-
Notifications
You must be signed in to change notification settings - Fork 82
perf(az): optimize non-recursive _list_dir
#447
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,9 +1,8 @@ | ||
| from datetime import datetime, timedelta | ||
| import mimetypes | ||
| import os | ||
| from pathlib import Path, PurePosixPath | ||
| from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union | ||
|
|
||
| from pathlib import Path | ||
| from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union, cast | ||
|
|
||
| from ..client import Client, register_client_class | ||
| from ..cloudpath import implementation_registry | ||
|
|
@@ -20,6 +19,7 @@ | |
| BlobProperties, | ||
| ContentSettings, | ||
| generate_blob_sas, | ||
| BlobPrefix, | ||
| ) | ||
| except ModuleNotFoundError: | ||
| implementation_registry["azure"].dependencies_loaded = False | ||
|
|
@@ -155,7 +155,20 @@ def _is_file_or_dir(self, cloud_path: AzureBlobPath) -> Optional[str]: | |
| return "dir" | ||
|
|
||
| try: | ||
| self._get_metadata(cloud_path) | ||
| # the result of get_blob_properties is a BlobProperties object (dict mixin) | ||
| metadata = cast(BlobProperties, self._get_metadata(cloud_path)) | ||
|
|
||
| # content_type and content_md5 are never None for files in Azure Blob Storage | ||
| # if they are None, then the given path is a directory | ||
| # https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-properties?tabs=microsoft-entra-id#response-headers | ||
| is_folder = ( | ||
| metadata.content_settings.content_type is None | ||
| and metadata.content_settings.content_md5 is None | ||
| ) | ||
|
|
||
| if is_folder: | ||
| return "dir" | ||
|
|
||
| return "file" | ||
| except ResourceNotFoundError: | ||
| prefix = cloud_path.blob | ||
|
|
@@ -181,17 +194,14 @@ def _exists(self, cloud_path: AzureBlobPath) -> bool: | |
| def _list_dir( | ||
| self, cloud_path: AzureBlobPath, recursive: bool = False | ||
| ) -> Iterable[Tuple[AzureBlobPath, bool]]: | ||
| # shortcut if listing all available containers | ||
| if not cloud_path.container: | ||
| if recursive: | ||
| raise NotImplementedError( | ||
| "Cannot recursively list all containers and contents; you can get all the containers then recursively list each separately." | ||
| ) | ||
|
|
||
| yield from ( | ||
| (self.CloudPath(f"az://{c.name}"), True) | ||
| for c in self.service_client.list_containers() | ||
| ) | ||
| for container in self.service_client.list_containers(): | ||
| yield self.CloudPath(f"az://{container.name}"), True | ||
|
|
||
| if not recursive: | ||
| continue | ||
|
|
||
| yield from self._list_dir(self.CloudPath(f"az://{container.name}"), recursive=True) | ||
| return | ||
|
|
||
| container_client = self.service_client.get_container_client(cloud_path.container) | ||
|
|
@@ -200,30 +210,30 @@ def _list_dir( | |
| if prefix and not prefix.endswith("/"): | ||
| prefix += "/" | ||
|
|
||
| yielded_dirs = set() | ||
|
|
||
| # NOTE: Not recursive may be slower than necessary since it just filters | ||
| # the recursive implementation | ||
| for o in container_client.list_blobs(name_starts_with=prefix): | ||
| # get directory from this path | ||
| for parent in PurePosixPath(o.name[len(prefix) :]).parents: | ||
| # if we haven't surfaced this directory already | ||
| if parent not in yielded_dirs and str(parent) != ".": | ||
| # skip if not recursive and this is beyond our depth | ||
| if not recursive and "/" in str(parent): | ||
| continue | ||
|
|
||
| yield ( | ||
| self.CloudPath(f"az://{cloud_path.container}/{prefix}{parent}"), | ||
| True, # is a directory | ||
| ) | ||
| yielded_dirs.add(parent) | ||
|
|
||
| # skip file if not recursive and this is beyond our depth | ||
| if not recursive and "/" in o.name[len(prefix) :]: | ||
| continue | ||
|
|
||
| yield (self.CloudPath(f"az://{cloud_path.container}/{o.name}"), False) # is a file | ||
| if not recursive: | ||
| blobs = container_client.walk_blobs(name_starts_with=prefix) | ||
|
M0dEx marked this conversation as resolved.
|
||
| else: | ||
| blobs = container_client.list_blobs(name_starts_with=prefix) | ||
|
|
||
| def is_dir(blob: Union[BlobProperties, BlobPrefix]) -> bool: | ||
| # walk_blobs returns a BlobPrefix object for directories | ||
| # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobprefix?view=azure-python | ||
| if isinstance(blob, BlobPrefix): | ||
| return True | ||
|
|
||
| # content_type and content_md5 are never None for files in Azure Blob Storage | ||
| # if they are None, then the given path is a directory | ||
| # https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-properties?tabs=microsoft-entra-id#response-headers | ||
| return ( | ||
| blob.content_settings.content_md5 is None | ||
| and blob.content_settings.content_type is None | ||
| ) | ||
|
|
||
| for blob in blobs: | ||
| # walk_blobs returns folders with a trailing slash | ||
| blob_path = blob.name.rstrip("/") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it problematic to keep the trailing slashes?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did it to be consistent with the current behaviour, but we can simply not remove the trailing slashes. |
||
| blob_cloud_path = self.CloudPath(f"az://{cloud_path.container}/{blob_path}") | ||
| yield blob_cloud_path, is_dir(blob) | ||
|
|
||
| def _move_file( | ||
| self, src: AzureBlobPath, dst: AzureBlobPath, remove_src: bool = True | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you point to a more specific reference? I don't see the link you sent supporting the claim that this is definitive for folders.
It does seem like
x-ms-resource-typeis useful to answer the question directly, but only for certain configurations so could be included as an optimization. It may be the case that all the scenarios where those vars are none are also ones wherex-ms-resource-typeis available.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree the reference is sort of vague. I was mainly going of off these two statements:
Using this, coupled with our observations from real-life usage, we concluded that these parameters are always
Nonefor folders and neverNonefor blobs, even if they are empty.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does your configuration have the
x-ms-resource-typeheader for these folder blobs?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That header does not seem to be returned within
BlobProperties, nor can I see it in the in-browser Azure storage explorer.