-
Notifications
You must be signed in to change notification settings - Fork 50
Expand file tree
/
Copy path_fsid.py
More file actions
129 lines (111 loc) · 4.7 KB
/
_fsid.py
File metadata and controls
129 lines (111 loc) · 4.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""Filesystem identity (fsid) fallback computation.
This module provides `_fallback_fsid` to compute filesystem identity from
protocol, storage_options, and fsspec global config (`fsspec.config.conf`)
without instantiating the filesystem.
The fsid is used by __eq__, relative_to, and is_relative_to to determine
if two paths are on the same filesystem. The key insight is that many
storage_options (like authentication or performance settings) don't affect
*which* filesystem is being accessed, only *how* it's accessed.
For filesystems where fsid cannot be determined (e.g., memory filesystem,
unknown protocols), returns None and callers fall back to comparing
storage_options directly.
"""
from __future__ import annotations
from collections import ChainMap
from collections.abc import Mapping
from typing import Any
from fsspec.config import conf as fsspec_conf
from fsspec.utils import tokenize
__all__ = ["_fallback_fsid"]
def _fallback_fsid(protocol: str, storage_options: Mapping[str, Any]) -> str | None:
"""Compute fsid from protocol, storage_options, and fsspec global config."""
global_opts = fsspec_conf.get(protocol)
opts: Mapping[str, Any] = (
ChainMap(storage_options, global_opts) # type: ignore[arg-type]
if global_opts
else storage_options
)
match protocol:
# Static fsid (no instance attributes needed)
case "" | "file" | "local":
return "local"
case "http" | "https":
return "http"
case "memory" | "memfs":
return None # Non-durable, fall back to storage_options
case "data":
return None # Non-durable
# Host + port based
case "sftp" | "ssh":
host = opts.get("host", "")
port = opts.get("port", 22)
return f"sftp_{tokenize(host, port)}" if host else None
case "smb":
host = opts.get("host", "")
port = opts.get("port", 445)
return f"smb_{tokenize(host, port)}" if host else None
case "ftp":
host = opts.get("host", "")
port = opts.get("port", 21)
return f"ftp_{tokenize(host, port)}" if host else None
case "webhdfs" | "webHDFS":
host = opts.get("host", "")
port = opts.get("port", 50070)
return f"webhdfs_{tokenize(host, port)}" if host else None
# Cloud object storage
case "s3" | "s3a":
endpoint = opts.get("endpoint_url", "https://s3.amazonaws.com")
# Normalize AWS endpoints
from urllib.parse import urlparse
parsed = urlparse(endpoint)
if parsed.netloc.endswith(".amazonaws.com"):
return "s3_aws"
return f"s3_{tokenize(endpoint)}"
case "gcs" | "gs":
return "gcs" # Single global endpoint
case "abfs" | "az":
account = opts.get("account_name", "")
return f"abfs_{tokenize(account)}" if account else None
case "adl":
tenant = opts.get("tenant_id", "")
store = opts.get("store_name", "")
return f"adl_{tokenize(tenant, store)}" if tenant and store else None
case "oci":
region = opts.get("region", "")
return f"oci_{tokenize(region)}" if region else None
case "oss":
endpoint = opts.get("endpoint", "")
return f"oss_{tokenize(endpoint)}" if endpoint else None
# Git-based
case "git":
path = opts.get("path", "")
ref = opts.get("ref", "")
return f"git_{tokenize(path, ref)}" if path else None
case "github":
org = opts.get("org", "")
repo = opts.get("repo", "")
sha = opts.get("sha", "")
return f"github_{tokenize(org, repo, sha)}" if org and repo else None
# Platform-specific
case "hf":
endpoint = opts.get("endpoint", "huggingface.co")
return f"hf_{tokenize(endpoint)}"
case "lakefs":
host = opts.get("host", "")
return f"lakefs_{tokenize(host)}" if host else None
case "webdav":
base_url = opts.get("base_url", "")
return f"webdav_{tokenize(base_url)}" if base_url else None
case "box":
return "box"
case "dropbox":
return "dropbox"
# Wrappers - delegate to underlying
case "simplecache" | "filecache" | "blockcache" | "cached":
return None # Complex, fall back
# Archive filesystems - need underlying fs info
case "zip" | "tar":
return None # Complex, fall back
# Default: unknown protocol, fall back to storage_options
case _:
return None