Skip to content

Commit 90a853d

Browse files
authored
fix(request fp): hash request kwargs and headers correctly (#255)
2 parents f305580 + 809d478 commit 90a853d

2 files changed

Lines changed: 46 additions & 3 deletions

File tree

scrapling/spiders/request.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@ def _convert_to_bytes(value: str | bytes) -> bytes:
2222
return value.encode(encoding="utf-8", errors="ignore")
2323

2424

25+
def _stable_value_repr(value: Any) -> str:
26+
try:
27+
return orjson.dumps(value, option=orjson.OPT_SORT_KEYS, default=repr).decode()
28+
except TypeError:
29+
return repr(value)
30+
31+
2532
class Request:
2633
def __init__(
2734
self,
@@ -97,15 +104,19 @@ def update_fingerprint(
97104
}
98105

99106
if include_kwargs:
100-
kwargs = (key.lower() for key in self._session_kwargs.keys() if key.lower() not in ("data", "json"))
101-
data["kwargs"] = "".join(set(_convert_to_bytes(key).hex() for key in kwargs))
107+
filtered_kwargs = {
108+
key.lower(): _stable_value_repr(value)
109+
for key, value in self._session_kwargs.items()
110+
if key.lower() not in ("data", "json")
111+
}
112+
data["kwargs"] = tuple(sorted(filtered_kwargs.items()))
102113

103114
if include_headers:
104115
headers = self._session_kwargs.get("headers") or self._session_kwargs.get("extra_headers") or {}
105116
processed_headers = {}
106117
# Some header normalization
107118
for key, value in headers.items():
108-
processed_headers[_convert_to_bytes(key.lower()).hex()] = _convert_to_bytes(value.lower()).hex()
119+
processed_headers[_convert_to_bytes(key.lower()).hex()] = _convert_to_bytes(value).hex()
109120
data["headers"] = tuple(processed_headers.items())
110121

111122
fp = hashlib.sha1(orjson.dumps(data, option=orjson.OPT_SORT_KEYS), usedforsecurity=False).digest()

tests/spiders/test_request.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,38 @@ def test_fingerprint_different_urls(self):
9999
r2 = Request("https://example.com/page2")
100100
assert r1.update_fingerprint() != r2.update_fingerprint()
101101

102+
def test_fingerprint_include_kwargs_uses_kwarg_values(self):
103+
"""Test kwargs with different values produce different fingerprints."""
104+
r1 = Request("https://example.com", timeout=1)
105+
r2 = Request("https://example.com", timeout=2)
106+
107+
assert r1.update_fingerprint(include_kwargs=True) != r2.update_fingerprint(include_kwargs=True)
108+
109+
def test_fingerprint_include_kwargs_handles_non_primitive_values(self):
110+
class _Opaque:
111+
def __repr__(self) -> str:
112+
return "_Opaque(stable)"
113+
114+
opaque = _Opaque()
115+
r1 = Request("https://example.com", proxies={"http": "p1"}, custom=opaque)
116+
r2 = Request("https://example.com", proxies={"http": "p1"}, custom=opaque)
117+
r3 = Request("https://example.com", proxies={"http": "p2"}, custom=opaque)
118+
119+
fp1 = r1.update_fingerprint(include_kwargs=True)
120+
r2._fp = None
121+
fp2 = r2.update_fingerprint(include_kwargs=True)
122+
fp3 = r3.update_fingerprint(include_kwargs=True)
123+
124+
assert fp1 == fp2
125+
assert fp1 != fp3
126+
127+
def test_fingerprint_include_headers_preserves_header_value_case(self):
128+
"""Test header values are fingerprinted without lowercasing."""
129+
r1 = Request("https://example.com", headers={"X-Test": "A"})
130+
r2 = Request("https://example.com", headers={"X-Test": "a"})
131+
132+
assert r1.update_fingerprint(include_headers=True) != r2.update_fingerprint(include_headers=True)
133+
102134

103135
class TestRequestCopy:
104136
"""Test Request copy functionality."""

0 commit comments

Comments
 (0)