diff --git a/scrapling/spiders/request.py b/scrapling/spiders/request.py index ce728ee2..ef766b2c 100644 --- a/scrapling/spiders/request.py +++ b/scrapling/spiders/request.py @@ -22,6 +22,13 @@ def _convert_to_bytes(value: str | bytes) -> bytes: return value.encode(encoding="utf-8", errors="ignore") +def _stable_value_repr(value: Any) -> str: + try: + return orjson.dumps(value, option=orjson.OPT_SORT_KEYS, default=repr).decode() + except TypeError: + return repr(value) + + class Request: def __init__( self, @@ -97,15 +104,19 @@ def update_fingerprint( } if include_kwargs: - kwargs = (key.lower() for key in self._session_kwargs.keys() if key.lower() not in ("data", "json")) - data["kwargs"] = "".join(set(_convert_to_bytes(key).hex() for key in kwargs)) + filtered_kwargs = { + key.lower(): _stable_value_repr(value) + for key, value in self._session_kwargs.items() + if key.lower() not in ("data", "json") + } + data["kwargs"] = tuple(sorted(filtered_kwargs.items())) if include_headers: headers = self._session_kwargs.get("headers") or self._session_kwargs.get("extra_headers") or {} processed_headers = {} # Some header normalization for key, value in headers.items(): - processed_headers[_convert_to_bytes(key.lower()).hex()] = _convert_to_bytes(value.lower()).hex() + processed_headers[_convert_to_bytes(key.lower()).hex()] = _convert_to_bytes(value).hex() data["headers"] = tuple(processed_headers.items()) fp = hashlib.sha1(orjson.dumps(data, option=orjson.OPT_SORT_KEYS), usedforsecurity=False).digest() diff --git a/tests/spiders/test_request.py b/tests/spiders/test_request.py index 997a71ba..00f6c49c 100644 --- a/tests/spiders/test_request.py +++ b/tests/spiders/test_request.py @@ -99,6 +99,38 @@ def test_fingerprint_different_urls(self): r2 = Request("https://example.com/page2") assert r1.update_fingerprint() != r2.update_fingerprint() + def test_fingerprint_include_kwargs_uses_kwarg_values(self): + """Test kwargs with different values produce different fingerprints.""" + r1 = Request("https://example.com", timeout=1) + r2 = Request("https://example.com", timeout=2) + + assert r1.update_fingerprint(include_kwargs=True) != r2.update_fingerprint(include_kwargs=True) + + def test_fingerprint_include_kwargs_handles_non_primitive_values(self): + class _Opaque: + def __repr__(self) -> str: + return "_Opaque(stable)" + + opaque = _Opaque() + r1 = Request("https://example.com", proxies={"http": "p1"}, custom=opaque) + r2 = Request("https://example.com", proxies={"http": "p1"}, custom=opaque) + r3 = Request("https://example.com", proxies={"http": "p2"}, custom=opaque) + + fp1 = r1.update_fingerprint(include_kwargs=True) + r2._fp = None + fp2 = r2.update_fingerprint(include_kwargs=True) + fp3 = r3.update_fingerprint(include_kwargs=True) + + assert fp1 == fp2 + assert fp1 != fp3 + + def test_fingerprint_include_headers_preserves_header_value_case(self): + """Test header values are fingerprinted without lowercasing.""" + r1 = Request("https://example.com", headers={"X-Test": "A"}) + r2 = Request("https://example.com", headers={"X-Test": "a"}) + + assert r1.update_fingerprint(include_headers=True) != r2.update_fingerprint(include_headers=True) + class TestRequestCopy: """Test Request copy functionality."""