Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions scrapling/spiders/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@ def _convert_to_bytes(value: str | bytes) -> bytes:
return value.encode(encoding="utf-8", errors="ignore")


def _stable_value_repr(value: Any) -> str:
try:
return orjson.dumps(value, option=orjson.OPT_SORT_KEYS, default=repr).decode()
except TypeError:
return repr(value)


class Request:
def __init__(
self,
Expand Down Expand Up @@ -97,15 +104,19 @@ def update_fingerprint(
}

if include_kwargs:
kwargs = (key.lower() for key in self._session_kwargs.keys() if key.lower() not in ("data", "json"))
data["kwargs"] = "".join(set(_convert_to_bytes(key).hex() for key in kwargs))
filtered_kwargs = {
key.lower(): _stable_value_repr(value)
for key, value in self._session_kwargs.items()
if key.lower() not in ("data", "json")
}
data["kwargs"] = tuple(sorted(filtered_kwargs.items()))

if include_headers:
headers = self._session_kwargs.get("headers") or self._session_kwargs.get("extra_headers") or {}
processed_headers = {}
# Some header normalization
for key, value in headers.items():
processed_headers[_convert_to_bytes(key.lower()).hex()] = _convert_to_bytes(value.lower()).hex()
processed_headers[_convert_to_bytes(key.lower()).hex()] = _convert_to_bytes(value).hex()
data["headers"] = tuple(processed_headers.items())

fp = hashlib.sha1(orjson.dumps(data, option=orjson.OPT_SORT_KEYS), usedforsecurity=False).digest()
Expand Down
32 changes: 32 additions & 0 deletions tests/spiders/test_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,38 @@ def test_fingerprint_different_urls(self):
r2 = Request("https://example.com/page2")
assert r1.update_fingerprint() != r2.update_fingerprint()

def test_fingerprint_include_kwargs_uses_kwarg_values(self):
"""Test kwargs with different values produce different fingerprints."""
r1 = Request("https://example.com", timeout=1)
r2 = Request("https://example.com", timeout=2)

assert r1.update_fingerprint(include_kwargs=True) != r2.update_fingerprint(include_kwargs=True)

def test_fingerprint_include_kwargs_handles_non_primitive_values(self):
class _Opaque:
def __repr__(self) -> str:
return "_Opaque(stable)"

opaque = _Opaque()
r1 = Request("https://example.com", proxies={"http": "p1"}, custom=opaque)
r2 = Request("https://example.com", proxies={"http": "p1"}, custom=opaque)
r3 = Request("https://example.com", proxies={"http": "p2"}, custom=opaque)

fp1 = r1.update_fingerprint(include_kwargs=True)
r2._fp = None
fp2 = r2.update_fingerprint(include_kwargs=True)
fp3 = r3.update_fingerprint(include_kwargs=True)

assert fp1 == fp2
assert fp1 != fp3

def test_fingerprint_include_headers_preserves_header_value_case(self):
"""Test header values are fingerprinted without lowercasing."""
r1 = Request("https://example.com", headers={"X-Test": "A"})
r2 = Request("https://example.com", headers={"X-Test": "a"})

assert r1.update_fingerprint(include_headers=True) != r2.update_fingerprint(include_headers=True)


class TestRequestCopy:
"""Test Request copy functionality."""
Expand Down