Skip to content

Commit 6d6da78

Browse files
booghetaGallaecio
authored andcommitted
Add a keep_fragments parameter to the request_fingerprint function (scrapy#4104)
1 parent 66cbcee commit 6d6da78

File tree

2 files changed

+19
-6
lines changed

2 files changed

+19
-6
lines changed

scrapy/utils/request.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717

1818
_fingerprint_cache = weakref.WeakKeyDictionary()
19-
def request_fingerprint(request, include_headers=None):
19+
def request_fingerprint(request, include_headers=None, keep_fragments=False):
2020
"""
2121
Return the request fingerprint.
2222
@@ -42,24 +42,30 @@ def request_fingerprint(request, include_headers=None):
4242
the fingeprint. If you want to include specific headers use the
4343
include_headers argument, which is a list of Request headers to include.
4444
45+
Also, servers usually ignore fragments in urls when handling requests,
46+
so they are also ignored by default when calculating the fingerprint.
47+
If you want to include them, set the keep_fragments argument to True
48+
(for instance when handling requests with a headless browser).
49+
4550
"""
4651
if include_headers:
4752
include_headers = tuple(to_bytes(h.lower())
4853
for h in sorted(include_headers))
4954
cache = _fingerprint_cache.setdefault(request, {})
50-
if include_headers not in cache:
55+
cache_key = (include_headers, keep_fragments)
56+
if cache_key not in cache:
5157
fp = hashlib.sha1()
5258
fp.update(to_bytes(request.method))
53-
fp.update(to_bytes(canonicalize_url(request.url)))
59+
fp.update(to_bytes(canonicalize_url(request.url, keep_fragments=keep_fragments)))
5460
fp.update(request.body or b'')
5561
if include_headers:
5662
for hdr in include_headers:
5763
if hdr in request.headers:
5864
fp.update(hdr)
5965
for v in request.headers.getlist(hdr):
6066
fp.update(v)
61-
cache[include_headers] = fp.hexdigest()
62-
return cache[include_headers]
67+
cache[cache_key] = fp.hexdigest()
68+
return cache[cache_key]
6369

6470

6571
def request_authenticate(request, username, password):

tests/test_utils_request.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def test_request_fingerprint(self):
1717
self.assertNotEqual(request_fingerprint(r1), request_fingerprint(r2))
1818

1919
# make sure caching is working
20-
self.assertEqual(request_fingerprint(r1), _fingerprint_cache[r1][None])
20+
self.assertEqual(request_fingerprint(r1), _fingerprint_cache[r1][(None, False)])
2121

2222
r1 = Request("http://www.example.com/members/offers.html")
2323
r2 = Request("http://www.example.com/members/offers.html")
@@ -42,6 +42,13 @@ def test_request_fingerprint(self):
4242
self.assertEqual(request_fingerprint(r3, include_headers=['accept-language', 'sessionid']),
4343
request_fingerprint(r3, include_headers=['SESSIONID', 'Accept-Language']))
4444

45+
r1 = Request("http://www.example.com/test.html")
46+
r2 = Request("http://www.example.com/test.html#fragment")
47+
self.assertEqual(request_fingerprint(r1), request_fingerprint(r2))
48+
self.assertEqual(request_fingerprint(r1), request_fingerprint(r1, keep_fragments=True))
49+
self.assertNotEqual(request_fingerprint(r2), request_fingerprint(r2, keep_fragments=True))
50+
self.assertNotEqual(request_fingerprint(r1), request_fingerprint(r2, keep_fragments=True))
51+
4552
r1 = Request("http://www.example.com")
4653
r2 = Request("http://www.example.com", method='POST')
4754
r3 = Request("http://www.example.com", method='POST', body=b'request body')

0 commit comments

Comments
 (0)