Skip to content

Commit 8047e0e

Browse files
Rémi Lapeyretaleinat
Rémi Lapeyre
andcommitted
bpo-35922: Fix RobotFileParser when robots.txt has no relevant crawl delay or request rate (GH-11791)
Co-Authored-By: Tal Einat <taleinat+github@gmail.com>
1 parent 3a1d50e commit 8047e0e

File tree

3 files changed

+26
-14
lines changed

3 files changed

+26
-14
lines changed

Lib/test/test_robotparser.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -97,30 +97,38 @@ class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
9797

9898

9999
class BaseRequestRateTest(BaseRobotTest):
100+
request_rate = None
101+
crawl_delay = None
100102

101103
def test_request_rate(self):
104+
parser = self.parser
102105
for url in self.good + self.bad:
103106
agent, url = self.get_agent_and_url(url)
104107
with self.subTest(url=url, agent=agent):
105-
if self.crawl_delay:
106-
self.assertEqual(
107-
self.parser.crawl_delay(agent), self.crawl_delay
108-
)
109-
if self.request_rate:
108+
self.assertEqual(parser.crawl_delay(agent), self.crawl_delay)
109+
110+
parsed_request_rate = parser.request_rate(agent)
111+
self.assertEqual(parsed_request_rate, self.request_rate)
112+
if self.request_rate is not None:
110113
self.assertIsInstance(
111-
self.parser.request_rate(agent),
114+
parsed_request_rate,
112115
urllib.robotparser.RequestRate
113116
)
114117
self.assertEqual(
115-
self.parser.request_rate(agent).requests,
118+
parsed_request_rate.requests,
116119
self.request_rate.requests
117120
)
118121
self.assertEqual(
119-
self.parser.request_rate(agent).seconds,
122+
parsed_request_rate.seconds,
120123
self.request_rate.seconds
121124
)
122125

123126

127+
class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
128+
robots_txt = ''
129+
good = ['/foo']
130+
131+
124132
class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
125133
robots_txt = """\
126134
User-agent: figtree
@@ -141,10 +149,6 @@ class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
141149

142150
class DifferentAgentTest(CrawlDelayAndRequestRateTest):
143151
agent = 'FigTree Robot libwww-perl/5.04'
144-
# these are not actually tested, but we still need to parse it
145-
# in order to accommodate the input parameters
146-
request_rate = None
147-
crawl_delay = None
148152

149153

150154
class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):

Lib/urllib/robotparser.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,15 +186,19 @@ def crawl_delay(self, useragent):
186186
for entry in self.entries:
187187
if entry.applies_to(useragent):
188188
return entry.delay
189-
return self.default_entry.delay
189+
if self.default_entry:
190+
return self.default_entry.delay
191+
return None
190192

191193
def request_rate(self, useragent):
192194
if not self.mtime():
193195
return None
194196
for entry in self.entries:
195197
if entry.applies_to(useragent):
196198
return entry.req_rate
197-
return self.default_entry.req_rate
199+
if self.default_entry:
200+
return self.default_entry.req_rate
201+
return None
198202

199203
def site_maps(self):
200204
if not self.sitemaps:
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fix :meth:`RobotFileParser.crawl_delay` and
2+
:meth:`RobotFileParser.request_rate` to return ``None`` rather than
3+
raise :exc:`AttributeError` when no relevant rule is defined in the
4+
robots.txt file. Patch by Rémi Lapeyre.

0 commit comments

Comments
 (0)