Skip to content

Commit 8ddf081

Browse files
committed
Correctly detect when all managed crawlers are done in CrawlerRunner
1 parent 68954fa commit 8ddf081

File tree

2 files changed

+27
-16
lines changed

2 files changed

+27
-16
lines changed

scrapy/commands/shell.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ def run(self, args, opts):
5353

5454
# The crawler is created this way since the Shell manually handles the
5555
# crawling engine, so the set up in the crawl method won't work
56-
crawler = self.crawler_process._create_logged_crawler(spidercls)
56+
crawler = self.crawler_process._create_crawler(spidercls)
57+
self.crawler_process._setup_crawler_logging(crawler)
5758
# The Shell class needs a persistent engine in the crawler
5859
crawler.engine = crawler._create_engine()
5960
crawler.engine.start()

scrapy/crawler.py

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -76,22 +76,21 @@ def __init__(self, settings):
7676
smcls = load_object(settings['SPIDER_MANAGER_CLASS'])
7777
self.spiders = smcls.from_settings(settings.frozencopy())
7878
self.crawlers = set()
79-
self.crawl_deferreds = set()
79+
self._active = set()
8080

8181
def crawl(self, spidercls, *args, **kwargs):
82-
crawler = self._create_logged_crawler(spidercls)
82+
crawler = self._create_crawler(spidercls)
83+
self._setup_crawler_logging(crawler)
8384
self.crawlers.add(crawler)
84-
8585
d = crawler.crawl(*args, **kwargs)
86-
self.crawl_deferreds.add(d)
87-
return d
86+
self._active.add(d)
8887

89-
def _create_logged_crawler(self, spidercls):
90-
crawler = self._create_crawler(spidercls)
91-
log_observer = log.start_from_crawler(crawler)
92-
if log_observer:
93-
crawler.signals.connect(log_observer.stop, signals.engine_stopped)
94-
return crawler
88+
def _done(result):
89+
self.crawlers.discard(crawler)
90+
self._active.discard(d)
91+
return result
92+
93+
return d.addBoth(_done)
9594

9695
def _create_crawler(self, spidercls):
9796
if isinstance(spidercls, six.string_types):
@@ -100,13 +99,22 @@ def _create_crawler(self, spidercls):
10099
crawler_settings = self.settings.copy()
101100
spidercls.update_settings(crawler_settings)
102101
crawler_settings.freeze()
102+
return Crawler(spidercls, crawler_settings)
103103

104-
crawler = Crawler(spidercls, crawler_settings)
105-
return crawler
104+
def _setup_crawler_logging(self, crawler):
105+
log_observer = log.start_from_crawler(crawler)
106+
if log_observer:
107+
crawler.signals.connect(log_observer.stop, signals.engine_stopped)
106108

107109
def stop(self):
108110
return defer.DeferredList(c.stop() for c in self.crawlers)
109111

112+
@defer.inlineCallbacks
113+
def join(self):
114+
"""Wait for all managed crawlers to complete"""
115+
while self._active:
116+
yield defer.DeferredList(self._active)
117+
110118

111119
class CrawlerProcess(CrawlerRunner):
112120
"""A class to run multiple scrapy crawlers in a process simultaneously"""
@@ -135,13 +143,15 @@ def _signal_kill(self, signum, _):
135143

136144
def start(self, stop_after_crawl=True):
137145
if stop_after_crawl:
138-
d = defer.DeferredList(self.crawl_deferreds)
146+
d = self.join()
147+
# Don't start the reactor if the deferreds are already fired
139148
if d.called:
140-
# Don't start the reactor if the deferreds are already fired
141149
return
142150
d.addBoth(lambda _: self._stop_reactor())
151+
143152
if self.settings.getbool('DNSCACHE_ENABLED'):
144153
reactor.installResolver(CachingThreadedResolver(reactor))
154+
145155
reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
146156
reactor.run(installSignalHandlers=False) # blocking call
147157

0 commit comments

Comments
 (0)