ReactorNotRestartable error in while loop with scrapy
By default, CrawlerProcess
's .start()
will stop the Twisted reactor it creates when all crawlers have finished.
You should call process.start(stop_after_crawl=False)
if you create process
in each iteration.
Another option is to handle the Twisted reactor yourself and use CrawlerRunner
. The docs have an example on doing that.
I was able to solve this problem like this. process.start()
should be called only once.
from time import sleepfrom scrapy import signalsfrom scrapy.crawler import CrawlerProcessfrom scrapy.utils.project import get_project_settingsfrom scrapy.xlib.pydispatch import dispatcherresult = Nonedef set_result(item): result = itemwhile True: process = CrawlerProcess(get_project_settings()) dispatcher.connect(set_result, signals.item_scraped) process.crawl('my_spider')process.start()
Ref http://crawl.blog/scrapy-loop/
import scrapy from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from twisted.internet import reactor from twisted.internet.task import deferLater def sleep(self, *args, seconds): """Non blocking sleep callback""" return deferLater(reactor, seconds, lambda: None) process = CrawlerProcess(get_project_settings()) def _crawl(result, spider): deferred = process.crawl(spider) deferred.addCallback(lambda results: print('waiting 100 seconds before restart...')) deferred.addCallback(sleep, seconds=100) deferred.addCallback(_crawl, spider) return deferred_crawl(None, MySpider)process.start()