Scraping concurrently with selenium in python
Here is a different approach that I've had success with: you keep your workers in __main__, and the workers pull from the task_q.
import multiprocessingimport tracebackclass scrapeWorker(multiprocessing.Process): def __init__(self, worker_num, task_q, result_q): super(scrapeWorker, self).__init__() self.worker_num = worker_num self.task_q = task_q self.result_q = result_q self.scraper = my_scraper_class() # this contains driver code, methods, etc. def handleWork(self, work): assert isinstance(work, tuple) or isinstance(work, list), "work should be a tuple or list. found {}".format(type(work)) assert len(work) == 2, "len(work) != 2. found {}".format(work) assert isinstance(work[1], dict), "work[1] should be a dict. found {}".format(type(work[1])) # do the work result = getattr( self.scraper, work[0] )( **work[1] ) self.result_q.put( result ) # worker.run() is actually called via worker.start() def run(self): try: self.scraper.startDriving() while True: work = self.task_q.get() if work == 'KILL': self.scraper.driver.quit() break self.handleWork( work ) except: print traceback.format_exc() raiseif __name__ == "__main__": num_workers = 4 manager = multiprocessing.Manager() task_q = manager.Queue() result_q = manager.Queue() workers = [] for worker_num in xrange(num_workers): worker = scrapeWorker(worker_num, task_q, result_q) worker.start() workers.append( worker ) # you decide what job_stuff is # work == [ 'method_name', {'kw_1': val_1, ...} ] for work in job_stuff: task_q.put( work ) results = [] while len(results) < len(job_stuff): results.append( result_q.get() ) for worker in workers: task_q.put( "KILL" ) for worker in workers: worker.join() print "finished!"####