Scraping concurrently with selenium in python Scraping concurrently with selenium in python selenium selenium

Scraping concurrently with selenium in python


Here is a different approach that I've had success with: you keep your workers in __main__, and the workers pull from the task_q.

import multiprocessingimport tracebackclass scrapeWorker(multiprocessing.Process):    def __init__(self, worker_num, task_q, result_q):        super(scrapeWorker, self).__init__()        self.worker_num = worker_num        self.task_q = task_q        self.result_q = result_q        self.scraper = my_scraper_class() # this contains driver code, methods, etc.    def handleWork(self, work):        assert isinstance(work, tuple) or isinstance(work, list), "work should be a tuple or list. found {}".format(type(work))        assert len(work) == 2, "len(work) != 2. found {}".format(work)        assert isinstance(work[1], dict), "work[1] should be a dict. found {}".format(type(work[1]))        # do the work        result = getattr( self.scraper, work[0] )( **work[1] )        self.result_q.put( result )    # worker.run() is actually called via worker.start()    def run(self):        try:            self.scraper.startDriving()            while True:                work = self.task_q.get()                if work == 'KILL':                    self.scraper.driver.quit()                    break                self.handleWork( work )        except:            print traceback.format_exc()            raiseif __name__ == "__main__":    num_workers = 4    manager = multiprocessing.Manager()    task_q = manager.Queue()    result_q = manager.Queue()    workers = []    for worker_num in xrange(num_workers):        worker = scrapeWorker(worker_num, task_q, result_q)        worker.start()        workers.append( worker )    # you decide what job_stuff is    # work == [ 'method_name', {'kw_1': val_1, ...} ]    for work in job_stuff:        task_q.put( work )    results = []    while len(results) < len(job_stuff):        results.append( result_q.get() )    for worker in workers:        task_q.put( "KILL" )    for worker in workers:        worker.join()    print "finished!"####