Always run a constant number of subprocesses in parallel
Taking a different approach from the above - as it seems that the callback can't be sent as a parameter:
NextURLNo = 0MaxProcesses = 20MaxUrls = 100000 # Note this would be better to be len(urllist)Processes = []def StartNew(): """ Start a new subprocess if there is work to do """ global NextURLNo global Processes if NextURLNo < MaxUrls: proc = subprocess.Popen(['python', 'script.py', urllist[NextURLNo], OnExit]) print ("Started to Process %s", urllist[NextURLNo]) NextURLNo += 1 Processes.append(proc)def CheckRunning(): """ Check any running processes and start new ones if there are spare slots.""" global Processes global NextURLNo for p in range(len(Processes):0:-1): # Check the processes in reverse order if Processes[p].poll() is not None: # If the process hasn't finished will return None del Processes[p] # Remove from list - this is why we needed reverse order while (len(Processes) < MaxProcesses) and (NextURLNo < MaxUrls): # More to do and some spare slots StartNew()if __name__ == "__main__": CheckRunning() # This will start the max processes running while (len(Processes) > 0): # Some thing still going on. time.sleep(0.1) # You may wish to change the time for this CheckRunning() print ("Done!")
Just keep count as you start them and use a callback from each subprocess to start a new one if there are any url list entries to process.
e.g. Assuming that your sub-process calls the OnExit method passed to it as it ends:
NextURLNo = 0MaxProcesses = 20NoSubProcess = 0MaxUrls = 100000def StartNew(): """ Start a new subprocess if there is work to do """ global NextURLNo global NoSubProcess if NextURLNo < MaxUrls: subprocess.Popen(['python', 'script.py', urllist[NextURLNo], OnExit]) print "Started to Process", urllist[NextURLNo] NextURLNo += 1 NoSubProcess += 1def OnExit(): NoSubProcess -= 1if __name__ == "__main__": for n in range(MaxProcesses): StartNew() while (NoSubProcess > 0): time.sleep(1) if (NextURLNo < MaxUrls): for n in range(NoSubProcess,MaxProcesses): StartNew()
To keep constant number of concurrent requests, you could use a thread pool:
#!/usr/bin/env pythonfrom multiprocessing.dummy import Pooldef process_url(url): # ... handle a single urlurllist = [url1, url2, url3, .. , url100000]for _ in Pool(20).imap_unordered(process_url, urllist): pass
To run processes instead of threads, remove .dummy
from the import.