What is the fastest way to send 100,000 HTTP requests in Python?
Twistedless solution:
from urlparse import urlparsefrom threading import Threadimport httplib, sysfrom Queue import Queueconcurrent = 200def doWork(): while True: url = q.get() status, url = getStatus(url) doSomethingWithResult(status, url) q.task_done()def getStatus(ourl): try: url = urlparse(ourl) conn = httplib.HTTPConnection(url.netloc) conn.request("HEAD", url.path) res = conn.getresponse() return res.status, ourl except: return "error", ourldef doSomethingWithResult(status, url): print status, urlq = Queue(concurrent * 2)for i in range(concurrent): t = Thread(target=doWork) t.daemon = True t.start()try: for url in open('urllist.txt'): q.put(url.strip()) q.join()except KeyboardInterrupt: sys.exit(1)
This one is slighty faster than the twisted solution and uses less CPU.
Things have changed quite a bit since 2010 when this was posted and I haven't tried all the other answers but I have tried a few, and I found this to work the best for me using python3.6.
I was able to fetch about ~150 unique domains per second running on AWS.
import concurrent.futuresimport requestsimport timeout = []CONNECTIONS = 100TIMEOUT = 5tlds = open('../data/sample_1k.txt').read().splitlines()urls = ['http://{}'.format(x) for x in tlds[1:]]def load_url(url, timeout): ans = requests.head(url, timeout=timeout) return ans.status_codewith concurrent.futures.ThreadPoolExecutor(max_workers=CONNECTIONS) as executor: future_to_url = (executor.submit(load_url, url, TIMEOUT) for url in urls) time1 = time.time() for future in concurrent.futures.as_completed(future_to_url): try: data = future.result() except Exception as exc: data = str(type(exc)) finally: out.append(data) print(str(len(out)),end="\r") time2 = time.time()print(f'Took {time2-time1:.2f} s')
A solution using tornado asynchronous networking library
from tornado import ioloop, httpclienti = 0def handle_request(response): print(response.code) global i i -= 1 if i == 0: ioloop.IOLoop.instance().stop()http_client = httpclient.AsyncHTTPClient()for url in open('urls.txt'): i += 1 http_client.fetch(url.strip(), handle_request, method='HEAD')ioloop.IOLoop.instance().start()
This code is using non-blocking network I/O and doesn't have any restriction. It can scale to tens of thousands of open connections. It will run in a single thread but will be a way faster then any threading solution. Checkout non-blocking I/O