How to parallelize file downloads?
You could use a thread pool to download files in parallel:
#!/usr/bin/env python3from multiprocessing.dummy import Pool # use threads for I/O bound tasksfrom urllib.request import urlretrieveurls = [...]result = Pool(4).map(urlretrieve, urls) # download 4 files at a time
You could also download several files at once in a single thread using asyncio
:
#!/usr/bin/env python3import asyncioimport loggingfrom contextlib import closingimport aiohttp # $ pip install aiohttp@asyncio.coroutinedef download(url, session, semaphore, chunk_size=1<<15): with (yield from semaphore): # limit number of concurrent downloads filename = url2filename(url) logging.info('downloading %s', filename) response = yield from session.get(url) with closing(response), open(filename, 'wb') as file: while True: # save file chunk = yield from response.content.read(chunk_size) if not chunk: break file.write(chunk) logging.info('done %s', filename) return filename, (response.status, tuple(response.headers.items()))urls = [...]logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')with closing(asyncio.get_event_loop()) as loop, \ closing(aiohttp.ClientSession()) as session: semaphore = asyncio.Semaphore(4) download_tasks = (download(url, session, semaphore) for url in urls) result = loop.run_until_complete(asyncio.gather(*download_tasks))