How to parallelize file downloads? How to parallelize file downloads? python python

How to parallelize file downloads?


You could use a thread pool to download files in parallel:

#!/usr/bin/env python3from multiprocessing.dummy import Pool # use threads for I/O bound tasksfrom urllib.request import urlretrieveurls = [...]result = Pool(4).map(urlretrieve, urls) # download 4 files at a time

You could also download several files at once in a single thread using asyncio:

#!/usr/bin/env python3import asyncioimport loggingfrom contextlib import closingimport aiohttp # $ pip install aiohttp@asyncio.coroutinedef download(url, session, semaphore, chunk_size=1<<15):    with (yield from semaphore): # limit number of concurrent downloads        filename = url2filename(url)        logging.info('downloading %s', filename)        response = yield from session.get(url)        with closing(response), open(filename, 'wb') as file:            while True: # save file                chunk = yield from response.content.read(chunk_size)                if not chunk:                    break                file.write(chunk)        logging.info('done %s', filename)    return filename, (response.status, tuple(response.headers.items()))urls = [...]logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')with closing(asyncio.get_event_loop()) as loop, \     closing(aiohttp.ClientSession()) as session:    semaphore = asyncio.Semaphore(4)    download_tasks = (download(url, session, semaphore) for url in urls)    result = loop.run_until_complete(asyncio.gather(*download_tasks))

where url2filename() is defined here.