Download image with selenium python

python firefox selenium selenium-webdriver

Here's a complete example (using google's recaptcha as a target):

import urllibfrom selenium import webdriverdriver = webdriver.Firefox()driver.get('http://www.google.com/recaptcha/demo/recaptcha')# get the image sourceimg = driver.find_element_by_xpath('//div[@id="recaptcha_image"]/img')src = img.get_attribute('src')# download the imageurllib.urlretrieve(src, "captcha.png")driver.close()

UPDATE:

The problem with dynamic generated images is that there is a new image generated each time you request it. In that case, you have several options:

take a screenshot

from selenium import webdriverdriver = webdriver.Firefox()driver.get('https://moscowsg.megafon.ru/ps/scc/php/cryptographp.php?PHPSESSID=mfc540jkbeme81qjvh5t0v0bnjdr7oc6&ref=114&w=150')driver.save_screenshot("screenshot.png")driver.close()

simulate right click + "Save As". See this thread for more info.

Hope that helps.

python firefox selenium selenium-webdriver

It's ok to save a screenshot from the whole page and then cut the image from, but you can also to use the "find" method from "webdriver" to locate the image you want to save, and write the "screenshot_as_png" property like below:

from selenium import webdriverdriver = webdriver.Firefox()driver.get('https://www.webpagetest.org/')with open('filename.png', 'wb') as file:    file.write(driver.find_element_by_xpath('/html/body/div[1]/div[5]/div[2]/table[1]/tbody/tr/td[1]/a/div').screenshot_as_png)

Sometimes it could get an error because of the scroll, but depending on your necessity, it's a good way to get the image.

python firefox selenium selenium-webdriver

The problem of using save_screenshot is that we cannot save an image in its original quality and cannot restore the alpha channel in an image. Therefore, I propose another solution. Here is a complete example using the selenium-wire library suggested by @codam_hsmits. It is possible to download images via ChromeDriver.

I have defined the following function to parse each request and save the request body to a file when necessary.

from seleniumwire import webdriver  # Import from seleniumwirefrom urllib.parse import urlparseimport osfrom mimetypes import guess_extensionimport timeimport datetimedef download_assets(requests,                   asset_dir="temp",                   default_fname="unnamed",                   skip_domains=["facebook", "google", "yahoo", "agkn", "2mdn"],                   exts=[".png", ".jpeg", ".jpg", ".svg", ".gif", ".pdf", ".bmp", ".webp", ".ico"],                   append_ext=False):    asset_list = {}    for req_idx, request in enumerate(requests):        # request.headers        # request.response.body is the raw response body in bytes        if request is None or request.response is None or request.response.headers is None or 'Content-Type' not in request.response.headers:            continue                    ext = guess_extension(request.response.headers['Content-Type'].split(';')[0].strip())        if ext is None or ext == "" or ext not in exts:            #Don't know the file extention, or not in the whitelist            continue        parsed_url = urlparse(request.url)                skip = False        for d in skip_domains:            if d in parsed_url.netloc:                skip = True                break        if skip:            continue                frelpath = parsed_url.path.strip()        if frelpath == "":            timestamp = str(datetime.datetime.now().replace(microsecond=0).isoformat())            frelpath = f"{default_fname}_{req_idx}_{timestamp}{ext}"        elif frelpath.endswith("\\") or frelpath.endswith("/"):            timestamp = str(datetime.datetime.now().replace(microsecond=0).isoformat())            frelpath = frelpath + f"{default_fname}_{req_idx}_{timestamp}{ext}"        elif append_ext and not frelpath.endswith(ext):            frelpath = frelpath + f"_{default_fname}{ext}" #Missing file extension but may not be a problem        if frelpath.startswith("\\") or frelpath.startswith("/"):            frelpath = frelpath[1:]                fpath = os.path.join(asset_dir, parsed_url.netloc, frelpath)        if os.path.isfile(fpath):            continue        os.makedirs(os.path.dirname(fpath), exist_ok=True)        print(f"Downloading {request.url} to {fpath}")        asset_list[fpath] = request.url        try:            with open(fpath, "wb") as file:                file.write(request.response.body)        except:            print(f"Cannot download {request.url} to {fpath}")    return asset_list

Let's download some images from Google homepage to temp folder.

# Create a new instance of the Chrome/Firefox driverdriver = webdriver.Chrome()# Go to the Google home pagedriver.get('https://www.google.com')# Download content to temp folderasset_dir = "temp"while True:    # Please browser the internet, it will collect the images for every second    time.sleep(1)    download_assets(driver.requests, asset_dir=asset_dir)driver.close()

Note that it cannot decide which images can be seen on the page rather than being hidden in the background, so the users should actively click the buttons or links to trigger new download requests.

CodeHunter

Download image with selenium python

Recent Posts

How can I color dots in a xy scatterplot according to column value?

How to update a claim in ASP.NET Identity?

What does {0} mean when initializing an object?

Accessing members of items in a JSONArray with Java

How to log SQL statements in Spring Boot?

Powershell Get-WebSite name parameter is ignored

How to detect scroll to bottom of html element

Java synchronized method

How to test controllers with CodeIgniter?

Detect Visual Composer

Matplotlib: Specify format of floats for tick labels

Rails join a list of strings with commas and "and" before the last