Using Python Selenium to download a file in memory, not in disk Using Python Selenium to download a file in memory, not in disk selenium selenium

Using Python Selenium to download a file in memory, not in disk


Your question can be accomplished by adding the selenium add_experimental_option.I also redesigned your code to loop through the table to extract the href to pass them to StringIO. No files are downloaded to my local system using this code.

If I missed something please let me know.

import pandas as pdfrom time import sleepfrom io import StringIOfrom selenium import webdriverfrom selenium.webdriver.chrome.options import Optionsfrom selenium.webdriver.common.desired_capabilities import DesiredCapabilitiescapabilities = DesiredCapabilities().CHROMEchrome_options = Options()chrome_options.add_argument("--incognito")chrome_options.add_argument("--disable-infobars")chrome_options.add_argument("start-maximized")chrome_options.add_argument("--disable-extensions")chrome_options.add_argument("--disable-popup-blocking")prefs = {    'profile.default_content_setting_values':     {        'automatic_downloads': 0  },      'profile.content_settings.exceptions':    {        'automatic_downloads': 0    }  }chrome_options.add_experimental_option('prefs', prefs)capabilities.update(chrome_options.to_capabilities())driver = webdriver.Chrome('/usr/local/bin/chromedriver', options=chrome_options)url_main = 'https://file-examples.com/index.php/sample-documents-download/sample-xls-download/'driver.get(url_main)elements = driver.find_elements_by_xpath('//*[@id="table-files"]//td/a')for element in elements:   if str(element.get_attribute("href")).endswith('.xls'):     file_object = StringIO(element.get_attribute("href"))      xls_file = file_object.read()      df = pd.read_excel(xls_file)      print(df.to_string(index=False))        First Name  Last Name  Gender        Country  Age        Date    Id      1      Dulce      Abril  Female  United States   32  15/10/2017  1562      2       Mara  Hashimoto  Female  Great Britain   25  16/08/2016  1582      3     Philip       Gent    Male         France   36  21/05/2015  2587      4   Kathleen     Hanner  Female  United States   25  15/10/2017  3549      5    Nereida    Magwood  Female  United States   58  16/08/2016  2468      6     Gaston      Brumm    Male  United States   24  21/05/2015  2554      7       Etta       Hurn  Female  Great Britain   56  15/10/2017  3598      8    Earlean     Melgar  Female  United States   27  16/08/2016  2456      9   Vincenza    Weiland  Female  United States   40  21/05/2015  6548            sleep(360)

Here is an example using a RAMDISK that was mentioned in the comments. This option does not use selenium add_experimental_option or StringIO.

import fsimport pandas as pdfrom time import sleepfrom selenium import webdriverfrom selenium.webdriver.chrome.options import Optionschrome_options = Options()chrome_options.add_argument("--incognito")chrome_options.add_argument("--disable-infobars")chrome_options.add_argument("start-maximized")chrome_options.add_argument("--disable-extensions")chrome_options.add_argument("--disable-popup-blocking")driver = webdriver.Chrome('/usr/local/bin/chromedriver', options=chrome_options)url_main = 'https://file-examples.com/index.php/sample-documents-download/sample-xls-download/'driver.get(url_main)urls_to_process = []elements = driver.find_elements_by_xpath('//*[@id="table-files"]//td/a')# Create RAMDISKmem_fs = fs.open_fs('mem://')mem_fs.makedir('hidden_dir')for element in elements:  if str(element.get_attribute("href")).endswith('.xls'):     with mem_fs.open('hidden_dir/file1.csv', 'w') as in_file:        in_file.write(element.get_attribute("href"))        in_file.close()     with mem_fs.open('hidden_dir/file1.csv', 'r') as out_file:        df = pd.read_excel(out_file.read())        print(df.to_string(index=False))        # same output as above        sleep(360)


IMO, using selenium is clearly unnecessary.Only using requests + beautifulsoup + pandas is okay.(this would much faster than using selenium,and need less code).

Code below:

from io import BytesIOimport requestsfrom bs4 import BeautifulSoupimport pandas as pdresponse = requests.get("https://file-examples.com/index.php/sample-documents-download/sample-xls-download/")soup = BeautifulSoup(response.text, "html.parser")# get the download linkfile_link = soup.select_one(".file-link > a").get("href")# download it in memorybytes_of_file = requests.get(file_link).contentdf = pd.read_excel(BytesIO(bytes_of_file))print(df)

Result:

   0 First Name  Last Name  Gender        Country  Age        Date    Id0  1      Dulce      Abril  Female  United States   32  15/10/2017  15621  2       Mara  Hashimoto  Female  Great Britain   25  16/08/2016  15822  3     Philip       Gent    Male         France   36  21/05/2015  25873  4   Kathleen     Hanner  Female  United States   25  15/10/2017  35494  5    Nereida    Magwood  Female  United States   58  16/08/2016  24685  6     Gaston      Brumm    Male  United States   24  21/05/2015  25546  7       Etta       Hurn  Female  Great Britain   56  15/10/2017  35987  8    Earlean     Melgar  Female  United States   27  16/08/2016  24568  9   Vincenza    Weiland  Female  United States   40  21/05/2015  6548

And this wouldn't download any excel files.