Using Python Selenium to download a file in memory, not in disk
Your question can be accomplished by adding the selenium add_experimental_option.I also redesigned your code to loop through the table to extract the href to pass them to StringIO. No files are downloaded to my local system using this code.
If I missed something please let me know.
import pandas as pdfrom time import sleepfrom io import StringIOfrom selenium import webdriverfrom selenium.webdriver.chrome.options import Optionsfrom selenium.webdriver.common.desired_capabilities import DesiredCapabilitiescapabilities = DesiredCapabilities().CHROMEchrome_options = Options()chrome_options.add_argument("--incognito")chrome_options.add_argument("--disable-infobars")chrome_options.add_argument("start-maximized")chrome_options.add_argument("--disable-extensions")chrome_options.add_argument("--disable-popup-blocking")prefs = { 'profile.default_content_setting_values': { 'automatic_downloads': 0 }, 'profile.content_settings.exceptions': { 'automatic_downloads': 0 } }chrome_options.add_experimental_option('prefs', prefs)capabilities.update(chrome_options.to_capabilities())driver = webdriver.Chrome('/usr/local/bin/chromedriver', options=chrome_options)url_main = 'https://file-examples.com/index.php/sample-documents-download/sample-xls-download/'driver.get(url_main)elements = driver.find_elements_by_xpath('//*[@id="table-files"]//td/a')for element in elements: if str(element.get_attribute("href")).endswith('.xls'): file_object = StringIO(element.get_attribute("href")) xls_file = file_object.read() df = pd.read_excel(xls_file) print(df.to_string(index=False)) First Name Last Name Gender Country Age Date Id 1 Dulce Abril Female United States 32 15/10/2017 1562 2 Mara Hashimoto Female Great Britain 25 16/08/2016 1582 3 Philip Gent Male France 36 21/05/2015 2587 4 Kathleen Hanner Female United States 25 15/10/2017 3549 5 Nereida Magwood Female United States 58 16/08/2016 2468 6 Gaston Brumm Male United States 24 21/05/2015 2554 7 Etta Hurn Female Great Britain 56 15/10/2017 3598 8 Earlean Melgar Female United States 27 16/08/2016 2456 9 Vincenza Weiland Female United States 40 21/05/2015 6548 sleep(360)
Here is an example using a RAMDISK that was mentioned in the comments. This option does not use selenium add_experimental_option or StringIO.
import fsimport pandas as pdfrom time import sleepfrom selenium import webdriverfrom selenium.webdriver.chrome.options import Optionschrome_options = Options()chrome_options.add_argument("--incognito")chrome_options.add_argument("--disable-infobars")chrome_options.add_argument("start-maximized")chrome_options.add_argument("--disable-extensions")chrome_options.add_argument("--disable-popup-blocking")driver = webdriver.Chrome('/usr/local/bin/chromedriver', options=chrome_options)url_main = 'https://file-examples.com/index.php/sample-documents-download/sample-xls-download/'driver.get(url_main)urls_to_process = []elements = driver.find_elements_by_xpath('//*[@id="table-files"]//td/a')# Create RAMDISKmem_fs = fs.open_fs('mem://')mem_fs.makedir('hidden_dir')for element in elements: if str(element.get_attribute("href")).endswith('.xls'): with mem_fs.open('hidden_dir/file1.csv', 'w') as in_file: in_file.write(element.get_attribute("href")) in_file.close() with mem_fs.open('hidden_dir/file1.csv', 'r') as out_file: df = pd.read_excel(out_file.read()) print(df.to_string(index=False)) # same output as above sleep(360)
IMO, using selenium is clearly unnecessary.Only using requests + beautifulsoup + pandas
is okay.(this would much faster than using selenium,and need less code).
Code below:
from io import BytesIOimport requestsfrom bs4 import BeautifulSoupimport pandas as pdresponse = requests.get("https://file-examples.com/index.php/sample-documents-download/sample-xls-download/")soup = BeautifulSoup(response.text, "html.parser")# get the download linkfile_link = soup.select_one(".file-link > a").get("href")# download it in memorybytes_of_file = requests.get(file_link).contentdf = pd.read_excel(BytesIO(bytes_of_file))print(df)
Result:
0 First Name Last Name Gender Country Age Date Id0 1 Dulce Abril Female United States 32 15/10/2017 15621 2 Mara Hashimoto Female Great Britain 25 16/08/2016 15822 3 Philip Gent Male France 36 21/05/2015 25873 4 Kathleen Hanner Female United States 25 15/10/2017 35494 5 Nereida Magwood Female United States 58 16/08/2016 24685 6 Gaston Brumm Male United States 24 21/05/2015 25546 7 Etta Hurn Female Great Britain 56 15/10/2017 35987 8 Earlean Melgar Female United States 27 16/08/2016 24568 9 Vincenza Weiland Female United States 40 21/05/2015 6548
And this wouldn't download any excel files.