Extracting the required information for a Script tag of scraped webpage using BeautifulSoup
This is a task that is best suited for a tool like selenium
, as the site uses the scrip to populate the page with the table after it loads, and it is not trivial to parse the values from the script
source:
from selenium import webdriverfrom bs4 import BeautifulSoup as soupimport urllib.parse, collections, red = webdriver.Chrome('/path/to/chromedriver')d.get((url:='https://www.capology.com/club/leicester/salaries/2019-2020/'))league_teams = d.execute_script(""" var results = []; for (var i of Array.from(document.querySelectorAll('li.green-subheader + li')).slice(0, 5)){ results.push({league:i.querySelector('.league-title').textContent, teams:Array.from(i.querySelectorAll('select:nth-of-type(1).team-menu option')).map(x => [x.getAttribute('value'), x.textContent]).slice(1), years:Array.from(i.querySelectorAll('select:nth-of-type(2).team-menu option')).map(x => [x.getAttribute('value'), x.textContent]).slice(2)}) } return results;""")vals = collections.defaultdict(dict)for i in league_teams: for y, full_year in [[re.sub('\d{4}\-\d{4}', '2020-2021', i['years'][0][0]), '2020-21'], *i['years']][:4]: for t, team in i['teams']: d.get(urllib.parse.urljoin(url, t) + (y1:=re.findall('/\d{4}\-\d{4}/', y)[0][1:])) hvals = [x.get_text(strip=True) for x in soup(d.page_source, 'html.parser').select('#table thead tr:nth-of-type(3) th')] tvals = soup(d.page_source, 'html.parser').select('#table tbody tr') full_table = [dict(zip(hvals, [j.get_text(strip=True) for j in k.select('td')])) for k in tvals] if team not in vals[i['league']]: vals[i['league']][team] = {full_year:None} vals[i['league']][team][full_year] = full_table