How to parse an HTML table with rowspans in Python?
You'll have to track the rowspans on previous rows, one per column.
You could do this simply by copying the integer value of a rowspan into a dictionary, and subsequent rows decrement the rowspan value until it drops to 1
(or we could store the integer value minus 1 and drop to 0
for ease of coding). Then you can adjust subsequent table counts based on preceding rowspans.
Your table complicates this a little by using a default span of size 2, incrementing in steps of two, but that can easily be brought back to manageable numbers by dividing by 2.
Rather than use massive CSS selectors, select just the table rows and we'll iterate over those:
roster = []rowspans = {} # track rowspanning cells# every second row in the tablerows = page.select('html > body > center > table > tr')[1:21:2]for block, row in enumerate(rows, 1): # take direct child td cells, but skip the first cell: daycells = row.select('> td')[1:] rowspan_offset = 0 for daynum, daycell in enumerate(daycells, 1): # rowspan handling; if there is a rowspan here, adjust to find correct position daynum += rowspan_offset while rowspans.get(daynum, 0): rowspan_offset += 1 rowspans[daynum] -= 1 daynum += 1 # now we have a correct day number for this cell, adjusted for # rowspanning cells. # update the rowspan accounting for this cell rowspan = (int(daycell.get('rowspan', 2)) // 2) - 1 if rowspan: rowspans[daynum] = rowspan texts = daycell.select("table > tr > td > font") if texts: # class info found teacher, classroom, course = (c.get_text(strip=True) for c in texts) roster.append({ 'blok_start': block, 'blok_eind': block + rowspan, 'dag': daynum, 'leraar': teacher, 'lokaal': classroom, 'vak': course }) # days that were skipped at the end due to a rowspan while daynum < 5: daynum += 1 if rowspans.get(daynum, 0): rowspans[daynum] -= 1
This produces correct output:
[{'blok_eind': 2, 'blok_start': 1, 'dag': 5, 'leraar': u'BLEEJ002', 'lokaal': u'ALK B021', 'vak': u'WEBD'}, {'blok_eind': 3, 'blok_start': 2, 'dag': 3, 'leraar': u'BLEEJ002', 'lokaal': u'ALK B021B', 'vak': u'WEBD'}, {'blok_eind': 4, 'blok_start': 3, 'dag': 5, 'leraar': u'DOODF000', 'lokaal': u'ALK C212', 'vak': u'PROJ-T'}, {'blok_eind': 5, 'blok_start': 4, 'dag': 3, 'leraar': u'BLEEJ002', 'lokaal': u'ALK B021B', 'vak': u'MENT'}, {'blok_eind': 7, 'blok_start': 6, 'dag': 5, 'leraar': u'JONGJ003', 'lokaal': u'ALK B008', 'vak': u'BURG'}, {'blok_eind': 8, 'blok_start': 7, 'dag': 3, 'leraar': u'FLUIP000', 'lokaal': u'ALK B004', 'vak': u'ICT algemeen Prakti'}, {'blok_eind': 9, 'blok_start': 8, 'dag': 5, 'leraar': u'KOOLE000', 'lokaal': u'ALK B008', 'vak': u'NED'}]
Moreover, this code will continue to work even if courses span more than 2 blocks, or just one block; any rowspan size is supported.
Maybe it is better to use bs4 builtin function like "findAll" to parse your table.
You may use the following code :
from pprint import pprintfrom bs4 import BeautifulSoupimport requestsr = requests.get("http://rooster.horizoncollege.nl/rstr/ECO/AMR/400-ECO/Roosters/36" "/c/c00025.htm")content=r.contentpage = BeautifulSoup(content, "html")table=page.find('table')trs=table.findAll("tr", {},recursive=False)tr_count=0trs.pop(0)final_table={}for tr in trs: tds=tr.findAll("td", {},recursive=False) if tds: td_count=0 tds.pop(0) for td in tds: if td.has_attr('rowspan'): final_table[str(tr_count)+"-"+str(td_count)]=td.text.strip() if int(td.attrs['rowspan'])==4: final_table[str(tr_count+1)+"-"+str(td_count)]=td.text.strip() if final_table.has_key(str(tr_count)+"-"+str(td_count+1)): td_count=td_count+1 td_count=td_count+1 tr_count=tr_count+1roster=[]for i in range(0,10): #iterate over time for j in range(0,5): #iterate over day item=final_table[str(i)+"-"+str(j)] if len(item)!=0: block_eind=i+1 try: if final_table[str(i+1)+"-"+str(j)]==final_table[str(i)+"-"+str(j)]: block_eind=i+2 except: pass try: lokaal=item.split('\r\n \n\n')[0] leraar=item.split('\r\n \n\n')[1].split('\n \n\r\n')[0] vak=item.split('\n \n\r\n')[1] except: lokaal=leraar=vak="---" dayroster = { "dag": j+1, "blok_start": i+1, "blok_eind": block_eind, "lokaal": lokaal, "leraar": leraar, "vak": vak } dayroster_double = { "dag": j+1, "blok_start": i, "blok_eind": block_eind, "lokaal": lokaal, "leraar": leraar, "vak": vak } #use to prevent double dict for same event if dayroster_double not in roster: roster.append(dayroster)print (roster)