BeautifulSoup Grab Visible Webpage Text BeautifulSoup Grab Visible Webpage Text python python

BeautifulSoup Grab Visible Webpage Text


Try this:

from bs4 import BeautifulSoupfrom bs4.element import Commentimport urllib.requestdef tag_visible(element):    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:        return False    if isinstance(element, Comment):        return False    return Truedef text_from_html(body):    soup = BeautifulSoup(body, 'html.parser')    texts = soup.findAll(text=True)    visible_texts = filter(tag_visible, texts)      return u" ".join(t.strip() for t in visible_texts)html = urllib.request.urlopen('http://www.nytimes.com/2009/12/21/us/21storm.html').read()print(text_from_html(html))


The approved answer from @jbochi does not work for me. The str() function call raises an exception because it cannot encode the non-ascii characters in the BeautifulSoup element. Here is a more succinct way to filter the example web page to visible text.

html = open('21storm.html').read()soup = BeautifulSoup(html)[s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]visible_text = soup.getText()


import urllibfrom bs4 import BeautifulSoupurl = "https://www.yahoo.com"html = urllib.urlopen(url).read()soup = BeautifulSoup(html)# kill all script and style elementsfor script in soup(["script", "style"]):    script.extract()    # rip it out# get texttext = soup.get_text()# break into lines and remove leading and trailing space on eachlines = (line.strip() for line in text.splitlines())# break multi-headlines into a line eachchunks = (phrase.strip() for line in lines for phrase in line.split("  "))# drop blank linestext = '\n'.join(chunk for chunk in chunks if chunk)print(text.encode('utf-8'))