BeautifulSoup Grab Visible Webpage Text
Try this:
from bs4 import BeautifulSoupfrom bs4.element import Commentimport urllib.requestdef tag_visible(element): if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']: return False if isinstance(element, Comment): return False return Truedef text_from_html(body): soup = BeautifulSoup(body, 'html.parser') texts = soup.findAll(text=True) visible_texts = filter(tag_visible, texts) return u" ".join(t.strip() for t in visible_texts)html = urllib.request.urlopen('http://www.nytimes.com/2009/12/21/us/21storm.html').read()print(text_from_html(html))
The approved answer from @jbochi does not work for me. The str() function call raises an exception because it cannot encode the non-ascii characters in the BeautifulSoup element. Here is a more succinct way to filter the example web page to visible text.
html = open('21storm.html').read()soup = BeautifulSoup(html)[s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]visible_text = soup.getText()
import urllibfrom bs4 import BeautifulSoupurl = "https://www.yahoo.com"html = urllib.urlopen(url).read()soup = BeautifulSoup(html)# kill all script and style elementsfor script in soup(["script", "style"]): script.extract() # rip it out# get texttext = soup.get_text()# break into lines and remove leading and trailing space on eachlines = (line.strip() for line in text.splitlines())# break multi-headlines into a line eachchunks = (phrase.strip() for line in lines for phrase in line.split(" "))# drop blank linestext = '\n'.join(chunk for chunk in chunks if chunk)print(text.encode('utf-8'))