Parse HTML and preserve original content

python html ruby node.js html-parsing

I highly recommend the pyquery package, for python. It is a jquery-like interface layered ontop of the extremely reliable lxml package, a python binding to libxml2.

I believe this does exactly what you want, with a quite familiar interface.

from pyquery import PyQuery as pqhtml = '''<div class=header><span class=title>Foo</span></div><p>1<p>2<table><tr><td>1</td></tr></table>'''doc = pq(html)doc('.header .title').text('my new content')print doc

Output:

<div><div class="header"><span class="title">my new content</span></div><p>1</p><p>2</p><table><tr><td>1</td></tr></table></div>

The closing p tag can't be helped. lxml only keeps the values from the original document, not the vagaries of the original. Paragraphs can be made two ways, and it chooses the more standard way when doing serialization. I don't believe you'll find a (bug-free) parser that does better.

python html ruby node.js html-parsing

Note: I'm on Python 3.

This will only handle a subset of CSS selectors, but it may be enough for your purposes.

from html.parser import HTMLParserclass AttrQuery():    def __init__(self):        self.repl_text = ""        self.selectors = []    def add_css_sel(self, seltext):        sels = seltext.split(" ")        for selector in sels:            if selector[:1] == "#":                self.add_selector({"id": selector[1:]})            elif selector[:1] == ".":                self.add_selector({"class": selector[1:]})            elif "." in selector:                html_tag, html_class = selector.split(".")                self.add_selector({"html_tag": html_tag, "class": html_class})            else:                self.add_selector({"html_tag": selector})    def add_selector(self, selector_dict):        self.selectors.append(selector_dict)    def match_test(self, tagwithattrs_list):        for selector in self.selectors:            for condition in selector:                condition_value = selector[condition]                if not self._condition_test(tagwithattrs_list, condition, condition_value):                    return False        return True    def _condition_test(self, tagwithattrs_list, condition, condition_value):        for tagwithattrs in tagwithattrs_list:            try:                if condition_value == tagwithattrs[condition]:                    return True            except KeyError:                pass        return Falseclass HTMLAttrParser(HTMLParser):    def __init__(self, html, **kwargs):        super().__init__(self, **kwargs)        self.tagwithattrs_list = []        self.queries = []        self.matchrepl_list = []        self.html = html    def handle_starttag(self, tag, attrs):        tagwithattrs = dict(attrs)        tagwithattrs["html_tag"] = tag        self.tagwithattrs_list.append(tagwithattrs)        if debug:            print("push\t", end="")            for attrname in tagwithattrs:                print("{}:{}, ".format(attrname, tagwithattrs[attrname]), end="")            print("")    def handle_endtag(self, tag):        try:            while True:                tagwithattrs = self.tagwithattrs_list.pop()                if debug:                    print("pop \t", end="")                    for attrname in tagwithattrs:                        print("{}:{}, ".format(attrname, tagwithattrs[attrname]), end="")                    print("")                if tag == tagwithattrs["html_tag"]: break        except IndexError:            raise IndexError("Found a close-tag for a non-existent element.")    def handle_data(self, data):        if self.tagwithattrs_list:            for query in self.queries:                if query.match_test(self.tagwithattrs_list):                    line, position = self.getpos()                    length = len(data)                    match_replace = (line-1, position, length, query.repl_text)                    self.matchrepl_list.append(match_replace)    def addquery(self, query):        self.queries.append(query)    def transform(self):        split_html = self.html.split("\n")        self.matchrepl_list.reverse()        if debug: print ("\nreversed list of matches (line, position, len, repl_text):\n{}\n".format(self.matchrepl_list))        for line, position, length, repl_text in self.matchrepl_list:            oldline = split_html[line]            newline = oldline[:position] + repl_text + oldline[position+length:]            split_html = split_html[:line] + [newline] + split_html[line+1:]        return "\n".join(split_html)

See the example usage below.

html_test = """<div class=header><span class=title>Foo</span></div><p>1<p>2<table><tr><td class=hi><div id=there>1</div></td></tr></table>"""debug = Falseparser = HTMLAttrParser(html_test)query = AttrQuery()query.repl_text = "Bar"query.add_selector({"html_tag": "div", "class": "header"})query.add_selector({"class": "title"})parser.addquery(query)query = AttrQuery()query.repl_text = "InTable"query.add_css_sel("table tr td.hi #there")parser.addquery(query)parser.feed(html_test)transformed_html = parser.transform()print("transformed html:\n{}".format(transformed_html))

Output:

transformed html:<div class=header><span class=title>Bar</span></div><p>1<p>2<table><tr><td class=hi><div id=there>InTable</div></td></tr></table>

python html ruby node.js html-parsing

Ok I have done this in a few languages and I have to say the best parser I have seen that preserves whitespace and even HTML comments is:

Jericho which is unfortunately Java.

That is Jericho knows how to parse and preserve fragments.

Yes I know its Java but you could easily make a RESTful service with a tiny bit of Java that would take the payload and convert it. In the Java REST service you could use JRuby, Jython, Rhino Javascript etc. to coordinate with Jericho.

CodeHunter

Parse HTML and preserve original content

Recent Posts

How can I color dots in a xy scatterplot according to column value?

How to update a claim in ASP.NET Identity?

What does {0} mean when initializing an object?

Accessing members of items in a JSONArray with Java

How to log SQL statements in Spring Boot?

Powershell Get-WebSite name parameter is ignored

How to detect scroll to bottom of html element

Java synchronized method

How to test controllers with CodeIgniter?

Detect Visual Composer

Matplotlib: Specify format of floats for tick labels

Rails join a list of strings with commas and "and" before the last