What is the best practice to parse html in swift?
There are several nice libraries of HTML Parsing using Swift
and Objective-C
like the followings:
Take a look in the following examples in the four libraries posted above, mainly parsed using XPath 2.0:
hpple:
let data = NSData(contentsOfFile: path)let doc = TFHpple(htmlData: data)if let elements = doc.searchWithXPathQuery("//a/@href[ends-with(.,'.txt')]") as? [TFHppleElement] { for element in elements { println(element.content) }}
NDHpple:
let data = NSData(contentsOfFile: path)!let html = NSString(data: data, encoding: NSUTF8StringEncoding)!let doc = NDHpple(HTMLData: html)if let elements = doc.searchWithXPathQuery("//a/@href[ends-with(.,'.txt')]") { for element in elements { println(element.children?.first?.content) }}
Kanna (Xpath and CSS Selectors):
let html = "<html><head></head><body><ul><li><input type='image' name='input1' value='string1value' class='abc' /></li><li><input type='image' name='input2' value='string2value' class='def' /></li></ul><span class='spantext'><b>Hello World 1</b></span><span class='spantext'><b>Hello World 2</b></span><a href='example.com'>example(English)</a><a href='example.co.jp'>example(JP)</a></body>"if let doc = Kanna.HTML(html: html, encoding: NSUTF8StringEncoding) { var bodyNode = doc.body if let inputNodes = bodyNode?.xpath("//a/@href[ends-with(.,'.txt')]") { for node in inputNodes { println(node.contents) } }}
Fuzi (Xpath and CSS Selectors):
let html = "<html><head></head><body><ul><li><input type='image' name='input1' value='string1value' class='abc' /></li><li><input type='image' name='input2' value='string2value' class='def' /></li></ul><span class='spantext'><b>Hello World 1</b></span><span class='spantext'><b>Hello World 2</b></span><a href='example.com'>example(English)</a><a href='example.co.jp'>example(JP)</a></body>"do { // if encoding is omitted, it defaults to NSUTF8StringEncoding let doc = try HTMLDocument(string: html, encoding: NSUTF8StringEncoding) // XPath queries for anchor in doc.xpath("//a/@href[ends-with(.,'.txt')]") { print(anchor.stringValue) }} catch let error { print(error)}
The ends-with
function is part of Xpath 2.0.
SwiftSoup (CSS Selectors):
do{ let doc: Document = try SwiftSoup.parse("...") let links: Elements = try doc.select("a[href]") // a with href let pngs: Elements = try doc.select("img[src$=.png]") // img with src ending .png let masthead: Element? = try doc.select("div.masthead").first() // div with class=masthead let resultLinks: Elements? = try doc.select("h3.r > a") // direct a after h3} catch Exception.Error(let type, let message){ print(message)} catch { print("error")}
Ji (XPath):
let jiDoc = Ji(htmlURL: URL(string: "http://www.apple.com/support")!)let titleNode = jiDoc?.xPath("//head/title")?.firstprint("title: \(titleNode?.content)") // title: Optional("Official Apple Support")
I hope this helps you.
You could try this swift-html-parser:
https://github.com/tid-kijyun/Swift-HTML-Parser
It helps a lot.
And for getting your html from a txt you can:
let file = "file.txt"if let dirs : [String] = NSSearchPathForDirectoriesInDomains(NSSearchPathDirectory.DocumentDirectory, NSSearchPathDomainMask.AllDomainsMask, true) as? [String] { let dir = dirs[0] //documents directory let path = dir.stringByAppendingPathComponent(file); let html = String(contentsOfFile: path, encoding: NSUTF8StringEncoding, error: nil)
Edit:
To get what you need you could use as the exemple:
import Foundationlet html = "theHtmlYouWannaParse"var err : NSError?var parser = HTMLParser(html: html, error: &err)if err != nil { println(err) exit(1)}var bodyNode = parser.bodyif let inputNodes = bodyNode?.findChildTags("b") { for node in inputNodes { println(node.contents) }}if let inputNodes = bodyNode?.findChildTags("a") { for node in inputNodes { println(node.getAttributeNamed("href")) //<- Here you would get your files link }}