Scraping html tables into R data frames using the XML package
…or a shorter try:
library(XML)library(RCurl)library(rlist)theurl <- getURL("https://en.wikipedia.org/wiki/Brazil_national_football_team",.opts = list(ssl.verifypeer = FALSE) )tables <- readHTMLTable(theurl)tables <- list.clean(tables, fun = is.null, recursive = FALSE)n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
the picked table is the longest one on the page
tables[[which.max(n.rows)]]
library(RCurl)library(XML)# Download page using RCurl# You may need to set proxy details, etc., in the call to getURLtheurl <- "http://en.wikipedia.org/wiki/Brazil_national_football_team"webpage <- getURL(theurl)# Process escape characterswebpage <- readLines(tc <- textConnection(webpage)); close(tc)# Parse the html tree, ignoring errors on the pagepagetree <- htmlTreeParse(webpage, error=function(...){})# Navigate your way through the tree. It may be possible to do this more efficiently using getNodeSetbody <- pagetree$children$html$children$body divbodyContent <- body$children$div$children[[1]]$children$div$children[[4]]tables <- divbodyContent$children[names(divbodyContent)=="table"]#In this case, the required table is the only one with class "wikitable sortable" tableclasses <- sapply(tables, function(x) x$attributes["class"])thetable <- tables[which(tableclasses=="wikitable sortable")]$table#Get columns headersheaders <- thetable$children[[1]]$childrencolumnnames <- unname(sapply(headers, function(x) x$children$text$value))# Get rows from tablecontent <- c()for(i in 2:length(thetable$children)){ tablerow <- thetable$children[[i]]$children opponent <- tablerow[[1]]$children[[2]]$children$text$value others <- unname(sapply(tablerow[-1], function(x) x$children$text$value)) content <- rbind(content, c(opponent, others))}# Convert to data framecolnames(content) <- columnnamesas.data.frame(content)
Edited to add:
Sample output
Opponent Played Won Drawn Lost Goals for Goals against % Won 1 Argentina 94 36 24 34 148 150 38.3% 2 Paraguay 72 44 17 11 160 61 61.1% 3 Uruguay 72 33 19 20 127 93 45.8% ...
Another option using Xpath.
library(RCurl)library(XML)theurl <- "http://en.wikipedia.org/wiki/Brazil_national_football_team"webpage <- getURL(theurl)webpage <- readLines(tc <- textConnection(webpage)); close(tc)pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)# Extract table header and contentstablehead <- xpathSApply(pagetree, "//*/table[@class='wikitable sortable']/tr/th", xmlValue)results <- xpathSApply(pagetree, "//*/table[@class='wikitable sortable']/tr/td", xmlValue)# Convert character vector to dataframecontent <- as.data.frame(matrix(results, ncol = 8, byrow = TRUE))# Clean up the resultscontent[,1] <- gsub("Â ", "", content[,1])tablehead <- gsub("Â ", "", tablehead)names(content) <- tablehead
Produces this result
> head(content) Opponent Played Won Drawn Lost Goals for Goals against % Won1 Argentina 94 36 24 34 148 150 38.3%2 Paraguay 72 44 17 11 160 61 61.1%3 Uruguay 72 33 19 20 127 93 45.8%4 Chile 64 45 12 7 147 53 70.3%5 Peru 39 27 9 3 83 27 69.2%6 Mexico 36 21 6 9 69 34 58.3%