Scraping html tables into R data frames using the XML package Scraping html tables into R data frames using the XML package xml xml

Scraping html tables into R data frames using the XML package


…or a shorter try:

library(XML)library(RCurl)library(rlist)theurl <- getURL("https://en.wikipedia.org/wiki/Brazil_national_football_team",.opts = list(ssl.verifypeer = FALSE) )tables <- readHTMLTable(theurl)tables <- list.clean(tables, fun = is.null, recursive = FALSE)n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))

the picked table is the longest one on the page

tables[[which.max(n.rows)]]


library(RCurl)library(XML)# Download page using RCurl# You may need to set proxy details, etc.,  in the call to getURLtheurl <- "http://en.wikipedia.org/wiki/Brazil_national_football_team"webpage <- getURL(theurl)# Process escape characterswebpage <- readLines(tc <- textConnection(webpage)); close(tc)# Parse the html tree, ignoring errors on the pagepagetree <- htmlTreeParse(webpage, error=function(...){})# Navigate your way through the tree. It may be possible to do this more efficiently using getNodeSetbody <- pagetree$children$html$children$body divbodyContent <- body$children$div$children[[1]]$children$div$children[[4]]tables <- divbodyContent$children[names(divbodyContent)=="table"]#In this case, the required table is the only one with class "wikitable sortable"  tableclasses <- sapply(tables, function(x) x$attributes["class"])thetable  <- tables[which(tableclasses=="wikitable sortable")]$table#Get columns headersheaders <- thetable$children[[1]]$childrencolumnnames <- unname(sapply(headers, function(x) x$children$text$value))# Get rows from tablecontent <- c()for(i in 2:length(thetable$children)){   tablerow <- thetable$children[[i]]$children   opponent <- tablerow[[1]]$children[[2]]$children$text$value   others <- unname(sapply(tablerow[-1], function(x) x$children$text$value))    content <- rbind(content, c(opponent, others))}# Convert to data framecolnames(content) <- columnnamesas.data.frame(content)

Edited to add:

Sample output

                     Opponent Played Won Drawn Lost Goals for Goals against  % Won    1               Argentina     94  36    24   34       148           150  38.3%    2                Paraguay     72  44    17   11       160            61  61.1%    3                 Uruguay     72  33    19   20       127            93  45.8%    ...


Another option using Xpath.

library(RCurl)library(XML)theurl <- "http://en.wikipedia.org/wiki/Brazil_national_football_team"webpage <- getURL(theurl)webpage <- readLines(tc <- textConnection(webpage)); close(tc)pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)# Extract table header and contentstablehead <- xpathSApply(pagetree, "//*/table[@class='wikitable sortable']/tr/th", xmlValue)results <- xpathSApply(pagetree, "//*/table[@class='wikitable sortable']/tr/td", xmlValue)# Convert character vector to dataframecontent <- as.data.frame(matrix(results, ncol = 8, byrow = TRUE))# Clean up the resultscontent[,1] <- gsub(" ", "", content[,1])tablehead <- gsub(" ", "", tablehead)names(content) <- tablehead

Produces this result

> head(content)   Opponent Played Won Drawn Lost Goals for Goals against % Won1 Argentina     94  36    24   34       148           150 38.3%2  Paraguay     72  44    17   11       160            61 61.1%3   Uruguay     72  33    19   20       127            93 45.8%4     Chile     64  45    12    7       147            53 70.3%5      Peru     39  27     9    3        83            27 69.2%6    Mexico     36  21     6    9        69            34 58.3%