I posted the code Nexis importer last year, but it tuned out that the HTML format of the database service is less consistent than I though, so I changed the logic. The new version is dependent less on the structure of the HTML files, but more on the format of the content.
library(XML) #might need libxml2-dev via apt-get command readNewsDir <- function(dir,...){ names <- list.files(dir, full.names = TRUE, recursive = TRUE) df <- data.frame(head = c(), body = c(), pub = c(), datetime = c(), edition = c(), length = c(), stringsAsFactors = FALSE) for(name in names){ #print(file) if(grepl('\\.html$|\\.htm$|\\.xhtml$', name, ignore.case = TRUE)){ #print(file) df <- rbind(df, readNexisHTML(name, ...)) } } return(df) } #readNexisHTML('/home/kohei/Documents/Syria report/nexis.html') readNexisHTML <- function(name, sep = ' '){ heads <- c() bodies <- c() bys <- c() pubs <- c() datetimes <- c() editions <- c() lengths <- c() #Convert format cat('Reading', name, '\n') # HTML cleaning------------------------------------------------ lines <- scan(name, what="character", sep='\n', quiet=TRUE, encoding = "UTF-8") docnum <- 0 for(i in 1:length(lines)){ #print(lines[i]) lines[i] <- gsub('', '', lines[i]) lines[i] <- gsub(' -->', '', lines[i]) } lines[i+1] = '' # Fix EOF problem html <- paste(lines, collapse='\n') # Write to debug #cat(html, file="converted.html", sep="", append=FALSE) # Main process------------------------------------------------ #Load as DOM object doc <- htmlParse(html , encoding="UTF-8") # Remove index indexns <- getNodeSet(doc, '/html/body//doc[.//table]') for(indexn in indexns){ #print(xmlValue(indexn)) removeNodes(indexn) } for(node in getNodeSet(doc, '/html/body//doc')){ pub <- NA datetime <- NA head <- NA by <- NA edition <- NA section <- NA length <- NA i <- 1 for(div in getNodeSet(node, './/div')){ value <- cleanNews(xmlValue(div)) #print(paste(i, value)) if(i == 1 & grepl('\\d+ of \\d+ DOCUMENTS', value)){ i <- 2 }else if(i == 2){ #print(paste('pub', value)) pub <- value i <- 3 }else if(i == 3 & grepl('^(January|February|March|April|May|June|July|August|September|October|November|December)', value)){ dateline <- value #print(paste('date', value)) match <- regexpr(paste0('(January|February|March|April|May|June|July|August|September|October|November|December)', '[, ]+([0-9]{1,2})', '[, ]+([0-9]{4})', '([,; ]+(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday))?', '([, ]+(.+))?'), value, perl=TRUE) date <- c() for(j in 1:length(attr(match, "capture.start"))){ from <- attr(match, "capture.start")[j] to <- attr(match, "capture.start")[j] + attr(match, "capture.length")[j] date <- c(date, substr(dateline, from, to)) } month <- gsub('[^a-zA-Z]', '', date[1]) day <- gsub('[^0-9]', '', date[2]) year <- gsub('[^0-9]', '', date[3]) datetime <- format(strptime(paste(month, day, year, '12:00 AM'), format='%B %d %Y %I:%M %p'), '%Y-%m-%d %H:%M:%S UTC') if(length(date) == 7){ edition <- cleanNews(date[7]) #print(date) #print(dateline) #print(edition) } i <- 4 }else if(i == 4 & !grepl('[A-Z]+:', value)){ head <- value # Sometimes does not exists i <- 8 }else if(i >= 4 & grepl('BYLINE:', value)){ by <- sub('BYLINE: ', '', value) i <- 8 }else if(i >= 4 & grepl('SECTION:', value)){ section <- sub('SECTION: ', '', value) i <- 8 }else if(i >= 4 & grepl('LENGTH:', value)){ length <- strsplit(value, ' ')[[1]][2] i <- 8 }else if(i >= 4 & grepl('[A-Z]+:', value)){ i <- 8 }else if(i == 8){ paras <- c() for(p in getNodeSet(div, 'p')){ paras <- c(paras, cleanNews(xmlValue(p))) } if(length(paras) > 0){ body <- paste(paras, sep = '', collapse=sep) } break() } } if(!is.na(body)){ heads <- c(heads, head) bodies <- c(bodies, body) bys <- c(bys, by) pubs <- c(pubs, pub) datetimes <- c(datetimes, datetime) editions <- c(editions, edition) lengths <- c(lengths, length) } } return(data.frame(head = as.character(heads), pub = as.character(pubs), datetime = as.POSIXct(datetimes, tz = 'UTC'), by = as.factor(bys), edition = as.character(editions), length = as.numeric(lengths), body = as.character(bodies), stringsAsFactors = FALSE)) } cleanNews <- function(text){ text <- gsub("\\r\\n|\\r|\\n|\\t", " ", text) text <- gsub("[[:cntrl:]]", " ", text, perl = TRUE) text <- gsub("\\s\\s+", " ", text) text <- gsub("^\\s+|\\s+$", "", text) return(text) }