In this April, I created a R scrip to import files downloaded from Nexis and Factiva. Factiva does not offer file download function, but its search results pages can be save as HTML files and imported to R using this script.
library(XML) #might need libxml2-dev via apt-get command readNewsDir <- function(dir, type){ files <- list.files(dir, full.names = TRUE, recursive = TRUE) df <- data.frame(head = c(), body = c(), pub = c(), datetime = c(), edition = c(), length = c(), stringsAsFactors = FALSE) for(file in files){ #print(file) if(grepl('\\.html$|\\.htm$|\\.xhtml$', file, ignore.case = TRUE)){ #print(file) if(type == 'nexis'){ df <- rbind(df, readNexisHTML(file)) }else if(type == 'factiva'){ df <- rbind(df, readFactivaHTML(file)) } } } return(df) } #readNexisHTML('/home/kohei/Documents/Syria report/nexis.html') readNexisHTML <- function(file, sep = ' | '){ heads = c() bodies <- c() pubs <- c() datetimes <- c() timezones <- c() editions <- c() lengths <- c() #Convert format cat('Reading', file, '\n') f <- file(file, encoding = "UTF-8") lines <- readLines(f) close(f) docnum <- 0 for(i in 1:length(lines)){ lines[i] <- gsub('', '', lines[i]) lines[i] <- gsub(' -->', '', lines[i]) } html <- paste(lines, collapse='\n') #Write to debug cat(html, file="/home/kohei/Documents/R/converted.html", sep="", append=FALSE) #Extract elements doc <- htmlParse(html , encoding="UTF-8") ids <- paste0('doc_id_', 1:500) for(id in ids){ query <- paste0('/html/body//doc[@id="', id , '"]') nodes <- getNodeSet(doc, query) if(length(nodes)){ node <- nodes[[1]] }else{ next #can not break since there are errors in HTML } pub <- '' #pubns <- getNodeSet(node, './/div[@class="c10"]/p[@class="c11"]/span[@class="c12"]') pubns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c1"]/span[@class="c2"]') if(length(pubns)){ i <- 1 for(pubn in pubns){ if(grepl('DOCUMENTS$', xmlValue(pubn))){ docnum <- i } if(i == docnum + 1){ pub <- xmlValue(pubn) pub <- cleanNews(pub) } i <- i + 1 } } if(nchar(pub) == 0) pub <- NA #print(pub) date <- '' #datelinens <- getNodeSet(node, './/div[@class="c13" or @class="c14"]/p[@class="c11"]') datelinens <- getNodeSet(node, './/div[@class="c3" or @class="c4"]/p[@class="c1"]') if(length(datelinens)) dateline <- xmlValue(datelinens[[1]]) #print(datelinens) dates <- strsplit(sub(',', '', dateline, fixed = TRUE), ' ', fixed = TRUE)[[1]] date <- paste(dates[1], dates[2], dates[3], sep=' ') if(nchar(date) == 0) date <- NA edition <- '' if(length(dates) >= 5){ edition <- paste(dates[5:length(dates)], collapse = ' ') edition <- cleanNews(edition) } time <- '' timezone <- '' if(grepl('^[0-9]{1,2}:[0-9]{1,2} (AM|PM)', edition)){ timeline <- edition timeline.parts <- strsplit(timeline, ' ')[[1]] #print(edition) #print(timeline.parts) time <- paste(timeline.parts[1], timeline.parts[2]) if(length(timeline.parts) > 2){ timezone <- paste(timeline.parts[3:length(timeline.parts)], collapse = ' ') } edition <- '' #print(time) } if(nchar(time) == 0) time <- '12:00 AM' if(nchar(edition) == 0) edition <- '' if(nchar(timezone) == 0) timezone <- '' body <- '' #bodyns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c17" or @class="c18"]/span[@class="c12" or @class="c14"]') #bodyns <- getNodeSet(node, './/div[@class="c5"]/p[@class="c15" or @class="c9" or @class="c9"]/span[@class="c2" or @class="c3"]') bodyns1 <- getNodeSet(node, './/div[(@class="c5") and count(.//p) > 1]//p') bodyns2 <- getNodeSet(node, './/div[(@class="c4") and count(.//p) > 1]//p') if(length(bodyns1) > length(bodyns2)){ bodyns <- bodyns1 }else{ bodyns <- bodyns2 } #if(is.null(bodyns)) print(node) if(length(bodyns)){ paras <- c() for(bodyn in bodyns){ para <- xmlValue(bodyn) para <- cleanNews(para) paras <- append(paras, para) } body <- paste(paras, sep = '', collapse=sep) if(nchar(body) == 0) body <- NA } by <- '' code <- '' head <- '' length <- 0 #attribns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c5" and .//@class="c12"]') attribns1 <- getNodeSet(node, './/div[@class="c5"]/p[@class="c6"]') attribns2 <- getNodeSet(node, './/div[@class="c4"]/p[@class="c5"]') if(length(attribns1) > length(attribns2)){ attribns <- attribns1 }else{ attribns <- attribns2 } if(length(attribns)){ for(attribn in attribns){ attrib <- xmlValue(attribn) attrib <- gsub("^\\s+|\\s+$", "", attrib) #print(attrib) if(grepl('^BYLINE: ', attrib)){ by <- gsub('BYLINE: ', '', attrib) } if(grepl('^LENGTH: ', attrib)){ length <- as.numeric(gsub('[^0-9]', '', attrib)) } if(grepl('^JOURNAL-CODE: ', attrib)){ code <- gsub('JOURNAL-CODE: ', '', attrib) } if(!grepl('^[A-Z\\-]{6,20}: ', attrib)){ #exclude "BYLINE:", "SECTION:", "PUBLICATION-TYPE:" etc. head <- cleanNews(attrib) } } } if(nchar(by) == 0) by <- NA if(nchar(code) == 0) code <- NA #print(paste(date, time, sep=' ')) datetime = format(strptime(paste(date, time), format='%B %d %Y %I:%M %p'), '%Y-%m-%d %H:%M:%S UTC') #print(paste(date, time)) heads = append(heads, head) bodies <- append(bodies, body) pubs <- append(pubs, pub) datetimes <- append(datetimes, datetime) timezones <- append(timezones, timezone) editions <- append(editions, edition) lengths <- append(lengths, length) } #print(datetimes) return(data.frame(head = as.character(heads), pub = as.character(pubs), datetime = as.POSIXct(datetimes, tz = 'UTC'), timezone = as.character(timezones), edition = as.factor(editions), length = as.numeric(lengths), body = as.character(bodies), stringsAsFactors = FALSE)) } #readFactivaHTML('/home/kohei/Documents/Syria report/factiva.html') readFactivaHTML <- function(file, sep = ' | '){ heads = c() bodies <- c() pubs <- c() datetimes <- c() timezones <- c() editions <- c() lengths <- c() cat('Reading', file, '\n') library(XML) doc <- htmlParse(file, encoding="UTF-8") nodes <- getNodeSet(doc, '/html/body//div[contains(@class, "article")]') for(node in nodes){ #print(node) head <- '' headns <- getNodeSet(node, './div[@id="hd"]') if(length(headns)){ head <- xmlValue(headns[[1]]) head <- cleanNews(head) } if(nchar(head) == 0) head <- NA body <- '' bodyns <- getNodeSet(node, './p[contains(@class, "articleParagraph") and not(.//pre)]') if(length(bodyns)){ paras <- c() for(bodyn in bodyns){ para <- xmlValue(bodyn) para <- cleanNews(para) paras <- append(paras, para) } body <- paste(paras, sep = '', collapse = sep) } if(nchar(body) == 0) body <- NA pub <- '' length <- 0 date <- '' time <- '' pos <- 0 posTime <- 0 attribns <- getNodeSet(node, './div[not(@id) and not(@class)]') if(length(attribns)){ for(attribn in attribns){ pos <- pos + 1 #print(paste(posTime, pos)) attrib <- xmlValue(attribn) attrib <- gsub("^\\s+|\\s+$", "", attrib) #print(attrib) if(grepl(' words$', attrib)){ length <- as.numeric(gsub(' words$', '', attrib)) }else if(grepl('[0-9]{1,2} [a-zA-Z]+ [0-9]{4}', attrib)){ date <- attrib #date <- strsplit(attrib, ' ', fixed = TRUE)[[1]] }else if(grepl('[0-9]{2}:[0-9]{2}', attrib)){ time <- attrib posTime <- pos }else if(pos == (posTime + 1)){ #publication name must be next to the time pub <- attrib } } } if(nchar(pub) == 0) pub <- NA if(nchar(date) == 0) date <- NA if(nchar(time) == 0) time <- NA #print(paste(pub, date[1], date[2], date[3], time, head, length)) #print(paste(date, time, sep=' ')) datetime = format(strptime(paste(date, ' ', time, sep=''), format='%d %B %Y %H:%M'), '%Y-%m-%d %H:%M:%S UTC') #print(paste(pub, datetime, head, length)) #print(body) heads = append(heads, head) bodies <- append(bodies, body) pubs <- append(pubs, pub) datetimes <- append(datetimes, datetime) timezones <- append(timezones, '') editions <- append(editions, '') lengths <- append(lengths, length) } return(data.frame(head = as.character(heads), pub = as.character(pubs), datetime = as.POSIXct(datetimes, tz = 'UTC'), timezone = as.character(timezones), edition = editions, length = as.numeric(lengths), body = as.character(bodies), stringsAsFactors = FALSE)) } cleanNews <- function(text){ text <- gsub("\\r\\n|\\r|\\n|\\t", " ", text) text <- gsub("[[:cntrl:]]", " ", text, perl = TRUE) text <- gsub("\\s\\s+", " ", text) text <- gsub("^\\s+|\\s+$", "", text) return(text) } cleanReuters <- function(text){ text <- gsub('\\(Writing by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE) text <- gsub('\\(Editing by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE) text <- gsub('\\(Reporting by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE) text <- gsub('\\(Additional reporting by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE) text <- gsub('Reuters', '', text, ignore.case = TRUE, ) text <- cleanNews(text) return(text) }