In this April, I created a R scrip to import files downloaded from Nexis and Factiva. Factiva does not offer file download function, but its search results pages can be save as HTML files and imported to R using this script.
library(XML) #might need libxml2-dev via apt-get command
readNewsDir <- function(dir, type){
files <- list.files(dir, full.names = TRUE, recursive = TRUE)
df <- data.frame(head = c(), body = c(), pub = c(), datetime = c(), edition = c(), length = c(), stringsAsFactors = FALSE)
for(file in files){
#print(file)
if(grepl('\\.html$|\\.htm$|\\.xhtml$', file, ignore.case = TRUE)){
#print(file)
if(type == 'nexis'){
df <- rbind(df, readNexisHTML(file))
}else if(type == 'factiva'){
df <- rbind(df, readFactivaHTML(file))
}
}
}
return(df)
}
#readNexisHTML('/home/kohei/Documents/Syria report/nexis.html')
readNexisHTML <- function(file, sep = ' | '){
heads = c()
bodies <- c()
pubs <- c()
datetimes <- c()
timezones <- c()
editions <- c()
lengths <- c()
#Convert format
cat('Reading', file, '\n')
f <- file(file, encoding = "UTF-8")
lines <- readLines(f)
close(f)
docnum <- 0
for(i in 1:length(lines)){
lines[i] <- gsub('', '', lines[i])
lines[i] <- gsub(' -->', '', lines[i])
}
html <- paste(lines, collapse='\n')
#Write to debug
cat(html, file="/home/kohei/Documents/R/converted.html", sep="", append=FALSE)
#Extract elements
doc <- htmlParse(html , encoding="UTF-8")
ids <- paste0('doc_id_', 1:500)
for(id in ids){
query <- paste0('/html/body//doc[@id="', id , '"]')
nodes <- getNodeSet(doc, query)
if(length(nodes)){
node <- nodes[[1]]
}else{
next #can not break since there are errors in HTML
}
pub <- ''
#pubns <- getNodeSet(node, './/div[@class="c10"]/p[@class="c11"]/span[@class="c12"]')
pubns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c1"]/span[@class="c2"]')
if(length(pubns)){
i <- 1
for(pubn in pubns){
if(grepl('DOCUMENTS$', xmlValue(pubn))){
docnum <- i
}
if(i == docnum + 1){
pub <- xmlValue(pubn)
pub <- cleanNews(pub)
}
i <- i + 1
}
}
if(nchar(pub) == 0) pub <- NA
#print(pub)
date <- ''
#datelinens <- getNodeSet(node, './/div[@class="c13" or @class="c14"]/p[@class="c11"]')
datelinens <- getNodeSet(node, './/div[@class="c3" or @class="c4"]/p[@class="c1"]')
if(length(datelinens)) dateline <- xmlValue(datelinens[[1]])
#print(datelinens)
dates <- strsplit(sub(',', '', dateline, fixed = TRUE), ' ', fixed = TRUE)[[1]]
date <- paste(dates[1], dates[2], dates[3], sep=' ')
if(nchar(date) == 0) date <- NA
edition <- ''
if(length(dates) >= 5){
edition <- paste(dates[5:length(dates)], collapse = ' ')
edition <- cleanNews(edition)
}
time <- ''
timezone <- ''
if(grepl('^[0-9]{1,2}:[0-9]{1,2} (AM|PM)', edition)){
timeline <- edition
timeline.parts <- strsplit(timeline, ' ')[[1]]
#print(edition)
#print(timeline.parts)
time <- paste(timeline.parts[1], timeline.parts[2])
if(length(timeline.parts) > 2){
timezone <- paste(timeline.parts[3:length(timeline.parts)], collapse = ' ')
}
edition <- ''
#print(time)
}
if(nchar(time) == 0) time <- '12:00 AM'
if(nchar(edition) == 0) edition <- ''
if(nchar(timezone) == 0) timezone <- ''
body <- ''
#bodyns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c17" or @class="c18"]/span[@class="c12" or @class="c14"]')
#bodyns <- getNodeSet(node, './/div[@class="c5"]/p[@class="c15" or @class="c9" or @class="c9"]/span[@class="c2" or @class="c3"]')
bodyns1 <- getNodeSet(node, './/div[(@class="c5") and count(.//p) > 1]//p')
bodyns2 <- getNodeSet(node, './/div[(@class="c4") and count(.//p) > 1]//p')
if(length(bodyns1) > length(bodyns2)){
bodyns <- bodyns1
}else{
bodyns <- bodyns2
}
#if(is.null(bodyns)) print(node)
if(length(bodyns)){
paras <- c()
for(bodyn in bodyns){
para <- xmlValue(bodyn)
para <- cleanNews(para)
paras <- append(paras, para)
}
body <- paste(paras, sep = '', collapse=sep)
if(nchar(body) == 0) body <- NA
}
by <- ''
code <- ''
head <- ''
length <- 0
#attribns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c5" and .//@class="c12"]')
attribns1 <- getNodeSet(node, './/div[@class="c5"]/p[@class="c6"]')
attribns2 <- getNodeSet(node, './/div[@class="c4"]/p[@class="c5"]')
if(length(attribns1) > length(attribns2)){
attribns <- attribns1
}else{
attribns <- attribns2
}
if(length(attribns)){
for(attribn in attribns){
attrib <- xmlValue(attribn)
attrib <- gsub("^\\s+|\\s+$", "", attrib)
#print(attrib)
if(grepl('^BYLINE: ', attrib)){
by <- gsub('BYLINE: ', '', attrib)
}
if(grepl('^LENGTH: ', attrib)){
length <- as.numeric(gsub('[^0-9]', '', attrib))
}
if(grepl('^JOURNAL-CODE: ', attrib)){
code <- gsub('JOURNAL-CODE: ', '', attrib)
}
if(!grepl('^[A-Z\\-]{6,20}: ', attrib)){ #exclude "BYLINE:", "SECTION:", "PUBLICATION-TYPE:" etc.
head <- cleanNews(attrib)
}
}
}
if(nchar(by) == 0) by <- NA
if(nchar(code) == 0) code <- NA
#print(paste(date, time, sep=' '))
datetime = format(strptime(paste(date, time), format='%B %d %Y %I:%M %p'), '%Y-%m-%d %H:%M:%S UTC')
#print(paste(date, time))
heads = append(heads, head)
bodies <- append(bodies, body)
pubs <- append(pubs, pub)
datetimes <- append(datetimes, datetime)
timezones <- append(timezones, timezone)
editions <- append(editions, edition)
lengths <- append(lengths, length)
}
#print(datetimes)
return(data.frame(head = as.character(heads),
pub = as.character(pubs),
datetime = as.POSIXct(datetimes, tz = 'UTC'),
timezone = as.character(timezones),
edition = as.factor(editions),
length = as.numeric(lengths),
body = as.character(bodies),
stringsAsFactors = FALSE))
}
#readFactivaHTML('/home/kohei/Documents/Syria report/factiva.html')
readFactivaHTML <- function(file, sep = ' | '){
heads = c()
bodies <- c()
pubs <- c()
datetimes <- c()
timezones <- c()
editions <- c()
lengths <- c()
cat('Reading', file, '\n')
library(XML)
doc <- htmlParse(file, encoding="UTF-8")
nodes <- getNodeSet(doc, '/html/body//div[contains(@class, "article")]')
for(node in nodes){
#print(node)
head <- ''
headns <- getNodeSet(node, './div[@id="hd"]')
if(length(headns)){
head <- xmlValue(headns[[1]])
head <- cleanNews(head)
}
if(nchar(head) == 0) head <- NA
body <- ''
bodyns <- getNodeSet(node, './p[contains(@class, "articleParagraph") and not(.//pre)]')
if(length(bodyns)){
paras <- c()
for(bodyn in bodyns){
para <- xmlValue(bodyn)
para <- cleanNews(para)
paras <- append(paras, para)
}
body <- paste(paras, sep = '', collapse = sep)
}
if(nchar(body) == 0) body <- NA
pub <- ''
length <- 0
date <- ''
time <- ''
pos <- 0
posTime <- 0
attribns <- getNodeSet(node, './div[not(@id) and not(@class)]')
if(length(attribns)){
for(attribn in attribns){
pos <- pos + 1
#print(paste(posTime, pos))
attrib <- xmlValue(attribn)
attrib <- gsub("^\\s+|\\s+$", "", attrib)
#print(attrib)
if(grepl(' words$', attrib)){
length <- as.numeric(gsub(' words$', '', attrib))
}else if(grepl('[0-9]{1,2} [a-zA-Z]+ [0-9]{4}', attrib)){
date <- attrib
#date <- strsplit(attrib, ' ', fixed = TRUE)[[1]]
}else if(grepl('[0-9]{2}:[0-9]{2}', attrib)){
time <- attrib
posTime <- pos
}else if(pos == (posTime + 1)){ #publication name must be next to the time
pub <- attrib
}
}
}
if(nchar(pub) == 0) pub <- NA
if(nchar(date) == 0) date <- NA
if(nchar(time) == 0) time <- NA
#print(paste(pub, date[1], date[2], date[3], time, head, length))
#print(paste(date, time, sep=' '))
datetime = format(strptime(paste(date, ' ', time, sep=''), format='%d %B %Y %H:%M'), '%Y-%m-%d %H:%M:%S UTC')
#print(paste(pub, datetime, head, length))
#print(body)
heads = append(heads, head)
bodies <- append(bodies, body)
pubs <- append(pubs, pub)
datetimes <- append(datetimes, datetime)
timezones <- append(timezones, '')
editions <- append(editions, '')
lengths <- append(lengths, length)
}
return(data.frame(head = as.character(heads),
pub = as.character(pubs),
datetime = as.POSIXct(datetimes, tz = 'UTC'),
timezone = as.character(timezones),
edition = editions,
length = as.numeric(lengths),
body = as.character(bodies),
stringsAsFactors = FALSE))
}
cleanNews <- function(text){
text <- gsub("\\r\\n|\\r|\\n|\\t", " ", text)
text <- gsub("[[:cntrl:]]", " ", text, perl = TRUE)
text <- gsub("\\s\\s+", " ", text)
text <- gsub("^\\s+|\\s+$", "", text)
return(text)
}
cleanReuters <- function(text){
text <- gsub('\\(Writing by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
text <- gsub('\\(Editing by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
text <- gsub('\\(Reporting by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
text <- gsub('\\(Additional reporting by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
text <- gsub('Reuters', '', text, ignore.case = TRUE, )
text <- cleanNews(text)
return(text)
}
