News data importer for R

In this April, I created a R scrip to import files downloaded from Nexis and Factiva. Factiva does not offer file download function, but its search results pages can be save as HTML files and imported to R using this script.

library(XML) #might need libxml2-dev via apt-get command

readNewsDir <- function(dir, type){
  files <- list.files(dir, full.names = TRUE, recursive = TRUE)
  df <- data.frame(head = c(), body = c(), pub = c(), datetime = c(), edition = c(), length = c(), stringsAsFactors = FALSE)
  for(file in files){
    #print(file)
    if(grepl('\\.html$|\\.htm$|\\.xhtml$', file, ignore.case = TRUE)){
      #print(file)
      if(type == 'nexis'){
        df <- rbind(df, readNexisHTML(file))
      }else if(type == 'factiva'){
        df <- rbind(df, readFactivaHTML(file))
      }
    }
  }
  return(df)
}

#readNexisHTML('/home/kohei/Documents/Syria report/nexis.html')
readNexisHTML <- function(file, sep = ' | '){
    
  heads = c()
  bodies <- c()
  pubs <- c()
  datetimes <- c()
  timezones <- c()
  editions <- c()
  lengths <- c()
  
  #Convert format
  cat('Reading', file, '\n')
  f <- file(file, encoding = "UTF-8")
  lines <- readLines(f)
  close(f)
  docnum <- 0
  for(i in 1:length(lines)){
    lines[i] <- gsub('', '', lines[i])
    lines[i] <- gsub(' -->', '', lines[i])
  }
  html <- paste(lines, collapse='\n')

  #Write to debug
  cat(html, file="/home/kohei/Documents/R/converted.html", sep="", append=FALSE)
  
  #Extract elements
  doc <- htmlParse(html , encoding="UTF-8")
  ids <- paste0('doc_id_', 1:500)
  for(id in ids){
    query <- paste0('/html/body//doc[@id="', id , '"]')
    nodes <- getNodeSet(doc, query)
    if(length(nodes)){
      node <- nodes[[1]]
    }else{
      next #can not break since there are errors in HTML
    }

    pub <- ''
    #pubns <- getNodeSet(node, './/div[@class="c10"]/p[@class="c11"]/span[@class="c12"]')
    pubns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c1"]/span[@class="c2"]')
    if(length(pubns)){
      i <- 1
      for(pubn in pubns){
        if(grepl('DOCUMENTS$', xmlValue(pubn))){
          docnum <- i
        }
        if(i == docnum + 1){
          pub <- xmlValue(pubn)
          pub <- cleanNews(pub)
        }
        i <- i + 1
      }
    }
    if(nchar(pub) == 0) pub <- NA
    #print(pub)
    
    date <- ''
    #datelinens <- getNodeSet(node, './/div[@class="c13" or @class="c14"]/p[@class="c11"]')
    datelinens <- getNodeSet(node, './/div[@class="c3" or @class="c4"]/p[@class="c1"]')
    if(length(datelinens)) dateline <- xmlValue(datelinens[[1]])
    #print(datelinens)
    dates <- strsplit(sub(',', '', dateline, fixed = TRUE), ' ', fixed = TRUE)[[1]]
    date <- paste(dates[1], dates[2], dates[3], sep=' ')
    if(nchar(date) == 0) date <- NA
    
    edition <- ''
    if(length(dates) >= 5){
      edition <- paste(dates[5:length(dates)], collapse = ' ')
      edition <- cleanNews(edition)
    }
    time <- ''
    timezone <- ''
    if(grepl('^[0-9]{1,2}:[0-9]{1,2} (AM|PM)', edition)){
      timeline <- edition
      timeline.parts <- strsplit(timeline, ' ')[[1]]
      #print(edition)
      #print(timeline.parts)
      time <- paste(timeline.parts[1], timeline.parts[2])
      if(length(timeline.parts) > 2){
        timezone <- paste(timeline.parts[3:length(timeline.parts)], collapse = ' ')
      }
      edition <- ''
      #print(time)
    }
    if(nchar(time) == 0) time <- '12:00 AM'
    if(nchar(edition) == 0) edition <- ''
    if(nchar(timezone) == 0) timezone <- ''
        
    body <- ''
    #bodyns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c17" or @class="c18"]/span[@class="c12" or @class="c14"]')
    #bodyns <- getNodeSet(node, './/div[@class="c5"]/p[@class="c15" or @class="c9" or @class="c9"]/span[@class="c2" or @class="c3"]')
    bodyns1 <- getNodeSet(node, './/div[(@class="c5") and count(.//p) > 1]//p')
    bodyns2 <- getNodeSet(node, './/div[(@class="c4") and count(.//p) > 1]//p')
    if(length(bodyns1) > length(bodyns2)){
      bodyns <- bodyns1
    }else{
      bodyns <- bodyns2
    }
    #if(is.null(bodyns)) print(node)
    if(length(bodyns)){
      paras <- c()
      for(bodyn in bodyns){
        para <- xmlValue(bodyn)
        para <- cleanNews(para)
        paras <- append(paras, para)
      }
      body <- paste(paras, sep = '', collapse=sep)
      if(nchar(body) == 0) body <- NA
    } 
    
    by <- ''
    code <- ''
    head <- ''
    length <- 0
    #attribns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c5" and .//@class="c12"]')
    attribns1 <- getNodeSet(node, './/div[@class="c5"]/p[@class="c6"]')
    attribns2 <- getNodeSet(node, './/div[@class="c4"]/p[@class="c5"]')
    if(length(attribns1) > length(attribns2)){
      attribns <- attribns1
    }else{
      attribns <- attribns2
    }
    if(length(attribns)){
      for(attribn in attribns){
        attrib <- xmlValue(attribn)
        attrib <- gsub("^\\s+|\\s+$", "", attrib)
        #print(attrib)
        if(grepl('^BYLINE: ', attrib)){
          by <- gsub('BYLINE: ', '', attrib)
        }
        if(grepl('^LENGTH: ', attrib)){
          length <- as.numeric(gsub('[^0-9]', '', attrib))
        }
        if(grepl('^JOURNAL-CODE: ', attrib)){
          code <- gsub('JOURNAL-CODE: ', '', attrib)
        }
        if(!grepl('^[A-Z\\-]{6,20}: ', attrib)){ #exclude "BYLINE:", "SECTION:", "PUBLICATION-TYPE:" etc.
          head <- cleanNews(attrib)
        }
      }
    }
    if(nchar(by) == 0) by <- NA
    if(nchar(code) == 0) code <- NA
    
    #print(paste(date, time, sep=' '))
    datetime = format(strptime(paste(date, time), format='%B %d %Y %I:%M %p'), '%Y-%m-%d %H:%M:%S UTC')
    #print(paste(date, time))
    
    heads = append(heads, head)
    bodies <- append(bodies, body)
    pubs <- append(pubs, pub)
    datetimes <- append(datetimes, datetime)
    
    timezones <- append(timezones, timezone)
    editions <- append(editions, edition)
    lengths <- append(lengths, length)
  }
  #print(datetimes)
  return(data.frame(head = as.character(heads), 
                    pub = as.character(pubs), 
                    datetime = as.POSIXct(datetimes, tz = 'UTC'), 
                    timezone = as.character(timezones), 
                    edition = as.factor(editions), 
                    length = as.numeric(lengths),
                    body = as.character(bodies), 
                    stringsAsFactors = FALSE))
}

#readFactivaHTML('/home/kohei/Documents/Syria report/factiva.html')
readFactivaHTML <- function(file, sep = ' | '){
  
  heads = c()
  bodies <- c()
  pubs <- c()
  datetimes <- c()
  timezones <- c()
  editions <- c()
  lengths <- c()
  
  cat('Reading', file, '\n')
  
  library(XML)
  doc <- htmlParse(file, encoding="UTF-8")
  nodes <- getNodeSet(doc, '/html/body//div[contains(@class, "article")]')
  
  for(node in nodes){
    #print(node)
    
    head <- ''
    headns <- getNodeSet(node, './div[@id="hd"]')
    if(length(headns)){
      head <- xmlValue(headns[[1]])
      head <- cleanNews(head)
    }
    if(nchar(head) == 0) head <- NA
    
    body <- ''
    bodyns <- getNodeSet(node, './p[contains(@class, "articleParagraph") and not(.//pre)]')
    if(length(bodyns)){
      paras <- c()
      for(bodyn in bodyns){
        para <- xmlValue(bodyn)
        para <- cleanNews(para)
        paras <- append(paras, para)
      }
      body <- paste(paras, sep = '', collapse = sep)
    }
    if(nchar(body) == 0) body <- NA
    
    pub <- ''
    length <- 0
    date <- ''
    time <- ''
    pos <- 0
    posTime <- 0
    attribns <- getNodeSet(node, './div[not(@id) and not(@class)]')
    if(length(attribns)){
      for(attribn in attribns){
        pos <- pos + 1
        #print(paste(posTime, pos))
        attrib <- xmlValue(attribn)
        attrib <- gsub("^\\s+|\\s+$", "", attrib)
        #print(attrib)
        if(grepl(' words$', attrib)){
          length <- as.numeric(gsub(' words$', '', attrib))
        }else if(grepl('[0-9]{1,2} [a-zA-Z]+ [0-9]{4}', attrib)){
          date <- attrib
          #date <- strsplit(attrib, ' ', fixed = TRUE)[[1]]
        }else if(grepl('[0-9]{2}:[0-9]{2}', attrib)){
          time <- attrib
          posTime <- pos
        }else if(pos == (posTime + 1)){ #publication name must be next to the time
          pub <- attrib
        }
      }
    }
    if(nchar(pub) == 0) pub <- NA
    if(nchar(date) == 0) date <- NA
    if(nchar(time) == 0) time <- NA
    
    #print(paste(pub, date[1], date[2], date[3], time, head, length))
    #print(paste(date, time, sep=' '))
    datetime = format(strptime(paste(date, ' ', time, sep=''), format='%d %B %Y %H:%M'), '%Y-%m-%d %H:%M:%S UTC')
    #print(paste(pub, datetime, head, length))
    #print(body)
    
    heads = append(heads, head)
    bodies <- append(bodies, body)
    pubs <- append(pubs, pub)
    datetimes <- append(datetimes, datetime)
    timezones <- append(timezones, '')
    editions <- append(editions, '')
    lengths <- append(lengths, length)
  }
  return(data.frame(head = as.character(heads), 
                    pub = as.character(pubs), 
                    datetime = as.POSIXct(datetimes, tz = 'UTC'),
                    timezone = as.character(timezones), 
                    edition = editions,
                    length = as.numeric(lengths), 
                    body = as.character(bodies), 
                    stringsAsFactors = FALSE))  
}

cleanNews <- function(text){
  text <- gsub("\\r\\n|\\r|\\n|\\t", " ", text)
  text <- gsub("[[:cntrl:]]", " ", text, perl = TRUE)
  text <- gsub("\\s\\s+", " ", text)
  text <- gsub("^\\s+|\\s+$", "", text)
  return(text)
}

cleanReuters <- function(text){
  text <- gsub('\\(Writing by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('\\(Editing by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('\\(Reporting by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('\\(Additional reporting by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('Reuters', '', text, ignore.case = TRUE, )
  text <- cleanNews(text)
  return(text)
}
Posts created 114

Leave a Reply

Your email address will not be published. Required fields are marked *

Related Posts

Begin typing your search term above and press enter to search. Press ESC to cancel.

Back To Top