News data importer for R

KoheiAugust 5, 2014January 12, 2015

In this April, I created a R scrip to import files downloaded from Nexis and Factiva. Factiva does not offer file download function, but its search results pages can be save as HTML files and imported to R using this script.

library(XML) #might need libxml2-dev via apt-get command

readNewsDir <- function(dir, type){
  files <- list.files(dir, full.names = TRUE, recursive = TRUE)
  df <- data.frame(head = c(), body = c(), pub = c(), datetime = c(), edition = c(), length = c(), stringsAsFactors = FALSE)
  for(file in files){
    #print(file)
    if(grepl('\\.html$|\\.htm$|\\.xhtml$', file, ignore.case = TRUE)){
      #print(file)
      if(type == 'nexis'){
        df <- rbind(df, readNexisHTML(file))
      }else if(type == 'factiva'){
        df <- rbind(df, readFactivaHTML(file))
      }
    }
  }
  return(df)
}

#readNexisHTML('/home/kohei/Documents/Syria report/nexis.html')
readNexisHTML <- function(file, sep = ' | '){
    
  heads = c()
  bodies <- c()
  pubs <- c()
  datetimes <- c()
  timezones <- c()
  editions <- c()
  lengths <- c()
  
  #Convert format
  cat('Reading', file, '\n')
  f <- file(file, encoding = "UTF-8")
  lines <- readLines(f)
  close(f)
  docnum <- 0
  for(i in 1:length(lines)){
    lines[i] <- gsub('', '', lines[i])
    lines[i] <- gsub(' -->', '', lines[i])
  }
  html <- paste(lines, collapse='\n')

  #Write to debug
  cat(html, file="/home/kohei/Documents/R/converted.html", sep="", append=FALSE)
  
  #Extract elements
  doc <- htmlParse(html , encoding="UTF-8")
  ids <- paste0('doc_id_', 1:500)
  for(id in ids){
    query <- paste0('/html/body//doc[@id="', id , '"]')
    nodes <- getNodeSet(doc, query)
    if(length(nodes)){
      node <- nodes[[1]]
    }else{
      next #can not break since there are errors in HTML
    }

    pub <- ''
    #pubns <- getNodeSet(node, './/div[@class="c10"]/p[@class="c11"]/span[@class="c12"]')
    pubns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c1"]/span[@class="c2"]')
    if(length(pubns)){
      i <- 1
      for(pubn in pubns){
        if(grepl('DOCUMENTS$', xmlValue(pubn))){
          docnum <- i
        }
        if(i == docnum + 1){
          pub <- xmlValue(pubn)
          pub <- cleanNews(pub)
        }
        i <- i + 1
      }
    }
    if(nchar(pub) == 0) pub <- NA
    #print(pub)
    
    date <- ''
    #datelinens <- getNodeSet(node, './/div[@class="c13" or @class="c14"]/p[@class="c11"]')
    datelinens <- getNodeSet(node, './/div[@class="c3" or @class="c4"]/p[@class="c1"]')
    if(length(datelinens)) dateline <- xmlValue(datelinens[[1]])
    #print(datelinens)
    dates <- strsplit(sub(',', '', dateline, fixed = TRUE), ' ', fixed = TRUE)[[1]]
    date <- paste(dates[1], dates[2], dates[3], sep=' ')
    if(nchar(date) == 0) date <- NA
    
    edition <- ''
    if(length(dates) >= 5){
      edition <- paste(dates[5:length(dates)], collapse = ' ')
      edition <- cleanNews(edition)
    }
    time <- ''
    timezone <- ''
    if(grepl('^[0-9]{1,2}:[0-9]{1,2} (AM|PM)', edition)){
      timeline <- edition
      timeline.parts <- strsplit(timeline, ' ')[[1]]
      #print(edition)
      #print(timeline.parts)
      time <- paste(timeline.parts[1], timeline.parts[2])
      if(length(timeline.parts) > 2){
        timezone <- paste(timeline.parts[3:length(timeline.parts)], collapse = ' ')
      }
      edition <- ''
      #print(time)
    }
    if(nchar(time) == 0) time <- '12:00 AM'
    if(nchar(edition) == 0) edition <- ''
    if(nchar(timezone) == 0) timezone <- ''
        
    body <- ''
    #bodyns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c17" or @class="c18"]/span[@class="c12" or @class="c14"]')
    #bodyns <- getNodeSet(node, './/div[@class="c5"]/p[@class="c15" or @class="c9" or @class="c9"]/span[@class="c2" or @class="c3"]')
    bodyns1 <- getNodeSet(node, './/div[(@class="c5") and count(.//p) > 1]//p')
    bodyns2 <- getNodeSet(node, './/div[(@class="c4") and count(.//p) > 1]//p')
    if(length(bodyns1) > length(bodyns2)){
      bodyns <- bodyns1
    }else{
      bodyns <- bodyns2
    }
    #if(is.null(bodyns)) print(node)
    if(length(bodyns)){
      paras <- c()
      for(bodyn in bodyns){
        para <- xmlValue(bodyn)
        para <- cleanNews(para)
        paras <- append(paras, para)
      }
      body <- paste(paras, sep = '', collapse=sep)
      if(nchar(body) == 0) body <- NA
    } 
    
    by <- ''
    code <- ''
    head <- ''
    length <- 0
    #attribns <- getNodeSet(node, './/div[@class="c0"]/p[@class="c5" and .//@class="c12"]')
    attribns1 <- getNodeSet(node, './/div[@class="c5"]/p[@class="c6"]')
    attribns2 <- getNodeSet(node, './/div[@class="c4"]/p[@class="c5"]')
    if(length(attribns1) > length(attribns2)){
      attribns <- attribns1
    }else{
      attribns <- attribns2
    }
    if(length(attribns)){
      for(attribn in attribns){
        attrib <- xmlValue(attribn)
        attrib <- gsub("^\\s+|\\s+$", "", attrib)
        #print(attrib)
        if(grepl('^BYLINE: ', attrib)){
          by <- gsub('BYLINE: ', '', attrib)
        }
        if(grepl('^LENGTH: ', attrib)){
          length <- as.numeric(gsub('[^0-9]', '', attrib))
        }
        if(grepl('^JOURNAL-CODE: ', attrib)){
          code <- gsub('JOURNAL-CODE: ', '', attrib)
        }
        if(!grepl('^[A-Z\\-]{6,20}: ', attrib)){ #exclude "BYLINE:", "SECTION:", "PUBLICATION-TYPE:" etc.
          head <- cleanNews(attrib)
        }
      }
    }
    if(nchar(by) == 0) by <- NA
    if(nchar(code) == 0) code <- NA
    
    #print(paste(date, time, sep=' '))
    datetime = format(strptime(paste(date, time), format='%B %d %Y %I:%M %p'), '%Y-%m-%d %H:%M:%S UTC')
    #print(paste(date, time))
    
    heads = append(heads, head)
    bodies <- append(bodies, body)
    pubs <- append(pubs, pub)
    datetimes <- append(datetimes, datetime)
    
    timezones <- append(timezones, timezone)
    editions <- append(editions, edition)
    lengths <- append(lengths, length)
  }
  #print(datetimes)
  return(data.frame(head = as.character(heads), 
                    pub = as.character(pubs), 
                    datetime = as.POSIXct(datetimes, tz = 'UTC'), 
                    timezone = as.character(timezones), 
                    edition = as.factor(editions), 
                    length = as.numeric(lengths),
                    body = as.character(bodies), 
                    stringsAsFactors = FALSE))
}

#readFactivaHTML('/home/kohei/Documents/Syria report/factiva.html')
readFactivaHTML <- function(file, sep = ' | '){
  
  heads = c()
  bodies <- c()
  pubs <- c()
  datetimes <- c()
  timezones <- c()
  editions <- c()
  lengths <- c()
  
  cat('Reading', file, '\n')
  
  library(XML)
  doc <- htmlParse(file, encoding="UTF-8")
  nodes <- getNodeSet(doc, '/html/body//div[contains(@class, "article")]')
  
  for(node in nodes){
    #print(node)
    
    head <- ''
    headns <- getNodeSet(node, './div[@id="hd"]')
    if(length(headns)){
      head <- xmlValue(headns[[1]])
      head <- cleanNews(head)
    }
    if(nchar(head) == 0) head <- NA
    
    body <- ''
    bodyns <- getNodeSet(node, './p[contains(@class, "articleParagraph") and not(.//pre)]')
    if(length(bodyns)){
      paras <- c()
      for(bodyn in bodyns){
        para <- xmlValue(bodyn)
        para <- cleanNews(para)
        paras <- append(paras, para)
      }
      body <- paste(paras, sep = '', collapse = sep)
    }
    if(nchar(body) == 0) body <- NA
    
    pub <- ''
    length <- 0
    date <- ''
    time <- ''
    pos <- 0
    posTime <- 0
    attribns <- getNodeSet(node, './div[not(@id) and not(@class)]')
    if(length(attribns)){
      for(attribn in attribns){
        pos <- pos + 1
        #print(paste(posTime, pos))
        attrib <- xmlValue(attribn)
        attrib <- gsub("^\\s+|\\s+$", "", attrib)
        #print(attrib)
        if(grepl(' words$', attrib)){
          length <- as.numeric(gsub(' words$', '', attrib))
        }else if(grepl('[0-9]{1,2} [a-zA-Z]+ [0-9]{4}', attrib)){
          date <- attrib
          #date <- strsplit(attrib, ' ', fixed = TRUE)[[1]]
        }else if(grepl('[0-9]{2}:[0-9]{2}', attrib)){
          time <- attrib
          posTime <- pos
        }else if(pos == (posTime + 1)){ #publication name must be next to the time
          pub <- attrib
        }
      }
    }
    if(nchar(pub) == 0) pub <- NA
    if(nchar(date) == 0) date <- NA
    if(nchar(time) == 0) time <- NA
    
    #print(paste(pub, date[1], date[2], date[3], time, head, length))
    #print(paste(date, time, sep=' '))
    datetime = format(strptime(paste(date, ' ', time, sep=''), format='%d %B %Y %H:%M'), '%Y-%m-%d %H:%M:%S UTC')
    #print(paste(pub, datetime, head, length))
    #print(body)
    
    heads = append(heads, head)
    bodies <- append(bodies, body)
    pubs <- append(pubs, pub)
    datetimes <- append(datetimes, datetime)
    timezones <- append(timezones, '')
    editions <- append(editions, '')
    lengths <- append(lengths, length)
  }
  return(data.frame(head = as.character(heads), 
                    pub = as.character(pubs), 
                    datetime = as.POSIXct(datetimes, tz = 'UTC'),
                    timezone = as.character(timezones), 
                    edition = editions,
                    length = as.numeric(lengths), 
                    body = as.character(bodies), 
                    stringsAsFactors = FALSE))  
}

cleanNews <- function(text){
  text <- gsub("\\r\\n|\\r|\\n|\\t", " ", text)
  text <- gsub("[[:cntrl:]]", " ", text, perl = TRUE)
  text <- gsub("\\s\\s+", " ", text)
  text <- gsub("^\\s+|\\s+$", "", text)
  return(text)
}

cleanReuters <- function(text){
  text <- gsub('\\(Writing by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('\\(Editing by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('\\(Reporting by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('\\(Additional reporting by.{1,500}\\)$', '', text, ignore.case = TRUE, perl = TRUE)
  text <- gsub('Reuters', '', text, ignore.case = TRUE, )
  text <- cleanNews(text)
  return(text)
}

Kohei

Posts created 117

Leave a Reply Cancel reply

Develop efficient custom functions using quanteda v4.0 – Kohei Watanabe on New tokens object in quanteda v4.0April 16, 2024
[…] most important change in quanteda v4.0 is the creation of the external pointer-based tokens object, called tokens_xptr, that allows…
Setting fonts to plot Chinese polarity words in LSS – Kohei Watanabe on New paper on historical geopolitical threats to the USFebruary 19, 2024
[…] models are measuring to others. I am using this function myself in my project on construction of a geopolitical…
New paper on semantic temporality analysis – Kohei Watanabe on New paper on Latent Semantic ScalingAugust 29, 2023
[…] on temporal orientation of texts appeared in Research & Politics. In this study we applied latent semantic scaling (LSS)…
Kohei on Tutorial websites on LSS and Seeded LDAAugust 26, 2023
Please use base R's set.seed() before running the command.
Marli Fernandes on Tutorial websites on LSS and Seeded LDAAugust 24, 2023
I am currently using the seededlda package. I am using the following code: slda <- textmodel_seededlda(dfmt, dict, residual = 2)…

Back To Top