Nexis news importer updated

KoheiJune 18, 2015December 22, 2019

I posted the code Nexis importer last year, but it tuned out that the HTML format of the database service is less consistent than I though, so I changed the logic. The new version is dependent less on the structure of the HTML files, but more on the format of the content.

library(XML) #might need libxml2-dev via apt-get command

readNewsDir <- function(dir,...){
  names <- list.files(dir, full.names = TRUE, recursive = TRUE)
  df <- data.frame(head = c(), body = c(), pub = c(), datetime = c(), edition = c(), length = c(), stringsAsFactors = FALSE)
  for(name in names){
    #print(file)
    if(grepl('\\.html$|\\.htm$|\\.xhtml$', name, ignore.case = TRUE)){
      #print(file)
      df <- rbind(df, readNexisHTML(name, ...))
    }
  }
  return(df)
}

#readNexisHTML('/home/kohei/Documents/Syria report/nexis.html')
readNexisHTML <- function(name, sep = ' '){
  
  heads <- c()
  bodies <- c()
  bys <- c()
  pubs <- c()
  datetimes <- c()
  editions <- c()
  lengths <- c()
  
  #Convert format
  cat('Reading', name, '\n')
  
  # HTML cleaning------------------------------------------------
  
  lines <- scan(name, what="character", sep='\n', quiet=TRUE, encoding = "UTF-8")
  docnum <- 0
  for(i in 1:length(lines)){
    #print(lines[i])
    lines[i] <- gsub('', '', lines[i])
    lines[i] <- gsub(' -->', '', lines[i])
  }
  lines[i+1] = '' # Fix EOF problem
  html <- paste(lines, collapse='\n')
  
  # Write to debug
  #cat(html, file="converted.html", sep="", append=FALSE)
  
  # Main process------------------------------------------------
  
  #Load as DOM object
  doc <- htmlParse(html , encoding="UTF-8")
  
  # Remove index
  indexns <- getNodeSet(doc, '/html/body//doc[.//table]')
  for(indexn in indexns){
    #print(xmlValue(indexn))
    removeNodes(indexn)
  }
  
  for(node in getNodeSet(doc, '/html/body//doc')){
    
    pub <- NA
    datetime <- NA
    head <- NA
    by <- NA
    edition <- NA
    section <- NA
    length <- NA
    
    i <- 1
    for(div in getNodeSet(node, './/div')){
      value <- cleanNews(xmlValue(div))
      #print(paste(i, value))
      
      if(i == 1 & grepl('\\d+ of \\d+ DOCUMENTS', value)){
        i <- 2
      }else if(i == 2){
        #print(paste('pub', value))
        pub <- value
        i <- 3
      }else if(i == 3 & grepl('^(January|February|March|April|May|June|July|August|September|October|November|December)', value)){
        dateline <- value
        #print(paste('date', value))
        match <- regexpr(paste0('(January|February|March|April|May|June|July|August|September|October|November|December)',
                     '[, ]+([0-9]{1,2})',
                     '[, ]+([0-9]{4})',
                     '([,; ]+(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday))?',
                     '([, ]+(.+))?'), value, perl=TRUE)
        date <- c()
        for(j in 1:length(attr(match, "capture.start"))){
          from <- attr(match, "capture.start")[j]
          to <- attr(match, "capture.start")[j] + attr(match, "capture.length")[j]
          date <- c(date, substr(dateline, from, to))
        }
        month <- gsub('[^a-zA-Z]', '', date[1])
        day <- gsub('[^0-9]', '', date[2])
        year <- gsub('[^0-9]', '', date[3])
        
        datetime <- format(strptime(paste(month, day, year, '12:00 AM'), 
                                   format='%B %d %Y %I:%M %p'), '%Y-%m-%d %H:%M:%S UTC')
        
        if(length(date) == 7){
          edition <- cleanNews(date[7])
          #print(date)
          #print(dateline)
          #print(edition)
        }
        i <- 4
        
      }else if(i == 4 & !grepl('[A-Z]+:', value)){
        head <- value # Sometimes does not exists
        i <- 8
      }else if(i >= 4 & grepl('BYLINE:', value)){
        by <- sub('BYLINE: ', '', value)
        i <- 8
      }else if(i >= 4 & grepl('SECTION:', value)){
        section <- sub('SECTION: ', '', value)
        i <- 8
      }else if(i >= 4 & grepl('LENGTH:', value)){
        length <- strsplit(value, ' ')[[1]][2]
        i <- 8
      }else if(i >= 4 & grepl('[A-Z]+:', value)){
        i <- 8
      }else if(i == 8){
        paras <- c()
        for(p in getNodeSet(div, 'p')){ 
          paras <- c(paras, cleanNews(xmlValue(p)))
        }
        if(length(paras) > 0){
          body <- paste(paras, sep = '', collapse=sep)
        }
        break()
      }
    }
    if(!is.na(body)){
      heads <- c(heads, head)
      bodies <- c(bodies, body)
      bys <- c(bys, by)
      pubs <- c(pubs, pub)
      datetimes <- c(datetimes, datetime)
      editions <- c(editions, edition)
      lengths <- c(lengths, length)
    }
  }

  
  return(data.frame(head = as.character(heads), 
                    pub = as.character(pubs), 
                    datetime = as.POSIXct(datetimes, tz = 'UTC'),
                    by = as.factor(bys), 
                    edition = as.character(editions), 
                    length = as.numeric(lengths),
                    body = as.character(bodies), 
                    stringsAsFactors = FALSE))
}

cleanNews <- function(text){
  text <- gsub("\\r\\n|\\r|\\n|\\t", " ", text)
  text <- gsub("[[:cntrl:]]", " ", text, perl = TRUE)
  text <- gsub("\\s\\s+", " ", text)
  text <- gsub("^\\s+|\\s+$", "", text)
  return(text)
}

Kohei

Posts created 116

Leave a Reply Cancel reply

Develop efficient custom functions using quanteda v4.0 – Kohei Watanabe on New tokens object in quanteda v4.0April 16, 2024
[…] most important change in quanteda v4.0 is the creation of the external pointer-based tokens object, called tokens_xptr, that allows…
Setting fonts to plot Chinese polarity words in LSS – Kohei Watanabe on New paper on historical geopolitical threats to the USFebruary 19, 2024
[…] models are measuring to others. I am using this function myself in my project on construction of a geopolitical…
New paper on semantic temporality analysis – Kohei Watanabe on New paper on Latent Semantic ScalingAugust 29, 2023
[…] on temporal orientation of texts appeared in Research & Politics. In this study we applied latent semantic scaling (LSS)…
Kohei on Tutorial websites on LSS and Seeded LDAAugust 26, 2023
Please use base R's set.seed() before running the command.
Marli Fernandes on Tutorial websites on LSS and Seeded LDAAugust 24, 2023
I am currently using the seededlda package. I am using the following code: slda <- textmodel_seededlda(dfmt, dict, residual = 2)…

Back To Top