I believe that sentence is the optimal unit of sentiment analysis, but splitting whole news articles into sentences is often tricky because there are a lot of quotations in news. If we simply chop up texts based on punctuations, we get quoted texts are split into different sentences. This code is meant to avoid such problems as much as possible. This code is original written for Russian language texts but should work with English now.
library(stringi) unitize <- function(df_items, len_min=10, quote='"'){ # Input has to be data frame with 'tid' and 'body' vairables df_units <- data.frame() for(i in 1:nrow(df_items)){ print(i) body <- insertSeparator(df_items$body[i], len_min, quote) if(nchar(body)){ units <- unlist(strsplit(body, '|', fixed=TRUE)) flags <- unlist(lapply(units, function(x) grepl('[a-zA-Z0-9]', x))) # Language dependent units <- units[flags] len <- length(units) #print(body) #print(len) units <- stri_replace_all_fixed(units, '|', ' ') # Remove separator units <- stri_replace_all_regex(units, '\\s\\s+', ' ') # Remove duplicated spaces units <- stri_trim_both(units) df_temp <- data.frame(tid=rep(df_items$tid[i], len), uid=1:len, text=units, stringsAsFactors=FALSE) df_units <- rbind(df_units, df_temp) } } write.table(df_units, file='item_units.csv', sep="\t", quote=TRUE, qmethod="double") } insertSeparator <- function(text, len_min=10, quote){ flag_quote <- FALSE flag_bracket <- FALSE text <- stri_replace_all_regex(text, '([^.!?]) \\| ', '$1 ') # Remove wrong paragraph separator tokens <- stri_split_fixed(text, ' ', simplify=TRUE) tokens2 <- c() len <- 0 for(token in tokens){ # Reset flag by the paragraph separator if(stri_detect_fixed(token, '|')){ flag_quote <- FALSE flag_bracket <- FALSE #cat("==Reset========\n") } # Set flags flag_quote <- xor(flag_quote, stri_count_fixed(token, quote) == 1) # Exlcuded one-word quotaiton if(stri_detect_fixed(token, '(') != stri_detect_fixed(token, ')')){ if(stri_detect_fixed(token, '(')) flag_bracket <- TRUE # Exlcuded one-word bracket if(stri_detect_fixed(token, ')')) flag_bracket <- FALSE # Exlcuded one-word bracket #cat("---------------\n") } if(len < len_min){ if(!stri_detect_fixed(token, '|')){ tokens2 <- c(tokens2, token) len <- len + 1 } }else{ if(stri_detect_fixed(token, '|')){ tokens2 <- c(tokens2, token) len <- 0 }else if(!flag_quote & !flag_bracket & stri_detect_regex(token, '([.!?])$')){ tokens2 <- c(tokens2, token, '|') # Insert split mark len <- 0 }else{ tokens2 <- c(tokens2, token) len <- len + 1 } } #cat(token, flag_quote, flag_bracket, len, "\n") } text2 <- paste(tokens2, collapse=' ') return(text2) }