I believe that sentence is the optimal unit of sentiment analysis, but splitting whole news articles into sentences is often tricky because there are a lot of quotations in news. If we simply chop up texts based on punctuations, we get quoted texts are split into different sentences. This code is meant to avoid such problems as much as possible. This code is original written for Russian language texts but should work with English now.
library(stringi)
unitize <- function(df_items, len_min=10, quote='"'){ # Input has to be data frame with 'tid' and 'body' vairables
df_units <- data.frame()
for(i in 1:nrow(df_items)){
print(i)
body <- insertSeparator(df_items$body[i], len_min, quote)
if(nchar(body)){
units <- unlist(strsplit(body, '|', fixed=TRUE))
flags <- unlist(lapply(units, function(x) grepl('[a-zA-Z0-9]', x))) # Language dependent
units <- units[flags]
len <- length(units)
#print(body)
#print(len)
units <- stri_replace_all_fixed(units, '|', ' ') # Remove separator
units <- stri_replace_all_regex(units, '\\s\\s+', ' ') # Remove duplicated spaces
units <- stri_trim_both(units)
df_temp <- data.frame(tid=rep(df_items$tid[i], len), uid=1:len, text=units, stringsAsFactors=FALSE)
df_units <- rbind(df_units, df_temp)
}
}
write.table(df_units, file='item_units.csv', sep="\t", quote=TRUE, qmethod="double")
}
insertSeparator <- function(text, len_min=10, quote){
flag_quote <- FALSE
flag_bracket <- FALSE
text <- stri_replace_all_regex(text, '([^.!?]) \\| ', '$1 ') # Remove wrong paragraph separator
tokens <- stri_split_fixed(text, ' ', simplify=TRUE)
tokens2 <- c()
len <- 0
for(token in tokens){
# Reset flag by the paragraph separator
if(stri_detect_fixed(token, '|')){
flag_quote <- FALSE
flag_bracket <- FALSE
#cat("==Reset========\n")
}
# Set flags
flag_quote <- xor(flag_quote, stri_count_fixed(token, quote) == 1) # Exlcuded one-word quotaiton
if(stri_detect_fixed(token, '(') != stri_detect_fixed(token, ')')){
if(stri_detect_fixed(token, '(')) flag_bracket <- TRUE # Exlcuded one-word bracket
if(stri_detect_fixed(token, ')')) flag_bracket <- FALSE # Exlcuded one-word bracket
#cat("---------------\n")
}
if(len < len_min){
if(!stri_detect_fixed(token, '|')){
tokens2 <- c(tokens2, token)
len <- len + 1
}
}else{
if(stri_detect_fixed(token, '|')){
tokens2 <- c(tokens2, token)
len <- 0
}else if(!flag_quote & !flag_bracket & stri_detect_regex(token, '([.!?])$')){
tokens2 <- c(tokens2, token, '|') # Insert split mark
len <- 0
}else{
tokens2 <- c(tokens2, token)
len <- len + 1
}
}
#cat(token, flag_quote, flag_bracket, len, "\n")
}
text2 <- paste(tokens2, collapse=' ')
return(text2)
}
