Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions R/sentiment_engines.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ compute_sentiment_lexicons <- function(x, tokens, dv, lexicons, how, do.sentence
RcppParallel::setThreadOptions(numThreads = threads)
if (is_only_character(x)) x <- quanteda::corpus(x)
if (do.sentence == TRUE) {
tokens <- tokenize_texts(quanteda::texts(x), tokens, type = "sentence")
tokens <- tokenize_texts(as.character(x), tokens, type = "sentence")
valenceType <- ifelse(is.null(lexicons[["valence"]]), 0,
ifelse(colnames(lexicons[["valence"]])[2] == "y", 1, 2))
s <- compute_sentiment_sentences(unlist(tokens, recursive = FALSE),
Expand All @@ -50,7 +50,7 @@ compute_sentiment_lexicons <- function(x, tokens, dv, lexicons, how, do.sentence
data.table::setcolorder(s, c("id", "sentence_id", "word_count"))
}
} else {
tokens <- tokenize_texts(quanteda::texts(x), tokens, type = "word")
tokens <- tokenize_texts(as.character(x), tokens, type = "word")
if (is.null(lexicons[["valence"]])) { # call to C++ code
s <- compute_sentiment_onegrams(tokens, lexicons, how)
} else {
Expand Down
10 changes: 5 additions & 5 deletions R/sentocorpus.R
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ add_features <- function(corpus, featuresdf = NULL, keywords = NULL, do.binary =
stop("Please provide a list with proper names as part of the 'keywords' argument.")
if (!is_names_correct(names(keywords)))
stop("At least one feature's name in 'keywords' contains '-'. Please provide proper names.")
textsAll <- quanteda::texts(corpus)
textsAll <- as.character(corpus)
if (do.binary == TRUE) fct <- stringi::stri_detect
else fct <- stringi::stri_count
N <- length(keywords)
Expand Down Expand Up @@ -343,7 +343,7 @@ corpus_summarize <- function(x, by = "day", features = NULL) {
# statistics
dt <- data.table::data.table(
quanteda::docvars(x),
"nTokens" = as.numeric(sapply(tokenize_texts(quanteda::texts(x)), length))
"nTokens" = as.numeric(sapply(tokenize_texts(as.character(x)), length))
)

if (!is.null(features)) {
Expand Down Expand Up @@ -416,7 +416,7 @@ as.sento_corpus.corpus <- function(x, dates = NULL, do.clean = FALSE) {
features$date <- dates # avoids accidental duplication
}
dt <- data.table::data.table("id" = quanteda::docnames(x),
"texts" = quanteda::texts(x),
"texts" = as.character(x),
features) # includes date column
data.table::setcolorder(dt, c("id", "date", "texts"))
sento_corpus(dt, do.clean)
Expand Down Expand Up @@ -522,14 +522,14 @@ as.sento_corpus <- function(x, dates = NULL, do.clean = FALSE) {

#' @export
as.data.table.sento_corpus <- function(x, ...) {
dt <- data.table::data.table(id = quanteda::docnames(x), texts = quanteda::texts(x), quanteda::docvars(x))
dt <- data.table::data.table(id = quanteda::docnames(x), texts = as.character(x), quanteda::docvars(x))
data.table::setcolorder(dt, c("id", "date", "texts"))
dt
}

#' @export
as.data.frame.sento_corpus <- function(x, ...) {
df <- cbind(quanteda::docvars(x), texts = quanteda::texts(x))
df <- cbind(quanteda::docvars(x), texts = as.character(x))
df[, c("date", "texts", setdiff(colnames(df), c("date", "texts")))]
}

Expand Down
2 changes: 1 addition & 1 deletion R/sentomeasures_main.R
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ sento_measures <- function(sento_corpus, lexicons, ctr) {
#' list_valence_shifters[["en"]][, c("x", "t")])
#' sent1 <- compute_sentiment(corpusSample, l1, how = "counts")
#' sent2 <- compute_sentiment(corpusSample, l2, do.sentence = TRUE)
#' sent3 <- compute_sentiment(quanteda::texts(corpusSample), l2,
#' sent3 <- compute_sentiment(as.character(corpusSample), l2,
#' do.sentence = TRUE)
#' ctr <- ctr_agg(howTime = c("linear"), by = "year", lag = 3)
#'
Expand Down
2 changes: 1 addition & 1 deletion R/sentomeasures_measures_xyz.R
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ measures_update <- function(sento_measures, sento_corpus, lexicons) {
ctr <- sento_measures$ctr
sentiment <- sento_measures$sentiment
partialCorpus <- quanteda::corpus_subset(sento_corpus, !quanteda::docnames(sento_corpus) %in% sentiment$id)
if (length(quanteda::texts(partialCorpus)) > 0) {
if (quanteda::ndoc(partialCorpus) > 0) {
partialSentiment <- compute_sentiment(partialCorpus, lexicons, how = ctr$within$howWithin, nCore = ctr$nCore)
sentiment <- merge(sentiment, partialSentiment)
}
Expand Down
2 changes: 1 addition & 1 deletion appendix/run_timings.R
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ timingsFull.many <- lapply(nTexts, function(n) {
cat("Run timings for texts size of", n, "\n")
corpus <- quanteda::corpus(do.call(rbind, lapply(1:25, function(j) usnews))[keep[1:n], ],
text_field = "texts")
texts <- quanteda::texts(corpus)
texts <- as.character(corpus)
out <- microbenchmark(
sentoUnigramsAllFunc(texts),
sentoUnigramsAllFeaturesFunc(corpus),
Expand Down
2 changes: 1 addition & 1 deletion man/aggregate.sentiment.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/merge.sentiment.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tests/testthat/test_aggregation.R
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ test_that("Aggregation control function breaks when wrong inputs supplied", {

# aggregate.sentiment
s1 <- compute_sentiment(corpus, lex, how = "proportional")
s2 <- compute_sentiment(quanteda::texts(corpus), lex, how = "counts")
s2 <- compute_sentiment(as.character(corpus), lex, how = "counts")
s3 <- compute_sentiment(corpus, lexClust, how = "proportionalSquareRoot", do.sentence = TRUE)
sentimentAgg <- aggregate(s3, ctr_agg(lag = 7), do.full = FALSE)
test_that("Test input and output of sentiment aggregation functionality", {
Expand Down
52 changes: 28 additions & 24 deletions tests/testthat/test_sentiment_computation.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

context("Sentiment computation")

library("sentometrics")
library("data.table")
library("quanteda")
library("tm")
Expand All @@ -19,7 +20,7 @@ txt <- system.file("texts", "txt", package = "tm")
scorp <- tm::SimpleCorpus(tm::DirSource(txt))
# scorp$content[1] <- "A text for which we want to calculate above average sentiment."
# scorp$content[2] <- "A text for which we want to calculate below average sentiment."
scorp$content[3] <- quanteda::texts(corpus)[3]
scorp$content[3] <- as.character(corpus)[3]

# VCorpus creation
reuters <- system.file("texts", "crude", package = "tm")
Expand All @@ -45,6 +46,8 @@ names(lexWrong)[2] <- "frr"

### tests from here ###

load(system.file("extdata", "test_data.rda", package = "sentometrics")) # benchmark sentiment scores

sanity_sentiment <- function(texts, lexicon, valence = NULL) {
setkey(lexicon, "x")
if (!is.null(valence)) setkey(valence, "x")
Expand Down Expand Up @@ -73,21 +76,21 @@ sanity_sentiment <- function(texts, lexicon, valence = NULL) {
}

sentimentList <- list(
s1 = compute_sentiment(quanteda::texts(corpus), lex, how = "counts"),
s2 = compute_sentiment(quanteda::texts(corpus), lex[1:3], how = "counts"),
s3 = compute_sentiment(quanteda::texts(corpus), lex, how = "proportional"),
s4 = compute_sentiment(quanteda::texts(corpus), lex, how = "proportionalPol"),
s1 = compute_sentiment(as.character(corpus), lex, how = "counts"),
s2 = compute_sentiment(as.character(corpus), lex[1:3], how = "counts"),
s3 = compute_sentiment(as.character(corpus), lex, how = "proportional"),
s4 = compute_sentiment(as.character(corpus), lex, how = "proportionalPol"),
s5 = compute_sentiment(quanteda::corpus(usnews[1:250, "texts"]), lex, how = "counts"),
s6 = compute_sentiment(quanteda::corpus(usnews[1:250, c("texts", "wsj", "economy")], text_field = "texts"),
lex, how = "counts"),
s7 = compute_sentiment(corpus, lex, how = "counts"),
s8 = compute_sentiment(quanteda::texts(corpus), lexSplit, how = "counts"),
# s9 = compute_sentiment(quanteda::texts(corpus), lex, how = "TF", nCore = 2), # no multicore computation in CRAN checks
s10 = compute_sentiment(quanteda::texts(corpus), lexClust, how = "counts"),
s8 = compute_sentiment(as.character(corpus), lexSplit, how = "counts"),
# s9 = compute_sentiment(as.character(corpus), lex, how = "TF", nCore = 2), # no multicore computation in CRAN checks
s10 = compute_sentiment(as.character(corpus), lexClust, how = "counts"),
s11 = compute_sentiment(corpus, lexClust, how = "proportional"),
s12 = compute_sentiment(quanteda::texts(corpus), lexClust, how = "proportionalPol"),
s13 = compute_sentiment(corpus, lex, how = "exponential"),
s14 = compute_sentiment(corpus, lex, how = "inverseExponential"),
s12 = compute_sentiment(as.character(corpus), lexClust, how = "proportionalPol"),
# s13 = compute_sentiment(corpus, lex, how = "exponential"),
# s14 = compute_sentiment(corpus, lex, how = "inverseExponential"),
s15 = compute_sentiment(corpus, lex, how = "UShaped"),
s16 = compute_sentiment(corpus, lex, how = "inverseUShaped"),
# s17 = compute_sentiment(corpus, lex, how = "TF"),
Expand All @@ -101,11 +104,6 @@ sentimentList <- list(
)

# compute_sentiment
# load(system.file("extdata", "test_data.rda", package = "sentometrics")) # benchmark sentiment scores
# test_that("Agreement between legacy benchmark and current produced sentiment scores", {
# expect_equal(test_data, sentimentList[1:11])
# })

test_that("Agreement between sentiment scores on document-level across input objects", {
expect_true(all(unlist(lapply(sentimentList, function(s) nrow(s) == 250))))
expect_true(all(unlist(lapply(sentimentList[-1], function(s) all(s$word_count == sentimentList$s1$word_count)))))
Expand All @@ -115,21 +113,27 @@ test_that("Agreement between sentiment scores on document-level across input obj
sentimentList$s5[, c("GI_en", "LM_en", "HENRY_en")])
expect_equivalent(sentimentList$s6[, -c(1:2)],
sentimentList$s7[, colnames(sentimentList$s6)[-c(1:2)], with = FALSE])
expect_error(compute_sentiment(quanteda::texts(corpus), lex, how = "notAnOption"))
expect_warning(compute_sentiment(quanteda::texts(corpus), lex, how = "counts", nCore = -1))
expect_error(compute_sentiment(quanteda::texts(corpus), list_lexicons))
expect_error(compute_sentiment(as.character(corpus), lex, how = "notAnOption"))
expect_warning(compute_sentiment(as.character(corpus), lex, how = "counts", nCore = -1))
expect_error(compute_sentiment(as.character(corpus), list_lexicons))
expect_true(all.equal(sentimentList$s3[3, -1],
compute_sentiment(scorp[3], lex, how = "proportional")[, -1]))
# expect_warning(compute_sentiment(vcorp, lex, how = "proportional"))
expect_error(compute_sentiment(corpusLang, lex, how = "proportional"))
expect_true("language" %in% colnames(quanteda::docvars(corpusLang)))
expect_error(compute_sentiment(corpusLang, lexWrong, how = "proportional"))
expect_true(all.equal(sentimentList$s1$GI_en, sanity_sentiment(quanteda::texts(corpus), lex$GI_en, lex$valence)))
expect_true(all.equal(sentimentList$s2$GI_en, sanity_sentiment(quanteda::texts(corpus), lex$GI_en)))

# expect_true(all.equal(test_data, sentimentList[1:11])) # compare with old sentiment scores
setcolorder(sentimentList[[7]], names(test_data[[7]])) # make column order the same
setcolorder(sentimentList[[10]], names(test_data[[10]])) # make column order the same
expect_equal(test_data, sentimentList[1:11])

expect_true(all.equal(sentimentList$s1$GI_en, sanity_sentiment(as.character(corpus), lex$GI_en, lex$valence)))
expect_true(all.equal(sentimentList$s2$GI_en, sanity_sentiment(as.character(corpus), lex$GI_en)))
})

sentimentSentenceList <- list(
s1 = compute_sentiment(quanteda::texts(corpus), lexClust, how = "counts", do.sentence = TRUE),
s1 = compute_sentiment(as.character(corpus), lexClust, how = "counts", do.sentence = TRUE),
s2 = compute_sentiment(quanteda::corpus(usnews[1:250, "texts"]),
lexClust, how = "counts", do.sentence = TRUE),
s3 = compute_sentiment(quanteda::corpus(usnews[1:250, c("texts", "wsj", "economy")], text_field = "texts"),
Expand Down Expand Up @@ -194,12 +198,12 @@ test_that("Correct binding of several sentiment objects", {
})

# tf-idf comparison sentometrics vs. quanteda
toks <- stri_split_boundaries(stri_trans_tolower(quanteda::texts(corpus)), type = "word", skip_word_none = TRUE)
toks <- stri_split_boundaries(stri_trans_tolower(as.character(corpus)), type = "word", skip_word_none = TRUE)
dfmQ <- quanteda::dfm(as.tokens(toks)) %>% dfm_tfidf(k = 1)
posScores <- rowSums(as.matrix(quanteda::dfm_select(dfmQ, lex$GI_en[y == 1, x])))
negScores <- rowSums(as.matrix(quanteda::dfm_select(dfmQ, lex$GI_en[y == -1, x])))
test_that("Same tf-idf scoring for sentometrics and quanteda", {
expect_equal(compute_sentiment(quanteda::texts(corpus), lex[-length(lex)], tokens = toks, "TFIDF")[["GI_en"]],
expect_equal(compute_sentiment(as.character(corpus), lex[-length(lex)], tokens = toks, "TFIDF")[["GI_en"]],
unname(posScores - negScores))
})

4 changes: 2 additions & 2 deletions vignettes/examples/sentiment.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ tks <- as.list(tokens(corpus, what = "fastestword"))

lexicons <- sento_lexicons(list_lexicons[c("GI_en", "LM_en", "HENRY_en")])

compute_sentiment(texts(corpus), lexicons, how = "counts", tokens = tks)
compute_sentiment(as.character(corpus), lexicons, how = "counts", tokens = tks)
```

To provide your own tokenized input on sentence-level, beware that you need to provide a `list` of `list`s, and set `do.sentence = TRUE`. See one of the next examples for more info about sentence-level sentiment calculation.
Expand All @@ -97,7 +97,7 @@ To provide your own tokenized input on sentence-level, beware that you need to p
sentences <- tokens(corpus, what = "sentence")
tks2 <- lapply(sentences, function(s) as.list(tokens(s, what = "word")))

compute_sentiment(texts(corpus), lexicons[2:3], how = "counts", tokens = tks2, do.sentence = TRUE)
compute_sentiment(as.character(corpus), lexicons[2:3], how = "counts", tokens = tks2, do.sentence = TRUE)
```

### The three key approaches to the sentiment computation
Expand Down