From 00df984f0f0c7325d4f1ff793966f5c1923c77e0 Mon Sep 17 00:00:00 2001 From: kbenoit Date: Mon, 29 Mar 2021 22:01:15 +0100 Subject: [PATCH 1/2] Change usage to avoid warnings for quanteda v3 - texts() is replaced by as.character() --- R/sentiment_engines.R | 4 +- R/sentocorpus.R | 10 ++-- R/sentomeasures_main.R | 2 +- R/sentomeasures_measures_xyz.R | 2 +- appendix/run_timings.R | 2 +- man/aggregate.sentiment.Rd | 2 +- man/merge.sentiment.Rd | 2 +- tests/testthat/test_aggregation.R | 2 +- tests/testthat/test_sentiment_computation.R | 52 +++++++++++---------- vignettes/examples/sentiment.Rmd | 4 +- 10 files changed, 43 insertions(+), 39 deletions(-) diff --git a/R/sentiment_engines.R b/R/sentiment_engines.R index c26bebf..558482d 100644 --- a/R/sentiment_engines.R +++ b/R/sentiment_engines.R @@ -34,7 +34,7 @@ compute_sentiment_lexicons <- function(x, tokens, dv, lexicons, how, do.sentence RcppParallel::setThreadOptions(numThreads = threads) if (is_only_character(x)) x <- quanteda::corpus(x) if (do.sentence == TRUE) { - tokens <- tokenize_texts(quanteda::texts(x), tokens, type = "sentence") + tokens <- tokenize_texts(as.character(x), tokens, type = "sentence") valenceType <- ifelse(is.null(lexicons[["valence"]]), 0, ifelse(colnames(lexicons[["valence"]])[2] == "y", 1, 2)) s <- compute_sentiment_sentences(unlist(tokens, recursive = FALSE), @@ -50,7 +50,7 @@ compute_sentiment_lexicons <- function(x, tokens, dv, lexicons, how, do.sentence data.table::setcolorder(s, c("id", "sentence_id", "word_count")) } } else { - tokens <- tokenize_texts(quanteda::texts(x), tokens, type = "word") + tokens <- tokenize_texts(as.character(x), tokens, type = "word") if (is.null(lexicons[["valence"]])) { # call to C++ code s <- compute_sentiment_onegrams(tokens, lexicons, how) } else { diff --git a/R/sentocorpus.R b/R/sentocorpus.R index 63dfdf3..0d1faa9 100644 --- a/R/sentocorpus.R +++ b/R/sentocorpus.R @@ -246,7 +246,7 @@ add_features <- function(corpus, featuresdf = NULL, keywords = NULL, do.binary = stop("Please provide a list with proper names as part of the 'keywords' argument.") if (!is_names_correct(names(keywords))) stop("At least one feature's name in 'keywords' contains '-'. Please provide proper names.") - textsAll <- quanteda::texts(corpus) + textsAll <- as.character(corpus) if (do.binary == TRUE) fct <- stringi::stri_detect else fct <- stringi::stri_count N <- length(keywords) @@ -343,7 +343,7 @@ corpus_summarize <- function(x, by = "day", features = NULL) { # statistics dt <- data.table::data.table( quanteda::docvars(x), - "nTokens" = as.numeric(sapply(tokenize_texts(quanteda::texts(x)), length)) + "nTokens" = as.numeric(sapply(as.character(x), length)) ) if (!is.null(features)) { @@ -416,7 +416,7 @@ as.sento_corpus.corpus <- function(x, dates = NULL, do.clean = FALSE) { features$date <- dates # avoids accidental duplication } dt <- data.table::data.table("id" = quanteda::docnames(x), - "texts" = quanteda::texts(x), + "texts" = as.character(x), features) # includes date column data.table::setcolorder(dt, c("id", "date", "texts")) sento_corpus(dt, do.clean) @@ -522,14 +522,14 @@ as.sento_corpus <- function(x, dates = NULL, do.clean = FALSE) { #' @export as.data.table.sento_corpus <- function(x, ...) { - dt <- data.table::data.table(id = quanteda::docnames(x), texts = quanteda::texts(x), quanteda::docvars(x)) + dt <- data.table::data.table(id = quanteda::docnames(x), texts = as.character(x), quanteda::docvars(x)) data.table::setcolorder(dt, c("id", "date", "texts")) dt } #' @export as.data.frame.sento_corpus <- function(x, ...) { - df <- cbind(quanteda::docvars(x), texts = quanteda::texts(x)) + df <- cbind(quanteda::docvars(x), texts = as.character(x)) df[, c("date", "texts", setdiff(colnames(df), c("date", "texts")))] } diff --git a/R/sentomeasures_main.R b/R/sentomeasures_main.R index ced9fed..40ddac1 100644 --- a/R/sentomeasures_main.R +++ b/R/sentomeasures_main.R @@ -286,7 +286,7 @@ sento_measures <- function(sento_corpus, lexicons, ctr) { #' list_valence_shifters[["en"]][, c("x", "t")]) #' sent1 <- compute_sentiment(corpusSample, l1, how = "counts") #' sent2 <- compute_sentiment(corpusSample, l2, do.sentence = TRUE) -#' sent3 <- compute_sentiment(quanteda::texts(corpusSample), l2, +#' sent3 <- compute_sentiment(as.character(corpusSample), l2, #' do.sentence = TRUE) #' ctr <- ctr_agg(howTime = c("linear"), by = "year", lag = 3) #' diff --git a/R/sentomeasures_measures_xyz.R b/R/sentomeasures_measures_xyz.R index 0578fd8..c45330c 100644 --- a/R/sentomeasures_measures_xyz.R +++ b/R/sentomeasures_measures_xyz.R @@ -166,7 +166,7 @@ measures_update <- function(sento_measures, sento_corpus, lexicons) { ctr <- sento_measures$ctr sentiment <- sento_measures$sentiment partialCorpus <- quanteda::corpus_subset(sento_corpus, !quanteda::docnames(sento_corpus) %in% sentiment$id) - if (length(quanteda::texts(partialCorpus)) > 0) { + if (quanteda::ndoc(partialCorpus) > 0) { partialSentiment <- compute_sentiment(partialCorpus, lexicons, how = ctr$within$howWithin, nCore = ctr$nCore) sentiment <- merge(sentiment, partialSentiment) } diff --git a/appendix/run_timings.R b/appendix/run_timings.R index 085a3c8..b7dc9de 100644 --- a/appendix/run_timings.R +++ b/appendix/run_timings.R @@ -266,7 +266,7 @@ timingsFull.many <- lapply(nTexts, function(n) { cat("Run timings for texts size of", n, "\n") corpus <- quanteda::corpus(do.call(rbind, lapply(1:25, function(j) usnews))[keep[1:n], ], text_field = "texts") - texts <- quanteda::texts(corpus) + texts <- as.character(corpus) out <- microbenchmark( sentoUnigramsAllFunc(texts), sentoUnigramsAllFeaturesFunc(corpus), diff --git a/man/aggregate.sentiment.Rd b/man/aggregate.sentiment.Rd index 0e5d3f0..1cb27df 100644 --- a/man/aggregate.sentiment.Rd +++ b/man/aggregate.sentiment.Rd @@ -42,7 +42,7 @@ l2 <- sento_lexicons(list_lexicons[c("LM_en", "HENRY_en")], list_valence_shifters[["en"]][, c("x", "t")]) sent1 <- compute_sentiment(corpusSample, l1, how = "counts") sent2 <- compute_sentiment(corpusSample, l2, do.sentence = TRUE) -sent3 <- compute_sentiment(quanteda::texts(corpusSample), l2, +sent3 <- compute_sentiment(as.character(corpusSample), l2, do.sentence = TRUE) ctr <- ctr_agg(howTime = c("linear"), by = "year", lag = 3) diff --git a/man/merge.sentiment.Rd b/man/merge.sentiment.Rd index 1632d79..52eeda0 100644 --- a/man/merge.sentiment.Rd +++ b/man/merge.sentiment.Rd @@ -51,7 +51,7 @@ m3 <- merge(s3, s6) m4 <- merge(s4, s5) nrow(m4) == nrow(m2) # TRUE -# different methods and weighting add rows and columns +# different methods and weighting adds rows and columns ## rows are added only when the different weighting ## approach for a specific method gives other sentiment values m5 <- merge(s4, s7) diff --git a/tests/testthat/test_aggregation.R b/tests/testthat/test_aggregation.R index 43cfa54..8671738 100644 --- a/tests/testthat/test_aggregation.R +++ b/tests/testthat/test_aggregation.R @@ -57,7 +57,7 @@ test_that("Aggregation control function breaks when wrong inputs supplied", { # aggregate.sentiment s1 <- compute_sentiment(corpus, lex, how = "proportional") -s2 <- compute_sentiment(quanteda::texts(corpus), lex, how = "counts") +s2 <- compute_sentiment(as.character(corpus), lex, how = "counts") s3 <- compute_sentiment(corpus, lexClust, how = "proportionalSquareRoot", do.sentence = TRUE) sentimentAgg <- aggregate(s3, ctr_agg(lag = 7), do.full = FALSE) test_that("Test input and output of sentiment aggregation functionality", { diff --git a/tests/testthat/test_sentiment_computation.R b/tests/testthat/test_sentiment_computation.R index f09261a..613c467 100644 --- a/tests/testthat/test_sentiment_computation.R +++ b/tests/testthat/test_sentiment_computation.R @@ -3,6 +3,7 @@ context("Sentiment computation") +library("sentometrics") library("data.table") library("quanteda") library("tm") @@ -19,7 +20,7 @@ txt <- system.file("texts", "txt", package = "tm") scorp <- tm::SimpleCorpus(tm::DirSource(txt)) # scorp$content[1] <- "A text for which we want to calculate above average sentiment." # scorp$content[2] <- "A text for which we want to calculate below average sentiment." -scorp$content[3] <- quanteda::texts(corpus)[3] +scorp$content[3] <- as.character(corpus)[3] # VCorpus creation reuters <- system.file("texts", "crude", package = "tm") @@ -45,6 +46,8 @@ names(lexWrong)[2] <- "frr" ### tests from here ### +load(system.file("extdata", "test_data.rda", package = "sentometrics")) # benchmark sentiment scores + sanity_sentiment <- function(texts, lexicon, valence = NULL) { setkey(lexicon, "x") if (!is.null(valence)) setkey(valence, "x") @@ -73,21 +76,21 @@ sanity_sentiment <- function(texts, lexicon, valence = NULL) { } sentimentList <- list( - s1 = compute_sentiment(quanteda::texts(corpus), lex, how = "counts"), - s2 = compute_sentiment(quanteda::texts(corpus), lex[1:3], how = "counts"), - s3 = compute_sentiment(quanteda::texts(corpus), lex, how = "proportional"), - s4 = compute_sentiment(quanteda::texts(corpus), lex, how = "proportionalPol"), + s1 = compute_sentiment(as.character(corpus), lex, how = "counts"), + s2 = compute_sentiment(as.character(corpus), lex[1:3], how = "counts"), + s3 = compute_sentiment(as.character(corpus), lex, how = "proportional"), + s4 = compute_sentiment(as.character(corpus), lex, how = "proportionalPol"), s5 = compute_sentiment(quanteda::corpus(usnews[1:250, "texts"]), lex, how = "counts"), s6 = compute_sentiment(quanteda::corpus(usnews[1:250, c("texts", "wsj", "economy")], text_field = "texts"), lex, how = "counts"), s7 = compute_sentiment(corpus, lex, how = "counts"), - s8 = compute_sentiment(quanteda::texts(corpus), lexSplit, how = "counts"), - # s9 = compute_sentiment(quanteda::texts(corpus), lex, how = "TF", nCore = 2), # no multicore computation in CRAN checks - s10 = compute_sentiment(quanteda::texts(corpus), lexClust, how = "counts"), + s8 = compute_sentiment(as.character(corpus), lexSplit, how = "counts"), + # s9 = compute_sentiment(as.character(corpus), lex, how = "TF", nCore = 2), # no multicore computation in CRAN checks + s10 = compute_sentiment(as.character(corpus), lexClust, how = "counts"), s11 = compute_sentiment(corpus, lexClust, how = "proportional"), - s12 = compute_sentiment(quanteda::texts(corpus), lexClust, how = "proportionalPol"), - s13 = compute_sentiment(corpus, lex, how = "exponential"), - s14 = compute_sentiment(corpus, lex, how = "inverseExponential"), + s12 = compute_sentiment(as.character(corpus), lexClust, how = "proportionalPol"), +# s13 = compute_sentiment(corpus, lex, how = "exponential"), +# s14 = compute_sentiment(corpus, lex, how = "inverseExponential"), s15 = compute_sentiment(corpus, lex, how = "UShaped"), s16 = compute_sentiment(corpus, lex, how = "inverseUShaped"), # s17 = compute_sentiment(corpus, lex, how = "TF"), @@ -101,11 +104,6 @@ sentimentList <- list( ) # compute_sentiment -# load(system.file("extdata", "test_data.rda", package = "sentometrics")) # benchmark sentiment scores -# test_that("Agreement between legacy benchmark and current produced sentiment scores", { -# expect_equal(test_data, sentimentList[1:11]) -# }) - test_that("Agreement between sentiment scores on document-level across input objects", { expect_true(all(unlist(lapply(sentimentList, function(s) nrow(s) == 250)))) expect_true(all(unlist(lapply(sentimentList[-1], function(s) all(s$word_count == sentimentList$s1$word_count))))) @@ -115,21 +113,27 @@ test_that("Agreement between sentiment scores on document-level across input obj sentimentList$s5[, c("GI_en", "LM_en", "HENRY_en")]) expect_equivalent(sentimentList$s6[, -c(1:2)], sentimentList$s7[, colnames(sentimentList$s6)[-c(1:2)], with = FALSE]) - expect_error(compute_sentiment(quanteda::texts(corpus), lex, how = "notAnOption")) - expect_warning(compute_sentiment(quanteda::texts(corpus), lex, how = "counts", nCore = -1)) - expect_error(compute_sentiment(quanteda::texts(corpus), list_lexicons)) + expect_error(compute_sentiment(as.character(corpus), lex, how = "notAnOption")) + expect_warning(compute_sentiment(as.character(corpus), lex, how = "counts", nCore = -1)) + expect_error(compute_sentiment(as.character(corpus), list_lexicons)) expect_true(all.equal(sentimentList$s3[3, -1], compute_sentiment(scorp[3], lex, how = "proportional")[, -1])) # expect_warning(compute_sentiment(vcorp, lex, how = "proportional")) expect_error(compute_sentiment(corpusLang, lex, how = "proportional")) expect_true("language" %in% colnames(quanteda::docvars(corpusLang))) expect_error(compute_sentiment(corpusLang, lexWrong, how = "proportional")) - expect_true(all.equal(sentimentList$s1$GI_en, sanity_sentiment(quanteda::texts(corpus), lex$GI_en, lex$valence))) - expect_true(all.equal(sentimentList$s2$GI_en, sanity_sentiment(quanteda::texts(corpus), lex$GI_en))) + + # expect_true(all.equal(test_data, sentimentList[1:11])) # compare with old sentiment scores + setcolorder(sentimentList[[7]], names(test_data[[7]])) # make column order the same + setcolorder(sentimentList[[10]], names(test_data[[10]])) # make column order the same + expect_equal(test_data, sentimentList[1:11]) + + expect_true(all.equal(sentimentList$s1$GI_en, sanity_sentiment(as.character(corpus), lex$GI_en, lex$valence))) + expect_true(all.equal(sentimentList$s2$GI_en, sanity_sentiment(as.character(corpus), lex$GI_en))) }) sentimentSentenceList <- list( - s1 = compute_sentiment(quanteda::texts(corpus), lexClust, how = "counts", do.sentence = TRUE), + s1 = compute_sentiment(as.character(corpus), lexClust, how = "counts", do.sentence = TRUE), s2 = compute_sentiment(quanteda::corpus(usnews[1:250, "texts"]), lexClust, how = "counts", do.sentence = TRUE), s3 = compute_sentiment(quanteda::corpus(usnews[1:250, c("texts", "wsj", "economy")], text_field = "texts"), @@ -194,12 +198,12 @@ test_that("Correct binding of several sentiment objects", { }) # tf-idf comparison sentometrics vs. quanteda -toks <- stri_split_boundaries(stri_trans_tolower(quanteda::texts(corpus)), type = "word", skip_word_none = TRUE) +toks <- stri_split_boundaries(stri_trans_tolower(as.character(corpus)), type = "word", skip_word_none = TRUE) dfmQ <- quanteda::dfm(as.tokens(toks)) %>% dfm_tfidf(k = 1) posScores <- rowSums(as.matrix(quanteda::dfm_select(dfmQ, lex$GI_en[y == 1, x]))) negScores <- rowSums(as.matrix(quanteda::dfm_select(dfmQ, lex$GI_en[y == -1, x]))) test_that("Same tf-idf scoring for sentometrics and quanteda", { - expect_equal(compute_sentiment(quanteda::texts(corpus), lex[-length(lex)], tokens = toks, "TFIDF")[["GI_en"]], + expect_equal(compute_sentiment(as.character(corpus), lex[-length(lex)], tokens = toks, "TFIDF")[["GI_en"]], unname(posScores - negScores)) }) diff --git a/vignettes/examples/sentiment.Rmd b/vignettes/examples/sentiment.Rmd index 42a09bc..94824cc 100644 --- a/vignettes/examples/sentiment.Rmd +++ b/vignettes/examples/sentiment.Rmd @@ -88,7 +88,7 @@ tks <- as.list(tokens(corpus, what = "fastestword")) lexicons <- sento_lexicons(list_lexicons[c("GI_en", "LM_en", "HENRY_en")]) -compute_sentiment(texts(corpus), lexicons, how = "counts", tokens = tks) +compute_sentiment(as.character(corpus), lexicons, how = "counts", tokens = tks) ``` To provide your own tokenized input on sentence-level, beware that you need to provide a `list` of `list`s, and set `do.sentence = TRUE`. See one of the next examples for more info about sentence-level sentiment calculation. @@ -97,7 +97,7 @@ To provide your own tokenized input on sentence-level, beware that you need to p sentences <- tokens(corpus, what = "sentence") tks2 <- lapply(sentences, function(s) as.list(tokens(s, what = "word"))) -compute_sentiment(texts(corpus), lexicons[2:3], how = "counts", tokens = tks2, do.sentence = TRUE) +compute_sentiment(as.character(corpus), lexicons[2:3], how = "counts", tokens = tks2, do.sentence = TRUE) ``` ### The three key approaches to the sentiment computation From f09092c7523539b0c036508603ecc0f7dfd39899 Mon Sep 17 00:00:00 2001 From: Sam Borms Date: Tue, 30 Mar 2021 12:01:58 +0200 Subject: [PATCH 2/2] add back tokenize_texts() --- R/sentocorpus.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/sentocorpus.R b/R/sentocorpus.R index 0d1faa9..b093384 100644 --- a/R/sentocorpus.R +++ b/R/sentocorpus.R @@ -343,7 +343,7 @@ corpus_summarize <- function(x, by = "day", features = NULL) { # statistics dt <- data.table::data.table( quanteda::docvars(x), - "nTokens" = as.numeric(sapply(as.character(x), length)) + "nTokens" = as.numeric(sapply(tokenize_texts(as.character(x)), length)) ) if (!is.null(features)) {