From 00df984f0f0c7325d4f1ff793966f5c1923c77e0 Mon Sep 17 00:00:00 2001
From: kbenoit <kbenoit@lse.ac.uk>
Date: Mon, 29 Mar 2021 22:01:15 +0100
Subject: [PATCH 1/2] Change usage to avoid warnings for quanteda v3

- texts() is replaced by as.character()
---
 R/sentiment_engines.R                       |  4 +-
 R/sentocorpus.R                             | 10 ++--
 R/sentomeasures_main.R                      |  2 +-
 R/sentomeasures_measures_xyz.R              |  2 +-
 appendix/run_timings.R                      |  2 +-
 man/aggregate.sentiment.Rd                  |  2 +-
 man/merge.sentiment.Rd                      |  2 +-
 tests/testthat/test_aggregation.R           |  2 +-
 tests/testthat/test_sentiment_computation.R | 52 +++++++++++----------
 vignettes/examples/sentiment.Rmd            |  4 +-
 10 files changed, 43 insertions(+), 39 deletions(-)

diff --git a/R/sentiment_engines.R b/R/sentiment_engines.R
index c26bebf..558482d 100644
--- a/R/sentiment_engines.R
+++ b/R/sentiment_engines.R
@@ -34,7 +34,7 @@ compute_sentiment_lexicons <- function(x, tokens, dv, lexicons, how, do.sentence
   RcppParallel::setThreadOptions(numThreads = threads)
   if (is_only_character(x)) x <- quanteda::corpus(x)
   if (do.sentence == TRUE) {
-    tokens <- tokenize_texts(quanteda::texts(x), tokens, type = "sentence")
+    tokens <- tokenize_texts(as.character(x), tokens, type = "sentence")
     valenceType <- ifelse(is.null(lexicons[["valence"]]), 0,
                           ifelse(colnames(lexicons[["valence"]])[2] == "y", 1, 2))
     s <- compute_sentiment_sentences(unlist(tokens, recursive = FALSE),
@@ -50,7 +50,7 @@ compute_sentiment_lexicons <- function(x, tokens, dv, lexicons, how, do.sentence
       data.table::setcolorder(s, c("id", "sentence_id", "word_count"))
     }
   } else {
-    tokens <- tokenize_texts(quanteda::texts(x), tokens, type = "word")
+    tokens <- tokenize_texts(as.character(x), tokens, type = "word")
     if (is.null(lexicons[["valence"]])) { # call to C++ code
       s <- compute_sentiment_onegrams(tokens, lexicons, how)
     } else {
diff --git a/R/sentocorpus.R b/R/sentocorpus.R
index 63dfdf3..0d1faa9 100644
--- a/R/sentocorpus.R
+++ b/R/sentocorpus.R
@@ -246,7 +246,7 @@ add_features <- function(corpus, featuresdf = NULL, keywords = NULL, do.binary =
       stop("Please provide a list with proper names as part of the 'keywords' argument.")
     if (!is_names_correct(names(keywords)))
       stop("At least one feature's name in 'keywords' contains '-'. Please provide proper names.")
-    textsAll <- quanteda::texts(corpus)
+    textsAll <- as.character(corpus)
     if (do.binary == TRUE) fct <- stringi::stri_detect
     else fct <- stringi::stri_count
     N <- length(keywords)
@@ -343,7 +343,7 @@ corpus_summarize <- function(x, by = "day", features = NULL) {
   # statistics
   dt <- data.table::data.table(
     quanteda::docvars(x),
-    "nTokens" = as.numeric(sapply(tokenize_texts(quanteda::texts(x)), length))
+    "nTokens" = as.numeric(sapply(as.character(x), length))
   )
 
   if (!is.null(features)) {
@@ -416,7 +416,7 @@ as.sento_corpus.corpus <- function(x, dates = NULL, do.clean = FALSE) {
     features$date <- dates # avoids accidental duplication
   }
   dt <- data.table::data.table("id" = quanteda::docnames(x),
-                               "texts" = quanteda::texts(x),
+                               "texts" = as.character(x),
                                features) # includes date column
   data.table::setcolorder(dt, c("id", "date", "texts"))
   sento_corpus(dt, do.clean)
@@ -522,14 +522,14 @@ as.sento_corpus <- function(x, dates = NULL, do.clean = FALSE) {
 
 #' @export
 as.data.table.sento_corpus <- function(x, ...) {
-  dt <- data.table::data.table(id = quanteda::docnames(x), texts = quanteda::texts(x), quanteda::docvars(x))
+  dt <- data.table::data.table(id = quanteda::docnames(x), texts = as.character(x), quanteda::docvars(x))
   data.table::setcolorder(dt, c("id", "date", "texts"))
   dt
 }
 
 #' @export
 as.data.frame.sento_corpus <- function(x, ...) {
-  df <- cbind(quanteda::docvars(x), texts = quanteda::texts(x))
+  df <- cbind(quanteda::docvars(x), texts = as.character(x))
   df[, c("date", "texts", setdiff(colnames(df), c("date", "texts")))]
 }
 
diff --git a/R/sentomeasures_main.R b/R/sentomeasures_main.R
index ced9fed..40ddac1 100644
--- a/R/sentomeasures_main.R
+++ b/R/sentomeasures_main.R
@@ -286,7 +286,7 @@ sento_measures <- function(sento_corpus, lexicons, ctr) {
 #'                      list_valence_shifters[["en"]][, c("x", "t")])
 #' sent1 <- compute_sentiment(corpusSample, l1, how = "counts")
 #' sent2 <- compute_sentiment(corpusSample, l2, do.sentence = TRUE)
-#' sent3 <- compute_sentiment(quanteda::texts(corpusSample), l2,
+#' sent3 <- compute_sentiment(as.character(corpusSample), l2,
 #'                            do.sentence = TRUE)
 #' ctr <- ctr_agg(howTime = c("linear"), by = "year", lag = 3)
 #'
diff --git a/R/sentomeasures_measures_xyz.R b/R/sentomeasures_measures_xyz.R
index 0578fd8..c45330c 100644
--- a/R/sentomeasures_measures_xyz.R
+++ b/R/sentomeasures_measures_xyz.R
@@ -166,7 +166,7 @@ measures_update <- function(sento_measures, sento_corpus, lexicons) {
   ctr <- sento_measures$ctr
   sentiment <- sento_measures$sentiment
   partialCorpus <- quanteda::corpus_subset(sento_corpus, !quanteda::docnames(sento_corpus) %in% sentiment$id)
-  if (length(quanteda::texts(partialCorpus)) > 0) {
+  if (quanteda::ndoc(partialCorpus) > 0) {
     partialSentiment <- compute_sentiment(partialCorpus, lexicons, how = ctr$within$howWithin, nCore = ctr$nCore)
     sentiment <- merge(sentiment, partialSentiment)
   }
diff --git a/appendix/run_timings.R b/appendix/run_timings.R
index 085a3c8..b7dc9de 100644
--- a/appendix/run_timings.R
+++ b/appendix/run_timings.R
@@ -266,7 +266,7 @@ timingsFull.many <- lapply(nTexts, function(n) {
   cat("Run timings for texts size of", n, "\n")
   corpus <-  quanteda::corpus(do.call(rbind, lapply(1:25, function(j) usnews))[keep[1:n], ],
                               text_field = "texts")
-  texts <- quanteda::texts(corpus)
+  texts <- as.character(corpus)
   out <- microbenchmark(
     sentoUnigramsAllFunc(texts),
     sentoUnigramsAllFeaturesFunc(corpus),
diff --git a/man/aggregate.sentiment.Rd b/man/aggregate.sentiment.Rd
index 0e5d3f0..1cb27df 100644
--- a/man/aggregate.sentiment.Rd
+++ b/man/aggregate.sentiment.Rd
@@ -42,7 +42,7 @@ l2 <- sento_lexicons(list_lexicons[c("LM_en", "HENRY_en")],
                      list_valence_shifters[["en"]][, c("x", "t")])
 sent1 <- compute_sentiment(corpusSample, l1, how = "counts")
 sent2 <- compute_sentiment(corpusSample, l2, do.sentence = TRUE)
-sent3 <- compute_sentiment(quanteda::texts(corpusSample), l2,
+sent3 <- compute_sentiment(as.character(corpusSample), l2,
                            do.sentence = TRUE)
 ctr <- ctr_agg(howTime = c("linear"), by = "year", lag = 3)
 
diff --git a/man/merge.sentiment.Rd b/man/merge.sentiment.Rd
index 1632d79..52eeda0 100644
--- a/man/merge.sentiment.Rd
+++ b/man/merge.sentiment.Rd
@@ -51,7 +51,7 @@ m3 <- merge(s3, s6)
 m4 <- merge(s4, s5)
 nrow(m4) == nrow(m2) # TRUE
 
-# different methods and weighting add rows and columns
+# different methods and weighting adds rows and columns
 ## rows are added only when the different weighting
 ## approach for a specific method gives other sentiment values
 m5 <- merge(s4, s7)
diff --git a/tests/testthat/test_aggregation.R b/tests/testthat/test_aggregation.R
index 43cfa54..8671738 100644
--- a/tests/testthat/test_aggregation.R
+++ b/tests/testthat/test_aggregation.R
@@ -57,7 +57,7 @@ test_that("Aggregation control function breaks when wrong inputs supplied", {
 
 # aggregate.sentiment
 s1 <- compute_sentiment(corpus, lex, how = "proportional")
-s2 <- compute_sentiment(quanteda::texts(corpus), lex, how = "counts")
+s2 <- compute_sentiment(as.character(corpus), lex, how = "counts")
 s3 <- compute_sentiment(corpus, lexClust, how = "proportionalSquareRoot", do.sentence = TRUE)
 sentimentAgg <- aggregate(s3, ctr_agg(lag = 7), do.full = FALSE)
 test_that("Test input and output of sentiment aggregation functionality", {
diff --git a/tests/testthat/test_sentiment_computation.R b/tests/testthat/test_sentiment_computation.R
index f09261a..613c467 100644
--- a/tests/testthat/test_sentiment_computation.R
+++ b/tests/testthat/test_sentiment_computation.R
@@ -3,6 +3,7 @@
 
 context("Sentiment computation")
 
+library("sentometrics")
 library("data.table")
 library("quanteda")
 library("tm")
@@ -19,7 +20,7 @@ txt <- system.file("texts", "txt", package = "tm")
 scorp <- tm::SimpleCorpus(tm::DirSource(txt))
 # scorp$content[1] <- "A text for which we want to calculate above average sentiment."
 # scorp$content[2] <- "A text for which we want to calculate below average sentiment."
-scorp$content[3] <- quanteda::texts(corpus)[3]
+scorp$content[3] <- as.character(corpus)[3]
 
 # VCorpus creation
 reuters <- system.file("texts", "crude", package = "tm")
@@ -45,6 +46,8 @@ names(lexWrong)[2] <- "frr"
 
 ### tests from here ###
 
+load(system.file("extdata", "test_data.rda", package = "sentometrics")) # benchmark sentiment scores
+
 sanity_sentiment <- function(texts, lexicon, valence = NULL) {
   setkey(lexicon, "x")
   if (!is.null(valence)) setkey(valence, "x")
@@ -73,21 +76,21 @@ sanity_sentiment <- function(texts, lexicon, valence = NULL) {
 }
 
 sentimentList <- list(
-  s1 = compute_sentiment(quanteda::texts(corpus), lex, how = "counts"),
-  s2 = compute_sentiment(quanteda::texts(corpus), lex[1:3], how = "counts"),
-  s3 = compute_sentiment(quanteda::texts(corpus), lex, how = "proportional"),
-  s4 = compute_sentiment(quanteda::texts(corpus), lex, how = "proportionalPol"),
+  s1 = compute_sentiment(as.character(corpus), lex, how = "counts"),
+  s2 = compute_sentiment(as.character(corpus), lex[1:3], how = "counts"),
+  s3 = compute_sentiment(as.character(corpus), lex, how = "proportional"),
+  s4 = compute_sentiment(as.character(corpus), lex, how = "proportionalPol"),
   s5 = compute_sentiment(quanteda::corpus(usnews[1:250, "texts"]), lex, how = "counts"),
   s6 = compute_sentiment(quanteda::corpus(usnews[1:250, c("texts", "wsj", "economy")], text_field = "texts"),
                          lex, how = "counts"),
   s7 = compute_sentiment(corpus, lex, how = "counts"),
-  s8 = compute_sentiment(quanteda::texts(corpus), lexSplit, how = "counts"),
-  # s9 = compute_sentiment(quanteda::texts(corpus), lex, how = "TF", nCore = 2), # no multicore computation in CRAN checks
-  s10 = compute_sentiment(quanteda::texts(corpus), lexClust, how = "counts"),
+  s8 = compute_sentiment(as.character(corpus), lexSplit, how = "counts"),
+  # s9 = compute_sentiment(as.character(corpus), lex, how = "TF", nCore = 2), # no multicore computation in CRAN checks
+  s10 = compute_sentiment(as.character(corpus), lexClust, how = "counts"),
   s11 = compute_sentiment(corpus, lexClust, how = "proportional"),
-  s12 = compute_sentiment(quanteda::texts(corpus), lexClust, how = "proportionalPol"),
-  s13 = compute_sentiment(corpus, lex, how = "exponential"),
-  s14 = compute_sentiment(corpus, lex, how = "inverseExponential"),
+  s12 = compute_sentiment(as.character(corpus), lexClust, how = "proportionalPol"),
+#  s13 = compute_sentiment(corpus, lex, how = "exponential"),
+#  s14 = compute_sentiment(corpus, lex, how = "inverseExponential"),
   s15 = compute_sentiment(corpus, lex, how = "UShaped"),
   s16 = compute_sentiment(corpus, lex, how = "inverseUShaped"),
   # s17 = compute_sentiment(corpus, lex, how = "TF"),
@@ -101,11 +104,6 @@ sentimentList <- list(
 )
 
 # compute_sentiment
-# load(system.file("extdata", "test_data.rda", package = "sentometrics")) # benchmark sentiment scores
-# test_that("Agreement between legacy benchmark and current produced sentiment scores", {
-#   expect_equal(test_data, sentimentList[1:11])
-# })
-
 test_that("Agreement between sentiment scores on document-level across input objects", {
   expect_true(all(unlist(lapply(sentimentList, function(s) nrow(s) == 250))))
   expect_true(all(unlist(lapply(sentimentList[-1], function(s) all(s$word_count == sentimentList$s1$word_count)))))
@@ -115,21 +113,27 @@ test_that("Agreement between sentiment scores on document-level across input obj
                     sentimentList$s5[, c("GI_en", "LM_en", "HENRY_en")])
   expect_equivalent(sentimentList$s6[, -c(1:2)],
                     sentimentList$s7[, colnames(sentimentList$s6)[-c(1:2)], with = FALSE])
-  expect_error(compute_sentiment(quanteda::texts(corpus), lex, how = "notAnOption"))
-  expect_warning(compute_sentiment(quanteda::texts(corpus), lex, how = "counts", nCore = -1))
-  expect_error(compute_sentiment(quanteda::texts(corpus), list_lexicons))
+  expect_error(compute_sentiment(as.character(corpus), lex, how = "notAnOption"))
+  expect_warning(compute_sentiment(as.character(corpus), lex, how = "counts", nCore = -1))
+  expect_error(compute_sentiment(as.character(corpus), list_lexicons))
   expect_true(all.equal(sentimentList$s3[3, -1],
                         compute_sentiment(scorp[3], lex, how = "proportional")[, -1]))
   # expect_warning(compute_sentiment(vcorp, lex, how = "proportional"))
   expect_error(compute_sentiment(corpusLang, lex, how = "proportional"))
   expect_true("language" %in% colnames(quanteda::docvars(corpusLang)))
   expect_error(compute_sentiment(corpusLang, lexWrong, how = "proportional"))
-  expect_true(all.equal(sentimentList$s1$GI_en, sanity_sentiment(quanteda::texts(corpus), lex$GI_en, lex$valence)))
-  expect_true(all.equal(sentimentList$s2$GI_en, sanity_sentiment(quanteda::texts(corpus), lex$GI_en)))
+
+  # expect_true(all.equal(test_data, sentimentList[1:11])) # compare with old sentiment scores
+  setcolorder(sentimentList[[7]], names(test_data[[7]])) # make column order the same
+  setcolorder(sentimentList[[10]], names(test_data[[10]])) # make column order the same
+  expect_equal(test_data, sentimentList[1:11])
+
+  expect_true(all.equal(sentimentList$s1$GI_en, sanity_sentiment(as.character(corpus), lex$GI_en, lex$valence)))
+  expect_true(all.equal(sentimentList$s2$GI_en, sanity_sentiment(as.character(corpus), lex$GI_en)))
 })
 
 sentimentSentenceList <- list(
-  s1 = compute_sentiment(quanteda::texts(corpus), lexClust, how = "counts", do.sentence = TRUE),
+  s1 = compute_sentiment(as.character(corpus), lexClust, how = "counts", do.sentence = TRUE),
   s2 = compute_sentiment(quanteda::corpus(usnews[1:250, "texts"]),
                          lexClust, how = "counts", do.sentence = TRUE),
   s3 = compute_sentiment(quanteda::corpus(usnews[1:250, c("texts", "wsj", "economy")], text_field = "texts"),
@@ -194,12 +198,12 @@ test_that("Correct binding of several sentiment objects", {
 })
 
 # tf-idf comparison sentometrics vs. quanteda
-toks <- stri_split_boundaries(stri_trans_tolower(quanteda::texts(corpus)), type = "word", skip_word_none = TRUE)
+toks <- stri_split_boundaries(stri_trans_tolower(as.character(corpus)), type = "word", skip_word_none = TRUE)
 dfmQ <- quanteda::dfm(as.tokens(toks)) %>% dfm_tfidf(k = 1)
 posScores <- rowSums(as.matrix(quanteda::dfm_select(dfmQ, lex$GI_en[y == 1, x])))
 negScores <- rowSums(as.matrix(quanteda::dfm_select(dfmQ, lex$GI_en[y == -1, x])))
 test_that("Same tf-idf scoring for sentometrics and quanteda", {
-  expect_equal(compute_sentiment(quanteda::texts(corpus), lex[-length(lex)], tokens = toks, "TFIDF")[["GI_en"]],
+  expect_equal(compute_sentiment(as.character(corpus), lex[-length(lex)], tokens = toks, "TFIDF")[["GI_en"]],
                unname(posScores - negScores))
 })
 
diff --git a/vignettes/examples/sentiment.Rmd b/vignettes/examples/sentiment.Rmd
index 42a09bc..94824cc 100644
--- a/vignettes/examples/sentiment.Rmd
+++ b/vignettes/examples/sentiment.Rmd
@@ -88,7 +88,7 @@ tks <- as.list(tokens(corpus, what = "fastestword"))
 
 lexicons <- sento_lexicons(list_lexicons[c("GI_en", "LM_en", "HENRY_en")])
 
-compute_sentiment(texts(corpus), lexicons, how = "counts", tokens = tks)
+compute_sentiment(as.character(corpus), lexicons, how = "counts", tokens = tks)
 ```
 
 To provide your own tokenized input on sentence-level, beware that you need to provide a `list` of `list`s, and set `do.sentence = TRUE`. See one of the next examples for more info about sentence-level sentiment calculation.
@@ -97,7 +97,7 @@ To provide your own tokenized input on sentence-level, beware that you need to p
 sentences <- tokens(corpus, what = "sentence")
 tks2 <- lapply(sentences, function(s) as.list(tokens(s, what = "word")))
 
-compute_sentiment(texts(corpus), lexicons[2:3], how = "counts", tokens = tks2, do.sentence = TRUE)
+compute_sentiment(as.character(corpus), lexicons[2:3], how = "counts", tokens = tks2, do.sentence = TRUE)
 ```
 
 ### The three key approaches to the sentiment computation

From f09092c7523539b0c036508603ecc0f7dfd39899 Mon Sep 17 00:00:00 2001
From: Sam Borms <borms_sam@hotmail.com>
Date: Tue, 30 Mar 2021 12:01:58 +0200
Subject: [PATCH 2/2] add back tokenize_texts()

---
 R/sentocorpus.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/sentocorpus.R b/R/sentocorpus.R
index 0d1faa9..b093384 100644
--- a/R/sentocorpus.R
+++ b/R/sentocorpus.R
@@ -343,7 +343,7 @@ corpus_summarize <- function(x, by = "day", features = NULL) {
   # statistics
   dt <- data.table::data.table(
     quanteda::docvars(x),
-    "nTokens" = as.numeric(sapply(as.character(x), length))
+    "nTokens" = as.numeric(sapply(tokenize_texts(as.character(x)), length))
   )
 
   if (!is.null(features)) {