From 7c5328828034f22a140b410516478f392d4427dd Mon Sep 17 00:00:00 2001
From: Michael Chirico <michael.chirico@grabtaxi.com>
Date: Fri, 20 Mar 2020 00:48:38 +0800
Subject: [PATCH 1/3] expand & improve error message for grouping by
 unsupported types

---
 NEWS.md               | 2 ++
 R/data.table.R        | 6 +++++-
 inst/tests/tests.Rraw | 7 +++++--
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 71fd76aa65..0acb3a0805 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -145,6 +145,8 @@ unit = "s")
 
 7. Added more explanation/examples to `?data.table` for how to use `.BY`, [#1363](https://github.com/Rdatatable/data.table/issues/1363).
 
+8. Improved the error message for attempting to group by a `list` column, [#4308](https://github.com/Rdatatable/data.table/issues/4308). Thanks @sindribaldur for reaching out about the unclear message. We don't offer `by=list_column` support because grouping in `data.table` requires sorting, and sort order on lists is not well-defined. Other frameworks accomplish group-by-list by doing equality comparisons (e.g. with hashing) and sorting arbitrarily; please add your support and use case in related issue [#1597](https://github.com/Rdatatable/data.table/issues/1597) if your work would benefit substantially from this.
+
 
 # data.table [v1.12.8](https://github.com/Rdatatable/data.table/milestone/15?closed=1)  (09 Dec 2019)
 
diff --git a/R/data.table.R b/R/data.table.R
index 9292ee940b..eea3393102 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -822,7 +822,11 @@ replace_dot_alias = function(e) {
         if (!is.list(byval)) stop("'by' or 'keyby' must evaluate to a vector or a list of vectors (where 'list' includes data.table and data.frame which are lists, too)")
         if (length(byval)==1L && is.null(byval[[1L]])) bynull=TRUE #3530 when by=(function()NULL)()
         if (!bynull) for (jj in seq_len(length(byval))) {
-          if (!typeof(byval[[jj]]) %chin% ORDERING_TYPES) stop("column or expression ",jj," of 'by' or 'keyby' is type ",typeof(byval[[jj]]),". Do not quote column names. Usage: DT[,sum(colC),by=list(colA,month(colB))]")
+          if (!(this_type <- typeof(byval[[jj]])) %chin% ORDERING_TYPES) {
+            if (this_type == 'list')
+              stop(gettextf("Column or expression %d of 'by' or 'keyby' is a list. In data.table, aggregation requires sorting by the grouping key, and lists can't be sorted in general. If you don't care about sort order, you might try converting the list to a string representation which can be sorted, e.g.  by = sapply(list_col, toString).", jj, domain="R-data.table"), domain = NA)
+            stop(gettextf("Column or expression %d of 'by' or 'keyby' is type '%s'. In data.table, aggregation requires sorting by the grouping key, and our internals don't support this type. If you have a compelling use case, please file an issue to support this type; as a workaround, consider converting the column to a sortable type (e.g. character), taking care to maintain distinctness in the process.", jj, this_type, domain="R-data.table"), domain=NA)
+          }
         }
         tt = vapply_1i(byval,length)
         if (any(tt!=xnrow)) stop(gettextf("The items in the 'by' or 'keyby' list are length(s) (%s). Each must be length %d; the same length as there are rows in x (after subsetting if i is provided).", paste(tt, collapse=","), xnrow, domain='R-data.table'))
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 7cc6819e8f..713cb492e4 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -4,7 +4,6 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
   if (!identical(suppressWarnings(packageDescription("data.table")), NA)) {
     remove.packages("data.table")
     stop("This is dev mode but data.table was installed. Uninstalled it. Please q() this R session and try cc() again. The installed namespace causes problems in dev mode for the S4 tests.\n")
-  }
   if ((tt<-compiler::enableJIT(-1))>0)
     cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="")
 } else {
@@ -13936,7 +13935,7 @@ test(1984.05, DT[ , sum(b), keyby = c, verbose = TRUE],
 ### hitting byval = eval(bysub, setattr(as.list(seq_along(xss)), ...)
 test(1984.06, DT[1:3, sum(a), by=b:c], data.table(b=10:8, c=1:3, V1=1:3))
 test(1984.07, DT[, sum(a), by=call('sin',pi)], error='must evaluate to a vector or a list of vectors')
-test(1984.08, DT[, sum(a), by=as.raw(0)],           error='column or expression.*type raw')
+test(1984.08, DT[, sum(a), by=as.raw(0)],           error="column or expression.*type 'raw'")
 test(1984.09, DT[, sum(a), by=.(1,1:2)],       error='The items.*list are length[(]s[)] [(]1,2[)].*Each must be length 10; .*rows in x.*after subsetting')
 options('datatable.optimize' = Inf)
 test(1984.10, DT[ , 1, by = .(a %% 2), verbose = TRUE],
@@ -16846,3 +16845,7 @@ A = data.table(A=c(complex(real = 1:3, imaginary=c(0, -1, 1)), NaN))
 test(2138.3, rbind(A,B), data.table(A=c(as.character(A$A), B$A)))
 A = data.table(A=as.complex(rep(NA, 5)))
 test(2138.4, rbind(A,B), data.table(A=c(as.character(A$A), B$A)))
+
+#4308 better error message for grouping by list
+DT = data.table(l = list(1, 1:2, 1, 1:3), v=1:4)
+test(2139, DT[ , sum(v), by = list(l)], error="Column or expression.*is a list")

From 20a5012282dd321de99c9dd4b1a9ba8dd9dbf116 Mon Sep 17 00:00:00 2001
From: Michael Chirico <michael.chirico@grabtaxi.com>
Date: Fri, 20 Mar 2020 00:49:54 +0800
Subject: [PATCH 2/3] accidental backspace

---
 inst/tests/tests.Rraw | 1 +
 1 file changed, 1 insertion(+)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 713cb492e4..0c1e02c3ee 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -4,6 +4,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
   if (!identical(suppressWarnings(packageDescription("data.table")), NA)) {
     remove.packages("data.table")
     stop("This is dev mode but data.table was installed. Uninstalled it. Please q() this R session and try cc() again. The installed namespace causes problems in dev mode for the S4 tests.\n")
+  }
   if ((tt<-compiler::enableJIT(-1))>0)
     cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="")
 } else {

From f50677fea51dada6d6c3e2e19f2cf88c7bc62e53 Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Mon, 14 Jun 2021 13:33:50 -0600
Subject: [PATCH 3/3] one error message instead of two, added feature request
 link to the message itself, moved test up

---
 R/data.table.R        | 4 +---
 inst/tests/tests.Rraw | 7 +++----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/R/data.table.R b/R/data.table.R
index e277571f0c..392599da71 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -882,9 +882,7 @@ replace_dot_alias = function(e) {
         if (length(byval)==1L && is.null(byval[[1L]])) bynull=TRUE #3530 when by=(function()NULL)()
         if (!bynull) for (jj in seq_len(length(byval))) {
           if (!(this_type <- typeof(byval[[jj]])) %chin% ORDERING_TYPES) {
-            if (this_type == 'list')
-              stop(gettextf("Column or expression %d of 'by' or 'keyby' is a list. In data.table, aggregation requires sorting by the grouping key, and lists can't be sorted in general. If you don't care about sort order, you might try converting the list to a string representation which can be sorted, e.g.  by = sapply(list_col, toString).", jj, domain="R-data.table"), domain = NA)
-            stop(gettextf("Column or expression %d of 'by' or 'keyby' is type '%s'. In data.table, aggregation requires sorting by the grouping key, and our internals don't support this type. If you have a compelling use case, please file an issue to support this type; as a workaround, consider converting the column to a sortable type (e.g. character), taking care to maintain distinctness in the process.", jj, this_type, domain="R-data.table"), domain=NA)
+            stop(gettextf("Column or expression %d of 'by' or 'keyby' is type '%s' which is not currently supported. If you have a compelling use case, please add it to https://github.com/Rdatatable/data.table/issues/1597. As a workaround, consider converting the column to a supported type, e.g. by=sapply(list_col, toString), whilst taking care to maintain distinctness in the process.", jj, this_type))
           }
         }
         tt = vapply_1i(byval,length)
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 96c1de1df6..5b21448fa6 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -14086,7 +14086,9 @@ test(1984.05, DT[ , sum(b), keyby = c, verbose = TRUE],
 ### hitting byval = eval(bysub, setattr(as.list(seq_along(xss)), ...)
 test(1984.06, DT[1:3, sum(a), by=b:c], data.table(b=10:8, c=1:3, V1=1:3))
 test(1984.07, DT[, sum(a), by=call('sin',pi)], error='must evaluate to a vector or a list of vectors')
-test(1984.08, DT[, sum(a), by=as.raw(0)],           error="column or expression.*type 'raw'")
+test(1984.081, DT[, sum(a), by=as.raw(0)],     error="Column or expression.*1.*type 'raw'.*not.*supported")
+test(1984.082, data.table(A=1:4, L=list(1, 1:2, 1, 1:3), V=1:4)[, sum(V), by=.(A,L)],  # better error message, 4308
+               error="Column or expression.*2.*type 'list'.*not.*supported")
 test(1984.09, DT[, sum(a), by=.(1,1:2)],       error='The items.*list are length[(]s[)] [(]1,2[)].*Each must be length 10; .*rows in x.*after subsetting')
 options('datatable.optimize' = Inf)
 test(1984.10, DT[ , 1, by = .(a %% 2), verbose = TRUE],
@@ -17726,6 +17728,3 @@ if (test_bit64) {
   test(2193.2, X[Y, `:=`(y=i.y), on="x", by=.EACHI], data.table(x=1:3, y=as.integer64(10L,20L,NA)))
 }
 
-# better error message for grouping by list, #4308
-DT = data.table(l = list(1, 1:2, 1, 1:3), v=1:4)
-test(2194, DT[ , sum(v), by = list(l)], error="Column or expression.*is a list")