From 7c5328828034f22a140b410516478f392d4427dd Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 20 Mar 2020 00:48:38 +0800 Subject: [PATCH 1/3] expand & improve error message for grouping by unsupported types --- NEWS.md | 2 ++ R/data.table.R | 6 +++++- inst/tests/tests.Rraw | 7 +++++-- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 71fd76aa65..0acb3a0805 100644 --- a/NEWS.md +++ b/NEWS.md @@ -145,6 +145,8 @@ unit = "s") 7. Added more explanation/examples to `?data.table` for how to use `.BY`, [#1363](https://github.com/Rdatatable/data.table/issues/1363). +8. Improved the error message for attempting to group by a `list` column, [#4308](https://github.com/Rdatatable/data.table/issues/4308). Thanks @sindribaldur for reaching out about the unclear message. We don't offer `by=list_column` support because grouping in `data.table` requires sorting, and sort order on lists is not well-defined. Other frameworks accomplish group-by-list by doing equality comparisons (e.g. with hashing) and sorting arbitrarily; please add your support and use case in related issue [#1597](https://github.com/Rdatatable/data.table/issues/1597) if your work would benefit substantially from this. + # data.table [v1.12.8](https://github.com/Rdatatable/data.table/milestone/15?closed=1) (09 Dec 2019) diff --git a/R/data.table.R b/R/data.table.R index 9292ee940b..eea3393102 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -822,7 +822,11 @@ replace_dot_alias = function(e) { if (!is.list(byval)) stop("'by' or 'keyby' must evaluate to a vector or a list of vectors (where 'list' includes data.table and data.frame which are lists, too)") if (length(byval)==1L && is.null(byval[[1L]])) bynull=TRUE #3530 when by=(function()NULL)() if (!bynull) for (jj in seq_len(length(byval))) { - if (!typeof(byval[[jj]]) %chin% ORDERING_TYPES) stop("column or expression ",jj," of 'by' or 'keyby' is type ",typeof(byval[[jj]]),". Do not quote column names. Usage: DT[,sum(colC),by=list(colA,month(colB))]") + if (!(this_type <- typeof(byval[[jj]])) %chin% ORDERING_TYPES) { + if (this_type == 'list') + stop(gettextf("Column or expression %d of 'by' or 'keyby' is a list. In data.table, aggregation requires sorting by the grouping key, and lists can't be sorted in general. If you don't care about sort order, you might try converting the list to a string representation which can be sorted, e.g. by = sapply(list_col, toString).", jj, domain="R-data.table"), domain = NA) + stop(gettextf("Column or expression %d of 'by' or 'keyby' is type '%s'. In data.table, aggregation requires sorting by the grouping key, and our internals don't support this type. If you have a compelling use case, please file an issue to support this type; as a workaround, consider converting the column to a sortable type (e.g. character), taking care to maintain distinctness in the process.", jj, this_type, domain="R-data.table"), domain=NA) + } } tt = vapply_1i(byval,length) if (any(tt!=xnrow)) stop(gettextf("The items in the 'by' or 'keyby' list are length(s) (%s). Each must be length %d; the same length as there are rows in x (after subsetting if i is provided).", paste(tt, collapse=","), xnrow, domain='R-data.table')) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7cc6819e8f..713cb492e4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -4,7 +4,6 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { if (!identical(suppressWarnings(packageDescription("data.table")), NA)) { remove.packages("data.table") stop("This is dev mode but data.table was installed. Uninstalled it. Please q() this R session and try cc() again. The installed namespace causes problems in dev mode for the S4 tests.\n") - } if ((tt<-compiler::enableJIT(-1))>0) cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") } else { @@ -13936,7 +13935,7 @@ test(1984.05, DT[ , sum(b), keyby = c, verbose = TRUE], ### hitting byval = eval(bysub, setattr(as.list(seq_along(xss)), ...) test(1984.06, DT[1:3, sum(a), by=b:c], data.table(b=10:8, c=1:3, V1=1:3)) test(1984.07, DT[, sum(a), by=call('sin',pi)], error='must evaluate to a vector or a list of vectors') -test(1984.08, DT[, sum(a), by=as.raw(0)], error='column or expression.*type raw') +test(1984.08, DT[, sum(a), by=as.raw(0)], error="column or expression.*type 'raw'") test(1984.09, DT[, sum(a), by=.(1,1:2)], error='The items.*list are length[(]s[)] [(]1,2[)].*Each must be length 10; .*rows in x.*after subsetting') options('datatable.optimize' = Inf) test(1984.10, DT[ , 1, by = .(a %% 2), verbose = TRUE], @@ -16846,3 +16845,7 @@ A = data.table(A=c(complex(real = 1:3, imaginary=c(0, -1, 1)), NaN)) test(2138.3, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) A = data.table(A=as.complex(rep(NA, 5))) test(2138.4, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) + +#4308 better error message for grouping by list +DT = data.table(l = list(1, 1:2, 1, 1:3), v=1:4) +test(2139, DT[ , sum(v), by = list(l)], error="Column or expression.*is a list") From 20a5012282dd321de99c9dd4b1a9ba8dd9dbf116 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 20 Mar 2020 00:49:54 +0800 Subject: [PATCH 2/3] accidental backspace --- inst/tests/tests.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 713cb492e4..0c1e02c3ee 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -4,6 +4,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { if (!identical(suppressWarnings(packageDescription("data.table")), NA)) { remove.packages("data.table") stop("This is dev mode but data.table was installed. Uninstalled it. Please q() this R session and try cc() again. The installed namespace causes problems in dev mode for the S4 tests.\n") + } if ((tt<-compiler::enableJIT(-1))>0) cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") } else { From f50677fea51dada6d6c3e2e19f2cf88c7bc62e53 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 14 Jun 2021 13:33:50 -0600 Subject: [PATCH 3/3] one error message instead of two, added feature request link to the message itself, moved test up --- R/data.table.R | 4 +--- inst/tests/tests.Rraw | 7 +++---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index e277571f0c..392599da71 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -882,9 +882,7 @@ replace_dot_alias = function(e) { if (length(byval)==1L && is.null(byval[[1L]])) bynull=TRUE #3530 when by=(function()NULL)() if (!bynull) for (jj in seq_len(length(byval))) { if (!(this_type <- typeof(byval[[jj]])) %chin% ORDERING_TYPES) { - if (this_type == 'list') - stop(gettextf("Column or expression %d of 'by' or 'keyby' is a list. In data.table, aggregation requires sorting by the grouping key, and lists can't be sorted in general. If you don't care about sort order, you might try converting the list to a string representation which can be sorted, e.g. by = sapply(list_col, toString).", jj, domain="R-data.table"), domain = NA) - stop(gettextf("Column or expression %d of 'by' or 'keyby' is type '%s'. In data.table, aggregation requires sorting by the grouping key, and our internals don't support this type. If you have a compelling use case, please file an issue to support this type; as a workaround, consider converting the column to a sortable type (e.g. character), taking care to maintain distinctness in the process.", jj, this_type, domain="R-data.table"), domain=NA) + stop(gettextf("Column or expression %d of 'by' or 'keyby' is type '%s' which is not currently supported. If you have a compelling use case, please add it to https://github.com/Rdatatable/data.table/issues/1597. As a workaround, consider converting the column to a supported type, e.g. by=sapply(list_col, toString), whilst taking care to maintain distinctness in the process.", jj, this_type)) } } tt = vapply_1i(byval,length) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 96c1de1df6..5b21448fa6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14086,7 +14086,9 @@ test(1984.05, DT[ , sum(b), keyby = c, verbose = TRUE], ### hitting byval = eval(bysub, setattr(as.list(seq_along(xss)), ...) test(1984.06, DT[1:3, sum(a), by=b:c], data.table(b=10:8, c=1:3, V1=1:3)) test(1984.07, DT[, sum(a), by=call('sin',pi)], error='must evaluate to a vector or a list of vectors') -test(1984.08, DT[, sum(a), by=as.raw(0)], error="column or expression.*type 'raw'") +test(1984.081, DT[, sum(a), by=as.raw(0)], error="Column or expression.*1.*type 'raw'.*not.*supported") +test(1984.082, data.table(A=1:4, L=list(1, 1:2, 1, 1:3), V=1:4)[, sum(V), by=.(A,L)], # better error message, 4308 + error="Column or expression.*2.*type 'list'.*not.*supported") test(1984.09, DT[, sum(a), by=.(1,1:2)], error='The items.*list are length[(]s[)] [(]1,2[)].*Each must be length 10; .*rows in x.*after subsetting') options('datatable.optimize' = Inf) test(1984.10, DT[ , 1, by = .(a %% 2), verbose = TRUE], @@ -17726,6 +17728,3 @@ if (test_bit64) { test(2193.2, X[Y, `:=`(y=i.y), on="x", by=.EACHI], data.table(x=1:3, y=as.integer64(10L,20L,NA))) } -# better error message for grouping by list, #4308 -DT = data.table(l = list(1, 1:2, 1, 1:3), v=1:4) -test(2194, DT[ , sum(v), by = list(l)], error="Column or expression.*is a list")