From 679c573207fec3249e69791682fd28fb9c8dd36d Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 30 Oct 2021 23:19:29 -0700 Subject: [PATCH 1/5] add cols= argument to unique.data.table --- R/duplicated.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/duplicated.R b/R/duplicated.R index 4fc7c8d166..bc7825b072 100644 --- a/R/duplicated.R +++ b/R/duplicated.R @@ -23,7 +23,7 @@ duplicated.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_ res } -unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) { +unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), cols=NULL, ...) { if (!cedta()) return(NextMethod("unique")) # nocov if (!isFALSE(incomparables)) { .NotYetUsed("incomparables != FALSE") @@ -31,6 +31,8 @@ unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_alon if (nrow(x) <= 1L) return(x) if (!length(by)) by = NULL #4594 o = forderv(x, by=by, sort=FALSE, retGrp=TRUE) + if (is.null(cols)) cols = names(x) else cols = c(by, cols) + x = .shallow(x, cols) # if by=key(x), forderv tests for orderedness within it quickly and will short-circuit # there isn't any need in unique() to call uniqlist like duplicated does; uniqlist returns a new nrow(x) vector anyway and isn't # as efficient as forderv returning empty o when input is already ordered From 2d50461304e738a0908d1b62ecae99d781eb45f4 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 30 Oct 2021 23:28:37 -0700 Subject: [PATCH 2/5] tests, manual --- inst/tests/tests.Rraw | 8 ++++++++ man/duplicated.Rd | 5 ++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6382a13a85..a4b2f80102 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18348,3 +18348,11 @@ test(2225.1, groupingsets(data.table(iris), j=sum(Sepal.Length), by=c('Sp'='Spec test(2225.2, groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Sp'='Species'), sets=list('Species')), groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Species'), sets=list('Species'))) +# cols argument for unique.data.table, #5243 +DT = data.table(g = rep(letters, 3), v1=1:78, v2=78:1) +test(2226.1, unique(DT, by='g', cols='v1'), DT[1:26, !'v2']) +test(2226.2, unique(DT, by='g', cols='v2'), DT[1:26, !'v1']) +## no duplicates +test(2226.3, unique(DT[1:26], by='g', cols='v1'), DT[1:26, !'v2']) +## invalid columns fail as expected +test(2226.4, unique(DT, by='g', cols='v3'), error="non-existing column(s)") diff --git a/man/duplicated.Rd b/man/duplicated.Rd index a9c333beb5..945b404f70 100644 --- a/man/duplicated.Rd +++ b/man/duplicated.Rd @@ -28,7 +28,8 @@ memory efficient. \usage{ \method{duplicated}{data.table}(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), \dots) -\method{unique}{data.table}(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), \dots) +\method{unique}{data.table}(x, incomparables=FALSE, fromLast=FALSE, +by=seq_along(x), cols=NULL, \dots) \method{anyDuplicated}{data.table}(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), \dots) @@ -46,6 +47,8 @@ correspond to \code{duplicated = FALSE}.} of columns from \code{x} to use for uniqueness checks. By default all columns are being used. That was changed recently for consistency to data.frame methods. In version \code{< 1.9.8} default was \code{key(x)}.} +\item{cols}{Columns (in addition to \code{by}) from \code{x} to include in the + resulting \code{data.table}.} \item{na.rm}{Logical (default is \code{FALSE}). Should missing values (including \code{NaN}) be removed?} } From 01466569c31e7eb3f269f901c914a030f75e8106 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 30 Oct 2021 23:32:01 -0700 Subject: [PATCH 3/5] NEWS --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 5faf40723f..ec0fdf8556 100644 --- a/NEWS.md +++ b/NEWS.md @@ -207,6 +207,8 @@ # v1.14.4 0.4826 0.5586 0.6586 0.6329 0.7348 1.318 100 ``` +31. `unique.data.table` gains an argument, `cols`, to specify a subset of columns to include in the resulting `data.table`, [#5243](https://github.com/Rdatatable/data.table/issues/5243). This saves the memory overhead of subsetting unneeded columns, and provides a cleaner API for a common operation previously available from more convoluted code. Thanks to @MichaelChirico for the suggestion & implementation. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. From 065890d421ae9b69d12c0493d7e85282f437e5dd Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 31 Oct 2021 00:00:54 -0700 Subject: [PATCH 4/5] need to retain keys --- R/duplicated.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/duplicated.R b/R/duplicated.R index bc7825b072..ceab3b8d83 100644 --- a/R/duplicated.R +++ b/R/duplicated.R @@ -32,7 +32,7 @@ unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_alon if (!length(by)) by = NULL #4594 o = forderv(x, by=by, sort=FALSE, retGrp=TRUE) if (is.null(cols)) cols = names(x) else cols = c(by, cols) - x = .shallow(x, cols) + x = .shallow(x, cols, retain.key=TRUE) # if by=key(x), forderv tests for orderedness within it quickly and will short-circuit # there isn't any need in unique() to call uniqlist like duplicated does; uniqlist returns a new nrow(x) vector anyway and isn't # as efficient as forderv returning empty o when input is already ordered From 1bcd54dbfbb1b3c250b4731b24dd79ee9d9e4bee Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 31 Oct 2021 00:20:56 -0700 Subject: [PATCH 5/5] dont shallow copy unless requested; mention col order in man --- R/duplicated.R | 5 +++-- man/duplicated.Rd | 6 +++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/R/duplicated.R b/R/duplicated.R index ceab3b8d83..901d6e3c01 100644 --- a/R/duplicated.R +++ b/R/duplicated.R @@ -31,8 +31,9 @@ unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_alon if (nrow(x) <= 1L) return(x) if (!length(by)) by = NULL #4594 o = forderv(x, by=by, sort=FALSE, retGrp=TRUE) - if (is.null(cols)) cols = names(x) else cols = c(by, cols) - x = .shallow(x, cols, retain.key=TRUE) + if (!is.null(cols)) { + x = .shallow(x, c(by, cols), retain.key=TRUE) + } # if by=key(x), forderv tests for orderedness within it quickly and will short-circuit # there isn't any need in unique() to call uniqlist like duplicated does; uniqlist returns a new nrow(x) vector anyway and isn't # as efficient as forderv returning empty o when input is already ordered diff --git a/man/duplicated.Rd b/man/duplicated.Rd index 945b404f70..daf7c39d58 100644 --- a/man/duplicated.Rd +++ b/man/duplicated.Rd @@ -62,7 +62,11 @@ handle cases where limitations in floating point representation is undesirable. \code{v1.9.4} introduces \code{anyDuplicated} method for data.tables and is similar to base in functionality. It also implements the logical argument -\code{fromLast} for all three functions, with default value \code{FALSE}. +\code{fromLast} for all three functions, with default value +\code{FALSE}. + +Note: When \code{cols} is specified, the resulting table will have +columns \code{c(by, cols)}, in that order. } \value{ \code{duplicated} returns a logical vector of length \code{nrow(x)}