From 679c573207fec3249e69791682fd28fb9c8dd36d Mon Sep 17 00:00:00 2001
From: Michael Chirico <michaelchirico4@gmail.com>
Date: Sat, 30 Oct 2021 23:19:29 -0700
Subject: [PATCH 1/5] add cols= argument to unique.data.table

---
 R/duplicated.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/R/duplicated.R b/R/duplicated.R
index 4fc7c8d166..bc7825b072 100644
--- a/R/duplicated.R
+++ b/R/duplicated.R
@@ -23,7 +23,7 @@ duplicated.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_
   res
 }
 
-unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) {
+unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), cols=NULL, ...) {
   if (!cedta()) return(NextMethod("unique")) # nocov
   if (!isFALSE(incomparables)) {
     .NotYetUsed("incomparables != FALSE")
@@ -31,6 +31,8 @@ unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_alon
   if (nrow(x) <= 1L) return(x)
   if (!length(by)) by = NULL  #4594
   o = forderv(x, by=by, sort=FALSE, retGrp=TRUE)
+  if (is.null(cols)) cols = names(x) else cols = c(by, cols)
+  x = .shallow(x, cols)
   # if by=key(x), forderv tests for orderedness within it quickly and will short-circuit
   # there isn't any need in unique() to call uniqlist like duplicated does; uniqlist returns a new nrow(x) vector anyway and isn't
   # as efficient as forderv returning empty o when input is already ordered

From 2d50461304e738a0908d1b62ecae99d781eb45f4 Mon Sep 17 00:00:00 2001
From: Michael Chirico <michaelchirico4@gmail.com>
Date: Sat, 30 Oct 2021 23:28:37 -0700
Subject: [PATCH 2/5] tests, manual

---
 inst/tests/tests.Rraw | 8 ++++++++
 man/duplicated.Rd     | 5 ++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 6382a13a85..a4b2f80102 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -18348,3 +18348,11 @@ test(2225.1, groupingsets(data.table(iris), j=sum(Sepal.Length), by=c('Sp'='Spec
 test(2225.2, groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Sp'='Species'), sets=list('Species')),
              groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Species'), sets=list('Species')))
 
+# cols argument for unique.data.table, #5243
+DT = data.table(g = rep(letters, 3), v1=1:78, v2=78:1)
+test(2226.1, unique(DT, by='g', cols='v1'), DT[1:26, !'v2'])
+test(2226.2, unique(DT, by='g', cols='v2'), DT[1:26, !'v1'])
+## no duplicates
+test(2226.3, unique(DT[1:26], by='g', cols='v1'), DT[1:26, !'v2'])
+## invalid columns fail as expected
+test(2226.4, unique(DT, by='g', cols='v3'), error="non-existing column(s)")
diff --git a/man/duplicated.Rd b/man/duplicated.Rd
index a9c333beb5..945b404f70 100644
--- a/man/duplicated.Rd
+++ b/man/duplicated.Rd
@@ -28,7 +28,8 @@ memory efficient.
 \usage{
 \method{duplicated}{data.table}(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), \dots)
 
-\method{unique}{data.table}(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), \dots)
+\method{unique}{data.table}(x, incomparables=FALSE, fromLast=FALSE,
+by=seq_along(x), cols=NULL, \dots)
 
 \method{anyDuplicated}{data.table}(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), \dots)
 
@@ -46,6 +47,8 @@ correspond to \code{duplicated = FALSE}.}
 of columns from \code{x} to use for uniqueness checks. By default all columns
 are being used. That was changed recently for consistency to data.frame methods.
 In version \code{< 1.9.8} default was \code{key(x)}.}
+\item{cols}{Columns (in addition to \code{by}) from \code{x} to include in the
+  resulting \code{data.table}.}
 \item{na.rm}{Logical (default is \code{FALSE}). Should missing values (including
 \code{NaN}) be removed?}
 }

From 01466569c31e7eb3f269f901c914a030f75e8106 Mon Sep 17 00:00:00 2001
From: Michael Chirico <michaelchirico4@gmail.com>
Date: Sat, 30 Oct 2021 23:32:01 -0700
Subject: [PATCH 3/5] NEWS

---
 NEWS.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 5faf40723f..ec0fdf8556 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -207,6 +207,8 @@
     #  v1.14.4  0.4826  0.5586  0.6586  0.6329  0.7348  1.318   100
     ```
 
+31. `unique.data.table` gains an argument, `cols`, to specify a subset of columns to include in the resulting `data.table`, [#5243](https://github.com/Rdatatable/data.table/issues/5243). This saves the memory overhead of subsetting unneeded columns, and provides a cleaner API for a common operation previously available from more convoluted code. Thanks to @MichaelChirico for the suggestion & implementation.
+
 ## BUG FIXES
 
 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries.

From 065890d421ae9b69d12c0493d7e85282f437e5dd Mon Sep 17 00:00:00 2001
From: Michael Chirico <michaelchirico4@gmail.com>
Date: Sun, 31 Oct 2021 00:00:54 -0700
Subject: [PATCH 4/5] need to retain keys

---
 R/duplicated.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/duplicated.R b/R/duplicated.R
index bc7825b072..ceab3b8d83 100644
--- a/R/duplicated.R
+++ b/R/duplicated.R
@@ -32,7 +32,7 @@ unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_alon
   if (!length(by)) by = NULL  #4594
   o = forderv(x, by=by, sort=FALSE, retGrp=TRUE)
   if (is.null(cols)) cols = names(x) else cols = c(by, cols)
-  x = .shallow(x, cols)
+  x = .shallow(x, cols, retain.key=TRUE)
   # if by=key(x), forderv tests for orderedness within it quickly and will short-circuit
   # there isn't any need in unique() to call uniqlist like duplicated does; uniqlist returns a new nrow(x) vector anyway and isn't
   # as efficient as forderv returning empty o when input is already ordered

From 1bcd54dbfbb1b3c250b4731b24dd79ee9d9e4bee Mon Sep 17 00:00:00 2001
From: Michael Chirico <michaelchirico4@gmail.com>
Date: Sun, 31 Oct 2021 00:20:56 -0700
Subject: [PATCH 5/5] dont shallow copy unless requested; mention col order in
 man

---
 R/duplicated.R    | 5 +++--
 man/duplicated.Rd | 6 +++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/R/duplicated.R b/R/duplicated.R
index ceab3b8d83..901d6e3c01 100644
--- a/R/duplicated.R
+++ b/R/duplicated.R
@@ -31,8 +31,9 @@ unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_alon
   if (nrow(x) <= 1L) return(x)
   if (!length(by)) by = NULL  #4594
   o = forderv(x, by=by, sort=FALSE, retGrp=TRUE)
-  if (is.null(cols)) cols = names(x) else cols = c(by, cols)
-  x = .shallow(x, cols, retain.key=TRUE)
+  if (!is.null(cols)) {
+      x = .shallow(x, c(by, cols), retain.key=TRUE)
+  }
   # if by=key(x), forderv tests for orderedness within it quickly and will short-circuit
   # there isn't any need in unique() to call uniqlist like duplicated does; uniqlist returns a new nrow(x) vector anyway and isn't
   # as efficient as forderv returning empty o when input is already ordered
diff --git a/man/duplicated.Rd b/man/duplicated.Rd
index 945b404f70..daf7c39d58 100644
--- a/man/duplicated.Rd
+++ b/man/duplicated.Rd
@@ -62,7 +62,11 @@ handle cases where limitations in floating point representation is undesirable.
 
 \code{v1.9.4} introduces \code{anyDuplicated} method for data.tables and is
 similar to base in functionality. It also implements the logical argument
-\code{fromLast} for all three functions, with default value \code{FALSE}.
+\code{fromLast} for all three functions, with default value
+\code{FALSE}.
+
+Note: When \code{cols} is specified, the resulting table will have
+columns \code{c(by, cols)}, in that order.
 }
 \value{
 \code{duplicated} returns a logical vector of length \code{nrow(x)}