From 8667a2f5a355ee7781f1fe5a5fe2fde2e67122c9 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Thu, 2 Apr 2020 14:00:49 +0100 Subject: [PATCH 1/4] setorderv gets new arg neworder, closes #4012 --- NEWS.md | 14 +++++++++++ R/setkey.R | 57 ++++++++++++++++++++++++++----------------- inst/tests/tests.Rraw | 19 +++++++++++++++ man/setorder.Rd | 13 ++++++++-- src/reorder.c | 5 ++-- 5 files changed, 82 insertions(+), 26 deletions(-) diff --git a/NEWS.md b/NEWS.md index 71fd76aa65..59596479dd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -81,6 +81,20 @@ unit = "s") 14. Added support for `round()` and `trunc()` to extend functionality of `ITime`. `round()` and `trunc()` can be used with argument units: "hours" or "minutes". Thanks to @JensPederM for the suggestion and PR. +15. Function `setorderv` gets new argument `neworder` where user can specify custom ordering directly, [#4012](https://github.com/Rdatatable/data.table/issues/4012). + +```r +DT = data.table(id1 = c("a","b","c","d"), v1 = rnorm(4)) + +# move first row to the end +setorderv(DT, neworder = c(2:4,1L)) +DT + +# random order +setorderv(DT, neworder = sample(nrow(DT))) +DT +``` + ## BUG FIXES 1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085). diff --git a/R/setkey.R b/R/setkey.R index 334ca1e801..1e130777ce 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -274,30 +274,43 @@ setorder = function(x, ..., na.last=FALSE) setorderv(x, cols, order, na.last) } -setorderv = function(x, cols = colnames(x), order=1L, na.last=FALSE) -{ - if (is.null(cols)) return(x) +setorderv = function(x, cols = colnames(x), order=1L, na.last=FALSE, neworder) { if (!is.data.frame(x)) stop("x must be a data.frame or data.table") - na.last = as.logical(na.last) - if (is.na(na.last) || !length(na.last)) stop('na.last must be logical TRUE/FALSE') - if (!is.character(cols)) stop("cols is not a character vector. Please see further information in ?setorder.") - if (!length(cols)) { - warning("cols is a character vector of zero length. Use NULL instead, or wrap with suppressWarnings() to avoid this warning.") - return(x) - } - if (!all(nzchar(cols))) stop("cols contains some blanks.") # TODO: probably I'm checking more than necessary here.. there are checks in 'forderv' as well - # remove backticks from cols - cols = gsub("`", "", cols, fixed = TRUE) - miss = !(cols %chin% colnames(x)) - if (any(miss)) stop("some columns are not in the data.table: ", paste(cols[miss], collapse=",")) - if (".xi" %chin% colnames(x)) stop("x contains a column called '.xi'. Conflicts with internal use by data.table.") - for (i in cols) { - .xi = x[[i]] # [[ is copy on write, otherwise checking type would be copying each column - if (!typeof(.xi) %chin% ORDERING_TYPES) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported for ordering currently.") - } - if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov + if (is.null(cols) || !length(x)) return(x) + if (!missing(neworder)) { + if (!missing(cols)) + stop("Provide either cols or neworder, not both") + if (!missing(order)) + warning("Argument order is ignored when neworder argument was provided") + if (!missing(na.last)) + warning("Argument na.last is ignored when neworder argument was provided") + if (length(neworder) != nrow(x)) + stop("Provided neworder is a different length than nrow of provided data.table") + if (!is.integer(neworder) && is.numeric(neworder)) + neworder = as.integer(neworder) + o = neworder + } else { + na.last = as.logical(na.last) + if (is.na(na.last) || !length(na.last)) stop('na.last must be logical TRUE/FALSE') + if (!is.character(cols)) stop("cols is not a character vector. Please see further information in ?setorder.") + if (!length(cols)) { + warning("cols is a character vector of zero length. Use NULL instead, or wrap with suppressWarnings() to avoid this warning.") + return(x) + } + if (!all(nzchar(cols))) stop("cols contains some blanks.") # TODO: probably I'm checking more than necessary here.. there are checks in 'forderv' as well + # remove backticks from cols + cols = gsub("`", "", cols, fixed = TRUE) + miss = !(cols %chin% colnames(x)) + if (any(miss)) stop("some columns are not in the data.table: ", paste(cols[miss], collapse=",")) + if (".xi" %chin% colnames(x)) stop("x contains a column called '.xi'. Conflicts with internal use by data.table.") + for (i in cols) { + .xi = x[[i]] # [[ is copy on write, otherwise checking type would be copying each column + if (!typeof(.xi) %chin% ORDERING_TYPES) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported for ordering currently.") + } + if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov - o = forderv(x, cols, sort=TRUE, retGrp=FALSE, order=order, na.last=na.last) + o = forderv(x, cols, sort=TRUE, retGrp=FALSE, order=order, na.last=na.last) + } if (length(o)) { .Call(Creorder, x, o) if (is.data.frame(x) & !is.data.table(x)) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7cc6819e8f..2a6f226364 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16846,3 +16846,22 @@ A = data.table(A=c(complex(real = 1:3, imaginary=c(0, -1, 1)), NaN)) test(2138.3, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) A = data.table(A=as.complex(rep(NA, 5))) test(2138.4, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) + +# setorderv could take index vector too #4012 +DT = data.table(id1 = c("a","b","c","d"), v1 = rnorm(4)) +d = copy(DT) +test(2139.01, setorderv(DT, neworder = c(2:4,1L)), d[c(2:4,1L)]) # move first row to the end +DT = copy(d) +s = sample(nrow(DT)) +test(2139.02, setorderv(DT, neworder = s), d[s]) # random order +DT = copy(d) +test(2139.03, setorderv(DT, order=1L, neworder = 1:4), d, warning = "Argument order is ignored") +test(2139.04, setorderv(DT, na.last=FALSE, neworder = 1:4), d, warning = "Argument na.last is ignored") +test(2139.05, setorderv(DT, order=1L, na.last=FALSE, neworder = 1:4), d, warning = c("Argument order is ignored","Argument na.last is ignored")) +test(2139.06, setorderv(DT, cols="id1", neworder = 1:4), error = "Provide either cols or neworder, not both") +test(2139.07, setorderv(DT, neworder = 1:3), error = "Provided neworder is a different length than nrow of provided data.table") +test(2139.08, setorderv(DT, neworder = 1:5), error = "Provided neworder is a different length than nrow of provided data.table") +test(2139.09, setorderv(DT, neworder = c(1L,1L,2L,3L)), error = "duplicated.") +test(2139.10, setorderv(DT, neworder = c(1L,2L,3L,NA_integer_)), error = "NA") +test(2139.11, setorderv(DT, neworder = c(1L,0L,2L,3L)), error = "out of range") +test(2139.12, setorderv(DT, neworder = c(1L,5L,2L,3L)), error = "out of range") diff --git a/man/setorder.Rd b/man/setorder.Rd index 6e7b598427..cd04eed393 100644 --- a/man/setorder.Rd +++ b/man/setorder.Rd @@ -28,7 +28,7 @@ Also note that \code{data.table} always reorders in "C-locale" (see Details). To \usage{ setorder(x, \dots, na.last=FALSE) -setorderv(x, cols = colnames(x), order=1L, na.last=FALSE) +setorderv(x, cols = colnames(x), order=1L, na.last=FALSE, neworder) # optimised to use data.table's internal fast order # x[order(., na.last=TRUE)] } @@ -48,6 +48,7 @@ when \code{b} is of type \code{character} as well. } \code{na.last=NA} is valid only for \code{x[order(., na.last)]} and its default is \code{TRUE}. \code{setorder} and \code{setorderv} only accept \code{TRUE}/\code{FALSE} with default \code{FALSE}. } +\item{neworder}{ Integer vector, strict permutation of \code{1:nrow(x)}, no repeats, zeros, NAs, also known as a \emph{shuffle}. } } \details{ \code{data.table} implements its own fast radix-based ordering. See the references for some exposition on the concept of radix sort. @@ -115,7 +116,6 @@ If you require a copy, take a copy first (using \code{DT2 = copy(DT)}). See \code{\link{setDF}}, \code{\link{copy}}, \code{\link{setNumericRounding}} } \examples{ - set.seed(45L) DT = data.table(A=sample(3, 10, TRUE), B=sample(letters[1:3], 10, TRUE), C=sample(10)) @@ -125,6 +125,15 @@ setorder(DT, A, -B) # same as above, but using setorderv setorderv(DT, c("A", "B"), c(1, -1)) + +# neworder +DT = data.table(id1 = c("a","b","c","d"), v1 = rnorm(4)) + +# move first row to the end +setorderv(DT, neworder = c(2:4,1L)) + +# random order +setorderv(DT, neworder = sample(nrow(DT))) } \keyword{ data } diff --git a/src/reorder.c b/src/reorder.c index da3784e94d..eb18766198 100644 --- a/src/reorder.c +++ b/src/reorder.c @@ -2,7 +2,7 @@ SEXP reorder(SEXP x, SEXP order) { - // For internal use only by setkey(). + // For internal use by setkeyv and setorderv // 'order' must be a strict permutation of 1:n; i.e. no repeats, zeros, NAs. Also known as a shuffle. // If only a small subset in the middle is reordered, the ends are moved in to avoid wasteful work. // x may be a vector, or a list of same-length vectors (typically a data.table). @@ -52,7 +52,8 @@ SEXP reorder(SEXP x, SEXP order) i+1, idx[i], length(order)); // This should run in reasonable time because although 'seen' is random write, it is writing to just 1 byte * nrow // which is relatively small and has a good chance of fitting in cache. - // A worry mitigated by this check is a user passing their own incorrect ordering using ::: to reach this internal. + // A worry mitigated by this check is a user passing their own incorrect ordering using ::: to reach this internal - it happened to be used on SO already so is likely to happen + // There is also new arg to setorderv which is likely to hit this // This check is once up front, and then idx is applied to all the columns which is where the most time is spent. } From 6dfd502dd597fb017e26cbe05d6f82c92ba5a22f Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 11 Apr 2020 17:27:27 +0100 Subject: [PATCH 2/4] codecov missing test, unfold C code for more --- inst/tests/tests.Rraw | 3 ++- src/reorder.c | 36 ++++++++++++++++++++++-------------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2a6f226364..7ec1d720f1 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16861,7 +16861,8 @@ test(2139.05, setorderv(DT, order=1L, na.last=FALSE, neworder = 1:4), d, warning test(2139.06, setorderv(DT, cols="id1", neworder = 1:4), error = "Provide either cols or neworder, not both") test(2139.07, setorderv(DT, neworder = 1:3), error = "Provided neworder is a different length than nrow of provided data.table") test(2139.08, setorderv(DT, neworder = 1:5), error = "Provided neworder is a different length than nrow of provided data.table") -test(2139.09, setorderv(DT, neworder = c(1L,1L,2L,3L)), error = "duplicated.") +test(2139.09, setorderv(DT, neworder = c(1L,1L,2L,3L)), error = "duplicated") test(2139.10, setorderv(DT, neworder = c(1L,2L,3L,NA_integer_)), error = "NA") test(2139.11, setorderv(DT, neworder = c(1L,0L,2L,3L)), error = "out of range") test(2139.12, setorderv(DT, neworder = c(1L,5L,2L,3L)), error = "out of range") +test(2139.13, setorderv(DT, neworder = as.numeric(s)), d[s]) diff --git a/src/reorder.c b/src/reorder.c index eb18766198..023ff8257f 100644 --- a/src/reorder.c +++ b/src/reorder.c @@ -1,7 +1,6 @@ #include "data.table.h" -SEXP reorder(SEXP x, SEXP order) -{ +SEXP reorder(SEXP x, SEXP order) { // For internal use by setkeyv and setorderv // 'order' must be a strict permutation of 1:n; i.e. no repeats, zeros, NAs. Also known as a shuffle. // If only a small subset in the middle is reordered, the ends are moved in to avoid wasteful work. @@ -19,29 +18,39 @@ SEXP reorder(SEXP x, SEXP order) error(_("Column %d is length %d which differs from length of column 1 (%d). Invalid data.table."), i+1, length(v), nrow); if (SIZEOF(v) > maxSize) maxSize=SIZEOF(v); - if (ALTREP(v)) SET_VECTOR_ELT(x, i, copyAsPlain(v)); + if (ALTREP(v)) + SET_VECTOR_ELT(x, i, copyAsPlain(v)); } copySharedColumns(x); // otherwise two columns which point to the same vector would be reordered and then re-reordered, issues linked in PR#3768 } else { if (SIZEOF(x)!=4 && SIZEOF(x)!=8 && SIZEOF(x)!=16) error(_("reorder accepts vectors but this non-VECSXP is type '%s' which isn't yet supported (SIZEOF=%d)"), type2char(TYPEOF(x)), SIZEOF(x)); - if (ALTREP(x)) error(_("Internal error in reorder.c: cannot reorder an ALTREP vector. Please see NEWS item 2 in v1.11.4 and report this as a bug.")); // # nocov + if (ALTREP(x)) + error(_("Internal error in reorder.c: cannot reorder an ALTREP vector. Please see NEWS item 2 in v1.11.4 and report this as a bug.")); // # nocov maxSize = SIZEOF(x); nrow = length(x); ncol = 1; } - if (!isInteger(order)) error(_("order must be an integer vector")); - if (length(order) != nrow) error(_("nrow(x)[%d]!=length(order)[%d]"),nrow,length(order)); + if (!isInteger(order)) + error(_("order must be an integer vector")); + if (length(order) != nrow) + error(_("nrow(x)[%d]!=length(order)[%d]"),nrow,length(order)); int nprotect = 0; - if (ALTREP(order)) { order=PROTECT(copyAsPlain(order)); nprotect++; } // TODO: if it's an ALTREP sequence some optimizations are possible rather than expand + if (ALTREP(order)) { + order=PROTECT(copyAsPlain(order)); nprotect++; + } // TODO: if it's an ALTREP sequence some optimizations are possible rather than expand const int *restrict idx = INTEGER(order); int i=0; - while (i Date: Sat, 11 Apr 2020 17:59:50 +0100 Subject: [PATCH 3/4] address lines revealed by codecov --- R/setkey.R | 2 -- inst/tests/tests.Rraw | 8 ++++++-- src/reorder.c | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/R/setkey.R b/R/setkey.R index 1e130777ce..a719049ffa 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -284,8 +284,6 @@ setorderv = function(x, cols = colnames(x), order=1L, na.last=FALSE, neworder) { warning("Argument order is ignored when neworder argument was provided") if (!missing(na.last)) warning("Argument na.last is ignored when neworder argument was provided") - if (length(neworder) != nrow(x)) - stop("Provided neworder is a different length than nrow of provided data.table") if (!is.integer(neworder) && is.numeric(neworder)) neworder = as.integer(neworder) o = neworder diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7ec1d720f1..1658b8d414 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13725,6 +13725,9 @@ test(1967.622, setnames(x, 1:2, c("a",NA)), error = "NA in 'new' at positions [2 test(1967.63, setcolorder(x, c(1, 1)), error = 'Item 2 of order (1) is either NA, out of range [1,2], or is duplicated. The new order must be a strict permutation of 1:n') test(1967.64, setcolorder(x, 1+3i), error = 'must be character or numeric') test(1967.65, setcolorder(x, 300), error = 'specify non existing column*.*300') +d = data.table(1:2, 2:3) +setattr(d, "names", NULL) +test(1967.651, setcolorder(d, 2:1), error="dt passed to setcolorder has no names") test(1967.66, rbindlist(list(x), idcol = FALSE), rbindlist(list(x))) test(1967.67, rbindlist(list(x), idcol = 1+3i), error = 'idcol must be a logical') @@ -16859,10 +16862,11 @@ test(2139.03, setorderv(DT, order=1L, neworder = 1:4), d, warning = "Argument or test(2139.04, setorderv(DT, na.last=FALSE, neworder = 1:4), d, warning = "Argument na.last is ignored") test(2139.05, setorderv(DT, order=1L, na.last=FALSE, neworder = 1:4), d, warning = c("Argument order is ignored","Argument na.last is ignored")) test(2139.06, setorderv(DT, cols="id1", neworder = 1:4), error = "Provide either cols or neworder, not both") -test(2139.07, setorderv(DT, neworder = 1:3), error = "Provided neworder is a different length than nrow of provided data.table") -test(2139.08, setorderv(DT, neworder = 1:5), error = "Provided neworder is a different length than nrow of provided data.table") +test(2139.07, setorderv(DT, neworder = 1:3), error = "length must be equal to nrow") +test(2139.08, setorderv(DT, neworder = 1:5), error = "length must be equal to nrow") test(2139.09, setorderv(DT, neworder = c(1L,1L,2L,3L)), error = "duplicated") test(2139.10, setorderv(DT, neworder = c(1L,2L,3L,NA_integer_)), error = "NA") test(2139.11, setorderv(DT, neworder = c(1L,0L,2L,3L)), error = "out of range") test(2139.12, setorderv(DT, neworder = c(1L,5L,2L,3L)), error = "out of range") test(2139.13, setorderv(DT, neworder = as.numeric(s)), d[s]) +test(2139.14, setorderv(DT, neworder=c("a","b")), error="must be an integer vector") diff --git a/src/reorder.c b/src/reorder.c index 023ff8257f..2042e5bb25 100644 --- a/src/reorder.c +++ b/src/reorder.c @@ -34,7 +34,7 @@ SEXP reorder(SEXP x, SEXP order) { if (!isInteger(order)) error(_("order must be an integer vector")); if (length(order) != nrow) - error(_("nrow(x)[%d]!=length(order)[%d]"),nrow,length(order)); + error(_("order length must be equal to nrow of x: nrow(x)[%d]!=length(order)[%d]"),nrow,length(order)); int nprotect = 0; if (ALTREP(order)) { order=PROTECT(copyAsPlain(order)); nprotect++; From 3b18044311cbe76e041528fe6c872fd3f3ebeb51 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 18 May 2020 19:23:05 +0100 Subject: [PATCH 4/4] setorder doc, move most to forder.Rd --- man/setorder.Rd | 120 +++++++++--------------------------------------- 1 file changed, 21 insertions(+), 99 deletions(-) diff --git a/man/setorder.Rd b/man/setorder.Rd index cd04eed393..2179bb54c4 100644 --- a/man/setorder.Rd +++ b/man/setorder.Rd @@ -1,119 +1,41 @@ \name{setorder} \alias{setorder} \alias{setorderv} -\alias{order} -\alias{fastorder} -\alias{forder} -\alias{forderv} - \title{Fast row reordering of a data.table by reference} \description{ -In \code{data.table} parlance, all \code{set*} functions change their input -\emph{by reference}. That is, no copy is made at all, other than temporary -working memory, which is as large as one column. The only other -\code{data.table} operator that modifies input by reference is \code{\link{:=}}. -Check out the \code{See Also} section below for other \code{set*} function -\code{data.table} provides. - -\code{setorder} (and \code{setorderv}) reorders the rows of a \code{data.table} -based on the columns (and column order) provided. It reorders the table -\emph{by reference} and is therefore very memory efficient. - -Note that queries like \code{x[order(.)]} are optimised internally to use \code{data.table}'s fast order. - -Also note that \code{data.table} always reorders in "C-locale" (see Details). To sort by session locale, use \code{x[base::order(.)]}. - -\code{bit64::integer64} type is also supported for reordering rows of a \code{data.table}. + \code{setorder} (and \code{setorderv}) reorders the rows of a \code{data.table} based on the columns (and column order) provided. It reorders the table \emph{by reference} and is therefore very memory efficient. } - \usage{ -setorder(x, \dots, na.last=FALSE) -setorderv(x, cols = colnames(x), order=1L, na.last=FALSE, neworder) -# optimised to use data.table's internal fast order -# x[order(., na.last=TRUE)] + setorder(x, \dots, na.last=FALSE) + setorderv(x, cols = colnames(x), order=1L, na.last=FALSE, neworder) } \arguments{ -\item{x}{ A \code{data.table}. } -\item{\dots}{ The columns to sort by. Do not quote column names. If \code{\dots} -is missing (ex: \code{setorder(x)}), \code{x} is rearranged based on all -columns in ascending order by default. To sort by a column in descending order -prefix the symbol \code{"-"} which means "descending" (\emph{not} "negative", in this context), i.e., \code{setorder(x, a, -b, c)}. The \code{-b} works -when \code{b} is of type \code{character} as well. } -\item{cols}{ A character vector of column names of \code{x} by which to order. By default, sorts over all columns; \code{cols = NULL} will return \code{x} untouched. Do not add \code{"-"} here. Use \code{order} argument instead. } -\item{order}{ An integer vector with only possible values of \code{1} and -\code{-1}, corresponding to ascending and descending order. The length of -\code{order} must be either \code{1} or equal to that of \code{cols}. If -\code{length(order) == 1}, it is recycled to \code{length(cols)}. } -\item{na.last}{ \code{logical}. If \code{TRUE}, missing values in the data are placed last; if \code{FALSE}, they are placed first; if \code{NA} they are removed. -\code{na.last=NA} is valid only for \code{x[order(., na.last)]} and its -default is \code{TRUE}. \code{setorder} and \code{setorderv} only accept -\code{TRUE}/\code{FALSE} with default \code{FALSE}. } -\item{neworder}{ Integer vector, strict permutation of \code{1:nrow(x)}, no repeats, zeros, NAs, also known as a \emph{shuffle}. } + \item{x}{ A \code{data.table}. } + \item{\dots}{ The columns to sort by. Do not quote column names. If \code{\dots} is missing (ex: \code{setorder(x)}), \code{x} is rearranged based on all columns in ascending order by default. To sort by a column in descending order prefix the symbol \code{"-"} which means \emph{descending} (not \emph{negative}, in this context), i.e., \code{setorder(x, a, -b, c)}. The \code{-b} works when \code{b} is of type \code{character} as well. } + \item{cols}{ A character vector of column names of \code{x} by which to order. By default, sorts over all columns; \code{cols = NULL} will return \code{x} untouched. Do not add \code{"-"} here. Use \code{order} argument instead. } + \item{order}{ An integer vector with only possible values of \code{1} and \code{-1}, corresponding to ascending and descending order. The length of \code{order} must be either \code{1} or equal to that of \code{cols}. If \code{length(order) == 1}, it is recycled to \code{length(cols)}. } + \item{na.last}{ \code{logical}. If \code{TRUE}, missing values in the data are placed last; if \code{FALSE} (default), they are placed first. } + \item{neworder}{ Integer vector, use custom order rather than order based on columns. This has to be strict permutation of \code{1:nrow(x)}, no repeats, zeros, NAs, also known as a \emph{shuffle}. } } \details{ -\code{data.table} implements its own fast radix-based ordering. See the references for some exposition on the concept of radix sort. + \code{setorder} accepts unquoted column names (with names preceded with a \code{-} sign for descending order) and reorders \code{data.table} rows +\emph{by reference}, for e.g., \code{setorder(x, a, -b, c)}. We emphasize that this means \emph{descending} and not \emph{negative} because the implementation simply reverses the sort order, as opposed to sorting the opposite of the input (which would be inefficient). -\code{setorder} accepts unquoted column names (with names preceded with a -\code{-} sign for descending order) and reorders \code{data.table} rows -\emph{by reference}, for e.g., \code{setorder(x, a, -b, c)}. We emphasize that -this means "descending" and not "negative" because the implementation simply -reverses the sort order, as opposed to sorting the opposite of the input -(which would be inefficient). + Note that \code{-b} also works with columns of type \code{character} unlike \code{\link[base]{order}}, which requires \code{-xtfrm(y)} instead (which is slow). -Note that \code{-b} also works with columns of type \code{character} unlike -\code{\link[base]{order}}, which requires \code{-xtfrm(y)} instead (which is slow). -\code{setorderv} in turn accepts a character vector of column names and an -integer vector of column order separately. + \code{setorderv} in turn accepts a character vector of column names and an integer vector of column order separately. -Note that \code{\link{setkey}} still requires and will always sort only in -ascending order, and is different from \code{setorder} in that it additionally -sets the \code{sorted} attribute. - -\code{na.last} argument, by default, is \code{FALSE} for \code{setorder} and -\code{setorderv} to be consistent with \code{data.table}'s \code{setkey} and -is \code{TRUE} for \code{x[order(.)]} to be consistent with \code{base::order}. -Only \code{x[order(.)]} can have \code{na.last = NA} as it is a subset operation -as opposed to \code{setorder} or \code{setorderv} which reorders the data.table -by reference. - -\code{data.table} always reorders in "C-locale". -As a consequence, the ordering may be different to that obtained by \code{base::order}. -In English locales, for example, sorting is case-sensitive in C-locale. -Thus, sorting \code{c("c", "a", "B")} returns \code{c("B", "a", "c")} in \code{data.table} - but \code{c("a", "B", "c")} in \code{base::order}. Note this makes no difference in most cases -of data; both return identical results on ids where only upper-case or lower-case letters are present (\code{"AB123" < "AC234"} -is true in both), or on country names and other proper nouns which are consistently capitalized. -For example, neither \code{"America" < "Brazil"} nor -\code{"america" < "brazil"} are affected since the first letter is consistently -capitalized. - -Using C-locale makes the behaviour of sorting in \code{data.table} more consistent across sessions and locales. -The behaviour of \code{base::order} depends on assumptions about the locale of the R session. -In English locales, \code{"america" < "BRAZIL"} is true by default -but false if you either type \code{Sys.setlocale(locale="C")} or the R session has been started in a C locale -for you -- which can happen on servers/services since the locale comes from the environment the R session -was started in. By contrast, \code{"america" < "BRAZIL"} is always \code{FALSE} in \code{data.table} regardless of the way your R session was started. - -If \code{setorder} results in reordering of the rows of a keyed \code{data.table}, -then its key will be set to \code{NULL}. + Note that \code{\link{setkey}} still requires and will always sort only in ascending order, and is different from \code{setorder} in that it additionally sets the \code{sorted} attribute. } -\value{ -The input is modified by reference, and returned (invisibly) so it can be used -in compound statements; e.g., \code{setorder(DT,a,-b)[, cumsum(c), by=list(a,b)]}. -If you require a copy, take a copy first (using \code{DT2 = copy(DT)}). See -\code{\link{copy}}. +\note{ + \code{data.table} always reorders in \emph{C-locale}, see \code{\link{forder}} for details. To reorder by session locale, use \code{setorderv(DT, neworder=base::order(.))}. } -\references{ - \url{https://en.wikipedia.org/wiki/Radix_sort}\cr - \url{https://en.wikipedia.org/wiki/Counting_sort}\cr - \url{http://stereopsis.com/radix.html}\cr - \url{https://codercorner.com/RadixSortRevisited.htm}\cr - \url{https://medium.com/basecs/getting-to-the-root-of-sorting-with-radix-sort-f8e9240d4224} +\value{ + The input is modified by reference, and returned (invisibly) so it can be used in compound statements; e.g., \code{setorder(DT,a,-b)[, cumsum(c), by=list(a,b)]}. If you require a copy, take a copy first (using \code{DT2 = copy(DT)}). See \code{\link{copy}}. + If \code{setorder} results in reordering of the rows of a keyed \code{data.table}, then its \emph{key} will be set to \code{NULL}. } \seealso{ - \code{\link{setkey}}, \code{\link{setcolorder}}, \code{\link{setattr}}, - \code{\link{setnames}}, \code{\link{set}}, \code{\link{:=}}, \code{\link{setDT}}, - \code{\link{setDF}}, \code{\link{copy}}, \code{\link{setNumericRounding}} + \code{\link{forder}}, \code{\link{setkey}}, \code{\link{setcolorder}}, \code{\link{copy}} } \examples{ set.seed(45L) @@ -133,7 +55,7 @@ DT = data.table(id1 = c("a","b","c","d"), v1 = rnorm(4)) setorderv(DT, neworder = c(2:4,1L)) # random order -setorderv(DT, neworder = sample(nrow(DT))) +setorderv(DT, neworder = sample.int(nrow(DT))) } \keyword{ data }