From 740d262d82e265174e3e680c3a26aebd1064b7fe Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 18 May 2020 12:04:14 +0100 Subject: [PATCH 01/14] order optimizes to forderv #3023 --- R/data.table.R | 69 +++++++++++++++++++++++++++++++- R/setkey.R | 14 +++---- inst/tests/tests.Rraw | 92 +++++++++++++++++++++++++++++++++++++------ 3 files changed, 154 insertions(+), 21 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 98651d6e0b..6102d54b0c 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -99,11 +99,71 @@ replace_dot_alias = function(e) { e } -.massagei = function(x) { +.massagei = function(x, dt=NULL, verbose=FALSE, ienv=NULL) { # J alias for list as well in i, just if the first symbol # if x = substitute(base::order) then as.character(x[[1L]]) == c("::", "base", "order") if (x %iscall% c("J",".")) x[[1L]] = quote(list) + # optimize order() to forderv(); evaluates: decreasing, method, na.last. #3023, possibly #3261 as well + if (!is.null(dt) && getOption("datatable.optimize")>=1L && x%iscall%"order") { + call.nm = names(x) + ## escape unsupported method + if ("method" %chin% call.nm) { + method = x[["method"]] + if (!is.character(method)) method = eval(method, ienv) + if (!identical(method, "radix")) return(x) + } + ## escape invalid decreasing, outsource raising error + if ("decreasing" %chin% call.nm) { + decreasing = x[["decreasing"]] + if (!is.logical(decreasing)) decreasing = eval(decreasing, ienv) + if (!is.logical(decreasing) || !length(decreasing) || anyNA(decreasing)) return(x) + } else decreasing = NULL + ## escape invalid na.last, outsource raising error + if ("na.last" %chin% call.nm) { + na.last = x[["na.last"]] + if (!is.logical(na.last)) na.last = eval(na.last, ienv) + if (!is.logical(na.last)) return(x) + } else na.last = TRUE + ## decompose variables in dots + order.args = c("decreasing","method","na.last") ## formalArgs(order) - "..." + order.call = if (!is.null(call.nm)) x[!call.nm %chin% order.args] else x + dots = as.list(order.call[-1]) + ## escapy empty input + if (!length(dots)) return(x) + order.vars = all.vars(order.call) + ## escape constant order(x, 1L) + if (length(dots)!=length(order.vars)) return(x) + ## escape for any non-dt var + if (any(!order.vars %chin% names(dt))) return(x) + ## escape for any unsupported type + supported = c("integer","double","logical","character","complex") + if (any(vapply(order.vars, function(v) !typeof(dt[[v]])%chin%supported, NA))) return(x) + ## decreasing recycle + decreasing = if (is.null(decreasing)) rep(FALSE, length(order.vars)) else { + if (!(length(decreasing)==1L || length(decreasing)==length(order.vars))) stop("'decreasing' must be either length 1, or length of the variables passed to order") + if (length(decreasing)==1L && length(order.vars)>1L) decreasing = rep(decreasing, length(order.vars)) else decreasing + } + ## forderv arguments + by = vector("character", length(order.vars)) + order = rep.int(1L, length(order.vars)) + order[decreasing] = -1L + ## language objects for each of order dots element + for (i in seq_along(dots)) { + dot = dots[[i]] + while (dot %iscall% c("-", "+") && length(dot)==2L) { + if (dot[[1L]]=="-") order[i] = -order[i] + dot = dot[[2L]] + } + if (is.symbol(dot)) { + var = as.character(dot) + if (!var %chin% order.vars) stop("internal error: a dots element is symbol but is not any of order.vars, should have been caught already") # nocov + by[i] = var + } else return(x) + } + x = as.call(list(quote(forderv), quote(x), by=by, retGrp=FALSE, sort=TRUE, order=order, na.last=na.last)) + if (verbose) cat(sprintf("order call in 'i' optimized to '%s'\n", deparse(x, width.cutoff=500L)[1L])) + } x } @@ -355,8 +415,13 @@ replace_dot_alias = function(e) { } else if (!is.name(isub)) { ienv = new.env(parent=parent.frame()) + isub = .massagei(isub, dt=x, verbose=verbose, ienv=ienv) + ## this functionality has been moved to .massagei+forderv (#3023) branch below, but this forder will be still used when a variable in `order` is not a DT column, or order(x, (y)), order(-(x)), etc if (getOption("datatable.optimize")>=1L) assign("order", forder, ienv) - i = tryCatch(eval(.massagei(isub), x, ienv), error=function(e) { + i = if (is.call(isub) && isub[[1L]]==quote(forderv)) { ## order has been optimized to forderv #3023 + fo = eval(isub) ## forderv(x, ...) + if (!length(fo)) seq_len(nrow(x)) else fo + } else tryCatch(eval(isub, x, ienv), error=function(e) { if (grepl(":=.*defined for use in j.*only", e$message)) stop("Operator := detected in i, the first argument inside DT[...], but is only valid in the second argument, j. Most often, this happens when forgetting the first comma (e.g. DT[newvar := 5] instead of DT[ , new_var := 5]). Please double-check the syntax. Run traceback(), and debugger() to get a line number.") else diff --git a/R/setkey.R b/R/setkey.R index 334ca1e801..b75676d01e 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -186,18 +186,18 @@ forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.las .Call(Cforder, x, by, retGrp, sort, order, na.last) # returns integer() if already sorted, regardless of sort=TRUE|FALSE } -forder = function(..., na.last=TRUE, decreasing=FALSE) +forder = function(..., na.last=TRUE, decreasing=FALSE, method=c("auto","shell","radix")) { + if (!missing(method) && !identical(method, "radix")) return(base::order(...=..., na.last=na.last, decreasing=decreasing, method=method)) sub = substitute(list(...)) tt = sapply(sub, function(x) is.null(x) || (is.symbol(x) && !nzchar(x))) - if (any(tt)) sub[tt] = NULL # remove any NULL or empty arguments; e.g. test 1962.052: forder(DT, NULL) and forder(DT, ) + if (any(tt)) stop("[f]order argument ", paste(which(tt)-1L, collapse=", "), " is NULL or empty") # raises error consistent to base::order, invalidates e.g. test 1962.052: forder(DT, NULL) and forder(DT, ) if (length(sub)<2L) return(NULL) # forder() with no arguments returns NULL consistent with base::order asc = rep.int(1L, length(sub)-1L) # ascending (1) or descending (-1) per column # the idea here is to intercept - (and unusual --+ deriving from built expressions) before vectors in forder(DT, -colA, colB) so that : # 1) - on character vector works; ordinarily in R that fails with type error # 2) each column/expression can have its own +/- more easily that having to use a separate decreasing=TRUE/FALSE # 3) we can pass the decreasing (-) flag to C and avoid what normally happens in R; i.e. allocate a new vector and apply - to every element first - # We intercept the unevaluated expressions and massage them before evaluating in with(DT) scope or not depending on the first item. for (i in seq.int(2L, length(sub))) { v = sub[[i]] while (v %iscall% c('-', '+') && length(v)==2L) { @@ -219,10 +219,10 @@ forder = function(..., na.last=TRUE, decreasing=FALSE) } else { data = eval(sub, parent.frame(), parent.frame()) } - stopifnot(isTRUEorFALSE(decreasing)) - o = forderv(data, seq_along(data), sort=TRUE, retGrp=FALSE, order= if (decreasing) -asc else asc, na.last) - if (!length(o) && length(data)>=1L) o = seq_along(data[[1L]]) else o - o + stopifnot(is.logical(decreasing), !anyNA(decreasing)) + asc[decreasing] = -(asc[decreasing]) + o = forderv(data, seq_along(data), sort=TRUE, retGrp=FALSE, order=asc, na.last=na.last) + if (!length(o) && length(data)>=1L) seq_along(data[[1L]]) else o } fsort = function(x, decreasing=FALSE, na.last=FALSE, internal=FALSE, verbose=FALSE, ...) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ed17470383..e022b403aa 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13373,18 +13373,19 @@ test(1962.0471, forderv(mean), error="'x' argument must test(1962.0472, forderv(DT, by=mean), error="argument specifying columns must be character or numeric") test(1962.0473, forderv(NULL), error="DT is an empty list() of 0 columns") -setDF(DT) -test(1962.0481, forder(DT), 3:1) -L = as.list(DT) -test(1962.0482, forder(L), 3:1) -test(1962.0483, forder(), NULL) -setDT(DT) -test(1962.049, forder(DT[ , 0L]), error = 'Attempting to order a 0-column') -test(1962.050, forder(DT, decreasing = NA), error = 'isTRUEorFALSE(decreasing) is not TRUE') -test(1962.051, forder(DT, decreasing = 1.4), error = 'isTRUEorFALSE(decreasing) is not TRUE') -test(1962.052, forder(DT, NULL), 3:1) -test(1962.053, forder(DT), 3:1) -test(1962.054, forder(DT, ), 3:1) +## made inactive by #4346: forder should not accept list, but forderv should +#setDF(DT) +#test(1962.0481, forder(DT), 3:1) +#L = as.list(DT) +#test(1962.0482, forder(L), 3:1) +#test(1962.0483, forder(), NULL) +#setDT(DT) +#test(1962.049, forder(DT[ , 0L]), error = 'Attempting to order a 0-column') +#test(1962.050, forder(DT, decreasing = NA), error = 'isTRUEorFALSE(decreasing) is not TRUE') +#test(1962.051, forder(DT, decreasing = 1.4), error = 'isTRUEorFALSE(decreasing) is not TRUE') +#test(1962.052, forder(DT, NULL), 3:1) +#test(1962.053, forder(DT), 3:1) +#test(1962.054, forder(DT, ), 3:1) test(1962.055, fsort(as.double(DT$a), internal = TRUE), error = 'Internal code should not be being called on type double') @@ -16853,3 +16854,70 @@ A = data.table(A=c(complex(real = 1:3, imaginary=c(0, -1, 1)), NaN)) test(2138.3, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) A = data.table(A=as.complex(rep(NA, 5))) test(2138.4, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) + +# optimize order() to forderv(); evaluates: decreasing, method, na.last. #3023, possibly #3261 as well +TEST = 2139 +opt.msg = "order call in 'i' optimized to 'forderv.*" +# test base R order arguments because we rely on that during order->forderv optimization +order.args = c("...", "na.last", "decreasing", "method") +if (base::getRversion() < "3.3.0") order.args = setdiff(order.args, "method") +test(TEST+0.01, formalArgs(order), order.args) +test(TEST+0.02, isTRUE(formals(order)$na.last)) # quite unlikely that would be changed, but doesnt hurt to test +# test new optimization +d = data.table(x=3:1, y=c(2L,1L,3L), z=1:3) +ix = with(d, order(x, z=y)) +test(TEST+0.11, output=opt.msg, d[order(x, z=y, na.last=TRUE), verbose=TRUE], d[ix]) # named dots are fine +ix = with(d, order(x, z=I(y))) +test(TEST+0.12, notOutput=opt.msg, d[order(x, z=I(y), na.last=TRUE), verbose=TRUE], d[ix]) # calls arent fine because we would need to evaluate dots +ix = with(d, order(x, I(y))) +test(TEST+0.13, notOutput=opt.msg, d[order(x, I(y), na.last=TRUE), verbose=TRUE], d[ix]) # unnamed as well wont work +ix = with(d, order(x, y, decreasing=c(FALSE,FALSE))) +test(TEST+0.14, output=opt.msg, d[order(x, y, decreasing=c(FALSE,FALSE)), verbose=TRUE], d[ix]) # decreasing evaluated +ix = with(d, order(x, y, na.last=TRUE)) +test(TEST+0.15, output=opt.msg, d[order(x, y, na.last=TRUE), verbose=TRUE], d[ix]) +ix = with(d, order(y, x, na.last=TRUE)) +test(TEST+0.16, output=opt.msg, d[order(y, x, na.last=TRUE), verbose=TRUE], d[ix]) +ix = with(d, order(x, y, na.last=FALSE)) +test(TEST+0.17, output=opt.msg, d[order(x, y, na.last=as.logical(FALSE)), verbose=TRUE], d[ix]) ## evaluates na.last +ix = with(d, order(y, x, na.last=FALSE)) +test(TEST+0.18, output=opt.msg, d[order(y, x, na.last=as.logical(FALSE)), verbose=TRUE], d[ix]) +ix = with(d, order(y, x, decreasing=TRUE)) +test(TEST+0.19, output=opt.msg, d[order(-y, -x), verbose=TRUE], d[ix]) +test(TEST+0.20, output=opt.msg, d[order(y, x, decreasing=TRUE), verbose=TRUE], d[ix]) +test(TEST+0.21, output=opt.msg, d[order(+y, +x, decreasing=TRUE), verbose=TRUE], d[ix]) +ix = with(d, order(y, x, decreasing=c(FALSE,FALSE))) +test(TEST+0.22, output=opt.msg, d[order(-y, -x, decreasing=c(TRUE,TRUE)), verbose=TRUE], d[ix]) +test(TEST+0.23, output=opt.msg, d[order(y, x, decreasing=c(FALSE,FALSE)), verbose=TRUE], d[ix]) +test(TEST+0.24, output=opt.msg, d[order(+y, +x, decreasing=c(FALSE,FALSE)), verbose=TRUE], d[ix]) +test(TEST+0.25, output=opt.msg, d[order(+y, -x, decreasing=c(FALSE,TRUE)), verbose=TRUE], d[ix]) +test(TEST+0.26, output=opt.msg, d[order(-y, +x, decreasing=c(TRUE,FALSE)), verbose=TRUE], d[ix]) +ix = with(d, order(x, decreasing=FALSE)) +test(TEST+0.27, output=opt.msg, d[order(-x, decreasing=TRUE), verbose=TRUE], d[ix]) +test(TEST+0.28, output=opt.msg, d[order(x), verbose=TRUE], d[ix]) +ix = with(d, order(x, decreasing=TRUE)) +test(TEST+0.29, output=opt.msg, d[order(x, decreasing=TRUE), verbose=TRUE], d[ix]) +test(TEST+0.30, output=opt.msg, d[order(-x), verbose=TRUE], d[ix]) +ix = with(d, order(x)) +test(TEST+0.31, output=opt.msg, d[order(-++-++x), verbose=TRUE], d[ix]) +ix = with(d, order(x, decreasing=TRUE)) +test(TEST+0.32, output=opt.msg, d[order(-++-++x, decreasing=TRUE), verbose=TRUE], d[ix]) +ix = with(d, order(x, y, x)) +test(TEST+0.33, notOutput=opt.msg, output="forder.c", d[order(x, y, x), verbose=TRUE], d[ix]) # test duplicated order vars redirect to forder +ix = with(d, order()) +test(TEST+0.34, notOutput=opt.msg, d[order(), verbose=TRUE], d[ix]) # zero length input +ix = with(d, order(na.last=FALSE)) +test(TEST+0.35, notOutput=opt.msg, d[order(na.last=FALSE), verbose=TRUE], d[ix]) +test(TEST+0.41, notOutput=opt.msg, d[order(NULL), verbose=TRUE], error="order argument 1 is NULL or empty") +test(TEST+0.42, notOutput=opt.msg, d[order(1,,2,NULL), verbose=TRUE], error="order argument 2, 4 is NULL or empty") +test(TEST+0.43, notOutput=opt.msg, d[order(NULL, na.last=FALSE), verbose=TRUE], error="order argument 1 is NULL or empty") +test(TEST+0.44, notOutput=opt.msg, d[order(na.last=FALSE, NULL), verbose=TRUE], error="order argument 1 is NULL or empty") +# method and decreasing properly handled in in forder #4456 +d = data.table(x=2:1); ans = data.table(x=1:2) +test(TEST+0.91, output=opt.msg, d[order(x, method="radix"), verbose=TRUE], ans) # forderv +test(TEST+0.92, notOutput=opt.msg, output="forder.c", d[order(((x)), method="radix"), verbose=TRUE], ans) # forder +test(TEST+0.93, notOutput="forder.*", d[order(x, method="auto"), verbose=TRUE], ans) # base::order +test(TEST+0.94, output=opt.msg, data.table(x=2:1, y=2L)[order(x, y, decreasing=c(FALSE,FALSE)), verbose=TRUE], data.table(x=1:2,y=2L)) +test(TEST+0.95, notOutput=opt.msg, output="forder.c", data.table(x=2:1, y=2L)[order((x), y, decreasing=c(FALSE,FALSE)), verbose=TRUE], data.table(x=1:2, y=2L)) +d = data.table(x=3:1, y=c(2L,1L,3L), z=1:3) +ix = with(d, order(x, y, decreasing=c(FALSE,FALSE))) +test(TEST+0.96, d[with(d, order(x, y, decreasing=c(FALSE,FALSE)))], d[ix]) ## order masked with forder From f4f19c5d1ec51751f46a5fe0ebff7c1d6788d5ba Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 18 May 2020 12:54:32 +0100 Subject: [PATCH 02/14] add tests for using index after lazy order merged --- inst/tests/tests.Rraw | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e022b403aa..1f00e14ae3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16911,6 +16911,16 @@ test(TEST+0.41, notOutput=opt.msg, d[order(NULL), verbose=TRUE], error="order ar test(TEST+0.42, notOutput=opt.msg, d[order(1,,2,NULL), verbose=TRUE], error="order argument 2, 4 is NULL or empty") test(TEST+0.43, notOutput=opt.msg, d[order(NULL, na.last=FALSE), verbose=TRUE], error="order argument 1 is NULL or empty") test(TEST+0.44, notOutput=opt.msg, d[order(na.last=FALSE, NULL), verbose=TRUE], error="order argument 1 is NULL or empty") +if ("lazy" %chin% formalArgs(forderv)) { ## this tests kicks in when #4386 merged + d = data.table(x=3:1, y=c(2L,1L,3L), z=1:3) + test(TEST+0.81, output="forder.*opt=-1", d[order(x, na.last=FALSE), verbose=TRUE], d[3:1]) + setindexv(d, list("x","y","z",c("x","y","z"),c("y","z"))) + test(TEST+0.82, output="forder.*opt=2", d[order(x, na.last=FALSE), verbose=TRUE], d[3:1]) + test(TEST+0.83, output="forder.*opt=2", d[order(x, y, z, na.last=FALSE), verbose=TRUE], d[3:1]) + test(TEST+0.84, output="forder.*opt=2", d[order(z, na.last=FALSE), verbose=TRUE], d) + setkeyv(d, "z") + test(TEST+0.85, output="forder.*opt=1", d[order(z, na.last=FALSE), verbose=TRUE], d) +} # method and decreasing properly handled in in forder #4456 d = data.table(x=2:1); ans = data.table(x=1:2) test(TEST+0.91, output=opt.msg, d[order(x, method="radix"), verbose=TRUE], ans) # forderv From a83d62e7f6522144c25085957b6eb49d93c9e346 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 18 May 2020 13:44:31 +0100 Subject: [PATCH 03/14] fix verbose output caused by new opt, 1967.71, #4459 --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1f00e14ae3..3433d8ed3b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13748,7 +13748,7 @@ test(1967.69, !any(grepl('forder.c', verbose_output, fixed = TRUE))) test(1967.70, any(grepl('[1] 5', verbose_output, fixed = TRUE))) options('datatable.optimize' = 1L) test(1967.71, x[order(a), .N, verbose = TRUE], 5L, - output = "forder.c received 5 rows and 1 column") + output = "forder.c received 5 rows and 2 columns") ## this is now optimized to forderv(x, by="a") changing verbose output #4459 setkey(x) test(1967.72, x[x, .N, on = 'a', verbose = TRUE], 5L, output = "on= matches existing key") From ca4668fddcc6b474917a4c57d061c4c65aace3cc Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 18 May 2020 14:39:28 +0100 Subject: [PATCH 04/14] debug order function on travis and appveyor --- inst/tests/tests.Rraw | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3433d8ed3b..17aea862c1 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16859,10 +16859,11 @@ test(2138.4, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) TEST = 2139 opt.msg = "order call in 'i' optimized to 'forderv.*" # test base R order arguments because we rely on that during order->forderv optimization +test(TEST+0.01, environmentName(environment(order)), "base") ## debug travis and appveyor order.args = c("...", "na.last", "decreasing", "method") if (base::getRversion() < "3.3.0") order.args = setdiff(order.args, "method") -test(TEST+0.01, formalArgs(order), order.args) -test(TEST+0.02, isTRUE(formals(order)$na.last)) # quite unlikely that would be changed, but doesnt hurt to test +if (!test(TEST+0.02, formalArgs(order), order.args)) cat(head(order), sep="\n") +test(TEST+0.03, isTRUE(formals(order)$na.last)) # quite unlikely that would be changed, but doesnt hurt to test # test new optimization d = data.table(x=3:1, y=c(2L,1L,3L), z=1:3) ix = with(d, order(x, z=y)) From 583e329af5bd1cf069948c5f6924ead013b070c0 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 18 May 2020 15:14:47 +0100 Subject: [PATCH 05/14] fix bit64 mask of order --- inst/tests/tests.Rraw | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 17aea862c1..bd1760c829 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16859,11 +16859,10 @@ test(2138.4, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) TEST = 2139 opt.msg = "order call in 'i' optimized to 'forderv.*" # test base R order arguments because we rely on that during order->forderv optimization -test(TEST+0.01, environmentName(environment(order)), "base") ## debug travis and appveyor order.args = c("...", "na.last", "decreasing", "method") if (base::getRversion() < "3.3.0") order.args = setdiff(order.args, "method") -if (!test(TEST+0.02, formalArgs(order), order.args)) cat(head(order), sep="\n") -test(TEST+0.03, isTRUE(formals(order)$na.last)) # quite unlikely that would be changed, but doesnt hurt to test +test(TEST+0.01, formalArgs(base::order), order.args) # we need base:: prefix because bit64 masks order +test(TEST+0.02, isTRUE(formals(base::order)$na.last)) # quite unlikely that would be changed, but doesnt hurt to test # test new optimization d = data.table(x=3:1, y=c(2L,1L,3L), z=1:3) ix = with(d, order(x, z=y)) From d3ee8f6a844545a6aa65d4d96b78a08a7789a6eb Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 18 May 2020 18:08:57 +0100 Subject: [PATCH 06/14] tests, also for creating index in DT[order(.)] calls, #3261 --- R/data.table.R | 4 ++-- R/setkey.R | 3 ++- inst/tests/tests.Rraw | 13 +++++++++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 6102d54b0c..7af2874cc0 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -139,9 +139,9 @@ replace_dot_alias = function(e) { ## escape for any unsupported type supported = c("integer","double","logical","character","complex") if (any(vapply(order.vars, function(v) !typeof(dt[[v]])%chin%supported, NA))) return(x) - ## decreasing recycle + ## decreasing recycle, outsource raising error decreasing = if (is.null(decreasing)) rep(FALSE, length(order.vars)) else { - if (!(length(decreasing)==1L || length(decreasing)==length(order.vars))) stop("'decreasing' must be either length 1, or length of the variables passed to order") + if (length(decreasing)!=1L && length(decreasing)!=length(order.vars)) return(x) if (length(decreasing)==1L && length(order.vars)>1L) decreasing = rep(decreasing, length(order.vars)) else decreasing } ## forderv arguments diff --git a/R/setkey.R b/R/setkey.R index b75676d01e..dd27b36f41 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -219,7 +219,8 @@ forder = function(..., na.last=TRUE, decreasing=FALSE, method=c("auto","shell"," } else { data = eval(sub, parent.frame(), parent.frame()) } - stopifnot(is.logical(decreasing), !anyNA(decreasing)) + if (!is.logical(decreasing) || anyNA(decreasing)) stop("'decreasing' must be logical non-NA") + if (length(decreasing)!=1L && length(decreasing)!=length(data)) stop("'decreasing' must be either length 1, or length of the variables passed to [f]order") asc[decreasing] = -(asc[decreasing]) o = forderv(data, seq_along(data), sort=TRUE, retGrp=FALSE, order=asc, na.last=na.last) if (!length(o) && length(data)>=1L) seq_along(data[[1L]]) else o diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index bd1760c829..539dbb3b17 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16911,7 +16911,14 @@ test(TEST+0.41, notOutput=opt.msg, d[order(NULL), verbose=TRUE], error="order ar test(TEST+0.42, notOutput=opt.msg, d[order(1,,2,NULL), verbose=TRUE], error="order argument 2, 4 is NULL or empty") test(TEST+0.43, notOutput=opt.msg, d[order(NULL, na.last=FALSE), verbose=TRUE], error="order argument 1 is NULL or empty") test(TEST+0.44, notOutput=opt.msg, d[order(na.last=FALSE, NULL), verbose=TRUE], error="order argument 1 is NULL or empty") +test(TEST+0.45, notOutput=opt.msg, d[order(x, decreasing=1L), verbose=TRUE], error="'decreasing' must be logical non-NA") +test(TEST+0.46, notOutput=opt.msg, d[order(x, decreasing=NA), verbose=TRUE], error="'decreasing' must be logical non-NA") +test(TEST+0.47, notOutput=opt.msg, d[order(x, y, decreasing=c(FALSE,FALSE,FALSE)), verbose=TRUE], error="must be either length 1, or length of the variables.*order") +test(TEST+0.48, notOutput=opt.msg, d[order(x, na.last=1L), verbose=TRUE], error="must be logical TRUE, FALSE or NA of length 1") +test(TEST+0.49, notOutput=opt.msg, data.table(x=as.raw(1:2))[order(x), verbose=TRUE], error="order is type 'raw', not yet supported") +test(TEST+0.50, notOutput=opt.msg, d[order(x, y, method="auto"), verbose=TRUE], d[3:1]) if ("lazy" %chin% formalArgs(forderv)) { ## this tests kicks in when #4386 merged + # `order` could utilize existing index #3023 d = data.table(x=3:1, y=c(2L,1L,3L), z=1:3) test(TEST+0.81, output="forder.*opt=-1", d[order(x, na.last=FALSE), verbose=TRUE], d[3:1]) setindexv(d, list("x","y","z",c("x","y","z"),c("y","z"))) @@ -16920,6 +16927,12 @@ if ("lazy" %chin% formalArgs(forderv)) { ## this tests kicks in when #4386 merge test(TEST+0.84, output="forder.*opt=2", d[order(z, na.last=FALSE), verbose=TRUE], d) setkeyv(d, "z") test(TEST+0.85, output="forder.*opt=1", d[order(z, na.last=FALSE), verbose=TRUE], d) + # `DT[order(.), ...]` could create index on DT #3261 + d = data.table(x=3:1, y=c(2L,1L,3L), z=1:3) + op2 = options("datatable.forder.auto.index"=TRUE) + test(TEST+0.86, output="forder.*opt=-1", d[order(x, na.last=FALSE), verbose=TRUE], d[3:1]) + test(TEST+0.87, output="forder.*opt=2", d[order(x, na.last=FALSE), verbose=TRUE], d[3:1]) + options(op2) } # method and decreasing properly handled in in forder #4456 d = data.table(x=2:1); ans = data.table(x=1:2) From 0ad9f2c13ab5e8d702186ea5bf3812fe35953826 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 18 May 2020 21:51:23 +0100 Subject: [PATCH 07/14] forder na.last=NA removes zeros from output --- R/setkey.R | 3 ++- inst/tests/tests.Rraw | 38 ++++++++++++++++++++++---------------- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/R/setkey.R b/R/setkey.R index dd27b36f41..12f84536d5 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -223,7 +223,8 @@ forder = function(..., na.last=TRUE, decreasing=FALSE, method=c("auto","shell"," if (length(decreasing)!=1L && length(decreasing)!=length(data)) stop("'decreasing' must be either length 1, or length of the variables passed to [f]order") asc[decreasing] = -(asc[decreasing]) o = forderv(data, seq_along(data), sort=TRUE, retGrp=FALSE, order=asc, na.last=na.last) - if (!length(o) && length(data)>=1L) seq_along(data[[1L]]) else o + o = if (!length(o) && length(data)>=1L) seq_along(data[[1L]]) else o + if (is.na(na.last)) o[as.logical(o)] else o ## remove zeros, as base order #4346 } fsort = function(x, decreasing=FALSE, na.last=FALSE, internal=FALSE, verbose=FALSE, ...) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 539dbb3b17..45a7e3f7ad 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5130,10 +5130,10 @@ test(1319.8, DT[order(list(DT$a, DT$b))], DT) # error="Column 1 of by= (1) is if (test_bit64) { set.seed(45L) DT <- data.table(x=as.integer64(c(-50, 0, 50, 1e18, 1e-18)), y=sample(5)) - ans1 <- forder(DT, x, na.last=TRUE, decreasing=FALSE) - ans2 <- forder(DT, x, na.last=FALSE, decreasing=FALSE) - ans3 <- forder(DT, x, na.last=TRUE, decreasing=TRUE) - ans4 <- forder(DT, x, na.last=FALSE, decreasing=TRUE) + ans1 <- forderv(DT, "x", na.last=TRUE, order=1L) + ans2 <- forderv(DT, "x", na.last=FALSE, order=1L) + ans3 <- forderv(DT, "x", na.last=TRUE, order=-1L) + ans4 <- forderv(DT, "x", na.last=FALSE, order=-1L) test(1320.01, ans1, as.integer(c(1,2,5,3,4))) test(1320.02, ans2, as.integer(c(1,2,5,3,4))) test(1320.03, ans3, as.integer(c(4,3,2,5,1))) @@ -5141,10 +5141,10 @@ if (test_bit64) { set.seed(45L) DT <- data.table(x=as.integer64(c(-50, 0, NA, 50, 1e18, NA, 1e-18)), y=sample(7)) - ans1 <- forder(DT, x, na.last=TRUE, decreasing=FALSE) - ans2 <- forder(DT, x, na.last=FALSE, decreasing=FALSE) - ans3 <- forder(DT, x, na.last=TRUE, decreasing=TRUE) - ans4 <- forder(DT, x, na.last=FALSE, decreasing=TRUE) + ans1 <- forderv(DT, "x", na.last=TRUE, order=1L) + ans2 <- forderv(DT, "x", na.last=FALSE, order=1L) + ans3 <- forderv(DT, "x", na.last=TRUE, order=-1L) + ans4 <- forderv(DT, "x", na.last=FALSE, order=-1L) test(1320.05, ans1, as.integer(c(1,2,7,4,5,3,6))) test(1320.06, ans2, as.integer(c(3,6,1,2,7,4,5))) @@ -5154,8 +5154,8 @@ if (test_bit64) { # missed test - checking na.last=NA! set.seed(45L) DT <- data.table(x=as.integer64(c(-50, 0, NA, 50, 1e18, NA, 1e-18)), y=sample(7)) - ans1 <- forder(DT, x, na.last=NA, decreasing=FALSE) - ans2 <- forder(DT, x, na.last=NA, decreasing=TRUE) + ans1 <- forderv(DT, "x", na.last=NA, order=1L) + ans2 <- forderv(DT, "x", na.last=NA, order=-1L) test(1320.09, ans1, as.integer(c(0,0,1,2,7,4,5))) test(1320.10, ans2, as.integer(c(0,0,5,4,2,7,1))) @@ -11877,17 +11877,17 @@ test(1843, is.sorted((0+0i)^(-3:3)), error = "type 'complex' is not yet supporte # make a minimal example where there's a group size of 2 in the 2nd column (type double) with an NA too and na.last=NA # covers the branch in forder.c:dsort line 1070 starting: if (nalast == 0 && n == 2) { DT = data.table(c("a","a","a","b","b"),c(2,1,3,NA,2)) -test(1844.1, forder(DT,V1,V2,na.last=NA), INT(2,1,3,0,5)) +test(1844.1, forderv(DT,c("V1","V2"),na.last=NA), INT(2,1,3,0,5)) DT = data.table(c("a","a","a","b","b"),c(2,1,3,2,NA)) -test(1844.2, forder(DT,V1,V2,na.last=NA), INT(2,1,3,0,4)) # prior to v1.12.0 this was 2,1,3,4,0. As long as it's the same with 0's removed, think it's ok +test(1844.2, forderv(DT,c("V1","V2"),na.last=NA), INT(2,1,3,0,4)) # prior to v1.12.0 this was 2,1,3,4,0. As long as it's the same with 0's removed, think it's ok # now with two NAs in that 2-group covers forder.c:forder line 1269 starting: else if (nalast == 0 && tmp==-2) { DT = data.table(c("a","a","a","b","b"),c(2,1,3,NA,NA)) -test(1844.3, forder(DT,V1,V2,na.last=NA), INT(2,1,3,0,0)) +test(1844.3, forderv(DT,c("V1","V2"),na.last=NA), INT(2,1,3,0,0)) DT = data.table(as.raw(0:6), 7:1) -test(1844.4, forder(DT,V1,V2), error="Column 1 passed to [f]order is type 'raw', not yet supported") -test(1844.5, forder(DT,V2,V1), error="Column 2 passed to [f]order is type 'raw', not yet supported") +test(1844.4, forderv(DT,c("V1","V2")), error="Column 1 passed to [f]order is type 'raw', not yet supported") +test(1844.5, forderv(DT,c("V2","V1")), error="Column 2 passed to [f]order is type 'raw', not yet supported") DT = data.table(as.raw(0:6), c(5L,5L,1L,2L,2L,2L,2L)) -test(1844.6, forder(DT,V2,V1), error="Column 2 passed to [f]order is type 'raw', not yet supported") +test(1844.6, forderv(DT,c("V2","V1")), error="Column 2 passed to [f]order is type 'raw', not yet supported") # fix for non-equi joins issue #1991. Thanks to Henrik for the nice minimal example. d1 <- data.table(x = c(rep(c("b", "a", "c"), each = 3), c("a", "b")), y = c(rep(c(1, 3, 6), 3), 6, 6), id = 1:11) @@ -16917,6 +16917,11 @@ test(TEST+0.47, notOutput=opt.msg, d[order(x, y, decreasing=c(FALSE,FALSE,FALSE) test(TEST+0.48, notOutput=opt.msg, d[order(x, na.last=1L), verbose=TRUE], error="must be logical TRUE, FALSE or NA of length 1") test(TEST+0.49, notOutput=opt.msg, data.table(x=as.raw(1:2))[order(x), verbose=TRUE], error="order is type 'raw', not yet supported") test(TEST+0.50, notOutput=opt.msg, d[order(x, y, method="auto"), verbose=TRUE], d[3:1]) +v1 = c(4L,1L,NA,2L) # forder removes 0 for NAs +d1 = data.table(v1) +test(TEST+0.51, forder(v1, na.last=NA), c(2L,4L,1L)) +test(TEST+0.52, forderv(d1,"v1",na.last=NA), c(0L,2L,4L,1L)) +test(TEST+0.53, output=opt.msg, d1[order(v1, na.last=NA), verbose=TRUE], d1[c(2L,4L,1L)]) if ("lazy" %chin% formalArgs(forderv)) { ## this tests kicks in when #4386 merged # `order` could utilize existing index #3023 d = data.table(x=3:1, y=c(2L,1L,3L), z=1:3) @@ -16944,3 +16949,4 @@ test(TEST+0.95, notOutput=opt.msg, output="forder.c", data.table(x=2:1, y=2L)[or d = data.table(x=3:1, y=c(2L,1L,3L), z=1:3) ix = with(d, order(x, y, decreasing=c(FALSE,FALSE))) test(TEST+0.96, d[with(d, order(x, y, decreasing=c(FALSE,FALSE)))], d[ix]) ## order masked with forder + From 22c27eb0f6536475d4eb3cbcb39b6ef178d8747e Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 18 May 2020 22:03:08 +0100 Subject: [PATCH 08/14] forder manual --- man/forder.Rd | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 man/forder.Rd diff --git a/man/forder.Rd b/man/forder.Rd new file mode 100644 index 0000000000..c8809ee915 --- /dev/null +++ b/man/forder.Rd @@ -0,0 +1,73 @@ +\name{forder} +\alias{order} +\alias{fastorder} +\alias{forderv} +\title{Fast Ordering Permutation} +\description{ + Finds permutation that reorders its input into specified order. + + \code{forder} is faster equivalent of \code{\link[base]{order}}. Argument \code{method} other than \code{"radix"} will redirect computation to \code{base::order}. + + \code{forderv} has a different interface and is meant to be used on \code{data.table}. In case if input is already ordered it returns \code{integer()} rather than \code{seq_len(nrow(x))}. Default value of \code{na.last} is \code{FALSE}, unlike \code{forder}. When \code{na.last} is \code{NA}, then zeros are returned for missing observations, unlike \code{forder}. + + Both functions finds the order in \emph{C-locale}, for details see \emph{Note} section. +} +\usage{ + forder(\dots, na.last=TRUE, decreasing=FALSE, method=c("auto","shell","radix")) + forderv(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.last=FALSE) +} +\arguments{ + \item{\dots}{ Vectors on which to find permutation, variable names must not be quoted. To find descending order prefix the symbol \code{"-"} which means \emph{descending} (not \emph{negative}, in this context), i.e., \code{forder(a, -b, c)}. The \code{-b} works when \code{b} is of type \code{character} as well. } + \item{na.last}{ \code{logical}. If \code{TRUE}, missing values in the data are placed last (default for \code{forder}); if \code{FALSE} they are placed first (default for \code{forderv}); if \code{NA} they are removed. } + \item{decreasing}{ \code{logical} defaults to \code{FALSE}. When \code{TRUE} then it finds descending order in data, or when combined with \code{"-"} prefixes it will invert the order. Scalar will be recycled to match the number of arguments passed to \dots. } + \item{method}{ \code{character} defaults to \code{"radix"}, any other value will redirect computation to \code{base::order}. } + \item{x}{ A \code{data.table}. } + \item{by}{ A \code{character} vector of column names of \code{x} by which to order. By default, find order over all columns; Do not add \code{"-"} prefixes here, use \code{order} argument instead. } + \item{retGrp}{ \code{logical} defaults to \code{FALSE}. When \code{TRUE} then resulting object will carry extra attributes \code{starts} and \code{maxgrpn}. } + \item{sort}{ \code{logical} defaults to \code{TRUE}. When \code{FALSE} then results will be only identifying the groups and not returning the order. } + \item{order}{ An \code{integer} vector with only possible values of \code{1} and \code{-1}, corresponding to ascending and descending order. The length of \code{order} must be either \code{1} or equal to that of \code{cols}. If \code{length(order)==1}, it is recycled to \code{length(by)}. } +} +\details{ + \code{data.table} implements its own fast \emph{radix}-based ordering. See the references for some exposition on the concept of \emph{radix} sort. + + Note that \code{data.table}'s fast order has been contributed to base R 3.3.0 in 2016. Since then \code{data.table}'s fast order evolved and is now parallel, uses multiple CPU threads. + + \code{bit64::integer64} type is also supported for finding order. + + Queries like \code{x[order(...)]} are optimised internally to use \code{data.table}'s fast order. Moreover queries \code{x[order(..., na.last=FALSE)]} can re-use existing indices. +} +\note{ + Using \emph{C-locale} makes the behaviour of ordering in \code{data.table} more consistent across sessions and locales. The behaviour of \code{base::order} depends on assumptions about the locale of the R session. In English locales, \code{"america" < "BRAZIL"} is \code{TRUE} by default but false if you either type \code{Sys.setlocale(locale="C")} or the R session has been started in a C locale for you -- which can happen on servers/services since the locale comes from the environment the R session was started in. By contrast, \code{"america" < "BRAZIL"} is always \code{FALSE} in \code{data.table} regardless of the way your R session was started. +} +\references{ + \url{https://en.wikipedia.org/wiki/Radix_sort}\cr + \url{https://en.wikipedia.org/wiki/Counting_sort}\cr + \url{http://stereopsis.com/radix.html}\cr + \url{https://codercorner.com/RadixSortRevisited.htm}\cr + \url{https://medium.com/basecs/getting-to-the-root-of-sorting-with-radix-sort-f8e9240d4224} +} +\value{ + Integer vector. + + \code{forder} will return vector of the same length as vectors in \dots. If \code{na.last} is \code{NA}, then returned values are limited to non-missing observations in input. + + \code{forderv} will return vector of the length equal to \code{nrow(x)}, or in case if \code{x} is already ordered by provided \code{by} argument, then 0 length integer vector, \code{integer()} is returned, rather than \code{seq_len(nrow(x))}. If \code{na.last} is \code{NA}, then zeros are returned as the order of missing observations. +} +\seealso{ + \code{\link{setkey}}, \code{\link{setorder}}, \code{\link{fsort}} +} +\examples{ +x = 1:5 +d = data.table(x) +forder(x) +forderv(d, by="x") + +x[4L] = NA +set(d, 4L, "x", NA) +forder(x) +forderv(d, by="x") + +forder(x, na.last=NA) +forderv(d, by="x", na.last=NA) +} +\keyword{ data } From 3813425217201dcda74db72253f90ef9b3507dd0 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 18 May 2020 22:26:08 +0100 Subject: [PATCH 09/14] manual fixes --- man/forder.Rd | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/man/forder.Rd b/man/forder.Rd index c8809ee915..ea119c6ebd 100644 --- a/man/forder.Rd +++ b/man/forder.Rd @@ -1,4 +1,5 @@ \name{forder} +\alias{forder} \alias{order} \alias{fastorder} \alias{forderv} @@ -57,6 +58,10 @@ \code{\link{setkey}}, \code{\link{setorder}}, \code{\link{fsort}} } \examples{ +## not yet exported +forder = data.table:::forder +forderv = data.table:::forderv + x = 1:5 d = data.table(x) forder(x) From 4b9a9d746bde268d611f23e8f8549491bb7aa836 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 18 May 2020 22:27:01 +0100 Subject: [PATCH 10/14] rm duplicated aliases --- man/setorder.Rd | 4 ---- 1 file changed, 4 deletions(-) diff --git a/man/setorder.Rd b/man/setorder.Rd index 6e7b598427..307ab956d9 100644 --- a/man/setorder.Rd +++ b/man/setorder.Rd @@ -1,10 +1,6 @@ \name{setorder} \alias{setorder} \alias{setorderv} -\alias{order} -\alias{fastorder} -\alias{forder} -\alias{forderv} \title{Fast row reordering of a data.table by reference} \description{ From 618f4d50bf155f55a832d61a020c6dd220ac6536 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 18 May 2020 22:41:37 +0100 Subject: [PATCH 11/14] test call in method arg --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 45a7e3f7ad..7fa072e327 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16916,7 +16916,7 @@ test(TEST+0.46, notOutput=opt.msg, d[order(x, decreasing=NA), verbose=TRUE], err test(TEST+0.47, notOutput=opt.msg, d[order(x, y, decreasing=c(FALSE,FALSE,FALSE)), verbose=TRUE], error="must be either length 1, or length of the variables.*order") test(TEST+0.48, notOutput=opt.msg, d[order(x, na.last=1L), verbose=TRUE], error="must be logical TRUE, FALSE or NA of length 1") test(TEST+0.49, notOutput=opt.msg, data.table(x=as.raw(1:2))[order(x), verbose=TRUE], error="order is type 'raw', not yet supported") -test(TEST+0.50, notOutput=opt.msg, d[order(x, y, method="auto"), verbose=TRUE], d[3:1]) +test(TEST+0.50, notOutput=opt.msg, d[order(x, y, method=as.character("auto")), verbose=TRUE], d[3:1]) v1 = c(4L,1L,NA,2L) # forder removes 0 for NAs d1 = data.table(v1) test(TEST+0.51, forder(v1, na.last=NA), c(2L,4L,1L)) From 04bdad98ab8185fd0f161a98a17109d0ff19dc8c Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 19 May 2020 12:00:43 +0100 Subject: [PATCH 12/14] improve manual --- man/forder.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/forder.Rd b/man/forder.Rd index ea119c6ebd..3f747a10e1 100644 --- a/man/forder.Rd +++ b/man/forder.Rd @@ -26,7 +26,7 @@ \item{by}{ A \code{character} vector of column names of \code{x} by which to order. By default, find order over all columns; Do not add \code{"-"} prefixes here, use \code{order} argument instead. } \item{retGrp}{ \code{logical} defaults to \code{FALSE}. When \code{TRUE} then resulting object will carry extra attributes \code{starts} and \code{maxgrpn}. } \item{sort}{ \code{logical} defaults to \code{TRUE}. When \code{FALSE} then results will be only identifying the groups and not returning the order. } - \item{order}{ An \code{integer} vector with only possible values of \code{1} and \code{-1}, corresponding to ascending and descending order. The length of \code{order} must be either \code{1} or equal to that of \code{cols}. If \code{length(order)==1}, it is recycled to \code{length(by)}. } + \item{order}{ An \code{integer} vector with only possible values of \code{1} and \code{-1}, corresponding to ascending and descending order. The length of \code{order} must be either \code{1} or equal to that of \code{cols}. If \code{length(order)==1}, it is recycled to \code{length(by)}. It is ignored for \code{sort==FALSE}. } } \details{ \code{data.table} implements its own fast \emph{radix}-based ordering. See the references for some exposition on the concept of \emph{radix} sort. From 4cbb8ce96585fc3657b382976c1f666b920b392e Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 19 May 2020 12:52:49 +0100 Subject: [PATCH 13/14] minor improvement and comments --- R/data.table.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 7af2874cc0..a3c5b0332b 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -113,20 +113,20 @@ replace_dot_alias = function(e) { if (!is.character(method)) method = eval(method, ienv) if (!identical(method, "radix")) return(x) } - ## escape invalid decreasing, outsource raising error + ## escape invalid decreasing if ("decreasing" %chin% call.nm) { decreasing = x[["decreasing"]] if (!is.logical(decreasing)) decreasing = eval(decreasing, ienv) - if (!is.logical(decreasing) || !length(decreasing) || anyNA(decreasing)) return(x) + if (!is.logical(decreasing) || !length(decreasing) || anyNA(decreasing)) return(x) ## outsource raising error } else decreasing = NULL - ## escape invalid na.last, outsource raising error + ## escape invalid na.last if ("na.last" %chin% call.nm) { na.last = x[["na.last"]] if (!is.logical(na.last)) na.last = eval(na.last, ienv) - if (!is.logical(na.last)) return(x) + if (!is.logical(na.last)) return(x) ## outsource raising error } else na.last = TRUE ## decompose variables in dots - order.args = c("decreasing","method","na.last") ## formalArgs(order) - "..." + order.args = c("decreasing","method","na.last") ## formalArgs(order) - "...", tested in main.Rraw order.call = if (!is.null(call.nm)) x[!call.nm %chin% order.args] else x dots = as.list(order.call[-1]) ## escapy empty input @@ -138,11 +138,11 @@ replace_dot_alias = function(e) { if (any(!order.vars %chin% names(dt))) return(x) ## escape for any unsupported type supported = c("integer","double","logical","character","complex") - if (any(vapply(order.vars, function(v) !typeof(dt[[v]])%chin%supported, NA))) return(x) - ## decreasing recycle, outsource raising error + if (any(vapply(order.vars, function(v) !typeof(dt[[v]])%chin%supported, NA))) return(x) ## outsource raising error + ## decreasing recycle decreasing = if (is.null(decreasing)) rep(FALSE, length(order.vars)) else { - if (length(decreasing)!=1L && length(decreasing)!=length(order.vars)) return(x) - if (length(decreasing)==1L && length(order.vars)>1L) decreasing = rep(decreasing, length(order.vars)) else decreasing + if (length(decreasing)!=1L && length(decreasing)!=length(order.vars)) return(x) ## outsource raising error + if (length(decreasing)==1L && length(order.vars)>1L) rep(decreasing, length(order.vars)) else decreasing } ## forderv arguments by = vector("character", length(order.vars)) From 7791b6ea9c3589020afdb5679627a383bb2a13f7 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 20 May 2020 10:18:23 +0100 Subject: [PATCH 14/14] mention new na.last opt from lazy-forder --- man/forder.Rd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/forder.Rd b/man/forder.Rd index 3f747a10e1..75bed39400 100644 --- a/man/forder.Rd +++ b/man/forder.Rd @@ -22,7 +22,7 @@ \item{na.last}{ \code{logical}. If \code{TRUE}, missing values in the data are placed last (default for \code{forder}); if \code{FALSE} they are placed first (default for \code{forderv}); if \code{NA} they are removed. } \item{decreasing}{ \code{logical} defaults to \code{FALSE}. When \code{TRUE} then it finds descending order in data, or when combined with \code{"-"} prefixes it will invert the order. Scalar will be recycled to match the number of arguments passed to \dots. } \item{method}{ \code{character} defaults to \code{"radix"}, any other value will redirect computation to \code{base::order}. } - \item{x}{ A \code{data.table}. } + \item{x}{ A \code{data.table}, or atomic types. } \item{by}{ A \code{character} vector of column names of \code{x} by which to order. By default, find order over all columns; Do not add \code{"-"} prefixes here, use \code{order} argument instead. } \item{retGrp}{ \code{logical} defaults to \code{FALSE}. When \code{TRUE} then resulting object will carry extra attributes \code{starts} and \code{maxgrpn}. } \item{sort}{ \code{logical} defaults to \code{TRUE}. When \code{FALSE} then results will be only identifying the groups and not returning the order. } @@ -35,7 +35,7 @@ \code{bit64::integer64} type is also supported for finding order. - Queries like \code{x[order(...)]} are optimised internally to use \code{data.table}'s fast order. Moreover queries \code{x[order(..., na.last=FALSE)]} can re-use existing indices. + Queries like \code{x[order(...)]} are optimised internally to use \code{data.table}'s fast order. Moreover queries \code{x[order(..., na.last=FALSE)]} can re-use existing indices, and queries \code{x[order(...)]} can re-use existing indices if there are no missing values in columns passed to order by. } \note{ Using \emph{C-locale} makes the behaviour of ordering in \code{data.table} more consistent across sessions and locales. The behaviour of \code{base::order} depends on assumptions about the locale of the R session. In English locales, \code{"america" < "BRAZIL"} is \code{TRUE} by default but false if you either type \code{Sys.setlocale(locale="C")} or the R session has been started in a C locale for you -- which can happen on servers/services since the locale comes from the environment the R session was started in. By contrast, \code{"america" < "BRAZIL"} is always \code{FALSE} in \code{data.table} regardless of the way your R session was started.