Rdatatable · mattdowle · May 29, 2019 · May 26, 2019 · May 29, 2019
@@ -90,6 +90,9 @@
 
 14. `setkey` to an existing index now uses the index, [#2889](https://github.com/Rdatatable/data.table/issues/2889). Thanks to @MichaelChirico for suggesting and @saraswatmks for the PR.
 
+15. `DT[order(col)[1:5], ...]` (i.e. where `i` is a compound expression involving `order()`) is now optimized to use `data.table`'s multithreaded `forder`, [#1921](https://github.com/Rdatatable/data.table/issues/1921). This example is not a fully optimal top-N query since the full ordering is still computed. The improvement is that the call to `order()` is computed faster for any `i` expression using `order`.
+
+
 #### BUG FIXES
 
 1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting.

@@ -218,6 +218,26 @@ replace_dot_alias = function(e) {
   }
 }
 
+# replace order -> forder wherever it appears in i
+replace_order = function(isub, verbose, env) {
+  if (length(isub) == 1L) return(isub)
+  for (ii in seq_along(isub)) {
+    isub_el = isub[[ii]]
+    if (missing(isub_el)) break
+    if (is.name(isub_el)) {
+      # stop base::order from becoming forder(x, base, order)
+      if (isub_el == '::') break
+      if (isub_el == 'order') {
+        if (verbose) cat("order optimisation is on, changed 'order(...)' in i to 'forder(x, ...)'.\n")
+        env$eval_forder = TRUE
+        return(as.call(c(list(quote(forder), quote(x)), as.list(isub)[-1L])))
+      }
+    }
+    if (is.call(isub_el)) isub[[ii]] = replace_order(isub_el, verbose, env)
+  }
+  return(isub)
+}
+
 "[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch"), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL)
 {
   # ..selfcount <<- ..selfcount+1  # in dev, we check no self calls, each of which doubles overhead, or could
@@ -417,31 +437,34 @@ replace_dot_alias = function(e) {
       if (is.call(isub) && isub[[1L]] == "(" && !is.name(isub[[2L]]))
         isub = isub[[2L]]
     }
-    if (is.call(isub) && isub[[1L]] == as.name("order") && getOption("datatable.optimize") >= 1) { # optimize here so that we can switch it off if needed
-      if (verbose) cat("order optimisation is on, i changed from 'order(...)' to 'forder(DT, ...)'.\n")
-      isub = as.list(isub)
-      isub = as.call(c(list(quote(forder), quote(x)), isub[-1L]))
-    }
+
     if (is.null(isub)) return( null.data.table() )
-    if (is.call(isub) && isub[[1L]] == quote(forder)) {
+
+    # optimize here so that we can switch it off if needed
+    check_eval_env = environment()
+    check_eval_env$eval_forder = FALSE
+    if (getOption("datatable.optimize") >= 1) {
+      isub = replace_order(isub, verbose, check_eval_env)
+    } 
+    if (check_eval_env$eval_forder) {
       order_env = new.env(parent=parent.frame())            # until 'forder' is exported
       assign("forder", forder, order_env)
       assign("x", x, order_env)
-      i = eval(isub, order_env, parent.frame())             # for optimisation of 'order' to 'forder'
+      i = eval(.massagei(isub), order_env, parent.frame())             # for optimisation of 'order' to 'forder'
       # that forder returns empty integer() is taken care of internally within forder
     } else if (length(o <- .prepareFastSubset(isub = isub, x = x,
                                               enclos =  parent.frame(),
                                               notjoin = notjoin, verbose = verbose))){
-        ## redirect to the is.data.table(x) == TRUE branch.
-        ## Additional flag to adapt things after bmerge:
-        optimizedSubset = TRUE
-        notjoin = o$notjoin
-        i = o$i
-        on = o$on
-        ## the following two are ignored if i is not a data.table.
-        ## Since we are converting i to data.table, it is important to set them properly.
-        nomatch = 0L
-        mult = "all"
+      ## redirect to the is.data.table(x) == TRUE branch.
+      ## Additional flag to adapt things after bmerge:
+      optimizedSubset = TRUE
+      notjoin = o$notjoin
+      i = o$i
+      on = o$on
+      ## the following two are ignored if i is not a data.table.
+      ## Since we are converting i to data.table, it is important to set them properly.
+      nomatch = 0L
+      mult = "all"
     }
     else if (!is.name(isub)) {
       i = tryCatch(eval(.massagei(isub), x, parent.frame()),
@@ -453,8 +476,8 @@ replace_dot_alias = function(e) {
         # must be "not found" since isub is a mere symbol
         col = try(eval(isub, x), silent=TRUE)  # is it a column name?
         msg = if (inherits(col,"try-error")) " and it is not a column name either."
-              else paste0(" but it is a column of type ", typeof(col),". If you wish to select rows where that column contains TRUE",
-                          ", or perhaps that column contains row numbers of itself to select, try DT[(col)], DT[DT$col], or DT[col==TRUE] is particularly clear and is optimized.")
+        else paste0(" but it is a column of type ", typeof(col),". If you wish to select rows where that column contains TRUE",
+                    ", or perhaps that column contains row numbers of itself to select, try DT[(col)], DT[DT$col], or DT[col==TRUE] is particularly clear and is optimized.")
         stop(as.character(isub), " is not found in calling scope", msg,
              " When the first argument inside DT[...] is a single symbol (e.g. DT[var]), data.table looks for var in calling scope.")
       }

@@ -13505,7 +13505,7 @@ test(1967.69, !any(grepl('order optimization', verbose_output, fixed = TRUE)))
 test(1967.70, any(grepl('[1] 5', verbose_output, fixed = TRUE)))
 options('datatable.optimize' = 1L)
 test(1967.71, x[order(a), .N, verbose = TRUE], 5L,
-     output = "i changed from 'order(...)' to 'forder(")
+     output = "changed 'order(...)' in i to 'forder(")
 setkey(x)
 test(1967.72, x[x, .N, on = 'a', verbose = TRUE], 5L,
      output = "on= matches existing key")
@@ -14965,6 +14965,15 @@ DF = data.frame(date = as.IDate(0L))
 test(2053.1, storage.mode(rbind(DF, DF)$date), 'integer')
 test(2053.2, DF$date[1L] <- integer(), integer())
 
+# forder detected more generally in i, #1921
+DT = data.table(
+  A = c(2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L),
+  B = c("b", "c", "a", "b", "b", "b", "c", "a", "b", "a"),
+  C = c(2L, 3L, 5L, 8L, 6L, 1L, 4L, 9L, 10L, 7L)
+)
+test(2054, DT[order(C)[1:5], B, verbose=TRUE], c('b', 'b', 'c', 'c', 'a'),
+           output = 'order optimisation is on')
+
 
 ###################################
 #  Add new tests above this line  #