Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@

14. `setkey` to an existing index now uses the index, [#2889](https://github.com/Rdatatable/data.table/issues/2889). Thanks to @MichaelChirico for suggesting and @saraswatmks for the PR.

15. `DT[order(col)[1:5], ...]` (i.e. where `i` is a compound expression involving `order()`) is now optimized to use `data.table`'s multithreaded `forder`, [#1921](https://github.com/Rdatatable/data.table/issues/1921). This example is not a fully optimal top-N query since the full ordering is still computed. The improvement is that the call to `order()` is computed faster for any `i` expression using `order`.


#### BUG FIXES

1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting.
Expand Down
61 changes: 42 additions & 19 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,26 @@ replace_dot_alias = function(e) {
}
}

# replace order -> forder wherever it appears in i
replace_order = function(isub, verbose, env) {
if (length(isub) == 1L) return(isub)
for (ii in seq_along(isub)) {
isub_el = isub[[ii]]
if (missing(isub_el)) break
if (is.name(isub_el)) {
# stop base::order from becoming forder(x, base, order)
if (isub_el == '::') break
if (isub_el == 'order') {
if (verbose) cat("order optimisation is on, changed 'order(...)' in i to 'forder(x, ...)'.\n")
env$eval_forder = TRUE
return(as.call(c(list(quote(forder), quote(x)), as.list(isub)[-1L])))
}
}
if (is.call(isub_el)) isub[[ii]] = replace_order(isub_el, verbose, env)
}
return(isub)
}

"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch"), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL)
{
# ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could
Expand Down Expand Up @@ -417,31 +437,34 @@ replace_dot_alias = function(e) {
if (is.call(isub) && isub[[1L]] == "(" && !is.name(isub[[2L]]))
isub = isub[[2L]]
}
if (is.call(isub) && isub[[1L]] == as.name("order") && getOption("datatable.optimize") >= 1) { # optimize here so that we can switch it off if needed
if (verbose) cat("order optimisation is on, i changed from 'order(...)' to 'forder(DT, ...)'.\n")
isub = as.list(isub)
isub = as.call(c(list(quote(forder), quote(x)), isub[-1L]))
}

if (is.null(isub)) return( null.data.table() )
if (is.call(isub) && isub[[1L]] == quote(forder)) {

# optimize here so that we can switch it off if needed
check_eval_env = environment()
check_eval_env$eval_forder = FALSE
if (getOption("datatable.optimize") >= 1) {
isub = replace_order(isub, verbose, check_eval_env)
}
if (check_eval_env$eval_forder) {
order_env = new.env(parent=parent.frame()) # until 'forder' is exported
assign("forder", forder, order_env)
assign("x", x, order_env)
i = eval(isub, order_env, parent.frame()) # for optimisation of 'order' to 'forder'
i = eval(.massagei(isub), order_env, parent.frame()) # for optimisation of 'order' to 'forder'
# that forder returns empty integer() is taken care of internally within forder
} else if (length(o <- .prepareFastSubset(isub = isub, x = x,
enclos = parent.frame(),
notjoin = notjoin, verbose = verbose))){
## redirect to the is.data.table(x) == TRUE branch.
## Additional flag to adapt things after bmerge:
optimizedSubset = TRUE
notjoin = o$notjoin
i = o$i
on = o$on
## the following two are ignored if i is not a data.table.
## Since we are converting i to data.table, it is important to set them properly.
nomatch = 0L
mult = "all"
## redirect to the is.data.table(x) == TRUE branch.
## Additional flag to adapt things after bmerge:
optimizedSubset = TRUE
notjoin = o$notjoin
i = o$i
on = o$on
## the following two are ignored if i is not a data.table.
## Since we are converting i to data.table, it is important to set them properly.
nomatch = 0L
mult = "all"
}
else if (!is.name(isub)) {
i = tryCatch(eval(.massagei(isub), x, parent.frame()),
Expand All @@ -453,8 +476,8 @@ replace_dot_alias = function(e) {
# must be "not found" since isub is a mere symbol
col = try(eval(isub, x), silent=TRUE) # is it a column name?
msg = if (inherits(col,"try-error")) " and it is not a column name either."
else paste0(" but it is a column of type ", typeof(col),". If you wish to select rows where that column contains TRUE",
", or perhaps that column contains row numbers of itself to select, try DT[(col)], DT[DT$col], or DT[col==TRUE] is particularly clear and is optimized.")
else paste0(" but it is a column of type ", typeof(col),". If you wish to select rows where that column contains TRUE",
", or perhaps that column contains row numbers of itself to select, try DT[(col)], DT[DT$col], or DT[col==TRUE] is particularly clear and is optimized.")
stop(as.character(isub), " is not found in calling scope", msg,
" When the first argument inside DT[...] is a single symbol (e.g. DT[var]), data.table looks for var in calling scope.")
}
Expand Down
11 changes: 10 additions & 1 deletion inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -13505,7 +13505,7 @@ test(1967.69, !any(grepl('order optimization', verbose_output, fixed = TRUE)))
test(1967.70, any(grepl('[1] 5', verbose_output, fixed = TRUE)))
options('datatable.optimize' = 1L)
test(1967.71, x[order(a), .N, verbose = TRUE], 5L,
output = "i changed from 'order(...)' to 'forder(")
output = "changed 'order(...)' in i to 'forder(")
setkey(x)
test(1967.72, x[x, .N, on = 'a', verbose = TRUE], 5L,
output = "on= matches existing key")
Expand Down Expand Up @@ -14965,6 +14965,15 @@ DF = data.frame(date = as.IDate(0L))
test(2053.1, storage.mode(rbind(DF, DF)$date), 'integer')
test(2053.2, DF$date[1L] <- integer(), integer())

# forder detected more generally in i, #1921
DT = data.table(
A = c(2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L),
B = c("b", "c", "a", "b", "b", "b", "c", "a", "b", "a"),
C = c(2L, 3L, 5L, 8L, 6L, 1L, 4L, 9L, 10L, 7L)
)
test(2054, DT[order(C)[1:5], B, verbose=TRUE], c('b', 'b', 'c', 'c', 'a'),
output = 'order optimisation is on')


###################################
# Add new tests above this line #
Expand Down