From c46d908b9ae9c1a6afd0754a5e1e252f5e0cc601 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Wed, 29 Jan 2020 23:23:29 -0500 Subject: [PATCH 1/6] Prevent get reordering on assignment This also includes slight refactoring to eliminate one *TODO* comment --- R/data.table.R | 57 +++++++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 1fee0396a9..352f8b142f 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -968,7 +968,39 @@ replace_dot_alias = function(e) { } # fix for long standing FR/bug, #495 and #484 allcols = c(names_x, xdotprefix, names_i, idotprefix) - if ( length(non_sdvars <- setdiff(intersect(av, allcols), c(bynames, ansvars))) ) { + non_sdvars <- setdiff(intersect(av, allcols), c(bynames, ansvars)) + + # added 'mget' - fix for #994 + if (any(c("get", "mget") %chin% av)){ + # get('varname') is too difficult to detect which columns are used in general + # eval(macro) column names are detected via the if jsub[[1]]==eval switch earlier above. + b_warn = FALSE + if (is.call(jsub) && length(jsub[[1L]]) == 1L && jsub[[1L]] == ":="){ + if (is.symbol(jsub[[2L]])) { + jsub_lhs_symbol = as.character(jsub[[2L]]) + if (jsub_lhs_symbol %chin% non_sdvars) { + sdvars = setdiff(sdvars, jsub_lhs_symbol) + } + } else { + # fix for #4089 - if all names are already accounted for, we don't need to re-extract the names for the (m)get call + b_warn = is.null(names_i) && all(names_x %chin% c(ansvars, allbyvars)) + } + } + if (!b_warn) { + if (verbose) + cat("'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a single j=eval(macro) instead. Both will detect the columns used which is important for efficiency.\nOld:", paste(ansvars,collapse=","),"\n") + ansvars = setdiff(allcols, bynames) # fix for bug #5443 + non_sdvars = setdiff(ansvars, sdvars) + ansvals = chmatch(ansvars, names_x) + if (verbose) + cat("New:",paste(ansvars,collapse=","),"\n") + } else { ## TODO remove this else statement in Jan. 2023 + newvars = setdiff(allcols, bynames) # checking if old cols c("y", "x") became c("x", "y") + if (length(newvars) == length(ansvars) && !all(newvars == ansvars)) { + warning("(m)get found in j with update assignment (i.e., `(cols) := ...`). j previously was re-ordered. See Issue #4089") + } + } + } else if (length(non_sdvars)) { # we've a situation like DT[, c(sum(V1), lapply(.SD, mean)), by=., .SDcols=...] or # DT[, lapply(.SD, function(x) x *v1), by=, .SDcols=...] etc., ansvars = union(ansvars, non_sdvars) @@ -988,29 +1020,6 @@ replace_dot_alias = function(e) { } # if (!length(ansvars)) Leave ansvars empty. Important for test 607. - # TODO remove as (m)get is now folded in above. - # added 'mget' - fix for #994 - if (any(c("get", "mget") %chin% av)) { - if (verbose) { - cat("'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a single j=eval(macro) instead. Both will detect the columns used which is important for efficiency.\nOld:", paste(ansvars,collapse=","),"\n") - # get('varname') is too difficult to detect which columns are used in general - # eval(macro) column names are detected via the if jsub[[1]]==eval switch earlier above. - } - - # Do not include z in .SD when dt[, z := {.SD; get("x")}, .SDcols = "y"] (#2326, #2338) - if (is.call(jsub) && length(jsub[[1L]]) == 1L && jsub[[1L]] == ":=" && is.symbol(jsub[[2L]])) { - jsub_lhs_symbol = as.character(jsub[[2L]]) - if (jsub_lhs_symbol %chin% non_sdvars) { - sdvars = setdiff(sdvars, jsub_lhs_symbol) - } - } - allcols = c(names_x, xdotprefix, names_i, idotprefix) - ansvars = setdiff(allcols, bynames) # fix for bug #5443 - non_sdvars = setdiff(ansvars, sdvars) - ansvals = chmatch(ansvars, names_x) - if (verbose) cat("New:",paste(ansvars,collapse=","),"\n") - } - lhs = NULL newnames = NULL suppPrint = identity From e1ccde01491056847ee13a482e77579e1c338a73 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Wed, 29 Jan 2020 23:30:30 -0500 Subject: [PATCH 2/6] Update NEWS.md --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index d1b6b90bcd..facde8c113 100644 --- a/NEWS.md +++ b/NEWS.md @@ -79,6 +79,7 @@ unit = "s") 3. Dispatch of `first` and `last` functions now properly works again for `xts` objects, [#4053](https://github.com/Rdatatable/data.table/issues/4053). Thanks to @ethanbsmith for reporting. +4. When updating by reference, the use of `get` could result in columns being re-ordered silently, [#4089](https://github.com/Rdatatable/data.table/issues/4089). Thanks to @dmongin for reporting and Cole Miller for the fix. ## NOTES 1. `as.IDate`, `as.ITime`, `second`, `minute`, and `hour` now recognize UTC equivalents for speed: GMT, GMT-0, GMT+0, GMT0, Etc/GMT, and Etc/UTC, [#4116](https://github.com/Rdatatable/data.table/issues/4116). From fad5b9713530338b0b905f2903572f5a5b39444a Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Wed, 29 Jan 2020 23:32:39 -0500 Subject: [PATCH 3/6] better warning --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 352f8b142f..157192a93d 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -997,7 +997,7 @@ replace_dot_alias = function(e) { } else { ## TODO remove this else statement in Jan. 2023 newvars = setdiff(allcols, bynames) # checking if old cols c("y", "x") became c("x", "y") if (length(newvars) == length(ansvars) && !all(newvars == ansvars)) { - warning("(m)get found in j with update assignment (i.e., `(cols) := ...`). j previously was re-ordered. See Issue #4089") + warning("(m)get found in j with update assignment (i.e., `(cols) := ...`). j column order was re-ordered prior to version 1.12.9. See Issue #4089") } } } else if (length(non_sdvars)) { From 47e53eacb8ef747f30148c7f9c943e08cbd48acf Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Sat, 1 Feb 2020 19:20:49 -0500 Subject: [PATCH 4/6] Update data.table.R --- R/data.table.R | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 1fc39cc40b..8ea33d28d7 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -968,39 +968,35 @@ replace_dot_alias = function(e) { } # fix for long standing FR/bug, #495 and #484 allcols = c(names_x, xdotprefix, names_i, idotprefix) - non_sdvars <- setdiff(intersect(av, allcols), c(bynames, ansvars)) - + non_sdvars = setdiff(intersect(av, allcols), c(bynames, ansvars)) + # added 'mget' - fix for #994 if (any(c("get", "mget") %chin% av)){ - # get('varname') is too difficult to detect which columns are used in general - # eval(macro) column names are detected via the if jsub[[1]]==eval switch earlier above. - b_warn = FALSE + if (verbose) + cat(gettextf("'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a single j=eval(macro) instead. Both will detect the columns used which is important for efficiency.\nOld ansvars: %s \n", paste(ansvars,collapse=","), domain = "R-data.table")) + # get('varname') is too difficult to detect which columns are used in general + # eval(macro) column names are detected via the if jsub[[1]]==eval switch earlier above. + + # Do not include z in .SD when dt[, z := {.SD; get("x")}, .SDcols = "y"] (#2326, #2338) if (is.call(jsub) && length(jsub[[1L]]) == 1L && jsub[[1L]] == ":="){ if (is.symbol(jsub[[2L]])) { jsub_lhs_symbol = as.character(jsub[[2L]]) if (jsub_lhs_symbol %chin% non_sdvars) { sdvars = setdiff(sdvars, jsub_lhs_symbol) } - } else { - # fix for #4089 - if all names are already accounted for, we don't need to re-extract the names for the (m)get call - b_warn = is.null(names_i) && all(names_x %chin% c(ansvars, allbyvars)) - } + } } - if (!b_warn) { - if (verbose) - cat("'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a single j=eval(macro) instead. Both will detect the columns used which is important for efficiency.\nOld:", paste(ansvars,collapse=","),"\n") + + if (missing(.SDcols)) { ansvars = setdiff(allcols, bynames) # fix for bug #5443 - non_sdvars = setdiff(ansvars, sdvars) - ansvals = chmatch(ansvars, names_x) - if (verbose) - cat("New:",paste(ansvars,collapse=","),"\n") - } else { ## TODO remove this else statement in Jan. 2023 - newvars = setdiff(allcols, bynames) # checking if old cols c("y", "x") became c("x", "y") - if (length(newvars) == length(ansvars) && !all(newvars == ansvars)) { - warning("(m)get found in j with update assignment (i.e., `(cols) := ...`). j column order was re-ordered prior to version 1.12.9. See Issue #4089") - } - } - } else if (length(non_sdvars)) { + } else { + # fixes #4089 - if .SDcols was already evaluated, we do not want the order of the columns to change. + ansvars = union(ansvars, setdiff(setdiff(allcols, ansvars), bynames)) + } + non_sdvars = setdiff(ansvars, sdvars) + ansvals = chmatch(ansvars, names_x) + if (verbose) cat(gettextf("New ansvars: %s \n",paste(ansvars,collapse=","), domain = "R-data.table")) + } else if (length(non_sdvars)) { # we've a situation like DT[, c(sum(V1), lapply(.SD, mean)), by=., .SDcols=...] or # DT[, lapply(.SD, function(x) x *v1), by=, .SDcols=...] etc., ansvars = union(ansvars, non_sdvars) From 767b20e682d31cadd9494059c6e2f6fc7df591bf Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Sat, 1 Feb 2020 19:32:07 -0500 Subject: [PATCH 5/6] changed to brackify --- R/data.table.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 8ea33d28d7..4ed2083f55 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -973,7 +973,7 @@ replace_dot_alias = function(e) { # added 'mget' - fix for #994 if (any(c("get", "mget") %chin% av)){ if (verbose) - cat(gettextf("'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a single j=eval(macro) instead. Both will detect the columns used which is important for efficiency.\nOld ansvars: %s \n", paste(ansvars,collapse=","), domain = "R-data.table")) + cat(gettextf("'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a single j=eval(macro) instead. Both will detect the columns used which is important for efficiency.\nOld ansvars: %s \n", brackify(ansvars), domain = "R-data.table")) # get('varname') is too difficult to detect which columns are used in general # eval(macro) column names are detected via the if jsub[[1]]==eval switch earlier above. @@ -995,7 +995,7 @@ replace_dot_alias = function(e) { } non_sdvars = setdiff(ansvars, sdvars) ansvals = chmatch(ansvars, names_x) - if (verbose) cat(gettextf("New ansvars: %s \n",paste(ansvars,collapse=","), domain = "R-data.table")) + if (verbose) cat(gettextf("New ansvars: %s \n", brackify(ansvars), domain = "R-data.table")) } else if (length(non_sdvars)) { # we've a situation like DT[, c(sum(V1), lapply(.SD, mean)), by=., .SDcols=...] or # DT[, lapply(.SD, function(x) x *v1), by=, .SDcols=...] etc., From 4d11fe0984b9c380b1dc598a27a4ef616a1cfe5c Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Sun, 2 Feb 2020 07:04:24 -0500 Subject: [PATCH 6/6] Update tests.Rraw --- inst/tests/tests.Rraw | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f6d17a0762..62f7a66433 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16770,6 +16770,10 @@ test(2132.2, fifelse(TRUE, 1, s2), error = "S4 class objects (except nanot test(2132.3, fcase(TRUE, s1, FALSE, s2), error = "S4 class objects (except nanotime) are not supported. Please see https://github.com/Rdatatable/data.table/issues/4131.") rm(s1, s2, class2132) +# Use of (m)get does not re-order when using .SDcols, #4089 +dt = data.table(x = 1L, y = 2L, z = 3L) +cols = c('x', 'y') +test(2133.1, dt[, (cols) := lapply(.SD[get("x") == 1],function(x){x + 2L}), .SDcols = cols ,by = z], data.table(x = 1L + 2L, y = 2L + 2L, z = 3L)) ######################## # Add new tests here #