From 5b8f98baf994c867300d37c2c0765a052d53aa88 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 14 Mar 2020 00:27:11 +0530 Subject: [PATCH 01/37] working draft of substitute names --- R/wrappers.R | 7 +++++++ src/data.table.h | 3 +++ src/init.c | 1 + src/programming.c | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 45 insertions(+) create mode 100644 src/programming.c diff --git a/R/wrappers.R b/R/wrappers.R index 5fec33a92f..e2d6b0c6f9 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -12,3 +12,10 @@ colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, c coerceFill = function(x) .Call(CcoerceFillR, x) testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) + +replace_names = function(expr, env) { + #replace_names(quote(.(fvar1=fun(var1, arg1=TRUE), charhead=head(var2, 1L))), sapply(list(var1="myIntCol", fvar1="a_col", var2="myCharCol", fun="sum", arg1="na.rm"), as.symbol)) + stopifnot(is.list(env), as.logical(length(env)), !is.null(names(env)), #sapply(env, is.character), sapply(env, nzchar), sapply(env, length)==1L, + is.language(expr)) + .Call(Creplace_namesR, expr, env) +} diff --git a/src/data.table.h b/src/data.table.h index 90ff7fb6fc..4ca5a9d0d4 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -245,3 +245,6 @@ SEXP testMsgR(SEXP status, SEXP x, SEXP k); //fifelse.c SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na); SEXP fcaseR(SEXP na, SEXP rho, SEXP args); + +// programming.c +SEXP replace_namesR(SEXP expr, SEXP env); diff --git a/src/init.c b/src/init.c index aed2da3dbd..68c62febbe 100644 --- a/src/init.c +++ b/src/init.c @@ -211,6 +211,7 @@ R_CallMethodDef callMethods[] = { {"CfrollapplyR", (DL_FUNC) &frollapplyR, -1}, {"CtestMsgR", (DL_FUNC) &testMsgR, -1}, {"C_allNAR", (DL_FUNC) &allNAR, -1}, +{"Creplace_namesR", (DL_FUNC) &replace_namesR, -1}, {NULL, NULL, 0} }; diff --git a/src/programming.c b/src/programming.c new file mode 100644 index 0000000000..1342c46ff3 --- /dev/null +++ b/src/programming.c @@ -0,0 +1,34 @@ +#include "data.table.h" + +void replace_names(SEXP expr, SEXP env) { + R_len_t len = length(expr); + if (!isNull(expr) && len && isLanguage(expr)) { // isLanguage is R's is.call + SEXP exprnames = getAttrib(expr, R_NamesSymbol); + if (!isNull(exprnames)) { + SEXP envnames = getAttrib(env, R_NamesSymbol); + SEXP matches = PROTECT(chmatch(exprnames, envnames, 0)); + int *imatches = INTEGER(matches); + const SEXP *sexpr = SEXPPTR_RO(exprnames); + const SEXP *senv = SEXPPTR_RO(env); + SEXP tmp = expr; + for (int i=0; i %s\n", CHAR(sexpr[i]), CHAR(PRINTNAME(senv[imatches[i]-1]))); + SET_TAG(tmp, senv[imatches[i]-1]); + } + tmp = CDR(tmp); + } + UNPROTECT(1); // matches + // update also nested calls + for (SEXP t=expr; t!=R_NilValue; t=CDR(t)) + replace_names(CADR(t), env); + } + } +} +SEXP replace_namesR(SEXP expr, SEXP env) { + // move R's checks here, escape for 0 length, etc + SEXP ans = PROTECT(MAYBE_REFERENCED(expr) ? duplicate(expr) : expr); + replace_names(ans, env); // updates in-place + UNPROTECT(1); + return ans; +} From 838f00d6d2994941ea098c4b69e8801d7ce6aac5 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 14 Mar 2020 12:24:35 +0530 Subject: [PATCH 02/37] substitute2 draft --- R/programming.R | 31 +++++++++++++++++++++++++++++++ R/wrappers.R | 7 ------- src/data.table.h | 2 +- src/init.c | 2 +- src/programming.c | 38 +++++++++++++++++--------------------- tests/programming.R | 23 +++++++++++++++++++++++ 6 files changed, 73 insertions(+), 30 deletions(-) create mode 100644 R/programming.R create mode 100644 tests/programming.R diff --git a/R/programming.R b/R/programming.R new file mode 100644 index 0000000000..9c61893d87 --- /dev/null +++ b/R/programming.R @@ -0,0 +1,31 @@ +substitute2 = function(expr, env, char.as.name=FALSE, sub.names=TRUE) { + if (missing(env)) { + stop("TODO") + } else if (is.environment(env)) { + env = as.list(env, all.names=TRUE) ## todo: try to use environment rather than list + } else if (!is.list(env)) { + stop("'env' must be a list of an environment") + } + env.names = names(env) + if (is.null(env.names)) { + stop("'env' argument does not have names") + } else if (any(!nzchar(env.names))) { + stop("'env' argument has an zero char names") + } + if (isTRUE(char.as.name)) { + char = vapply(env, is.character, FALSE) + if (any(char)) { + if (any(non.scalar.char <- lengths(env[char])!=1L)) { + stop("'char.as.name' was used but the following character elements provided in 'env' are not scalar: ", + paste(names(non.scalar.char)[non.scalar.char], collapse=", ")) + } + env[char] = lapply(env[char], as.name) + } + } + expr.sub = eval(substitute(substitute(expr, env))) + if (isTRUE(sub.names)) { + .Call(Csubstitute_call_arg_namesR, expr.sub, env) + } else { + expr.sub + } +} diff --git a/R/wrappers.R b/R/wrappers.R index e2d6b0c6f9..5fec33a92f 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -12,10 +12,3 @@ colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, c coerceFill = function(x) .Call(CcoerceFillR, x) testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) - -replace_names = function(expr, env) { - #replace_names(quote(.(fvar1=fun(var1, arg1=TRUE), charhead=head(var2, 1L))), sapply(list(var1="myIntCol", fvar1="a_col", var2="myCharCol", fun="sum", arg1="na.rm"), as.symbol)) - stopifnot(is.list(env), as.logical(length(env)), !is.null(names(env)), #sapply(env, is.character), sapply(env, nzchar), sapply(env, length)==1L, - is.language(expr)) - .Call(Creplace_namesR, expr, env) -} diff --git a/src/data.table.h b/src/data.table.h index 4ca5a9d0d4..bfa2701ad7 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -247,4 +247,4 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na); SEXP fcaseR(SEXP na, SEXP rho, SEXP args); // programming.c -SEXP replace_namesR(SEXP expr, SEXP env); +SEXP substitute_call_arg_namesR(SEXP expr, SEXP env); diff --git a/src/init.c b/src/init.c index 68c62febbe..6cb06a37ac 100644 --- a/src/init.c +++ b/src/init.c @@ -211,7 +211,7 @@ R_CallMethodDef callMethods[] = { {"CfrollapplyR", (DL_FUNC) &frollapplyR, -1}, {"CtestMsgR", (DL_FUNC) &testMsgR, -1}, {"C_allNAR", (DL_FUNC) &allNAR, -1}, -{"Creplace_namesR", (DL_FUNC) &replace_namesR, -1}, +{"Csubstitute_call_arg_namesR", (DL_FUNC) &substitute_call_arg_namesR, -1}, {NULL, NULL, 0} }; diff --git a/src/programming.c b/src/programming.c index 1342c46ff3..e9009c5ef0 100644 --- a/src/programming.c +++ b/src/programming.c @@ -1,34 +1,30 @@ #include "data.table.h" -void replace_names(SEXP expr, SEXP env) { +static void substitute_call_arg_names(SEXP expr, SEXP env) { R_len_t len = length(expr); - if (!isNull(expr) && len && isLanguage(expr)) { // isLanguage is R's is.call - SEXP exprnames = getAttrib(expr, R_NamesSymbol); - if (!isNull(exprnames)) { - SEXP envnames = getAttrib(env, R_NamesSymbol); - SEXP matches = PROTECT(chmatch(exprnames, envnames, 0)); - int *imatches = INTEGER(matches); - const SEXP *sexpr = SEXPPTR_RO(exprnames); - const SEXP *senv = SEXPPTR_RO(env); - SEXP tmp = expr; - for (int i=0; i %s\n", CHAR(sexpr[i]), CHAR(PRINTNAME(senv[imatches[i]-1]))); - SET_TAG(tmp, senv[imatches[i]-1]); + //Rprintf("substitute names: %s -> %s\n", CHAR(expr_arg_names[i]), CHAR(PRINTNAME(env_sub[imatches[i]-1]))); // to be removed + SET_TAG(tmp, env_sub[imatches[i]-1]); } - tmp = CDR(tmp); + i++; + substitute_call_arg_names(CADR(tmp), env); // try substitute names in child calls } - UNPROTECT(1); // matches - // update also nested calls - for (SEXP t=expr; t!=R_NilValue; t=CDR(t)) - replace_names(CADR(t), env); + UNPROTECT(1); // chmatch } } } -SEXP replace_namesR(SEXP expr, SEXP env) { - // move R's checks here, escape for 0 length, etc +SEXP substitute_call_arg_namesR(SEXP expr, SEXP env) { SEXP ans = PROTECT(MAYBE_REFERENCED(expr) ? duplicate(expr) : expr); - replace_names(ans, env); // updates in-place + substitute_call_arg_names(ans, env); // updates in-place UNPROTECT(1); return ans; } diff --git a/tests/programming.R b/tests/programming.R new file mode 100644 index 0000000000..fbfb1ccc33 --- /dev/null +++ b/tests/programming.R @@ -0,0 +1,23 @@ +cc(F) + +substitute2( + .(fun_ans_var = fun(farg1, farg2=farg2val), timestamp=Sys.time(), col_head = head(head_arg, n=1L)), + list( + fun_ans_var = "my_mean_res", + fun = "mean", + farg1 = "my_x_col", + farg2 = "na.rm", + farg2val = TRUE, + col_head = "first_y", + head_arg = "y" + ), + char.as.name=TRUE +) + +const1 = function() 1L +substitute2(list(nm = fun()), env=list(a="b", fun="const1", nm="int1"), char.as.name=TRUE) +substitute2(.(), env=list(a="b", fun="const1", nm="int1"), char.as.name=TRUE) + +substitute2(.("TRUE" = 1L, "FALSE" = 2L, "1" = 3L, "2" = 4L), + env=list("FALSE"="col2", "2"="col4"), + char.as.name=TRUE) From a2b876b2e6729c1f89c0b7bf60da6d332ba1a9f5 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 14 Mar 2020 12:44:06 +0530 Subject: [PATCH 03/37] add example from #2655 --- tests/programming.R | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/programming.R b/tests/programming.R index fbfb1ccc33..ff817db05f 100644 --- a/tests/programming.R +++ b/tests/programming.R @@ -21,3 +21,16 @@ substitute2(.(), env=list(a="b", fun="const1", nm="int1"), char.as.name=TRUE) substitute2(.("TRUE" = 1L, "FALSE" = 2L, "1" = 3L, "2" = 4L), env=list("FALSE"="col2", "2"="col4"), char.as.name=TRUE) + +substitute2( + .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), + env = list( + in_col_name = "x", + fun = "sum", + fun_arg1 = "na.rm", + fun_arg1val = TRUE, + out_col_name = "sum_x" + ), + char.as.name = TRUE +) +#.(sum_x = sum(x, na.rm = TRUE)) \ No newline at end of file From 994ce4acca9c9a8771efedad921533040762421a Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 14 Mar 2020 18:01:15 +0800 Subject: [PATCH 04/37] ! on scalar not vector --- R/programming.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/programming.R b/R/programming.R index 9c61893d87..92b06f8ded 100644 --- a/R/programming.R +++ b/R/programming.R @@ -9,7 +9,7 @@ substitute2 = function(expr, env, char.as.name=FALSE, sub.names=TRUE) { env.names = names(env) if (is.null(env.names)) { stop("'env' argument does not have names") - } else if (any(!nzchar(env.names))) { + } else if (!all(nzchar(env.names))) { stop("'env' argument has an zero char names") } if (isTRUE(char.as.name)) { From 7816631fceb2807865191b7cb2a6e7e976c5ffde Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 17 Mar 2020 12:25:13 +0530 Subject: [PATCH 05/37] handle AsIs so char and symbols can be both used in env arg --- R/programming.R | 25 +++++++++++++++++------ src/programming.c | 7 ++++--- tests/programming.R | 50 ++++++++++++++++++++++++++++++++++++--------- 3 files changed, 63 insertions(+), 19 deletions(-) diff --git a/R/programming.R b/R/programming.R index 92b06f8ded..d8ca8e4347 100644 --- a/R/programming.R +++ b/R/programming.R @@ -1,4 +1,8 @@ -substitute2 = function(expr, env, char.as.name=FALSE, sub.names=TRUE) { +is.AsIs = function(x) { + inherits(x, "AsIs") +} + +substitute2 = function(expr, env, char.as.name=!is.AsIs(env), sub.names=TRUE) { if (missing(env)) { stop("TODO") } else if (is.environment(env)) { @@ -13,17 +17,26 @@ substitute2 = function(expr, env, char.as.name=FALSE, sub.names=TRUE) { stop("'env' argument has an zero char names") } if (isTRUE(char.as.name)) { + asis = vapply(env, is.AsIs, FALSE) char = vapply(env, is.character, FALSE) - if (any(char)) { - if (any(non.scalar.char <- lengths(env[char])!=1L)) { - stop("'char.as.name' was used but the following character elements provided in 'env' are not scalar: ", + toname = !asis & char + if (any(toname)) { + lens = vapply(env, length, 0L) + if (any(non.scalar.char <- lens[toname]!=1L)) { + stop("'char.as.name' was used but the following character objects provided in 'env' are not scalar, if you need them as character vector rather a name, then use 'I' function: ", paste(names(non.scalar.char)[non.scalar.char], collapse=", ")) } - env[char] = lapply(env[char], as.name) + env[toname] = lapply(env[toname], as.name) } } - expr.sub = eval(substitute(substitute(expr, env))) + # R substitute + expr.sub = eval(substitute( + substitute(.expr, env), + env = list(.expr = substitute(expr)) + )) + # new arg names substitute if (isTRUE(sub.names)) { + #cat("entering substitute_call_arg_namesR\n") .Call(Csubstitute_call_arg_namesR, expr.sub, env) } else { expr.sub diff --git a/src/programming.c b/src/programming.c index e9009c5ef0..715d10fb63 100644 --- a/src/programming.c +++ b/src/programming.c @@ -7,16 +7,16 @@ static void substitute_call_arg_names(SEXP expr, SEXP env) { if (!isNull(arg_names)) { SEXP env_names = getAttrib(env, R_NamesSymbol); int *imatches = INTEGER(PROTECT(chmatch(arg_names, env_names, 0))); - //const SEXP *expr_arg_names = SEXPPTR_RO(arg_names); // only for print, to be removed + //const SEXP *expr_arg_names = SEXPPTR_RO(arg_names); // debug const SEXP *env_sub = SEXPPTR_RO(env); int i = 0; for (SEXP tmp=expr; tmp!=R_NilValue; tmp=CDR(tmp)) { if (imatches[i]) { - //Rprintf("substitute names: %s -> %s\n", CHAR(expr_arg_names[i]), CHAR(PRINTNAME(env_sub[imatches[i]-1]))); // to be removed + //Rprintf("substitute names: %s -> %s\n", CHAR(expr_arg_names[i]), CHAR(PRINTNAME(env_sub[imatches[i]-1]))); // debug SET_TAG(tmp, env_sub[imatches[i]-1]); } i++; - substitute_call_arg_names(CADR(tmp), env); // try substitute names in child calls + substitute_call_arg_names(CADR(tmp), env); // substitute arg names in child calls } UNPROTECT(1); // chmatch } @@ -24,6 +24,7 @@ static void substitute_call_arg_names(SEXP expr, SEXP env) { } SEXP substitute_call_arg_namesR(SEXP expr, SEXP env) { SEXP ans = PROTECT(MAYBE_REFERENCED(expr) ? duplicate(expr) : expr); + //Rprintf("entering substitute_call_arg_names\n"); substitute_call_arg_names(ans, env); // updates in-place UNPROTECT(1); return ans; diff --git a/tests/programming.R b/tests/programming.R index ff817db05f..608fe2f8a3 100644 --- a/tests/programming.R +++ b/tests/programming.R @@ -1,5 +1,18 @@ -cc(F) +if (exists("cc")) { ## dev mode + cc() +} else { + library(data.table) + substitute2 = data.table:::substitute2 +} +# simple +substitute2(list(var = val), env = list(var="my_var", val=5L)) + +# AsIs way to handle char.to.name argument +substitute2(list(var = val), env = I(list(var=as.name("my_var"), val="my_val"))) +substitute2(list(var = val), env = list(var="my_var", val=I("my_val"))) + +# complex use case substitute2( .(fun_ans_var = fun(farg1, farg2=farg2val), timestamp=Sys.time(), col_head = head(head_arg, n=1L)), list( @@ -10,18 +23,19 @@ substitute2( farg2val = TRUE, col_head = "first_y", head_arg = "y" - ), - char.as.name=TRUE + ) ) +# calls of length 0 args const1 = function() 1L -substitute2(list(nm = fun()), env=list(a="b", fun="const1", nm="int1"), char.as.name=TRUE) -substitute2(.(), env=list(a="b", fun="const1", nm="int1"), char.as.name=TRUE) +substitute2(list(nm = fun()), env=list(a="b", fun="const1", nm="int1")) +substitute2(.(), env=list(a="b", fun="const1", nm="int1")) +# some special names substitute2(.("TRUE" = 1L, "FALSE" = 2L, "1" = 3L, "2" = 4L), - env=list("FALSE"="col2", "2"="col4"), - char.as.name=TRUE) + env=list("FALSE"="col2", "2"="col4")) +# PR example substitute2( .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), env = list( @@ -30,7 +44,23 @@ substitute2( fun_arg1 = "na.rm", fun_arg1val = TRUE, out_col_name = "sum_x" - ), - char.as.name = TRUE + ) +) + +# re-use inside another function +f = function(expr, env) { + eval(substitute( + substitute2(.expr, env), + list(.expr = substitute(expr)) + )) +} +f( + .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), + env = list( + in_col_name = "x", + fun = "sum", + fun_arg1 = "na.rm", + fun_arg1val = TRUE, + out_col_name = "sum_x" + ) ) -#.(sum_x = sum(x, na.rm = TRUE)) \ No newline at end of file From 6084f281b0312410c90999718fbee81292f97f47 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 17 Mar 2020 13:00:00 +0530 Subject: [PATCH 06/37] non-symbol handling --- R/programming.R | 9 ++++----- src/programming.c | 7 +++++-- tests/programming.R | 11 ++++++++++- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/R/programming.R b/R/programming.R index d8ca8e4347..c5fe227d0d 100644 --- a/R/programming.R +++ b/R/programming.R @@ -19,14 +19,13 @@ substitute2 = function(expr, env, char.as.name=!is.AsIs(env), sub.names=TRUE) { if (isTRUE(char.as.name)) { asis = vapply(env, is.AsIs, FALSE) char = vapply(env, is.character, FALSE) - toname = !asis & char - if (any(toname)) { - lens = vapply(env, length, 0L) - if (any(non.scalar.char <- lens[toname]!=1L)) { + to.name = !asis & char + if (any(to.name)) { + if (any(non.scalar.char <- vapply(env[to.name], length, 0L)!=1L)) { stop("'char.as.name' was used but the following character objects provided in 'env' are not scalar, if you need them as character vector rather a name, then use 'I' function: ", paste(names(non.scalar.char)[non.scalar.char], collapse=", ")) } - env[toname] = lapply(env[toname], as.name) + env[to.name] = lapply(env[to.name], as.name) } } # R substitute diff --git a/src/programming.c b/src/programming.c index 715d10fb63..c71116dbfa 100644 --- a/src/programming.c +++ b/src/programming.c @@ -12,8 +12,11 @@ static void substitute_call_arg_names(SEXP expr, SEXP env) { int i = 0; for (SEXP tmp=expr; tmp!=R_NilValue; tmp=CDR(tmp)) { if (imatches[i]) { - //Rprintf("substitute names: %s -> %s\n", CHAR(expr_arg_names[i]), CHAR(PRINTNAME(env_sub[imatches[i]-1]))); // debug - SET_TAG(tmp, env_sub[imatches[i]-1]); + SEXP sym = env_sub[imatches[i]-1]; + if (!isSymbol(sym)) + error("Attempting to substitute '%s' element with object of type '%s' but it has to be symbol type when substituting name of the call argument, functions 'as.name' and 'I' can be used to work out proper substitution, see ?substitute2", CHAR(STRING_ELT(arg_names, i)), type2char(TYPEOF(sym))); + //Rprintf("substitute names: %s -> %s\n", CHAR(expr_arg_names[i]), CHAR(PRINTNAME(sym))); // debug + SET_TAG(tmp, sym); } i++; substitute_call_arg_names(CADR(tmp), env); // substitute arg names in child calls diff --git a/tests/programming.R b/tests/programming.R index 608fe2f8a3..c550aca0fb 100644 --- a/tests/programming.R +++ b/tests/programming.R @@ -9,8 +9,17 @@ if (exists("cc")) { ## dev mode substitute2(list(var = val), env = list(var="my_var", val=5L)) # AsIs way to handle char.to.name argument -substitute2(list(var = val), env = I(list(var=as.name("my_var"), val="my_val"))) substitute2(list(var = val), env = list(var="my_var", val=I("my_val"))) +substitute2(list(var = val), env = I(list(var=as.name("my_var"), val="my_val"))) + +# test non-scalar char +substitute2(list(var = val), env = list(var="my_var", val=c("a","b"))) +substitute2(list(var = val), env = list(var="my_var", val=I(c("a","b")))) +substitute2(list(var = val), env = I(list(var=as.name("my_var"), val=c("a","b")))) + +# test non-symbol +substitute2(list(var = val), env = list(var=I("my_var"), val="my_val")) +substitute2(list(var = val), env = I(list(var="my_var", val="my_val"))) # complex use case substitute2( From 472d6d62d7fbb2de2ff412be07d2b9f50822a56b Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 17 Mar 2020 21:32:07 +0530 Subject: [PATCH 07/37] data.table query env support, rm AsIs class from env input --- R/data.table.R | 37 ++++++++++++++++++++++++++++++++----- R/programming.R | 7 ++++++- tests/programming.R | 21 +++++++++++++++++++++ 3 files changed, 59 insertions(+), 6 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 9292ee940b..17dc2b55da 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -125,7 +125,7 @@ replace_dot_alias = function(e) { } } -"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch", NA), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL) +"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch", NA), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL) { # ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could # test explicitly if the caller is [.data.table (even stronger test. TO DO.) @@ -151,12 +151,26 @@ replace_dot_alias = function(e) { if (!missing(keyby)) { if (!missing(by)) stop("Provide either by= or keyby= but not both") if (missing(j)) { warning("Ignoring keyby= because j= is not supplied"); keyby=NULL; } - by=bysub=substitute(keyby) + if (is.null(env)) by=bysub=substitute(keyby) else { + by = bysub = eval(substitute( + substitute2(.keyby, env), + list(.keyby = substitute(keyby)) + )) + if (verbose) cat("Argument 'by' after substitute: ", paste(deparse(bysub, width.cutoff=500L), collapse=" "), "\n", sep="") + } keyby=TRUE # Assign to 'by' so that by is no longer missing and we can proceed as if there were one by } else { if (!missing(by) && missing(j)) { warning("Ignoring by= because j= is not supplied"); by=NULL; } - by=bysub= if (missing(by)) NULL else substitute(by) + if (missing(by)) by=bysub=NULL else { + if (is.null(env)) by=bysub=substitute(by) else { + by = bysub = eval(substitute( + substitute2(.by, env), + list(.by = substitute(by)) + )) + if (verbose) cat("Argument 'by' after substitute: ", paste(deparse(bysub, width.cutoff=500L), collapse=" "), "\n", sep="") + } + } keyby=FALSE } bynull = !missingby && is.null(by) #3530 @@ -213,7 +227,14 @@ replace_dot_alias = function(e) { av = NULL jsub = NULL if (!missing(j)) { - jsub = replace_dot_alias(substitute(j)) + if (is.null(env)) jsub = substitute(j) else { + jsub = eval(substitute( + substitute2(.j, env), + list(.j = substitute(j)) + )) + if (verbose) cat("Argument 'j' after substitute: ", paste(deparse(jsub, width.cutoff=500L), collapse=" "), "\n", sep="") + } + jsub = replace_dot_alias(jsub) root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else "" if (root == ":" || (root %chin% c("-","!") && jsub[[2L]] %iscall% '(' && jsub[[2L]][[2L]] %iscall% ':') || @@ -291,8 +312,14 @@ replace_dot_alias = function(e) { dupdiff = function(x, y) x[!x %chin% y] if (!missing(i)) { + if (is.null(env)) isub = substitute(i) else { + isub = eval(substitute( + substitute2(.i, env), + list(.i = substitute(i)) + )) + if (verbose) cat("Argument 'i' after substitute: ", paste(deparse(isub, width.cutoff=500L), collapse=" "), "\n", sep="") + } xo = NULL - isub = substitute(i) if (identical(isub, NA)) { # only possibility *isub* can be NA (logical) is the symbol NA itself; i.e. DT[NA] # replace NA in this case with NA_integer_ as that's almost surely what user intended to diff --git a/R/programming.R b/R/programming.R index c5fe227d0d..0f66b155c3 100644 --- a/R/programming.R +++ b/R/programming.R @@ -20,12 +20,17 @@ substitute2 = function(expr, env, char.as.name=!is.AsIs(env), sub.names=TRUE) { asis = vapply(env, is.AsIs, FALSE) char = vapply(env, is.character, FALSE) to.name = !asis & char - if (any(to.name)) { + if (any(to.name)) { ## turns "my_name" character scalar into `my_name` symbol, for convenience if (any(non.scalar.char <- vapply(env[to.name], length, 0L)!=1L)) { stop("'char.as.name' was used but the following character objects provided in 'env' are not scalar, if you need them as character vector rather a name, then use 'I' function: ", paste(names(non.scalar.char)[non.scalar.char], collapse=", ")) } env[to.name] = lapply(env[to.name], as.name) + env[asis] = lapply(env[asis], function(x) { ## removes any AsIs class + cl = oldClass(x) + oldClass(x) = cl[cl!="AsIs"] + x + }) } } # R substitute diff --git a/tests/programming.R b/tests/programming.R index c550aca0fb..b1fab0e26b 100644 --- a/tests/programming.R +++ b/tests/programming.R @@ -73,3 +73,24 @@ f( out_col_name = "sum_x" ) ) + +# data.table i, j, by +d = data.table(a = 2:1, b = 1:4) +d[var3%in%values, .(var1 = f(var2)), by=var3, + env=list(var1="res", var2="b", f="sum", var3="a", values=0:3), + verbose=TRUE] + +# data.table symbols and chars +d = data.table(a = c("b","a"), b = 1:4) +d[var3%in%values, .(var1 = f(var2)), keyby=var3, + env=list(var1="res", var2="b", f="sum", var3="a", values=I(c("a","b","c"))), + verbose=TRUE] +d[var3%in%values, .(var1 = f(var2)), keyby=var3, + env=I(list(var1=as.name("res"), var2=as.name("b"), f=as.name("sum"), var3=as.name("a"), values=c("a","b","c"))), + verbose=TRUE] + +# test that AsIs class removed +class(substitute2(var3%in%values, list(var3="a", values=I(c("a","b","c"))))[[3L]]) == "character" +class(substitute2(var3%in%values, I(list(var3=as.name("a"), values=c("a","b","c"))))[[3L]]) == "character" +class(substitute2(var3%in%values, list(var3="a", values=I(1:3)))[[3L]]) == "integer" +class(substitute2(var3%in%values, I(list(var3=as.name("a"), values=c(1:3))))[[3L]]) == "integer" From 8167ef22ceb2437e9131aa1780f75223ae672662 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 17 Mar 2020 22:18:20 +0530 Subject: [PATCH 08/37] manual and tests --- .Rbuildignore | 1 + R/programming.R | 2 +- man/data.table.Rd | 4 +++- man/substitute2.Rd | 46 +++++++++++++++++++++++++++++++++++++++++++++ tests/programming.R | 29 +++++++++++++++++++--------- 5 files changed, 71 insertions(+), 11 deletions(-) create mode 100644 man/substitute2.Rd diff --git a/.Rbuildignore b/.Rbuildignore index 2b3483fa0e..8828e4dc59 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -25,6 +25,7 @@ ^.*\.Rproj$ ^\.Rproj\.user$ ^\.idea$ +^\.libs$ ^.*\.dll$ diff --git a/R/programming.R b/R/programming.R index 0f66b155c3..44607d33a3 100644 --- a/R/programming.R +++ b/R/programming.R @@ -4,7 +4,7 @@ is.AsIs = function(x) { substitute2 = function(expr, env, char.as.name=!is.AsIs(env), sub.names=TRUE) { if (missing(env)) { - stop("TODO") + stop("TODO, as of now 'env' should not be missing") } else if (is.environment(env)) { env = as.list(env, all.names=TRUE) ## todo: try to use environment rather than list } else if (!is.list(env)) { diff --git a/man/data.table.Rd b/man/data.table.Rd index 8c8e0d5375..1d6b0d37dc 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -31,7 +31,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac .SDcols, verbose = getOption("datatable.verbose"), # default: FALSE allow.cartesian = getOption("datatable.allow.cartesian"), # default: FALSE - drop = NULL, on = NULL) + drop = NULL, on = NULL, env = NULL) } \arguments{ \item{\dots}{ Just as \code{\dots} in \code{\link{data.frame}}. Usual recycling rules are applied to vectors of different lengths to create a list of equal length vectors.} @@ -170,6 +170,8 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac } See examples as well as \href{../doc/datatable-secondary-indices-and-auto-indexing.html}{\code{vignette("datatable-secondary-indices-and-auto-indexing")}}. } + + \item{env}{ Environment or a list, passed to \code{\link{substitute2}} for substition of parameters in \code{i}, \code{j}, \code{by} (or \code{keyby}). Use \code{verbose} to preview constructed expressions. } } \details{ \code{data.table} builds on base \R functionality to reduce 2 types of time:\cr diff --git a/man/substitute2.Rd b/man/substitute2.Rd new file mode 100644 index 0000000000..6da31b584e --- /dev/null +++ b/man/substitute2.Rd @@ -0,0 +1,46 @@ +\name{substitute2} +\alias{substitute2} +\alias{is.AsIs} +\title{ Substitute expression } +\description{ + Experimental, improved version of base R \code{\link[base]{substitute}} function. \code{is.AsIs} is just a helper function to check if \code{x} inherits \emph{AsIs} class. +} +\usage{ +substitute2(expr, env, char.as.name=!is.AsIs(env), sub.names=TRUE) + +is.AsIs(x) +} +\arguments{ +\item{expr}{ Unevaluated expression in which substitution has to take place. } +\item{env}{ Environment or a list from which variables will be taken to inject into \code{expr}. } +\item{char.as.name}{ Logical, for convenience it will automatically turn \code{"my_name"} character objects into \code{`my_name`} symbols. Feature can be escaped by using \code{\link[base]{I}} function. } +\item{sub.names}{ Logical, should be names of call arguments be substituted as well, if set to \code{FALSE} it will basically fall back to base R substitute. } +\item{x}{ Any object passed to \code{is.AsIS} to be tested for inheritance of \emph{AsIs} class. } +} +\note{ + By default function will turn any character objects into symbols. In case character is of length 2 or more, it will raise an error. Behaviour can be changed by setting \code{char.as.name} to \code{FALSE}, but then any symbol must be explicitly created, for example using \code{as.name} function. Additionally using base R \code{\link[base]{I}} function make it convenient to control \code{char.as.name} behaviour globally for all elements in \code{env}, or locally for each single one, without the need to use \code{char.as.name} argument explicitly. See example below. +} +\value{ +Quoted expression having variables substituted. +} +\seealso{ \code{\link[base]{substitute}}, \code{\link[base]{I}} } +\examples{ +substitute2 = data.table:::substitute2 + +substitute(list(var1 = var2), list(var1 = "c1", var2 = 5L)) +substitute2(list(var1 = var2), list(var1 = "c1", var2 = 5L)) + +# mix symbols and characters, both lines will yield same output +substitute2(list(var1 = var2), list(var1 = "c1", var2 = I("some_character"))) +substitute2(list(var1 = var2), I(list(var1 = as.name("c1"), var2 = "some_character"))) + +# using from inside a function +f = function(expr, env) { + eval(substitute( + substitute2(.expr, env), + list(.expr = substitute(expr)) + )) +} +f(list(var1 = var2), list(var1 = "c1", var2 = 5L)) +} +\keyword{ data } diff --git a/tests/programming.R b/tests/programming.R index b1fab0e26b..a6239bdabb 100644 --- a/tests/programming.R +++ b/tests/programming.R @@ -1,9 +1,7 @@ -if (exists("cc")) { ## dev mode - cc() -} else { - library(data.table) - substitute2 = data.table:::substitute2 -} +library(data.table) +substitute2 = data.table:::substitute2 + +# cc() ## dev mode # simple substitute2(list(var = val), env = list(var="my_var", val=5L)) @@ -13,13 +11,19 @@ substitute2(list(var = val), env = list(var="my_var", val=I("my_val"))) substitute2(list(var = val), env = I(list(var=as.name("my_var"), val="my_val"))) # test non-scalar char -substitute2(list(var = val), env = list(var="my_var", val=c("a","b"))) +try( + substitute2(list(var = val), env = list(var="my_var", val=c("a","b"))) +) substitute2(list(var = val), env = list(var="my_var", val=I(c("a","b")))) substitute2(list(var = val), env = I(list(var=as.name("my_var"), val=c("a","b")))) # test non-symbol -substitute2(list(var = val), env = list(var=I("my_var"), val="my_val")) -substitute2(list(var = val), env = I(list(var="my_var", val="my_val"))) +try( + substitute2(list(var = val), env = list(var=I("my_var"), val="my_val")) +) +try( + substitute2(list(var = val), env = I(list(var="my_var", val="my_val"))) +) # complex use case substitute2( @@ -94,3 +98,10 @@ class(substitute2(var3%in%values, list(var3="a", values=I(c("a","b","c"))))[[3L] class(substitute2(var3%in%values, I(list(var3=as.name("a"), values=c("a","b","c"))))[[3L]]) == "character" class(substitute2(var3%in%values, list(var3="a", values=I(1:3)))[[3L]]) == "integer" class(substitute2(var3%in%values, I(list(var3=as.name("a"), values=c(1:3))))[[3L]]) == "integer" + +# NA tests, especially NA_character_ +#TODO + +# char.as.name=TRUE but env is AsIs +#TODO +#remove char.as.name arg from API and keep I() interface only? From fd4434649f519ec6bc64ad9170d5242398914b5d Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 17 Mar 2020 22:27:52 +0530 Subject: [PATCH 09/37] tests roadmap --- tests/programming.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/programming.R b/tests/programming.R index a6239bdabb..a2105298ed 100644 --- a/tests/programming.R +++ b/tests/programming.R @@ -105,3 +105,9 @@ class(substitute2(var3%in%values, I(list(var3=as.name("a"), values=c(1:3))))[[3L # char.as.name=TRUE but env is AsIs #TODO #remove char.as.name arg from API and keep I() interface only? + +# get and mget use cases +#TODO + +# use DT[, var, env=list(var=quote(.(sum_x=sum(x)))] rather than dt[, eval(var)]? +#TODO From 247ed7a5b3b3afbaf25d701c71109eb885509cb4 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 18 Mar 2020 08:16:16 +0530 Subject: [PATCH 10/37] export substitute2 and is.AsIs, minor manual improvements --- NAMESPACE | 2 ++ man/substitute2.Rd | 10 +++++++--- tests/programming.R | 1 - 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c2c095a1d8..eb2c5d5b24 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -56,6 +56,8 @@ export(nafill) export(setnafill) export(.Last.updated) export(fcoalesce) +export(substitute2) +export(is.AsIs) S3method("[", data.table) S3method("[<-", data.table) diff --git a/man/substitute2.Rd b/man/substitute2.Rd index 6da31b584e..cb613118f5 100644 --- a/man/substitute2.Rd +++ b/man/substitute2.Rd @@ -3,11 +3,10 @@ \alias{is.AsIs} \title{ Substitute expression } \description{ - Experimental, improved version of base R \code{\link[base]{substitute}} function. \code{is.AsIs} is just a helper function to check if \code{x} inherits \emph{AsIs} class. + Experimental, more robust version of base R \code{\link[base]{substitute}} function. \code{is.AsIs} is a helper function to check if \code{x} inherits from \emph{AsIs} class. } \usage{ substitute2(expr, env, char.as.name=!is.AsIs(env), sub.names=TRUE) - is.AsIs(x) } \arguments{ @@ -25,7 +24,12 @@ Quoted expression having variables substituted. } \seealso{ \code{\link[base]{substitute}}, \code{\link[base]{I}} } \examples{ -substitute2 = data.table:::substitute2 +## base R substitute vs substitute2 +substitute(var1, list(var1 = "c1")) +substitute2(var1, list(var1 = I("c1"))) + +substitute(var1, list(var1 = as.name("c1"))) +substitute(var1, list(var1 = "c1")) substitute(list(var1 = var2), list(var1 = "c1", var2 = 5L)) substitute2(list(var1 = var2), list(var1 = "c1", var2 = 5L)) diff --git a/tests/programming.R b/tests/programming.R index a2105298ed..ed41711fae 100644 --- a/tests/programming.R +++ b/tests/programming.R @@ -1,5 +1,4 @@ library(data.table) -substitute2 = data.table:::substitute2 # cc() ## dev mode From 048e37061fb482ff31b48d56266a12f7f264bba0 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 18 Mar 2020 11:31:09 +0530 Subject: [PATCH 11/37] move and improve tests --- R/data.table.R | 4 +- R/programming.R | 18 +++-- inst/tests/programming.Rraw | 157 ++++++++++++++++++++++++++++++++++++ man/substitute2.Rd | 3 + src/programming.c | 5 +- tests/programming.R | 114 +------------------------- 6 files changed, 177 insertions(+), 124 deletions(-) create mode 100644 inst/tests/programming.Rraw diff --git a/R/data.table.R b/R/data.table.R index 17dc2b55da..f8282c15cd 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -232,7 +232,7 @@ replace_dot_alias = function(e) { substitute2(.j, env), list(.j = substitute(j)) )) - if (verbose) cat("Argument 'j' after substitute: ", paste(deparse(jsub, width.cutoff=500L), collapse=" "), "\n", sep="") + if (verbose) cat("Argument 'j' after substitute: ", paste(deparse(jsub, width.cutoff=500L), collapse=" "), "\n", sep="") } jsub = replace_dot_alias(jsub) root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else "" @@ -317,7 +317,7 @@ replace_dot_alias = function(e) { substitute2(.i, env), list(.i = substitute(i)) )) - if (verbose) cat("Argument 'i' after substitute: ", paste(deparse(isub, width.cutoff=500L), collapse=" "), "\n", sep="") + if (verbose) cat("Argument 'i' after substitute: ", paste(deparse(isub, width.cutoff=500L), collapse=" "), "\n", sep="") } xo = NULL if (identical(isub, NA)) { diff --git a/R/programming.R b/R/programming.R index 44607d33a3..13746e4078 100644 --- a/R/programming.R +++ b/R/programming.R @@ -8,13 +8,17 @@ substitute2 = function(expr, env, char.as.name=!is.AsIs(env), sub.names=TRUE) { } else if (is.environment(env)) { env = as.list(env, all.names=TRUE) ## todo: try to use environment rather than list } else if (!is.list(env)) { - stop("'env' must be a list of an environment") + stop("'env' must be a list or an environment") } env.names = names(env) if (is.null(env.names)) { stop("'env' argument does not have names") } else if (!all(nzchar(env.names))) { - stop("'env' argument has an zero char names") + stop("'env' argument has zero char names") + } else if (anyNA(env.names)) { + stop("'env' argument has NA names") + } else if (anyDuplicated(env.names)) { + stop("'env' argument has duplicated names") } if (isTRUE(char.as.name)) { asis = vapply(env, is.AsIs, FALSE) @@ -22,15 +26,18 @@ substitute2 = function(expr, env, char.as.name=!is.AsIs(env), sub.names=TRUE) { to.name = !asis & char if (any(to.name)) { ## turns "my_name" character scalar into `my_name` symbol, for convenience if (any(non.scalar.char <- vapply(env[to.name], length, 0L)!=1L)) { - stop("'char.as.name' was used but the following character objects provided in 'env' are not scalar, if you need them as character vector rather a name, then use 'I' function: ", + stop("'char.as.name' was used but the following character objects provided in 'env' are not scalar objects, if you need them as character vector rather a name, then use 'I' function: ", paste(names(non.scalar.char)[non.scalar.char], collapse=", ")) } env[to.name] = lapply(env[to.name], as.name) - env[asis] = lapply(env[asis], function(x) { ## removes any AsIs class + } + if (any(asis)) { + rm.AsIs = function(x) { ## removes any AsIs class cl = oldClass(x) oldClass(x) = cl[cl!="AsIs"] x - }) + } + env[asis] = lapply(env[asis], rm.AsIs) } } # R substitute @@ -40,7 +47,6 @@ substitute2 = function(expr, env, char.as.name=!is.AsIs(env), sub.names=TRUE) { )) # new arg names substitute if (isTRUE(sub.names)) { - #cat("entering substitute_call_arg_namesR\n") .Call(Csubstitute_call_arg_namesR, expr.sub, env) } else { expr.sub diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw new file mode 100644 index 0000000000..df8f54bb92 --- /dev/null +++ b/inst/tests/programming.Rraw @@ -0,0 +1,157 @@ +require(methods) +if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { + if ((tt<-compiler::enableJIT(-1))>0) + cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") +} else { + require(data.table) + test = data.table:::test +} + +# AsIs +test(1.01, is.AsIs(1L), FALSE) +test(1.02, is.AsIs(I(1L)), TRUE) +test(1.03, is.AsIs("a"), FALSE) +test(1.04, is.AsIs(I("a")), TRUE) +test(1.05, is.AsIs(list(1L)), FALSE) +test(1.06, is.AsIs(I(list(1L))), TRUE) +test(1.07, is.AsIs(structure(list(NULL), class="an_S3")), FALSE) ## S3 +test(1.08, is.AsIs(I(structure(list(NULL), class="an_S3"))), TRUE) +test(1.09, is.AsIs(getClass("MethodDefinition")), FALSE) ## S4 +test(1.10, is.AsIs(I(getClass("MethodDefinition"))), TRUE) + +# substitute2 simple +test(2.01, substitute2(list(var = val), env = list(var="my_var", val=5L)), quote(list(my_var = 5L))) +# substitute2 + I to handle char and symbol +test(2.02, substitute2(list(var = val), env = list(var="my_var", val=I("my_val"))), quote(list(my_var="my_val"))) +test(2.03, substitute2(list(var = val), env = I(list(var=as.name("my_var"), val="my_val"))), quote(list(my_var="my_val"))) +# substitute2 handle symbol anyway +test(2.04, substitute2(list(var = val), env = list(var=as.name("my_var"), val=I("my_val"))), quote(list(my_var="my_val"))) +# substitute2 complex use case +test(2.11, substitute2( + .(fun_ans_var = fun(farg1, farg2=farg2val), timestamp=Sys.time(), col_head = head(head_arg, n=1L)), + list( + fun_ans_var = "my_mean_res", + fun = "mean", + farg1 = "my_x_col", + farg2 = "na.rm", + farg2val = TRUE, + col_head = "first_y", + head_arg = "y" + ) +), quote(.(my_mean_res=mean(my_x_col, na.rm=TRUE), timestamp=Sys.time(), first_y=head(y, n=1L)))) +# substitute2 PR example +test(2.12, substitute2( + .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), + env = list( + in_col_name = "x", + fun = "sum", + fun_arg1 = "na.rm", + fun_arg1val = TRUE, + out_col_name = "sum_x" + ) +), quote(.(sum_x = sum(x, na.rm=TRUE)))) +# substitute2 nested calls +test(2.13, substitute2(f1(a1 = f2(a2 = f3(a3 = f4(a4 = v1, extra=v2), v3, a3b = v4)), a1b=c("a","b")), list(f1="fun1", f2="fun2", f3="fun3", f4="fun4", a1="arg1", a2="arg2", a3="arg3", a4="arg4", v1="col1", extra="n", v2=6L, v3="col2", a3b="arg3b", v4=c(3.5,4.5), a1b="arg1b")), substitute(fun1(arg1 = fun2(arg2 = fun3(arg3 = fun4(arg4 = col1, n=6L), col2, arg3b = v4)), arg1b=c("a","b")), list(v4=c(3.5,4.5)))) +# calls of length 0 args +const1 = function() 1L +test(2.21, substitute2(list(nm = fun()), env=list(a="b", fun="const1", nm="int1")), quote(list(int1=const1()))) +test(2.22, substitute2(.(), env=list(a="b", fun="const1", nm="int1")), quote(.())) +# substitute2 AsIs class properly removed or kept +test(2.31, class(substitute2(var3%in%values, list(var3="a", values=I(c("a","b","c"))))[[3L]]), "character") +test(2.32, class(substitute2(var3%in%values, I(list(var3=as.name("a"), values=c("a","b","c"))))[[3L]]), "character") +test(2.33, class(substitute2(var3%in%values, list(var3="a", values=I(1:3)))[[3L]]), "integer") +test(2.34, class(substitute2(var3%in%values, I(list(var3=as.name("a"), values=c(1:3))))[[3L]]), "integer") +qc = substitute2(var3%in%values, I(list(var3=as.name("a"), values=I(c("a","b","c"))))) ## keeping AsIs by extra I on whole env arg +test(2.35, qc, substitute(a %in% .v, list(.v=I(c("a","b","c"))))) +test(2.36, class(qc[[3L]]), "AsIs") +qc = substitute2(var3%in%values, I(list(var3="a", values=I(1:3)))) +test(2.37, qc, substitute("a" %in% .v, list(.v=I(1:3)))) +test(2.38, class(qc[[3L]]), "AsIs") +# substitute2 non-scalar char +test(2.41, substitute2(list(var = val), env = list(var="my_var", val=c("a","b"))), error="are not scalar") +test(2.42, substitute2(list(var = val), env = list(var="my_var", val=I(c("a","b")))), substitute(list(my_var=.v), list(.v=c("a","b")))) ## note that quote(list(my_var=c("a","b")))) will not work because 'c("a","b")' will be a 'language' class (a 'c()' call), but we need to have it as 'character' class instead +test(2.43, substitute2(list(var = val), env = I(list(var=as.name("my_var"), val=c("a","b")))), substitute(list(my_var=.v), list(.v=c("a","b")))) +# substitute2 non-symbol as name +test(2.44, substitute2(list(var = val), env = list(var=I("my_var"), val="my_val")), error="type 'character' but it has to be 'symbol'") +test(2.45, substitute2(list(var = val), env = I(list(var="my_var", val="my_val"))), error="type 'character' but it has to be 'symbol'") +test(2.46, substitute2(.(v1=v2), list(v1=1L, v2=2L)), error="type 'integer' but it has to be 'symbol'") +test(2.47, substitute2(.(v1=v2), list(v1=FALSE, v2=2L)), error="type 'logical' but it has to be 'symbol'") +# substitute2 env as environment class +env = as.environment(list(v=1L, .v=2L)) +test(2.81, substitute2(.(v, .v), env), quote(.(1L, 2L))) +# substitute2 invalid 'env' error coverage +test(2.91, substitute2(.()), error="TODO, as of now 'env' should not be missing") ## TODO +test(2.92, substitute2(v, c(v=1L)), error="'env' must be a list or an environment") +test(2.93, substitute2(.(v), list(1L, 2L)), error="'env' argument does not have names") +test(2.94, substitute2(.(v), structure(list(1L,2L), names=c("","v"))), error="'env' argument has zero char names") +test(2.95, substitute2(.(v), structure(list(1,2), names=c(NA,"v"))), error="'env' argument has NA names") +test(2.96, substitute2(.(v), list(v=1,v=2)), error="'env' argument has duplicated names") + +# char.as.name=TRUE but env is AsIs ## TODO consider removing `char.as.name` arg from API and keep I() interface only +#TODO +# sub.names escape, fall back to R substitute ## TODO consider removing `sub.names` arg from API and document to use base R substitute instead +test(2.99, substitute2(list(nm = v), list(nm = "new_name", v = "col"), sub.names=FALSE), quote(list(nm = col))) + +# substitute2 re-use inside another function +f = function(expr, env) { + eval(substitute( + substitute2(.expr, env), + list(.expr = substitute(expr)) + )) +} +qc = f( + .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), + env = list( + in_col_name = "x", + fun = "sum", + fun_arg1 = "na.rm", + fun_arg1val = TRUE, + out_col_name = "sum_x" + ) +) +test(3.01, qc, quote(.(sum_x = sum(x, na.rm = TRUE)))) +# substitute2 nested re-use inside another function +qc = substitute2(list(nm = fun(.(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), + env = list( + in_col_name = "x", + fun = "sum", + fun_arg1 = "na.rm", + fun_arg1val = tf_var, ## note a parameter here + out_col_name = "sum_x" +))), list(nm="my_call", fun="f", tf_var=FALSE)) +test(3.02, eval(qc), list(my_call = quote(.(sum_x = sum(x, na.rm = FALSE))))) + +# data.table i, j, by +d = data.table(a = 2:1, b = 1:4) +test(4.01, d[var3%in%values, .(var1 = f(var2)), by=var3, + env=list(var1="res", var2="b", f="sum", var3="a", values=0:3), + verbose=TRUE], data.table(a=c(2L,1L), res=c(4L,6L)), output=c("Argument 'by' after substitute: a","Argument 'j' after substitute: .(res = sum(b))","Argument 'i' after substitute: a %in% 0:3")) # 0:3 is not expanded even when we use c(0L,1L,2L,3L), see `substitute(v+x, list(x=c(1L,2L)))` vs `substitute(v+x, list(x=c(0L,2L)))` +# data.table symbols and chars +d = data.table(a = c("b","a"), b = 1:4) +out = capture.output(ans <- d[var3%in%values, .(var1 = f(var2)), keyby=var3, + env=list(var1="res", var2="b", f="sum", var3="a", values=I(c("a","b","c"))), + verbose=TRUE]) # could not use output arg in test, so test it manually +test(4.02, ans, data.table(a=c("a","b"), res=c(6L,4L), key="a")) +out = grep("Argument.*substitute", out, value=TRUE) +test(4.021, length(out), 3L) # we expect i, j, by only here, ensure about that +test(4.022, as.logical(grep("Argument 'by' after substitute: a", out, fixed=TRUE)), TRUE) +test(4.023, as.logical(grep("Argument 'j' after substitute: .(res = sum(b))", out, fixed=TRUE)), TRUE) +test(4.024, as.logical(grep("Argument 'i' after substitute: a %in% c(\"a\", \"b\", \"c\")", out, fixed=TRUE)), TRUE) +out = capture.output(ans <- d[var3%in%values, .(var1 = f(var2)), keyby=var3, + env=I(list(var1=as.name("res"), var2=as.name("b"), f=as.name("sum"), var3=as.name("a"), values=c("b","c"))), + verbose=TRUE]) +out = grep("Argument.*substitute", out, value=TRUE) +test(4.03, ans, data.table(a=c("b"), res=c(4L), key="a")) +test(4.031, length(out), 3L) +test(4.032, as.logical(grep("Argument 'by' after substitute: a", out, fixed=TRUE)), TRUE) +test(4.033, as.logical(grep("Argument 'j' after substitute: .(res = sum(b))", out, fixed=TRUE)), TRUE) +test(4.034, as.logical(grep("Argument 'i' after substitute: a %in% c(\"b\", \"c\")", out, fixed=TRUE)), TRUE) + +# get and mget use cases +#TODO + +# use DT[, var, env=list(var=quote(.(sum_x=sum(x)))] rather than dt[, eval(var)]? +#TODO + +# contributed use cases +#TODO diff --git a/man/substitute2.Rd b/man/substitute2.Rd index cb613118f5..6f274fecf9 100644 --- a/man/substitute2.Rd +++ b/man/substitute2.Rd @@ -1,6 +1,9 @@ \name{substitute2} \alias{substitute2} +\alias{substitute} \alias{is.AsIs} +\alias{AsIs} +\alias{I} \title{ Substitute expression } \description{ Experimental, more robust version of base R \code{\link[base]{substitute}} function. \code{is.AsIs} is a helper function to check if \code{x} inherits from \emph{AsIs} class. diff --git a/src/programming.c b/src/programming.c index c71116dbfa..62a85bba2b 100644 --- a/src/programming.c +++ b/src/programming.c @@ -7,15 +7,13 @@ static void substitute_call_arg_names(SEXP expr, SEXP env) { if (!isNull(arg_names)) { SEXP env_names = getAttrib(env, R_NamesSymbol); int *imatches = INTEGER(PROTECT(chmatch(arg_names, env_names, 0))); - //const SEXP *expr_arg_names = SEXPPTR_RO(arg_names); // debug const SEXP *env_sub = SEXPPTR_RO(env); int i = 0; for (SEXP tmp=expr; tmp!=R_NilValue; tmp=CDR(tmp)) { if (imatches[i]) { SEXP sym = env_sub[imatches[i]-1]; if (!isSymbol(sym)) - error("Attempting to substitute '%s' element with object of type '%s' but it has to be symbol type when substituting name of the call argument, functions 'as.name' and 'I' can be used to work out proper substitution, see ?substitute2", CHAR(STRING_ELT(arg_names, i)), type2char(TYPEOF(sym))); - //Rprintf("substitute names: %s -> %s\n", CHAR(expr_arg_names[i]), CHAR(PRINTNAME(sym))); // debug + error("Attempting to substitute '%s' element with object of type '%s' but it has to be 'symbol' type when substituting name of the call argument, functions 'as.name' and 'I' can be used to work out proper substitution, see ?substitute2", CHAR(STRING_ELT(arg_names, i)), type2char(TYPEOF(sym))); SET_TAG(tmp, sym); } i++; @@ -27,7 +25,6 @@ static void substitute_call_arg_names(SEXP expr, SEXP env) { } SEXP substitute_call_arg_namesR(SEXP expr, SEXP env) { SEXP ans = PROTECT(MAYBE_REFERENCED(expr) ? duplicate(expr) : expr); - //Rprintf("entering substitute_call_arg_names\n"); substitute_call_arg_names(ans, env); // updates in-place UNPROTECT(1); return ans; diff --git a/tests/programming.R b/tests/programming.R index ed41711fae..319898901b 100644 --- a/tests/programming.R +++ b/tests/programming.R @@ -1,112 +1,2 @@ -library(data.table) - -# cc() ## dev mode - -# simple -substitute2(list(var = val), env = list(var="my_var", val=5L)) - -# AsIs way to handle char.to.name argument -substitute2(list(var = val), env = list(var="my_var", val=I("my_val"))) -substitute2(list(var = val), env = I(list(var=as.name("my_var"), val="my_val"))) - -# test non-scalar char -try( - substitute2(list(var = val), env = list(var="my_var", val=c("a","b"))) -) -substitute2(list(var = val), env = list(var="my_var", val=I(c("a","b")))) -substitute2(list(var = val), env = I(list(var=as.name("my_var"), val=c("a","b")))) - -# test non-symbol -try( - substitute2(list(var = val), env = list(var=I("my_var"), val="my_val")) -) -try( - substitute2(list(var = val), env = I(list(var="my_var", val="my_val"))) -) - -# complex use case -substitute2( - .(fun_ans_var = fun(farg1, farg2=farg2val), timestamp=Sys.time(), col_head = head(head_arg, n=1L)), - list( - fun_ans_var = "my_mean_res", - fun = "mean", - farg1 = "my_x_col", - farg2 = "na.rm", - farg2val = TRUE, - col_head = "first_y", - head_arg = "y" - ) -) - -# calls of length 0 args -const1 = function() 1L -substitute2(list(nm = fun()), env=list(a="b", fun="const1", nm="int1")) -substitute2(.(), env=list(a="b", fun="const1", nm="int1")) - -# some special names -substitute2(.("TRUE" = 1L, "FALSE" = 2L, "1" = 3L, "2" = 4L), - env=list("FALSE"="col2", "2"="col4")) - -# PR example -substitute2( - .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), - env = list( - in_col_name = "x", - fun = "sum", - fun_arg1 = "na.rm", - fun_arg1val = TRUE, - out_col_name = "sum_x" - ) -) - -# re-use inside another function -f = function(expr, env) { - eval(substitute( - substitute2(.expr, env), - list(.expr = substitute(expr)) - )) -} -f( - .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), - env = list( - in_col_name = "x", - fun = "sum", - fun_arg1 = "na.rm", - fun_arg1val = TRUE, - out_col_name = "sum_x" - ) -) - -# data.table i, j, by -d = data.table(a = 2:1, b = 1:4) -d[var3%in%values, .(var1 = f(var2)), by=var3, - env=list(var1="res", var2="b", f="sum", var3="a", values=0:3), - verbose=TRUE] - -# data.table symbols and chars -d = data.table(a = c("b","a"), b = 1:4) -d[var3%in%values, .(var1 = f(var2)), keyby=var3, - env=list(var1="res", var2="b", f="sum", var3="a", values=I(c("a","b","c"))), - verbose=TRUE] -d[var3%in%values, .(var1 = f(var2)), keyby=var3, - env=I(list(var1=as.name("res"), var2=as.name("b"), f=as.name("sum"), var3=as.name("a"), values=c("a","b","c"))), - verbose=TRUE] - -# test that AsIs class removed -class(substitute2(var3%in%values, list(var3="a", values=I(c("a","b","c"))))[[3L]]) == "character" -class(substitute2(var3%in%values, I(list(var3=as.name("a"), values=c("a","b","c"))))[[3L]]) == "character" -class(substitute2(var3%in%values, list(var3="a", values=I(1:3)))[[3L]]) == "integer" -class(substitute2(var3%in%values, I(list(var3=as.name("a"), values=c(1:3))))[[3L]]) == "integer" - -# NA tests, especially NA_character_ -#TODO - -# char.as.name=TRUE but env is AsIs -#TODO -#remove char.as.name arg from API and keep I() interface only? - -# get and mget use cases -#TODO - -# use DT[, var, env=list(var=quote(.(sum_x=sum(x)))] rather than dt[, eval(var)]? -#TODO +require(data.table) +test.data.table(script="programming.Rraw") From d60152dee974f0073186f5c732abb9e257616a85 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 18 Mar 2020 11:34:52 +0530 Subject: [PATCH 12/37] manual typo --- man/data.table.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/data.table.Rd b/man/data.table.Rd index 1d6b0d37dc..5533fe7f72 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -171,7 +171,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac See examples as well as \href{../doc/datatable-secondary-indices-and-auto-indexing.html}{\code{vignette("datatable-secondary-indices-and-auto-indexing")}}. } - \item{env}{ Environment or a list, passed to \code{\link{substitute2}} for substition of parameters in \code{i}, \code{j}, \code{by} (or \code{keyby}). Use \code{verbose} to preview constructed expressions. } + \item{env}{ Environment or a list, passed to \code{\link{substitute2}} for substitution of parameters in \code{i}, \code{j} and \code{by} (or \code{keyby}). Use \code{verbose} to preview constructed expressions. } } \details{ \code{data.table} builds on base \R functionality to reduce 2 types of time:\cr From fed255383889ea12a9d88bcaa1bf276fe4b6a152 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sun, 22 Mar 2020 08:13:01 +0530 Subject: [PATCH 13/37] simplifies API, export less --- NAMESPACE | 1 - R/programming.R | 12 ++++-------- inst/tests/programming.Rraw | 6 +----- man/substitute2.Rd | 12 +++--------- 4 files changed, 8 insertions(+), 23 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index eb2c5d5b24..f061eebdeb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -57,7 +57,6 @@ export(setnafill) export(.Last.updated) export(fcoalesce) export(substitute2) -export(is.AsIs) S3method("[", data.table) S3method("[<-", data.table) diff --git a/R/programming.R b/R/programming.R index 13746e4078..2449012b51 100644 --- a/R/programming.R +++ b/R/programming.R @@ -2,7 +2,7 @@ is.AsIs = function(x) { inherits(x, "AsIs") } -substitute2 = function(expr, env, char.as.name=!is.AsIs(env), sub.names=TRUE) { +substitute2 = function(expr, env) { if (missing(env)) { stop("TODO, as of now 'env' should not be missing") } else if (is.environment(env)) { @@ -20,7 +20,7 @@ substitute2 = function(expr, env, char.as.name=!is.AsIs(env), sub.names=TRUE) { } else if (anyDuplicated(env.names)) { stop("'env' argument has duplicated names") } - if (isTRUE(char.as.name)) { + if (!is.AsIs(env)) { asis = vapply(env, is.AsIs, FALSE) char = vapply(env, is.character, FALSE) to.name = !asis & char @@ -45,10 +45,6 @@ substitute2 = function(expr, env, char.as.name=!is.AsIs(env), sub.names=TRUE) { substitute(.expr, env), env = list(.expr = substitute(expr)) )) - # new arg names substitute - if (isTRUE(sub.names)) { - .Call(Csubstitute_call_arg_namesR, expr.sub, env) - } else { - expr.sub - } + # call arg names substitute + .Call(Csubstitute_call_arg_namesR, expr.sub, env) } diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index df8f54bb92..713fb0e1b0 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -5,6 +5,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { } else { require(data.table) test = data.table:::test + is.AsIs = data.table:::is.AsIs } # AsIs @@ -87,11 +88,6 @@ test(2.94, substitute2(.(v), structure(list(1L,2L), names=c("","v"))), error="'e test(2.95, substitute2(.(v), structure(list(1,2), names=c(NA,"v"))), error="'env' argument has NA names") test(2.96, substitute2(.(v), list(v=1,v=2)), error="'env' argument has duplicated names") -# char.as.name=TRUE but env is AsIs ## TODO consider removing `char.as.name` arg from API and keep I() interface only -#TODO -# sub.names escape, fall back to R substitute ## TODO consider removing `sub.names` arg from API and document to use base R substitute instead -test(2.99, substitute2(list(nm = v), list(nm = "new_name", v = "col"), sub.names=FALSE), quote(list(nm = col))) - # substitute2 re-use inside another function f = function(expr, env) { eval(substitute( diff --git a/man/substitute2.Rd b/man/substitute2.Rd index 6f274fecf9..de9f0ace48 100644 --- a/man/substitute2.Rd +++ b/man/substitute2.Rd @@ -1,26 +1,20 @@ \name{substitute2} \alias{substitute2} \alias{substitute} -\alias{is.AsIs} -\alias{AsIs} \alias{I} \title{ Substitute expression } \description{ - Experimental, more robust version of base R \code{\link[base]{substitute}} function. \code{is.AsIs} is a helper function to check if \code{x} inherits from \emph{AsIs} class. + Experimental, more robust version of base R \code{\link[base]{substitute}}. } \usage{ -substitute2(expr, env, char.as.name=!is.AsIs(env), sub.names=TRUE) -is.AsIs(x) +substitute2(expr, env) } \arguments{ \item{expr}{ Unevaluated expression in which substitution has to take place. } \item{env}{ Environment or a list from which variables will be taken to inject into \code{expr}. } -\item{char.as.name}{ Logical, for convenience it will automatically turn \code{"my_name"} character objects into \code{`my_name`} symbols. Feature can be escaped by using \code{\link[base]{I}} function. } -\item{sub.names}{ Logical, should be names of call arguments be substituted as well, if set to \code{FALSE} it will basically fall back to base R substitute. } -\item{x}{ Any object passed to \code{is.AsIS} to be tested for inheritance of \emph{AsIs} class. } } \note{ - By default function will turn any character objects into symbols. In case character is of length 2 or more, it will raise an error. Behaviour can be changed by setting \code{char.as.name} to \code{FALSE}, but then any symbol must be explicitly created, for example using \code{as.name} function. Additionally using base R \code{\link[base]{I}} function make it convenient to control \code{char.as.name} behaviour globally for all elements in \code{env}, or locally for each single one, without the need to use \code{char.as.name} argument explicitly. See example below. + By default function will turn any character objects into symbols. In case character is of length 2 or more, it will raise an error. Behaviour can be changed by wrapping \code{env} into \code{\link[base]{I}} call. In such a case any symbols must be explicitly created, for example using \code{as.name} function. Alternatively it is possible to wrap single elements of \code{env} into \code{\link[base]{I}} call, then only those elements will retain their original class. See examples below. } \value{ Quoted expression having variables substituted. From fd5582ed65dc896499100390e4b1bf0447c22e4e Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sun, 22 Mar 2020 12:57:56 +0530 Subject: [PATCH 14/37] rework inner loop, tests, docs --- R/programming.R | 12 ++++++------ inst/tests/programming.Rraw | 39 ++++++++++++++++++++++++++++++++++--- man/substitute2.Rd | 30 ++++++++++++++++++++-------- src/programming.c | 11 ++++++----- 4 files changed, 70 insertions(+), 22 deletions(-) diff --git a/R/programming.R b/R/programming.R index 2449012b51..40f3fa87e5 100644 --- a/R/programming.R +++ b/R/programming.R @@ -1,6 +1,11 @@ is.AsIs = function(x) { inherits(x, "AsIs") } +rm.AsIs = function(x) { + cl = oldClass(x) + oldClass(x) = cl[cl!="AsIs"] + x +} substitute2 = function(expr, env) { if (missing(env)) { @@ -26,17 +31,12 @@ substitute2 = function(expr, env) { to.name = !asis & char if (any(to.name)) { ## turns "my_name" character scalar into `my_name` symbol, for convenience if (any(non.scalar.char <- vapply(env[to.name], length, 0L)!=1L)) { - stop("'char.as.name' was used but the following character objects provided in 'env' are not scalar objects, if you need them as character vector rather a name, then use 'I' function: ", + stop("Character objects provided in 'env' are not scalar objects, if you need them as character vector rather than a name, then use wrap it into 'I' call: ", paste(names(non.scalar.char)[non.scalar.char], collapse=", ")) } env[to.name] = lapply(env[to.name], as.name) } if (any(asis)) { - rm.AsIs = function(x) { ## removes any AsIs class - cl = oldClass(x) - oldClass(x) = cl[cl!="AsIs"] - x - } env[asis] = lapply(env[asis], rm.AsIs) } } diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index 713fb0e1b0..063442defa 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -68,15 +68,28 @@ test(2.36, class(qc[[3L]]), "AsIs") qc = substitute2(var3%in%values, I(list(var3="a", values=I(1:3)))) test(2.37, qc, substitute("a" %in% .v, list(.v=I(1:3)))) test(2.38, class(qc[[3L]]), "AsIs") -# substitute2 non-scalar char +# substitute2 non-scalar char as name test(2.41, substitute2(list(var = val), env = list(var="my_var", val=c("a","b"))), error="are not scalar") test(2.42, substitute2(list(var = val), env = list(var="my_var", val=I(c("a","b")))), substitute(list(my_var=.v), list(.v=c("a","b")))) ## note that quote(list(my_var=c("a","b")))) will not work because 'c("a","b")' will be a 'language' class (a 'c()' call), but we need to have it as 'character' class instead test(2.43, substitute2(list(var = val), env = I(list(var=as.name("my_var"), val=c("a","b")))), substitute(list(my_var=.v), list(.v=c("a","b")))) -# substitute2 non-symbol as name +# substitute2 non-symbol test(2.44, substitute2(list(var = val), env = list(var=I("my_var"), val="my_val")), error="type 'character' but it has to be 'symbol'") test(2.45, substitute2(list(var = val), env = I(list(var="my_var", val="my_val"))), error="type 'character' but it has to be 'symbol'") test(2.46, substitute2(.(v1=v2), list(v1=1L, v2=2L)), error="type 'integer' but it has to be 'symbol'") test(2.47, substitute2(.(v1=v2), list(v1=FALSE, v2=2L)), error="type 'logical' but it has to be 'symbol'") +# substitute2 NA_character_ becomes valid 'NA' name +test(2.48, substitute2(.(v1 = v2), list(v1 = NA_character_, v2 = NA_character_, "." = "list")), quote(list(`NA` = `NA`))) +qc = substitute2(.(v1 = v2), list(v1 = NA_character_, v2 = I(NA_character_), "." = "list")) +test(2.49, qc, quote(list(`NA` = NA_character_))) +test(2.50, eval(qc), list("NA" = NA_character_)) +# substitute2 duplicate matches +test(2.51, substitute2(list(v1=v2, v1=v2), env=list(v1="nm",v2=2L,v3=3L)), quote(list(nm = 2L, nm = 2L))) +test(2.52, substitute2(list(v1=v2, v1=v3), env=list(v1="nm",v2=2L,v3=3L)), quote(list(nm = 2L, nm = 3L))) +# substitute2 nested unnamed call +test(2.53, substitute2(c(list(v1=v2, v1=v2)), env=list(v1="nm",v2=2L,v3=3L)), quote(c(list(nm = 2L, nm = 2L)))) +test(2.54, substitute2(c(list(v1=v2, v1=v3)), env=list(v1="nm",v2=2L,v3=3L)), quote(c(list(nm = 2L, nm = 3L)))) +# TODO substitute2 during join + # substitute2 env as environment class env = as.environment(list(v=1L, .v=2L)) test(2.81, substitute2(.(v, .v), env), quote(.(1L, 2L))) @@ -143,8 +156,28 @@ test(4.032, as.logical(grep("Argument 'by' after substitute: a", out, fixed=TRUE test(4.033, as.logical(grep("Argument 'j' after substitute: .(res = sum(b))", out, fixed=TRUE)), TRUE) test(4.034, as.logical(grep("Argument 'i' after substitute: a %in% c(\"b\", \"c\")", out, fixed=TRUE)), TRUE) +# TODO special symbols +#".SD" +#".N", ".EACHI" +#I(list(j=as.name(".SD"))) +#I(list(j=as.name(".N"), by=as.name(".EACHI"))) + # get and mget use cases -#TODO +d = as.data.table(lapply(1:5, rep, 2L)) +setnames(d, paste0("c",1:5)) +v1 = "c1"; v2 = "c2"; v3 = "c3"; v4 = "c4"; v5 = "c5" +### TODO report/review get-mget issues +## note that d[, get(v1)] is wrong here, it translates to d[, c1] while it should be d[, "c1"] +## d[, mget(v1)] is handling that fine, compare both! might have been used for convenience but it breaks consistency +test(4.051, d[, v1, env=list(v1=v1)], c(1L,1L)) ## symbol c1 +test(4.052, d[, v1, env=list(v1=I(v1))], data.table(c1=c(1L,1L))) ## character "c1" +test(4.053, d[, v1, env=list(v1=I(v1))], d[, mget(v1)]) ## character "c1", works against mget only +test(4.054, d[, v1v2, env=list(v1v2=I(c(v1,v2)))], d[, mget(c(v1, v2))]) ## character c("c1","c2") +test(4.055, d[, .(v1), env=list(v1=v1)], data.table(c1=c(1L,1L))) ## d[, .(get(v1))] - (m)get would return unnamed columns +test(4.056, d[, .(v1, v2), env=list(v1=v1, v2=v2)], data.table(c1=c(1L,1L),c2=c(2L,2L))) ## d[, .(get(v1), get(v2))] +test(4.057, d[, .(sum(v1)), env=list(v1=v1)], d[, .(sum(get(v1)))]) +test(4.058, d[, lapply(vN, sum), env=list(vN=substitute2(list(v1=v1, v3=v3), list(v1=v1, v3=v3)))], d[, lapply(mget(c(v1,v3)), sum)]) +test(4.059, d[, c(list(c1=c1, c2=c2), list(v3=v3), list(v4=v4, v5=v5)), env=list(v3=v3,v4=v4,v5=v5)], d) ## d[, c(list(c1, c2), list(get(v3)), mget(c(v4,v5)))] - some are unnamed # use DT[, var, env=list(var=quote(.(sum_x=sum(x)))] rather than dt[, eval(var)]? #TODO diff --git a/man/substitute2.Rd b/man/substitute2.Rd index de9f0ace48..13635344ea 100644 --- a/man/substitute2.Rd +++ b/man/substitute2.Rd @@ -13,7 +13,7 @@ substitute2(expr, env) \item{expr}{ Unevaluated expression in which substitution has to take place. } \item{env}{ Environment or a list from which variables will be taken to inject into \code{expr}. } } -\note{ +\details{ By default function will turn any character objects into symbols. In case character is of length 2 or more, it will raise an error. Behaviour can be changed by wrapping \code{env} into \code{\link[base]{I}} call. In such a case any symbols must be explicitly created, for example using \code{as.name} function. Alternatively it is possible to wrap single elements of \code{env} into \code{\link[base]{I}} call, then only those elements will retain their original class. See examples below. } \value{ @@ -22,20 +22,34 @@ Quoted expression having variables substituted. \seealso{ \code{\link[base]{substitute}}, \code{\link[base]{I}} } \examples{ ## base R substitute vs substitute2 -substitute(var1, list(var1 = "c1")) -substitute2(var1, list(var1 = I("c1"))) +substitute(list(var1 = var2), list(var1 = "c1", var2 = 5L)) +substitute2(list(var1 = var2), list(var1 = "c1", var2 = 5L)) ## works also on names -substitute(var1, list(var1 = as.name("c1"))) substitute(var1, list(var1 = "c1")) +substitute2(var1, list(var1 = I("c1"))) ## enforce character with I -substitute(list(var1 = var2), list(var1 = "c1", var2 = 5L)) -substitute2(list(var1 = var2), list(var1 = "c1", var2 = 5L)) +substitute(var1, list(var1 = as.name("c1"))) +substitute2(var1, list(var1 = "c1")) ## turn character into symbol, for convenience -# mix symbols and characters, both lines will yield same output +## mix symbols and characters using 'I' function, both lines will yield same result substitute2(list(var1 = var2), list(var1 = "c1", var2 = I("some_character"))) substitute2(list(var1 = var2), I(list(var1 = as.name("c1"), var2 = "some_character"))) -# using from inside a function +## character to symbol coerce will not be done for character stored inside a list +substitute2(lapply(some_list, sum), list(some_list = list("V1","V2"))) +## those has to be substituted before +some_list = substitute2(list(v1, v2), list(v1="V1", v2="V2")) +substitute2(lapply(some_list, sum), list(some_list = some_list)) +## in case of a character list of arbitrary length +some_list = as.call(lapply(c("list", c("V1","V2")), as.name)) +substitute2(lapply(some_list, sum), list(some_list = some_list)) +## or from R 4.0.0 also bquote splice functionality +if (getRversion() >= "4.0.0") { + some_list = bquote(list(..(lapply(c("V1","V2"), as.name))), splice=TRUE) + substitute2(lapply(some_list, sum), list(some_list = some_list)) +} + +## using substitute2 from inside a function f = function(expr, env) { eval(substitute( substitute2(.expr, env), diff --git a/src/programming.c b/src/programming.c index 62a85bba2b..4f6cf1a19f 100644 --- a/src/programming.c +++ b/src/programming.c @@ -8,19 +8,20 @@ static void substitute_call_arg_names(SEXP expr, SEXP env) { SEXP env_names = getAttrib(env, R_NamesSymbol); int *imatches = INTEGER(PROTECT(chmatch(arg_names, env_names, 0))); const SEXP *env_sub = SEXPPTR_RO(env); - int i = 0; - for (SEXP tmp=expr; tmp!=R_NilValue; tmp=CDR(tmp)) { + SEXP tmp = expr; + for (int i=0; i Date: Sun, 22 Mar 2020 13:34:31 +0530 Subject: [PATCH 15/37] tests --- inst/tests/programming.Rraw | 47 +++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index 063442defa..11dbe73282 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -88,7 +88,6 @@ test(2.52, substitute2(list(v1=v2, v1=v3), env=list(v1="nm",v2=2L,v3=3L)), quote # substitute2 nested unnamed call test(2.53, substitute2(c(list(v1=v2, v1=v2)), env=list(v1="nm",v2=2L,v3=3L)), quote(c(list(nm = 2L, nm = 2L)))) test(2.54, substitute2(c(list(v1=v2, v1=v3)), env=list(v1="nm",v2=2L,v3=3L)), quote(c(list(nm = 2L, nm = 3L)))) -# TODO substitute2 during join # substitute2 env as environment class env = as.environment(list(v=1L, .v=2L)) @@ -155,13 +154,27 @@ test(4.031, length(out), 3L) test(4.032, as.logical(grep("Argument 'by' after substitute: a", out, fixed=TRUE)), TRUE) test(4.033, as.logical(grep("Argument 'j' after substitute: .(res = sum(b))", out, fixed=TRUE)), TRUE) test(4.034, as.logical(grep("Argument 'i' after substitute: a %in% c(\"b\", \"c\")", out, fixed=TRUE)), TRUE) - -# TODO special symbols -#".SD" -#".N", ".EACHI" -#I(list(j=as.name(".SD"))) -#I(list(j=as.name(".N"), by=as.name(".EACHI"))) - +# substitute2 during join +d1 = data.table(id1=1:4, v1=5) +d2 = data.table(id1=c(0L,2:3), v1=6) +out = capture.output(ans <- d1[d2, on="id1<=id1", .(c1, c2, c3, c4), env=list(c1="x.id1", c2="i.id1", c3="x.v1", c4="i.v1"), verbose=TRUE]) +out = grep("Argument.*substitute", out, value=TRUE) +test(4.041, ans, data.table(x.id1=c(NA,1:2,1:3), i.id1=c(0L,2L,2L,3L,3L,3L), x.v1=c(NA,rep(5,5)), i.v1=rep(6,6))) +test(4.042, as.logical(grep("Argument 'j' after substitute: .(x.id1, i.id1, x.v1, i.v1)", out, fixed=TRUE)), TRUE) +d1 = data.table(id1=c(2L,4L,2L,4L), v1=5) +d2 = data.table(id1=c(0L,2:3), v1=6) +out = capture.output(ans <- d1[dd, on="id1<=id1", .(sum(c3), sum(c4)), by=by, env=list(dd="d2", c3="x.v1", c4="i.v1", by=".EACHI"), verbose=TRUE]) +out = grep("Argument.*substitute", out, value=TRUE) +test(4.043, ans, data.table(id1=c(0L,2L,3L), V1=c(NA,10,10), V2=c(6,6,6))) +test(4.044, as.logical(grep("Argument 'by' after substitute: .EACHI", out, fixed=TRUE)), TRUE) +test(4.045, as.logical(grep("Argument 'j' after substitute: .(sum(x.v1), sum(i.v1))", out, fixed=TRUE)), TRUE) +test(4.046, as.logical(grep("Argument 'i' after substitute: d2", out, fixed=TRUE)), TRUE) +# substitute special symbols +d = data.table(V1=1:2, V2=1:4) +test(4.051, d[, j, by, env=list(j=".N", by="V1")], data.table(V1=c(1L,2L), N=c(2L,2L))) +test(4.052, d[, j, by, env=list(j=".SD", by="V1")], data.table(V1=c(1L,1L,2L,2L), V2=c(1L,3L,2L,4L))) +test(4.053, d[, j, env=I(list(j=as.name(".N")))], 4L) +test(4.054, d[, .(op, fun(col)), by=by, env=list(op=".N", fun="sum", col="V2", by="V1")], data.table(V1=1:2, N=c(2L,2L), V2=c(4L,6L))) # get and mget use cases d = as.data.table(lapply(1:5, rep, 2L)) setnames(d, paste0("c",1:5)) @@ -169,15 +182,15 @@ v1 = "c1"; v2 = "c2"; v3 = "c3"; v4 = "c4"; v5 = "c5" ### TODO report/review get-mget issues ## note that d[, get(v1)] is wrong here, it translates to d[, c1] while it should be d[, "c1"] ## d[, mget(v1)] is handling that fine, compare both! might have been used for convenience but it breaks consistency -test(4.051, d[, v1, env=list(v1=v1)], c(1L,1L)) ## symbol c1 -test(4.052, d[, v1, env=list(v1=I(v1))], data.table(c1=c(1L,1L))) ## character "c1" -test(4.053, d[, v1, env=list(v1=I(v1))], d[, mget(v1)]) ## character "c1", works against mget only -test(4.054, d[, v1v2, env=list(v1v2=I(c(v1,v2)))], d[, mget(c(v1, v2))]) ## character c("c1","c2") -test(4.055, d[, .(v1), env=list(v1=v1)], data.table(c1=c(1L,1L))) ## d[, .(get(v1))] - (m)get would return unnamed columns -test(4.056, d[, .(v1, v2), env=list(v1=v1, v2=v2)], data.table(c1=c(1L,1L),c2=c(2L,2L))) ## d[, .(get(v1), get(v2))] -test(4.057, d[, .(sum(v1)), env=list(v1=v1)], d[, .(sum(get(v1)))]) -test(4.058, d[, lapply(vN, sum), env=list(vN=substitute2(list(v1=v1, v3=v3), list(v1=v1, v3=v3)))], d[, lapply(mget(c(v1,v3)), sum)]) -test(4.059, d[, c(list(c1=c1, c2=c2), list(v3=v3), list(v4=v4, v5=v5)), env=list(v3=v3,v4=v4,v5=v5)], d) ## d[, c(list(c1, c2), list(get(v3)), mget(c(v4,v5)))] - some are unnamed +test(4.061, d[, v1, env=list(v1=v1)], c(1L,1L)) ## symbol c1 +test(4.062, d[, v1, env=list(v1=I(v1))], data.table(c1=c(1L,1L))) ## character "c1" +test(4.063, d[, v1, env=list(v1=I(v1))], d[, mget(v1)]) ## character "c1", works against mget only +test(4.064, d[, v1v2, env=list(v1v2=I(c(v1,v2)))], d[, mget(c(v1, v2))]) ## character c("c1","c2") +test(4.065, d[, .(v1), env=list(v1=v1)], data.table(c1=c(1L,1L))) ## d[, .(get(v1))] - (m)get would return unnamed columns +test(4.066, d[, .(v1, v2), env=list(v1=v1, v2=v2)], data.table(c1=c(1L,1L),c2=c(2L,2L))) ## d[, .(get(v1), get(v2))] +test(4.067, d[, .(sum(v1)), env=list(v1=v1)], d[, .(sum(get(v1)))]) +test(4.068, d[, lapply(vN, sum), env=list(vN=substitute2(list(v1=v1, v3=v3), list(v1=v1, v3=v3)))], d[, lapply(mget(c(v1,v3)), sum)]) +test(4.069, d[, c(list(c1=c1, c2=c2), list(v3=v3), list(v4=v4, v5=v5)), env=list(v3=v3,v4=v4,v5=v5)], d) ## d[, c(list(c1, c2), list(get(v3)), mget(c(v4,v5)))] - some are unnamed # use DT[, var, env=list(var=quote(.(sum_x=sum(x)))] rather than dt[, eval(var)]? #TODO From ffb97d42f289a6fdad3a805cdc25a1291281be42 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sun, 22 Mar 2020 16:45:23 +0530 Subject: [PATCH 16/37] man wording --- man/substitute2.Rd | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/man/substitute2.Rd b/man/substitute2.Rd index 13635344ea..4f56861c02 100644 --- a/man/substitute2.Rd +++ b/man/substitute2.Rd @@ -4,7 +4,7 @@ \alias{I} \title{ Substitute expression } \description{ - Experimental, more robust version of base R \code{\link[base]{substitute}}. + Experimental, more robust and more user-friendly version of base R \code{\link[base]{substitute}}. } \usage{ substitute2(expr, env) @@ -14,7 +14,7 @@ substitute2(expr, env) \item{env}{ Environment or a list from which variables will be taken to inject into \code{expr}. } } \details{ - By default function will turn any character objects into symbols. In case character is of length 2 or more, it will raise an error. Behaviour can be changed by wrapping \code{env} into \code{\link[base]{I}} call. In such a case any symbols must be explicitly created, for example using \code{as.name} function. Alternatively it is possible to wrap single elements of \code{env} into \code{\link[base]{I}} call, then only those elements will retain their original class. See examples below. + Function will turn any character elements of \code{env} argument into symbols. In case character is of length 2 or more, it will raise an error. Behaviour can be changed by wrapping \code{env} into \code{\link[base]{I}} call. In such case any symbols must be explicitly created, for example using \code{as.name} function. Alternatively it is possible to wrap elements of \code{env} into \code{\link[base]{I}} call, not the whole list, then only those elements will retain their original class. } \value{ Quoted expression having variables substituted. @@ -49,7 +49,7 @@ if (getRversion() >= "4.0.0") { substitute2(lapply(some_list, sum), list(some_list = some_list)) } -## using substitute2 from inside a function +## using substitute2 from another function f = function(expr, env) { eval(substitute( substitute2(.expr, env), From e6f22034048a46a741d10a3d62bfb23ca1477c7d Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sun, 22 Mar 2020 19:19:44 +0530 Subject: [PATCH 17/37] minor tests adjustments --- inst/tests/programming.Rraw | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index 11dbe73282..e43a9e93f8 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -6,6 +6,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { require(data.table) test = data.table:::test is.AsIs = data.table:::is.AsIs + rm.AsIs = data.table:::rm.AsIs } # AsIs @@ -19,6 +20,10 @@ test(1.07, is.AsIs(structure(list(NULL), class="an_S3")), FALSE) ## S3 test(1.08, is.AsIs(I(structure(list(NULL), class="an_S3"))), TRUE) test(1.09, is.AsIs(getClass("MethodDefinition")), FALSE) ## S4 test(1.10, is.AsIs(I(getClass("MethodDefinition"))), TRUE) +test(1.11, is.AsIs(rm.AsIs(1L)), FALSE) +test(1.12, is.AsIs(rm.AsIs(I(1L))), FALSE) +test(1.13, is.AsIs(rm.AsIs(list(1L))), FALSE) +test(1.14, is.AsIs(rm.AsIs(I(list(1L)))), FALSE) # substitute2 simple test(2.01, substitute2(list(var = val), env = list(var="my_var", val=5L)), quote(list(my_var = 5L))) @@ -142,33 +147,35 @@ out = capture.output(ans <- d[var3%in%values, .(var1 = f(var2)), keyby=var3, test(4.02, ans, data.table(a=c("a","b"), res=c(6L,4L), key="a")) out = grep("Argument.*substitute", out, value=TRUE) test(4.021, length(out), 3L) # we expect i, j, by only here, ensure about that -test(4.022, as.logical(grep("Argument 'by' after substitute: a", out, fixed=TRUE)), TRUE) -test(4.023, as.logical(grep("Argument 'j' after substitute: .(res = sum(b))", out, fixed=TRUE)), TRUE) -test(4.024, as.logical(grep("Argument 'i' after substitute: a %in% c(\"a\", \"b\", \"c\")", out, fixed=TRUE)), TRUE) +test(4.022, "Argument 'by' after substitute: a" %in% out, TRUE) +test(4.023, "Argument 'j' after substitute: .(res = sum(b))" %in% out, TRUE) +test(4.024, "Argument 'i' after substitute: a %in% c(\"a\", \"b\", \"c\")" %in% out, TRUE) out = capture.output(ans <- d[var3%in%values, .(var1 = f(var2)), keyby=var3, env=I(list(var1=as.name("res"), var2=as.name("b"), f=as.name("sum"), var3=as.name("a"), values=c("b","c"))), verbose=TRUE]) -out = grep("Argument.*substitute", out, value=TRUE) test(4.03, ans, data.table(a=c("b"), res=c(4L), key="a")) +out = grep("Argument.*substitute", out, value=TRUE) test(4.031, length(out), 3L) -test(4.032, as.logical(grep("Argument 'by' after substitute: a", out, fixed=TRUE)), TRUE) -test(4.033, as.logical(grep("Argument 'j' after substitute: .(res = sum(b))", out, fixed=TRUE)), TRUE) -test(4.034, as.logical(grep("Argument 'i' after substitute: a %in% c(\"b\", \"c\")", out, fixed=TRUE)), TRUE) +test(4.032, "Argument 'by' after substitute: a" %in% out, TRUE) +test(4.033, "Argument 'j' after substitute: .(res = sum(b))" %in% out, TRUE) +test(4.034, "Argument 'i' after substitute: a %in% c(\"b\", \"c\")" %in% out, TRUE) # substitute2 during join d1 = data.table(id1=1:4, v1=5) d2 = data.table(id1=c(0L,2:3), v1=6) out = capture.output(ans <- d1[d2, on="id1<=id1", .(c1, c2, c3, c4), env=list(c1="x.id1", c2="i.id1", c3="x.v1", c4="i.v1"), verbose=TRUE]) -out = grep("Argument.*substitute", out, value=TRUE) test(4.041, ans, data.table(x.id1=c(NA,1:2,1:3), i.id1=c(0L,2L,2L,3L,3L,3L), x.v1=c(NA,rep(5,5)), i.v1=rep(6,6))) -test(4.042, as.logical(grep("Argument 'j' after substitute: .(x.id1, i.id1, x.v1, i.v1)", out, fixed=TRUE)), TRUE) +out = grep("Argument.*substitute", out, value=TRUE) +test(4.042, length(out), 2L) ## 2L because i is non-missing attempt to substitute is made +test(4.043, "Argument 'j' after substitute: .(x.id1, i.id1, x.v1, i.v1)" %in% out, TRUE) d1 = data.table(id1=c(2L,4L,2L,4L), v1=5) d2 = data.table(id1=c(0L,2:3), v1=6) out = capture.output(ans <- d1[dd, on="id1<=id1", .(sum(c3), sum(c4)), by=by, env=list(dd="d2", c3="x.v1", c4="i.v1", by=".EACHI"), verbose=TRUE]) +test(4.044, ans, data.table(id1=c(0L,2L,3L), V1=c(NA,10,10), V2=c(6,6,6))) out = grep("Argument.*substitute", out, value=TRUE) -test(4.043, ans, data.table(id1=c(0L,2L,3L), V1=c(NA,10,10), V2=c(6,6,6))) -test(4.044, as.logical(grep("Argument 'by' after substitute: .EACHI", out, fixed=TRUE)), TRUE) -test(4.045, as.logical(grep("Argument 'j' after substitute: .(sum(x.v1), sum(i.v1))", out, fixed=TRUE)), TRUE) -test(4.046, as.logical(grep("Argument 'i' after substitute: d2", out, fixed=TRUE)), TRUE) +test(4.045, length(out), 3L) +test(4.046, "Argument 'by' after substitute: .EACHI" %in% out, TRUE) +test(4.047, "Argument 'j' after substitute: .(sum(x.v1), sum(i.v1))" %in% out, TRUE) +test(4.048, "Argument 'i' after substitute: d2" %in% out, TRUE) # substitute special symbols d = data.table(V1=1:2, V2=1:4) test(4.051, d[, j, by, env=list(j=".N", by="V1")], data.table(V1=c(1L,2L), N=c(2L,2L))) From 507e2535e8f573b94551fe27a6358566e098af6d Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sun, 22 Mar 2020 19:20:25 +0530 Subject: [PATCH 18/37] add use cases contributed by @renkun-ken --- inst/tests/programming.Rraw | 144 +++++++++++++++++++++++++++++++++++- 1 file changed, 143 insertions(+), 1 deletion(-) diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index e43a9e93f8..8238a6f86d 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -203,4 +203,146 @@ test(4.069, d[, c(list(c1=c1, c2=c2), list(v3=v3), list(v4=v4, v5=v5)), env=list #TODO # contributed use cases -#TODO +# renkun-ken +dt = as.data.table(list( ## RNGversion("3.5.0"); set.seed(108); round(numeric(), 4) + symbol = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), + date = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), + grp1 = c(1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L), + grp2 = c(3L, 3L, 3L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 3L, 2L, 1L, 3L, 3L, 1L, 1L, 3L, 3L, 1L, 3L, 3L, 2L, 1L, 2L, 2L, 3L, 2L), + x0 = c(1.1396, -0.2706, -2.2801, -0.1572, -1.0671, -0.9666, -0.8071, -0.23, -0.1626, 1.4347, -0.2234, 0.5613, -0.7084, 0.2598, -0.2023, 1.8624, 0.5209, -1.561, -1.2297, -1.0064, -0.9782, -0.1291, -2.275, 0.5268, -0.5316, 2.3234, 0.0556, -0.3623, -0.5695, -0.0142), + x1 = c(1.3553, 1.2909, -0.8958, -0.3677, 1.0041, 1.1247, -0.0595, 0.7503, 0.3503, -1.559, -1.6823, -0.0906, 0.7874, 0.2785, -0.1712, -1.5325, 0.408, 0.5981, -1.1464, -0.2233, -0.0635, 0.4461, -1.9813, -0.7281, 1.1216, -0.0516, 1.373, 0.2388, 0.6257, -0.0551), + x2 = c(-0.2457, -0.9797, 0.3957, -1.094, -1.1973, 0.3137, 0.2004, -1.9404, 1.6927, -0.4063, 0.0731, -0.3338, -2.2683, -1.1105, 0.2115, -0.0163, 0.2139, 0.5016, 0.2296, 0.4189, 0.3295, 0.0408, 1.4633, -0.7118, 0.4811, 0.4499, -0.4214, 0.1503, -0.2222, 0.4573), + x3 = c(1.3439, 0.3841, -0.4787, -0.6312, -0.5481, -0.8703, -1.2684, -1.4851, 0.6789, 0.1575, 2.7873, -1.1201, 0.1337, -0.6053, -0.6538, 0.4597, -0.8955, 0.1625, 1.3767, 0.6024, -1.2141, -1.3534, -0.6583, -0.095, 1.1923, 0.3062, -0.6818, 0.2407, -0.8534, -1.4521), + y1 = c(-0.2159, 0.8934, 0.0216, -1.0682, 1.2549, -0.1517, 1.4404, 1.3436, -2.1388, -0.2453, -1.4628, -1.7654, 0.6437, -0.9685, -0.9393, 0.0962, -0.2041, 1.1007, -1.8705, 0.2053, -0.9238, -0.6301, 1.9876, 1.2862, 0.3363, -0.334, -1.5149, -1.3254, 0.5716, -0.7165), + y2 = c(-0.5962, 0.3394, -0.2971, -0.6241, -0.5279, 1.1945, -0.152, 0.8207, 0.8731, 0.2281, 0.3466, -1.4862, -0.4694, 0.0435, 0.9888, -0.0797, 0.7109, -0.6636, -0.4402, 1.0093, -0.0655, 0.5099, 1.5415, 1.8833, -1.2365, 0.5085, 0.7073, -0.2191, 0.2442, 0.1501), + y3 = c(0.6222, -0.7174, -1.9616, -0.0117, -0.114, 0.1313, -1.3854, 1.5021, -0.7115, 0.4822, 1.8474, 1.1742, 0.8192, 0.2819, -1.3365, -0.6179, -0.9706, 0.2179, -1.2654, 1.0065, -2.2514, -0.7161, 0.9578, -0.0335, 0.3166, 0.0471, -0.9983, -0.6455, 1.4064, 0.2954))) +xs = c("x", "y") ## apply same formula to different set of columns +out = vector("list", length(xs)) +names(out) = xs +for (x in xs) { + out[[x]] = capture.output(invisible(dt[, RATIO := (R3 - R2) * (R2 - R1) * (R3 - R1) / sqrt(R1^2 + R2^2 + R3^2), + env = list(RATIO = paste0(x, "_ratio"), R1 = paste0(x, 1), R2 = paste0(x, 2), R3 = paste0(x, 3)), + verbose = TRUE])) # assign to nul, other +} +x_rat = c(0.0150761734954921, 1.68603966340262, -0.432117480975587, 0.0673302370985585, +1.3396117186265, -1.31542975195976, 0.358990921654875, 1.07137398842599, -0.240804570258909, 0.689134697166349, 6.53944855876942, -0.167936293758913, 1.99518595021054, 0.478886131900058, 0.225672526235629, 0.898595029001403, -0.278725254056844, -0.0178774591562397, 2.20493313305713, 0.126869315798536, 0.554130827073314, -0.713268530169861, -3.79227895596263, 0.00622410754980975, -0.0188758915276097, -0.0471688415642347, -0.60391972591766, -4.09856489441073e-05, -0.732101471917737, 0.897197218930381) +y_rat = c(-0.437137931952723, -0.789182136098114, -0.530238437504097, 0.232242653273211, 0.739369921650875, -0.334413400872578, -2.76908561851941, -0.0259528361203494, -2.81810697204509, 0.149050554297973, 3.77409495341661, 0.84329199487865, -0.220290266022232, 0.298795199314652, 0.932599183107379, -0.107238527606129, 0.966425089066359, 1.05320054480325, -0.310406226974414, -0.00125245906648534, 1.02314586034282, 0.111130598215941, -0.0996278782862306, 0.66222170820334, 0.0364570881136429, -0.242779893874194, -1.00552326863148, -0.215191768368067, -0.206580227824426, 0.16140646232964) +test(101.01, dt$x_ratio, x_rat) +test(101.02, dt$y_ratio, y_rat) +test(101.03, length(grep("Argument.*substitute", out[["x"]], value=TRUE)), 1L) +test(101.04, length(grep("Argument.*substitute", out[["y"]], value=TRUE)), 1L) +test(101.05, "Argument 'j' after substitute: `:=`(x_ratio, (x3 - x2) * (x2 - x1) * (x3 - x1)/sqrt(x1^2 + x2^2 + x3^2))" %in% out[["x"]], TRUE) +test(101.06, "Argument 'j' after substitute: `:=`(y_ratio, (y3 - y2) * (y2 - y1) * (y3 - y1)/sqrt(y1^2 + y2^2 + y3^2))" %in% out[["y"]], TRUE) +daily_cor = function(data, x, y) { ## daily correlation of user input features + data[, .(cor = cor(x, y)), + keyby = date, + env = list(x = x, y = y), + verbose = TRUE] +} +out = capture.output(ans <- daily_cor(dt, "x0", "y2")) +test(101.07, length(grep("Argument.*substitute", out, value=TRUE)), 2L) ## 'by' (or 'keyby') is not substituted here but it still goes via substitute2 because it is non-missing +test(101.08, "Argument 'by' after substitute: date" %in% out, TRUE) +test(101.09, "Argument 'j' after substitute: .(cor = cor(x0, y2))" %in% out, TRUE) +group_cor = function(data, x, y, g) { ## group cor comparison of user input features + cor_dt = data[, lapply(.SD, function(x) cor(x, Y)), + keyby = .(group = GROUP), + .SDcols = x, + env = list(Y = y, GROUP = g), + verbose = TRUE] + melt.data.table(cor_dt, id.vars = "group", measure.vars = x, variable.name = "x", value.name = "cor", variable.factor = FALSE) ## not relevant but lets keep it for completeness +} +out = capture.output(dt1 <- group_cor(dt, c("x0", "x1", "x2"), "y1", "grp1")) +test(101.10, length(grep("Argument.*substitute", out, value=TRUE)), 2L) +test(101.11, "Argument 'by' after substitute: .(group = grp1)" %in% out, TRUE) +test(101.12, "Argument 'j' after substitute: lapply(.SD, function(x) cor(x, y1))" %in% out, TRUE) +out = capture.output(dt2 <- group_cor(dt, c("x0", "x1", "x2"), "y1", "grp2")) +test(101.13, length(grep("Argument.*substitute", out, value=TRUE)), 2L) +test(101.14, "Argument 'by' after substitute: .(group = grp2)" %in% out, TRUE) +test(101.15, "Argument 'j' after substitute: lapply(.SD, function(x) cor(x, y1))" %in% out, TRUE) +stats_dt1 = as.data.table(list( + x = c("x0", "x1", "x2"), + min = c(-0.325967794724422, -0.126026585686073, -0.398950077203113), + mean = c(-0.277318407860876, -0.0164428001010045, -0.220868266148565), + max = c(-0.22866902099733, 0.0931409854840638, -0.0427864550940165) +), key="x") +test(101.16, dt1[, .(min = min(cor), mean = mean(cor), max = max(cor)), keyby = x], stats_dt1) ## post aggregation with known colnames, not relevant but lets keep it for completeness +stats_dt2 = as.data.table(list( + x = c("x0", "x1", "x2"), + min = c(-0.392714958827804, -0.339274985404091, -0.45937864657761), + mean = c(-0.279968323960171, 0.150866984990403, 0.0838779176840593), + max = c(-0.180337725136444, 0.697473394580653, 0.714679537878464) +), key="x") +test(101.17, dt2[, .(min = min(cor), mean = mean(cor), max = max(cor)), keyby = x], stats_dt2) +set.seed(108) ## to many values to hardcode +yn = c(1, 5, 10, 20) +ycols = paste0("y", yn) +ydt = data.table(symbol = rep(1:3, each = 100)) +ydt[, date := seq_len(.N), by = symbol] +ydt[, ret := rnorm(.N)] +ydt[, (ycols) := shift(ret, yn, type = "lead"), by = symbol] +xdt = data.table(symbol = rep(1:2, each = 20)) +xdt[, date := seq_len(.N), by = symbol] +xdt[, `:=`(x1 = rnorm(.N), x2 = rnorm(.N))] +cor_xy = function(xdt, ydt, x, y) { ## cor between each x and a single y + xdt[ydt, y := Y, on = .(symbol, date), + env = list(Y = y), + verbose = TRUE] + on.exit(xdt[, y := NULL]) + xdt[, lapply(.SD, cor, y = y), keyby = symbol, .SDcols = x] +} +out = capture.output(ans <- cor_xy(xdt, ydt, c("x1", "x2"), "y10")) +exp = as.data.table(list(symbol = 1:2, x1 = c(0.529292252112253, 0.0301956035638738 +), x2 = c(0.287076866252898, -0.335969587268599)), key="symbol") +test(102.01, ans, exp) +test(102.02, length(grep("Argument.*substitute", out, value=TRUE)), 2L) +test(102.03, "Argument 'j' after substitute: `:=`(y, y10)" %in% out, TRUE) +test(102.04, "Argument 'i' after substitute: ydt" %in% out, TRUE) +cor_xy2 = function(xdt, ydt, x, y) { ## cor between each pair of x and y + rbindlist(lapply(y, function(yi) { + xdt[ydt, y := Y, on = .(symbol, date), + env = list(Y = yi)] + on.exit(xdt[, y := NULL]) + rbindlist(lapply(x, function(xi) { + xdt[, .(x = xi, y = yi, cor = cor(X, y)), keyby = symbol, + env = list(X = xi)] + })) + })) +} +cor_dt = cor_xy2(xdt, ydt, c("x1", "x2"), ycols) +exp = as.data.table(list( + symbol = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), + x = c("x1", "x1", "x2", "x2", "x1", "x1", "x2", "x2", "x1", "x1", "x2", "x2", "x1", "x1", "x2", "x2"), + y = c("y1", "y1", "y1", "y1", "y5", "y5", "y5", "y5", "y10", "y10", "y10", "y10", "y20", "y20", "y20", "y20"), + cor = c(0.0963296961360529, -0.155702586981777, 0.45855688298414, -0.0867798048307359, -0.272158447799069, 0.0969909109333228, -0.172091337596075, -0.231918279862371, 0.529292252112253, 0.0301956035638738, 0.287076866252898, -0.335969587268599, 0.489259093604126, 0.190094143537513, 0.382176633086643, -0.0481151265706696) +)) +test(102.05, cor_dt, exp) +cor_xy3 = function(xdt, ydt, x, y) { ## cor matrix of existing columns and dynamically in-place merged columns + # still can't easily program without constructing a call + xdt[ydt, `:=`( + y1 = y1, + y5 = y5, + y10 = y10, + y20 = y20 + ), on = .(symbol, date)] + on.exit(xdt[, (ycols) := NULL]) + xdt[, cor(.SD), .SDcols = c(x, y)] +} +cor_mx = cor_xy3(xdt, ydt, c("x1", "x2"), ycols) +exp = structure(c( + 1, 0.242249239102964, -0.0286729531730845, -0.0936087330415663, 0.245575245812681, 0.323778522797129, 0.242249239102964, 1, 0.199165327684089, -0.160954354243643, 0.0034174556771777, 0.185518712777259, -0.0286729531730845, 0.199165327684089, 1, -0.164047186655086, -0.0689536633998918, -0.0326400434160486, -0.0936087330415663, -0.160954354243643, -0.164047186655086, 1, -0.0810998892055976, -0.106457956110047, 0.245575245812681, 0.0034174556771777, -0.0689536633998918, -0.0810998892055976, 1, 0.324977066952494, 0.323778522797129, 0.185518712777259, -0.0326400434160486, -0.106457956110047, 0.324977066952494, 1 + ), .Dim = c(6L, 6L), .Dimnames = list( + c("x1", "x2", "y1", "y5", "y10", "y20"), + c("x1", "x2", "y1", "y5", "y10", "y20") +)) +test(102.06, cor_mx, exp) +nadt = data.table(x1 = c(1, 2, NA, Inf), x2 = c(2, NA, 3, Inf), x3 = c(NA, 1, 2, 0)) ## fill abnormal values of multiple columns +dt_fill = function(data, columns, selector, fill) { + selector = match.fun(selector) + for (col in columns) { + data[selector(X), X := fill, env = list(X = col)] + } +} +dt_fill(nadt, c("x1", "x2", "x3"), is.na, 0) +test(103.01, nadt, data.table(x1 = c(1, 2, 0, Inf), x2 = c(2, 0, 3, Inf), x3 = c(0, 1, 2, 0))) +dt_fill(nadt, c("x1", "x2", "x3"), is.infinite, 0) +test(103.02, nadt, data.table(x1 = c(1, 2, 0, 0), x2 = c(2, 0, 3, 0), x3 = c(0, 1, 2, 0))) From e99ae001737c853f31d56fc6895ed066289b4db4 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sun, 22 Mar 2020 20:37:49 +0530 Subject: [PATCH 19/37] solve known issues about get --- inst/tests/programming.Rraw | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index 8238a6f86d..4a95adcd62 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -186,9 +186,6 @@ test(4.054, d[, .(op, fun(col)), by=by, env=list(op=".N", fun="sum", col="V2", b d = as.data.table(lapply(1:5, rep, 2L)) setnames(d, paste0("c",1:5)) v1 = "c1"; v2 = "c2"; v3 = "c3"; v4 = "c4"; v5 = "c5" -### TODO report/review get-mget issues -## note that d[, get(v1)] is wrong here, it translates to d[, c1] while it should be d[, "c1"] -## d[, mget(v1)] is handling that fine, compare both! might have been used for convenience but it breaks consistency test(4.061, d[, v1, env=list(v1=v1)], c(1L,1L)) ## symbol c1 test(4.062, d[, v1, env=list(v1=I(v1))], data.table(c1=c(1L,1L))) ## character "c1" test(4.063, d[, v1, env=list(v1=I(v1))], d[, mget(v1)]) ## character "c1", works against mget only @@ -198,9 +195,26 @@ test(4.066, d[, .(v1, v2), env=list(v1=v1, v2=v2)], data.table(c1=c(1L,1L),c2=c( test(4.067, d[, .(sum(v1)), env=list(v1=v1)], d[, .(sum(get(v1)))]) test(4.068, d[, lapply(vN, sum), env=list(vN=substitute2(list(v1=v1, v3=v3), list(v1=v1, v3=v3)))], d[, lapply(mget(c(v1,v3)), sum)]) test(4.069, d[, c(list(c1=c1, c2=c2), list(v3=v3), list(v4=v4, v5=v5)), env=list(v3=v3,v4=v4,v5=v5)], d) ## d[, c(list(c1, c2), list(get(v3)), mget(c(v4,v5)))] - some are unnamed +# addresses: weird exception when by contains get #1985 +tb = data.table(x=c(1,2), y=c(3,4), z=c(5,6), w=c("a","b")) +test(4.101, tb[w != "b", .(x=sum(x)), by=.(y, zz=.z), env=list(.z="z")], data.table(y=3, zz=5, x=1)) +dtIris = as.data.table(iris) +speciesVar = "Species" +test(4.102, dtIris[Sepal.Length > 4, .N, by = .(var = .speciesVar, Petal.Width), env = list(.speciesVar = speciesVar)], dtIris[Sepal.Length > 4, .N, by = .(var = Species, Petal.Width)]) # use DT[, var, env=list(var=quote(.(sum_x=sum(x)))] rather than dt[, eval(var)]? #TODO +#2432 Add Programmable NSE +#1180 eval doesn't work on join columns +#dt1 = data.table(a = 1, key = 'a') +#dt2 = data.table(a = 1, b = list(quote(5*a)), key = 'a') +#this works: +#dt1[dt2][, eval(b[[1]], .SD)] +##[1] 5 +# this doesn't (but should produce exact same result as above) +#dt1[dt2, eval(b[[1]], .SD)] +#Error in eval(b[[1]], .SD) : object 'b' not found +#dt1[dt2, eval(b[[1]], mget(names(dt2)))] # contributed use cases # renkun-ken From 9e6102562db9b3c8b4ca3b26a2c8694a34e2ea61 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sun, 22 Mar 2020 20:38:22 +0530 Subject: [PATCH 20/37] improve test for renkun-ken use case --- inst/tests/programming.Rraw | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index 4a95adcd62..f05bc0835e 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -331,13 +331,9 @@ exp = as.data.table(list( )) test(102.05, cor_dt, exp) cor_xy3 = function(xdt, ydt, x, y) { ## cor matrix of existing columns and dynamically in-place merged columns - # still can't easily program without constructing a call - xdt[ydt, `:=`( - y1 = y1, - y5 = y5, - y10 = y10, - y20 = y20 - ), on = .(symbol, date)] + qc = as.call(lapply(setNames(c(":=", ycols), c("",ycols)), as.name)) + xdt[ydt, j, on = .(symbol, date), + env = list(j=qc)] on.exit(xdt[, (ycols) := NULL]) xdt[, cor(.SD), .SDcols = c(x, y)] } From b498263fb9f22fd878be3f30855af550e36b27ca Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sun, 22 Mar 2020 21:55:42 +0530 Subject: [PATCH 21/37] minor fix for proper scoping --- inst/tests/programming.Rraw | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index f05bc0835e..ae82624159 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -331,10 +331,10 @@ exp = as.data.table(list( )) test(102.05, cor_dt, exp) cor_xy3 = function(xdt, ydt, x, y) { ## cor matrix of existing columns and dynamically in-place merged columns - qc = as.call(lapply(setNames(c(":=", ycols), c("",ycols)), as.name)) + qc = as.call(lapply(setNames(c(":=", y), c("", y)), as.name)) xdt[ydt, j, on = .(symbol, date), env = list(j=qc)] - on.exit(xdt[, (ycols) := NULL]) + on.exit(xdt[, (y) := NULL]) xdt[, cor(.SD), .SDcols = c(x, y)] } cor_mx = cor_xy3(xdt, ydt, c("x1", "x2"), ycols) From b0094f7598e4ea3ed171e11ce57be583255e122c Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sun, 22 Mar 2020 22:46:04 +0530 Subject: [PATCH 22/37] more tests addressing existing issues --- inst/tests/programming.Rraw | 43 ++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index ae82624159..8edc6dc0a2 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -201,20 +201,37 @@ test(4.101, tb[w != "b", .(x=sum(x)), by=.(y, zz=.z), env=list(.z="z")], data.ta dtIris = as.data.table(iris) speciesVar = "Species" test(4.102, dtIris[Sepal.Length > 4, .N, by = .(var = .speciesVar, Petal.Width), env = list(.speciesVar = speciesVar)], dtIris[Sepal.Length > 4, .N, by = .(var = Species, Petal.Width)]) - -# use DT[, var, env=list(var=quote(.(sum_x=sum(x)))] rather than dt[, eval(var)]? -#TODO +#2589 Need an easier way to use dynamically determined symbols +dt = data.table(x1 = 1:10, x2 = 10:1, x3 = 1:10) +s1 = "x2"; s2 = "x3" +test(4.103, dt[, s1 * s2, env=list(s1=s1,s2=s2)], c(10L, 18L, 24L, 28L, 30L, 30L, 28L, 24L, 18L, 10L)) +#2884 Alternative way to dynamic symbol usage in `j` +dt = data.table(id = rep(1:2, 5), x1 = rnorm(10), x2 = rnorm(10), y1 = rnorm(10), y2 = rnorm(10)) +test(4.104, dt[, .(xsum = sum(x), ysum = sum(y)), by = id, env = list(x = "x1", y = "y2")], dt[, .(xsum=sum(x1), ysum=sum(y2)), by=id]) +#2816 Possible regression for programmatic use in `j` +dt = data.table(x=1:3) +var = "x" +dt[, var := var+1L, env=list(var="x")] +test(4.105, dt, data.table(x=2:4)) +# injecting quoted expressions +#750 `by=list(eval(as.name("colA")))` renames column +DT = data.table(colA=1:4, colB=5:8, colC=9:12) +test(4.106, DT[, sum(colA), by=list(grp_name=grp), env=list(grp_name="colA", grp="colA")], data.table(colA=1:4, V1=1:4)) #2432 Add Programmable NSE -#1180 eval doesn't work on join columns -#dt1 = data.table(a = 1, key = 'a') -#dt2 = data.table(a = 1, b = list(quote(5*a)), key = 'a') -#this works: -#dt1[dt2][, eval(b[[1]], .SD)] -##[1] 5 -# this doesn't (but should produce exact same result as above) -#dt1[dt2, eval(b[[1]], .SD)] -#Error in eval(b[[1]], .SD) : object 'b' not found -#dt1[dt2, eval(b[[1]], mget(names(dt2)))] +co2 = as.data.table(CO2) +Jexp1 = quote(max(conc)) +Jexp2 = quote(mean(conc)) +Jexp = substitute(list(Jexp1, round(Jexp2)), list(Jexp1=Jexp1, Jexp2=Jexp2)) +out = capture.output(ans <- co2[, j, by=Type, env=list(j=Jexp), verbose=TRUE]) +test(4.107, ans, data.table(Type=factor(c("Quebec","Mississippi"), levels=c("Quebec","Mississippi")), V1=c(1000,1000), V2=c(435,435))) +out = grep("Argument.*substitute", out, value=TRUE) +test(4.108, length(out), 2L) +test(4.109, "Argument 'by' after substitute: Type" %in% out, TRUE) +test(4.110, "Argument 'j' after substitute: list(max(conc), round(mean(conc)))" %in% out, TRUE) +#628 Change j=list(xout=eval(...))'s eval to eval within scope of DT +dat = data.table(x_one=1:10, x_two=1:10, y_one=1:10, y_two=1:10) +f = function(vars) as.call(c(as.name("list"), lapply(setNames(vars, paste(vars,"out",sep="_")), function(var) substitute2(one-two, list(one=paste(var,"one",sep="_"), two=paste(var,"two",sep="_")))))) +test(4.111, dat[, j, env=list(j = f(c("x","y")))], dat[, list(x_out = x_one - x_two, y_out = y_one - y_two)]) # contributed use cases # renkun-ken From 9878dbfd0fa416c095ab2137003984d0e52499b4 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 23 Mar 2020 05:49:06 +0530 Subject: [PATCH 23/37] add use case by #@tdeenes --- inst/tests/programming.Rraw | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index 8edc6dc0a2..bbe0f3b5c1 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -1,3 +1,4 @@ +#grep -Enr "#[0-9]" programming.Rraw require(methods) if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { if ((tt<-compiler::enableJIT(-1))>0) @@ -176,6 +177,15 @@ test(4.045, length(out), 3L) test(4.046, "Argument 'by' after substitute: .EACHI" %in% out, TRUE) test(4.047, "Argument 'j' after substitute: .(sum(x.v1), sum(i.v1))" %in% out, TRUE) test(4.048, "Argument 'i' after substitute: d2" %in% out, TRUE) +dt1 = data.table(x = letters[1:5], y = 1:5) +dt2 = data.table(x = letters[1:3], y = 11:13) +target_v = "y" +source_v = paste0("i.", target_v) +on_v = "x" +out = capture.output(invisible(dt1[dt2, target_v := source_v, on = on_v, env = list(target_v = target_v, source_v = source_v), verbose=TRUE])) +out = grep("Argument.*substitute", out, value=TRUE) +test(4.049, length(out), 2L) +test(4.050, dt1, data.table(x = c("a", "b", "c", "d", "e"), y = c(11L, 12L, 13L, 4L, 5L))) # substitute special symbols d = data.table(V1=1:2, V2=1:4) test(4.051, d[, j, by, env=list(j=".N", by="V1")], data.table(V1=c(1L,2L), N=c(2L,2L))) From 57514ed888d6a456973f2f78cc3f516de9cd4195 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 23 Mar 2020 22:05:38 +0530 Subject: [PATCH 24/37] missing env --- R/programming.R | 7 ++++++- inst/tests/programming.Rraw | 7 +++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/R/programming.R b/R/programming.R index 40f3fa87e5..500e76120a 100644 --- a/R/programming.R +++ b/R/programming.R @@ -9,12 +9,17 @@ rm.AsIs = function(x) { substitute2 = function(expr, env) { if (missing(env)) { - stop("TODO, as of now 'env' should not be missing") + stop("'env' must not be missing") + } else if (is.null(env)) { + # null is fine, will be escaped few lines below } else if (is.environment(env)) { env = as.list(env, all.names=TRUE) ## todo: try to use environment rather than list } else if (!is.list(env)) { stop("'env' must be a list or an environment") } + if (!length(env)) { + return(substitute(expr)) + } env.names = names(env) if (is.null(env.names)) { stop("'env' argument does not have names") diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index bbe0f3b5c1..88e8d379e9 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -98,8 +98,11 @@ test(2.54, substitute2(c(list(v1=v2, v1=v3)), env=list(v1="nm",v2=2L,v3=3L)), qu # substitute2 env as environment class env = as.environment(list(v=1L, .v=2L)) test(2.81, substitute2(.(v, .v), env), quote(.(1L, 2L))) -# substitute2 invalid 'env' error coverage -test(2.91, substitute2(.()), error="TODO, as of now 'env' should not be missing") ## TODO +# substitute2 env various corner cases +test(2.901, substitute2(.(v), NULL), quote(.(v))) +test(2.902, substitute2(.(v), list()), quote(.(v))) +test(2.903, substitute2(.(v), emptyenv()), quote(.(v))) +test(2.91, substitute2(.()), error="'env' must not be missing") test(2.92, substitute2(v, c(v=1L)), error="'env' must be a list or an environment") test(2.93, substitute2(.(v), list(1L, 2L)), error="'env' argument does not have names") test(2.94, substitute2(.(v), structure(list(1L,2L), names=c("","v"))), error="'env' argument has zero char names") From 4c1001d6be8818788cc170addca688690d90cc2c Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 25 Mar 2020 17:53:05 +0000 Subject: [PATCH 25/37] env test and man --- R/programming.R | 4 ++-- inst/tests/programming.Rraw | 23 +++++++++++++++++++---- man/substitute2.Rd | 12 ++++++++++-- 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/R/programming.R b/R/programming.R index 500e76120a..50fa92b0b3 100644 --- a/R/programming.R +++ b/R/programming.R @@ -13,7 +13,7 @@ substitute2 = function(expr, env) { } else if (is.null(env)) { # null is fine, will be escaped few lines below } else if (is.environment(env)) { - env = as.list(env, all.names=TRUE) ## todo: try to use environment rather than list + env = as.list(env, all.names=TRUE) ## todo: try to use environment rather than list, then we don't have to evaluate env at start, see test 2.80 } else if (!is.list(env)) { stop("'env' must be a list or an environment") } @@ -36,7 +36,7 @@ substitute2 = function(expr, env) { to.name = !asis & char if (any(to.name)) { ## turns "my_name" character scalar into `my_name` symbol, for convenience if (any(non.scalar.char <- vapply(env[to.name], length, 0L)!=1L)) { - stop("Character objects provided in 'env' are not scalar objects, if you need them as character vector rather than a name, then use wrap it into 'I' call: ", + stop("Character objects provided in 'env' are not scalar objects, if you need them as character vector rather than a name, then use wrap each into 'I' call: ", paste(names(non.scalar.char)[non.scalar.char], collapse=", ")) } env[to.name] = lapply(env[to.name], as.name) diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index 88e8d379e9..734c815524 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -96,8 +96,20 @@ test(2.53, substitute2(c(list(v1=v2, v1=v2)), env=list(v1="nm",v2=2L,v3=3L)), qu test(2.54, substitute2(c(list(v1=v2, v1=v3)), env=list(v1="nm",v2=2L,v3=3L)), quote(c(list(nm = 2L, nm = 3L)))) # substitute2 env as environment class -env = as.environment(list(v=1L, .v=2L)) -test(2.81, substitute2(.(v, .v), env), quote(.(1L, 2L))) +e = as.environment(list(v=1L, .v=2L)) +test(2.81, substitute2(.(v, .v), e), quote(.(1L, 2L))) +# unline in base R substitute, the env arg is always evaluated +e = new.env() +delayedAssign("a_promise", stop("I am the error"), assign.env=e) +e$x = 5L +promises = function(env) { + f = function(x, env) eval(substitute(substitute(.x, env), list(.x=x))) + sym = lapply(setNames(nm=ls(env)), as.name) + lapply(sym, f, env) +} +test(2.820, promises(e), list(a_promise=quote(stop("I am the error")), x=5L)) +test(2.821, substitute(x + 1L, e), quote(5L + 1L)) +test(2.822, substitute2(x + 1L, e), error="I am the error", ignore.warning="restarting interrupted promise evaluation") # substitute2 env various corner cases test(2.901, substitute2(.(v), NULL), quote(.(v))) test(2.902, substitute2(.(v), list()), quote(.(v))) @@ -199,9 +211,9 @@ test(4.054, d[, .(op, fun(col)), by=by, env=list(op=".N", fun="sum", col="V2", b d = as.data.table(lapply(1:5, rep, 2L)) setnames(d, paste0("c",1:5)) v1 = "c1"; v2 = "c2"; v3 = "c3"; v4 = "c4"; v5 = "c5" -test(4.061, d[, v1, env=list(v1=v1)], c(1L,1L)) ## symbol c1 +test(4.061, d[, v1, env=list(v1=v1)], d[, get(v1)]) ## symbol c1 test(4.062, d[, v1, env=list(v1=I(v1))], data.table(c1=c(1L,1L))) ## character "c1" -test(4.063, d[, v1, env=list(v1=I(v1))], d[, mget(v1)]) ## character "c1", works against mget only +test(4.063, d[, list(v1), env=list(v1=v1)], d[, mget(v1)]) ## symbol c1 in list test(4.064, d[, v1v2, env=list(v1v2=I(c(v1,v2)))], d[, mget(c(v1, v2))]) ## character c("c1","c2") test(4.065, d[, .(v1), env=list(v1=v1)], data.table(c1=c(1L,1L))) ## d[, .(get(v1))] - (m)get would return unnamed columns test(4.066, d[, .(v1, v2), env=list(v1=v1, v2=v2)], data.table(c1=c(1L,1L),c2=c(2L,2L))) ## d[, .(get(v1), get(v2))] @@ -246,7 +258,10 @@ dat = data.table(x_one=1:10, x_two=1:10, y_one=1:10, y_two=1:10) f = function(vars) as.call(c(as.name("list"), lapply(setNames(vars, paste(vars,"out",sep="_")), function(var) substitute2(one-two, list(one=paste(var,"one",sep="_"), two=paste(var,"two",sep="_")))))) test(4.111, dat[, j, env=list(j = f(c("x","y")))], dat[, list(x_out = x_one - x_two, y_out = y_one - y_two)]) +####################### # contributed use cases +####################### + # renkun-ken dt = as.data.table(list( ## RNGversion("3.5.0"); set.seed(108); round(numeric(), 4) symbol = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), diff --git a/man/substitute2.Rd b/man/substitute2.Rd index 4f56861c02..4e35a68f32 100644 --- a/man/substitute2.Rd +++ b/man/substitute2.Rd @@ -11,10 +11,18 @@ substitute2(expr, env) } \arguments{ \item{expr}{ Unevaluated expression in which substitution has to take place. } -\item{env}{ Environment or a list from which variables will be taken to inject into \code{expr}. } +\item{env}{ List or an environment from which variables will be taken to inject into \code{expr}. } } \details{ - Function will turn any character elements of \code{env} argument into symbols. In case character is of length 2 or more, it will raise an error. Behaviour can be changed by wrapping \code{env} into \code{\link[base]{I}} call. In such case any symbols must be explicitly created, for example using \code{as.name} function. Alternatively it is possible to wrap elements of \code{env} into \code{\link[base]{I}} call, not the whole list, then only those elements will retain their original class. + Function will turn any character elements of \code{env} argument into symbols. In case character is of length 2 or more, it will raise an error. Behaviour can be changed by wrapping \code{env} into \code{\link[base]{I}} call. In such case any symbols must be explicitly created, for example using \code{as.name} function. Alternatively it is possible to wrap elements of \code{env} into \code{\link[base]{I}} call individually, not the whole list, then only those elements will retain their original class. + +Comparing to base R \code{\link[base]{substitute}}, \code{substitute2} function: +\enumerate{ +\item substitutes calls argument names as well. +\item automatically converts character elements of \code{env} argument to symbols. +\item does not accept missing \code{env} argument. +\item evaluates \code{env} argument. +} } \value{ Quoted expression having variables substituted. From 24f4a07575131b804044bfcd7c4642677738f441 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Thu, 26 Mar 2020 13:28:44 +0000 Subject: [PATCH 26/37] fix test num reference --- R/programming.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/programming.R b/R/programming.R index 50fa92b0b3..77da6c9457 100644 --- a/R/programming.R +++ b/R/programming.R @@ -13,7 +13,7 @@ substitute2 = function(expr, env) { } else if (is.null(env)) { # null is fine, will be escaped few lines below } else if (is.environment(env)) { - env = as.list(env, all.names=TRUE) ## todo: try to use environment rather than list, then we don't have to evaluate env at start, see test 2.80 + env = as.list(env, all.names=TRUE) ## todo: try to use environment rather than list, then we don't have to evaluate env at start, see test 2.822 } else if (!is.list(env)) { stop("'env' must be a list or an environment") } From 783f3a9b8f33b95e34951d8250379e1ec3286f97 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Fri, 27 Mar 2020 10:07:51 +0000 Subject: [PATCH 27/37] news entry and minor doc changes --- NEWS.md | 31 +++++++++++++++++++++++++++++++ man/data.table.Rd | 2 +- man/substitute2.Rd | 2 +- 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 71fd76aa65..a71d92c1b8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -81,6 +81,37 @@ unit = "s") 14. Added support for `round()` and `trunc()` to extend functionality of `ITime`. `round()` and `trunc()` can be used with argument units: "hours" or "minutes". Thanks to @JensPederM for the suggestion and PR. +15. New interface for _programming on data.table_ has been added. It is built using base R `substitute`-like interface via new `env` argument to `[.data.table`. For details of substitution see new vignette *programming on data.table* and `?substitute2` manual. + +```r +DT = data.table(x = 1:5, y = 5:1) + +# parameters +in_col_name = "x" +fun = "sum" +fun_arg1 = "na.rm" +fun_arg1val = TRUE +out_col_name = "sum_x" + +# parametrized query +#DT[, .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val))] + +# desired query +DT[, .(sum_x = sum(x, na.rm=TRUE))] + +# new interface +DT[, .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), + env = list( + in_col_name = "x", + fun = "sum", + fun_arg1 = "na.rm", + fun_arg1val = TRUE, + out_col_name = "sum_x" + )] +``` + +Addresses [#2655](https://github.com/Rdatatable/data.table/issues/2655) any many other linked issues. Thanks to numerous users for filling requests for a better flexibility in parametrizing data.table queries. + ## BUG FIXES 1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085). diff --git a/man/data.table.Rd b/man/data.table.Rd index 5533fe7f72..bfaa1dc042 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -171,7 +171,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac See examples as well as \href{../doc/datatable-secondary-indices-and-auto-indexing.html}{\code{vignette("datatable-secondary-indices-and-auto-indexing")}}. } - \item{env}{ Environment or a list, passed to \code{\link{substitute2}} for substitution of parameters in \code{i}, \code{j} and \code{by} (or \code{keyby}). Use \code{verbose} to preview constructed expressions. } + \item{env}{ List or an environment, passed to \code{\link{substitute2}} for substitution of parameters in \code{i}, \code{j} and \code{by} (or \code{keyby}). Use \code{verbose} to preview constructed expressions. } } \details{ \code{data.table} builds on base \R functionality to reduce 2 types of time:\cr diff --git a/man/substitute2.Rd b/man/substitute2.Rd index 4e35a68f32..d54662856d 100644 --- a/man/substitute2.Rd +++ b/man/substitute2.Rd @@ -21,7 +21,7 @@ Comparing to base R \code{\link[base]{substitute}}, \code{substitute2} function: \item substitutes calls argument names as well. \item automatically converts character elements of \code{env} argument to symbols. \item does not accept missing \code{env} argument. -\item evaluates \code{env} argument. +\item evaluates elements of \code{env} argument. } } \value{ From 9a31604f0d5d6539e7f1abcce1b1a2f3e7284b5c Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sun, 29 Mar 2020 19:21:02 +0100 Subject: [PATCH 28/37] test can now test itself --- R/test.data.table.R | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 14d5ae83bf..58279ef31b 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -160,7 +160,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F timings = env$timings DT = head(timings[-1L][order(-time)], 10L) # exclude id 1 as in dev that includes JIT if ((x<-sum(timings[["nTest"]])) != ntest) { - warning("Timings count mismatch:",x,"vs",ntest) # nocov + warning("Timings count mismatch: ",x," vs ",ntest) # nocov } cat("10 longest running tests took ", as.integer(tt<-DT[, sum(time)]), "s (", as.integer(100*tt/(ss<-timings[,sum(time)])), "% of ", as.integer(ss), "s)\n", sep="") print(DT, class=FALSE) @@ -243,6 +243,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no # iv) if warning is supplied, y is checked to equal x, and x should result in a warning message matching the pattern # v) if output is supplied, x is evaluated and printed and the output is checked to match the pattern # num just needs to be numeric and unique. We normally increment integers at the end, but inserts can be made using decimals e.g. 10,11,11.1,11.2,12,13,... + # num=0 to escape global failure tracking so we can test behaviour of test function itself: test(1.1, test(0, TRUE, FALSE), FALSE, output="1 element mismatch") # Motivations: # 1) we'd like to know all tests that fail not just stop at the first. This often helps by revealing a common feature across a set of # failing tests @@ -256,7 +257,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no prevtest = get("prevtest", parent.frame()) nfail = get("nfail", parent.frame()) # to cater for both test.data.table() and stepping through tests in dev whichfail = get("whichfail", parent.frame()) - assign("ntest", get("ntest", parent.frame()) + 1L, parent.frame(), inherits=TRUE) # bump number of tests run + assign("ntest", get("ntest", parent.frame()) + if (num>0) 1L else 0L, parent.frame(), inherits=TRUE) # bump number of tests run lasttime = get("lasttime", parent.frame()) timings = get("timings", parent.frame()) memtest = get("memtest", parent.frame()) @@ -265,7 +266,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no foreign = get("foreign", parent.frame()) showProgress = get("showProgress", parent.frame()) time = nTest = NULL # to avoid 'no visible binding' note - on.exit( { + if (num>0) on.exit( { now = proc.time()[3L] took = now-lasttime # so that prep time between tests is attributed to the following test assign("lasttime", now, parent.frame(), inherits=TRUE) @@ -326,7 +327,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no fwrite(mem, "memtest.csv", append=TRUE, verbose=FALSE) # nocov } fail = FALSE - if (.test.data.table) { + if (.test.data.table && num>0) { if (num0) { # nocov start assign("nfail", nfail+1L, parent.frame(), inherits=TRUE) assign("whichfail", c(whichfail, numStr), parent.frame(), inherits=TRUE) From 5d3475380c69bc5da533f4374d843e425cd6dd3a Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 31 Mar 2020 20:32:49 +0100 Subject: [PATCH 29/37] enlist, recursive enlisting --- R/programming.R | 55 ++++-- inst/tests/programming.Rraw | 348 +++++++++++++++++++++++++++--------- man/substitute2.Rd | 58 +++--- 3 files changed, 337 insertions(+), 124 deletions(-) diff --git a/R/programming.R b/R/programming.R index 77da6c9457..772c680554 100644 --- a/R/programming.R +++ b/R/programming.R @@ -6,6 +6,40 @@ rm.AsIs = function(x) { oldClass(x) = cl[cl!="AsIs"] x } +list2lang = function(x) { + if (!is.list(x)) + stop("'x' must be a list") + if (is.AsIs(x)) + return(rm.AsIs(x)) + asis = vapply(x, is.AsIs, FALSE) + char = vapply(x, is.character, FALSE) + to.name = !asis & char + if (any(to.name)) { ## turns "my_name" character scalar into `my_name` symbol, for convenience + if (any(non.scalar.char <- vapply(x[to.name], length, 0L)!=1L)) { + stop("Character objects provided in the input are not scalar objects, if you need them as character vector rather than a name, then wrap each into 'I' call: ", + paste(names(non.scalar.char)[non.scalar.char], collapse=", ")) + } + x[to.name] = lapply(x[to.name], as.name) + } + if (isTRUE(getOption("datatable.enlist", TRUE))) { ## recursively enlist for nested lists, see note section in substitute2 manual + islt = vapply(x, is.list, FALSE) + to.enlist = !asis & islt + if (any(to.enlist)) { + x[to.enlist] = lapply(x[to.enlist], enlist) + } + } + if (any(asis)) { + x[asis] = lapply(x[asis], rm.AsIs) + } + x +} +enlist = function(x) { + if (!is.list(x)) + stop("'x' must be a list") + if (is.AsIs(x)) + return(rm.AsIs(x)) + as.call(c(quote(list), list2lang(x))) +} substitute2 = function(expr, env) { if (missing(env)) { @@ -13,7 +47,7 @@ substitute2 = function(expr, env) { } else if (is.null(env)) { # null is fine, will be escaped few lines below } else if (is.environment(env)) { - env = as.list(env, all.names=TRUE) ## todo: try to use environment rather than list, then we don't have to evaluate env at start, see test 2.822 + env = as.list(env, all.names=TRUE, sorted=TRUE) } else if (!is.list(env)) { stop("'env' must be a list or an environment") } @@ -30,26 +64,13 @@ substitute2 = function(expr, env) { } else if (anyDuplicated(env.names)) { stop("'env' argument has duplicated names") } - if (!is.AsIs(env)) { - asis = vapply(env, is.AsIs, FALSE) - char = vapply(env, is.character, FALSE) - to.name = !asis & char - if (any(to.name)) { ## turns "my_name" character scalar into `my_name` symbol, for convenience - if (any(non.scalar.char <- vapply(env[to.name], length, 0L)!=1L)) { - stop("Character objects provided in 'env' are not scalar objects, if you need them as character vector rather than a name, then use wrap each into 'I' call: ", - paste(names(non.scalar.char)[non.scalar.char], collapse=", ")) - } - env[to.name] = lapply(env[to.name], as.name) - } - if (any(asis)) { - env[asis] = lapply(env[asis], rm.AsIs) - } - } + # character to name/symbol, and list to list call + env = list2lang(env) # R substitute expr.sub = eval(substitute( substitute(.expr, env), env = list(.expr = substitute(expr)) )) - # call arg names substitute + # substitute call argument names .Call(Csubstitute_call_arg_namesR, expr.sub, env) } diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index 734c815524..be8cddedc0 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -1,4 +1,3 @@ -#grep -Enr "#[0-9]" programming.Rraw require(methods) if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { if ((tt<-compiler::enableJIT(-1))>0) @@ -8,23 +7,31 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { test = data.table:::test is.AsIs = data.table:::is.AsIs rm.AsIs = data.table:::rm.AsIs + enlist = data.table:::enlist + list2lang = data.table:::list2lang } +# test that 'test' catches the difference in language object +cl1 = substitute(f(1L, list(2L))) +cl2 = substitute(f(1L, .v), list(.v=list(2L))) +test(1.01, all.equal(cl1, cl2), TRUE) +test(1.02, identical(cl1, cl2), FALSE) +test(1.03, test(0, cl1, cl2), FALSE, output="f(1L, list(2L))") # AsIs -test(1.01, is.AsIs(1L), FALSE) -test(1.02, is.AsIs(I(1L)), TRUE) -test(1.03, is.AsIs("a"), FALSE) -test(1.04, is.AsIs(I("a")), TRUE) -test(1.05, is.AsIs(list(1L)), FALSE) -test(1.06, is.AsIs(I(list(1L))), TRUE) -test(1.07, is.AsIs(structure(list(NULL), class="an_S3")), FALSE) ## S3 -test(1.08, is.AsIs(I(structure(list(NULL), class="an_S3"))), TRUE) -test(1.09, is.AsIs(getClass("MethodDefinition")), FALSE) ## S4 -test(1.10, is.AsIs(I(getClass("MethodDefinition"))), TRUE) -test(1.11, is.AsIs(rm.AsIs(1L)), FALSE) -test(1.12, is.AsIs(rm.AsIs(I(1L))), FALSE) -test(1.13, is.AsIs(rm.AsIs(list(1L))), FALSE) -test(1.14, is.AsIs(rm.AsIs(I(list(1L)))), FALSE) +test(1.11, is.AsIs(1L), FALSE) +test(1.12, is.AsIs(I(1L)), TRUE) +test(1.13, is.AsIs("a"), FALSE) +test(1.14, is.AsIs(I("a")), TRUE) +test(1.15, is.AsIs(list(1L)), FALSE) +test(1.16, is.AsIs(I(list(1L))), TRUE) +test(1.17, is.AsIs(structure(list(NULL), class="an_S3")), FALSE) ## S3 +test(1.18, is.AsIs(I(structure(list(NULL), class="an_S3"))), TRUE) +test(1.19, is.AsIs(getClass("MethodDefinition")), FALSE) ## S4 +test(1.20, is.AsIs(I(getClass("MethodDefinition"))), TRUE) +test(1.21, is.AsIs(rm.AsIs(1L)), FALSE) +test(1.22, is.AsIs(rm.AsIs(I(1L))), FALSE) +test(1.23, is.AsIs(rm.AsIs(list(1L))), FALSE) +test(1.24, is.AsIs(rm.AsIs(I(list(1L)))), FALSE) # substitute2 simple test(2.01, substitute2(list(var = val), env = list(var="my_var", val=5L)), quote(list(my_var = 5L))) @@ -57,8 +64,14 @@ test(2.12, substitute2( out_col_name = "sum_x" ) ), quote(.(sum_x = sum(x, na.rm=TRUE)))) -# substitute2 nested calls -test(2.13, substitute2(f1(a1 = f2(a2 = f3(a3 = f4(a4 = v1, extra=v2), v3, a3b = v4)), a1b=c("a","b")), list(f1="fun1", f2="fun2", f3="fun3", f4="fun4", a1="arg1", a2="arg2", a3="arg3", a4="arg4", v1="col1", extra="n", v2=6L, v3="col2", a3b="arg3b", v4=c(3.5,4.5), a1b="arg1b")), substitute(fun1(arg1 = fun2(arg2 = fun3(arg3 = fun4(arg4 = col1, n=6L), col2, arg3b = v4)), arg1b=c("a","b")), list(v4=c(3.5,4.5)))) +# substitute2 nested calls argument names substitute +test(2.13, substitute2( + f1(a1 = f2(a2 = f3(a3 = f4(a4 = v1, extra=v2), v3, a3b = v4)), a1b=c("a","b")), + list(f1="fun1", f2="fun2", f3="fun3", f4="fun4", a1="arg1", a2="arg2", a3="arg3", a4="arg4", v1="col1", extra="n", v2=6L, v3="col2", a3b="arg3b", v4=c(3.5,4.5), a1b="arg1b") +), substitute( + fun1(arg1 = fun2(arg2 = fun3(arg3 = fun4(arg4 = col1, n=6L), col2, arg3b = v4)), arg1b=c("a","b")), + list(v4=c(3.5,4.5)) +)) # calls of length 0 args const1 = function() 1L test(2.21, substitute2(list(nm = fun()), env=list(a="b", fun="const1", nm="int1")), quote(list(int1=const1()))) @@ -68,12 +81,12 @@ test(2.31, class(substitute2(var3%in%values, list(var3="a", values=I(c("a","b"," test(2.32, class(substitute2(var3%in%values, I(list(var3=as.name("a"), values=c("a","b","c"))))[[3L]]), "character") test(2.33, class(substitute2(var3%in%values, list(var3="a", values=I(1:3)))[[3L]]), "integer") test(2.34, class(substitute2(var3%in%values, I(list(var3=as.name("a"), values=c(1:3))))[[3L]]), "integer") -qc = substitute2(var3%in%values, I(list(var3=as.name("a"), values=I(c("a","b","c"))))) ## keeping AsIs by extra I on whole env arg -test(2.35, qc, substitute(a %in% .v, list(.v=I(c("a","b","c"))))) -test(2.36, class(qc[[3L]]), "AsIs") -qc = substitute2(var3%in%values, I(list(var3="a", values=I(1:3)))) -test(2.37, qc, substitute("a" %in% .v, list(.v=I(1:3)))) -test(2.38, class(qc[[3L]]), "AsIs") +cl = substitute2(var3%in%values, I(list(var3=as.name("a"), values=I(c("a","b","c"))))) ## keeping AsIs by extra I on whole env arg +test(2.35, cl, substitute(a %in% .v, list(.v=I(c("a","b","c"))))) +test(2.36, class(cl[[3L]]), "AsIs") +cl = substitute2(var3%in%values, I(list(var3="a", values=I(1:3)))) +test(2.37, cl, substitute("a" %in% .v, list(.v=I(1:3)))) +test(2.38, class(cl[[3L]]), "AsIs") # substitute2 non-scalar char as name test(2.41, substitute2(list(var = val), env = list(var="my_var", val=c("a","b"))), error="are not scalar") test(2.42, substitute2(list(var = val), env = list(var="my_var", val=I(c("a","b")))), substitute(list(my_var=.v), list(.v=c("a","b")))) ## note that quote(list(my_var=c("a","b")))) will not work because 'c("a","b")' will be a 'language' class (a 'c()' call), but we need to have it as 'character' class instead @@ -85,9 +98,9 @@ test(2.46, substitute2(.(v1=v2), list(v1=1L, v2=2L)), error="type 'integer' but test(2.47, substitute2(.(v1=v2), list(v1=FALSE, v2=2L)), error="type 'logical' but it has to be 'symbol'") # substitute2 NA_character_ becomes valid 'NA' name test(2.48, substitute2(.(v1 = v2), list(v1 = NA_character_, v2 = NA_character_, "." = "list")), quote(list(`NA` = `NA`))) -qc = substitute2(.(v1 = v2), list(v1 = NA_character_, v2 = I(NA_character_), "." = "list")) -test(2.49, qc, quote(list(`NA` = NA_character_))) -test(2.50, eval(qc), list("NA" = NA_character_)) +cl = substitute2(.(v1 = v2), list(v1 = NA_character_, v2 = I(NA_character_), "." = "list")) +test(2.49, cl, quote(list(`NA` = NA_character_))) +test(2.50, eval(cl), list("NA" = NA_character_)) # substitute2 duplicate matches test(2.51, substitute2(list(v1=v2, v1=v2), env=list(v1="nm",v2=2L,v3=3L)), quote(list(nm = 2L, nm = 2L))) test(2.52, substitute2(list(v1=v2, v1=v3), env=list(v1="nm",v2=2L,v3=3L)), quote(list(nm = 2L, nm = 3L))) @@ -128,7 +141,7 @@ f = function(expr, env) { list(.expr = substitute(expr)) )) } -qc = f( +cl = f( .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), env = list( in_col_name = "x", @@ -138,9 +151,9 @@ qc = f( out_col_name = "sum_x" ) ) -test(3.01, qc, quote(.(sum_x = sum(x, na.rm = TRUE)))) +test(3.01, cl, quote(.(sum_x = sum(x, na.rm = TRUE)))) # substitute2 nested re-use inside another function -qc = substitute2(list(nm = fun(.(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), +cl = substitute2(list(nm = fun(.(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), env = list( in_col_name = "x", fun = "sum", @@ -148,50 +161,133 @@ qc = substitute2(list(nm = fun(.(out_col_name = fun(in_col_name, fun_arg1=fun_ar fun_arg1val = tf_var, ## note a parameter here out_col_name = "sum_x" ))), list(nm="my_call", fun="f", tf_var=FALSE)) -test(3.02, eval(qc), list(my_call = quote(.(sum_x = sum(x, na.rm = FALSE))))) +test(3.02, eval(cl), list(my_call = quote(.(sum_x = sum(x, na.rm = FALSE))))) + +# enlist +test(4.01, enlist(c("a")), error="'x' must be a list") +test(4.02, enlist(list("V1","V2")), quote(list(V1, V2))) +test(4.03, enlist(list(V1="V1", V2="V2")), quote(list(V1=V1, V2=V2))) +test(4.04, enlist(I(list(V1="V1", V2="V2"))), list(V1="V1", V2="V2")) +test(4.05, enlist(list(V1=I("V1"), V2=I("V2"))), quote(list(V1="V1", V2="V2"))) +test(4.06, enlist(list(V1="V1", V2=I("V2"))), quote(list(V1=V1, V2="V2"))) +test(4.07, enlist(list(V1="V1", V2=I("V2"), V3=list("X1", "X2"))), quote(list(V1=V1, V2="V2", V3=list(X1, X2)))) +test(4.08, enlist(list(V1="V1", V2=I("V2"), V3=list(X1="X1", X2=I("X2")))), quote(list(V1=V1, V2="V2", V3=list(X1=X1, X2="X2")))) +test(4.09, enlist(list(V1="V1", V2=I("V2"), V3=enlist(list("X1","X2")))), quote(list(V1 = V1, V2 = "V2", V3 = list(X1, X2)))) +test(4.10, enlist(list(V1="V1", V2=I("V2"), V3=I(enlist(list("X1","X2"))))), quote(list(V1 = V1, V2 = "V2", V3 = list(X1, X2)))) +test(4.11, enlist(list(V1="V1", V2=I("V2"), V3=enlist(I(list("X1","X2"))))), quote(list(V1 = V1, V2 = "V2", V3 = list(X1, X2)))) +test(4.12, enlist(list(V1="V1", V2=I("V2"), V3=I(enlist(I(list("X1","X2")))))), substitute(list(V1 = V1, V2 = "V2", V3 = lst), list(lst = list("X1", "X2")))) +test(4.13, enlist(list(V1="V1", V2=I("V2"), V3=I(enlist(list(I("X1"),I("X2")))))), quote(list(V1 = V1, V2 = "V2", V3 = list("X1", "X2")))) +test(4.14, enlist(I(list(V1="V1", V2=list("V2")))), list(V1="V1", V2=list("V2"))) +test(4.15, enlist(I(list(V1="V1", V2=I(list("V2"))))), list(V1="V1", V2=I(list("V2")))) + +# list2lang +test(5.01, list2lang(c("a")), error="'x' must be a list") +test(5.02, list2lang(list("a", 1L)), list(as.name("a"), 1L)) +test(5.03, list2lang(I(list("a", 1L))), list("a", 1L)) +test(5.04, list2lang(list(I("a"), 1L)), list("a", 1L)) +test(5.05, list2lang(list("a", 1L, list("b"))), list(as.name("a"), 1L, call("list", as.name("b")))) +test(5.06, list2lang(list("a", 1L, list(I("b")))), list(as.name("a"), 1L, call("list", "b"))) +test(5.07, list2lang(list("a", 1L, I(list("b")))), list(as.name("a"), 1L, list("b"))) +test(5.08, list2lang(I(list("a", 1L, list("b")))), list("a", 1L, list("b"))) +test(5.09, list2lang(I(list("a", 1L, I(list("b"))))), list("a", 1L, I(list("b")))) +test(5.10, list2lang(list("a", 1L, c(1L, 2L))), list(as.name("a"), 1L, c(1L,2L))) ## no 'enlist' like feature for 'c()' function, see next test +test(5.11, list2lang(list("a", 1L, call("c", 1L, 2L))), list(as.name("a"), 1L, quote(c(1L, 2L)))) + +# datatable.enlist +op = options(datatable.enlist=NULL) +test(6.01, + substitute2(list(v1 = v2, v3 = v4), list(v1 = "int", v2 = 1L, v3 = "lst", v4 = list("a", "b", list("c", "d")))), + quote(list(int = 1L, lst = list(a, b, list(c, d))))) +options(datatable.enlist=FALSE) +test(6.02, + substitute2(list(v1 = v2, v3 = v4), list(v1 = "int", v2 = 1L, v3 = "lst", v4 = list("a", "b", list("c", "d")))), + substitute(list(int = 1L, lst = lst), list(lst = list("a", "b", list("c", "d"))))) +options(datatable.enlist=NULL) +test(6.03, + enlist(list(v1 = 1L, v2 = list(v3 = "b", v4 = list(v5 = "c")))), + quote(list(v1 = 1L, v2 = list(v3 = b, v4 = list(v5 = c))))) +options(datatable.enlist=FALSE) +test(6.04, + enlist(list(v1 = 1L, v2 = list(v3 = "b", v4 = list(v5 = "c")))), + substitute(list(v1 = 1L, v2 = lst), list(lst=list(v3 = "b", v4 = list(v5 = "c"))))) +options(datatable.enlist=NULL) +test(6.05, + substitute2(list(v1, v2, v3), list(v1="V1", v2="V2", v3=enlist(list("V4","V5")))), + quote(list(V1, V2, list(V4, V5)))) +options(datatable.enlist=FALSE) +test(6.06, + substitute2(list(v1, v2, v3), list(v1="V1", v2="V2", v3=enlist(list("V4","V5")))), + quote(list(V1, V2, list(V4, V5)))) +test(6.07, + substitute2(list(v1, v2, v3), list(v1="V1", v2="V2", v3=enlist(list("V4","V5", list("V6"))))), + substitute(list(V1, V2, list(V4, V5, lst)), list(lst=list("V6")))) +test(6.08, + substitute2(list(v1, v2, v3), list(v1="V1", v2="V2", v3=enlist(list("V4","V5", enlist(list("V6")))))), + quote(list(V1, V2, list(V4, V5, list(V6))))) +options(op) + +# documentation examples +test(7.01, substitute2(list(var1 = var2), list(var1 = "c1", var2 = 5L)), quote(list(c1 = 5L))) ## works also on names +test(7.02, substitute2(var1, list(var1 = I("c1"))), "c1") ## enforce character with I +test(7.03, substitute2(var1, list(var1 = "c1")), quote(c1)) ## turn character into symbol, for convenience +test(7.04, substitute2(list(var1 = var2), list(var1 = "c1", var2 = I("some_character"))), quote(list(c1 = "some_character"))) ## mix symbols and characters +test(7.05, substitute2(list(var1 = var2), I(list(var1 = as.name("c1"), var2 = "some_character"))), quote(list(c1 = "some_character"))) +test(7.06, substitute2(f(lst), I(list(lst = list(1L, 2L)))), substitute(f(lst), list(lst=list(1L,2L)))) ## list elements are enlist'ed into list calls +test(7.07, substitute2(f(lst), list(lst = I(list(1L, 2L)))), substitute(f(lst), list(lst=list(1L,2L)))) +test(7.08, substitute2(f(lst), list(lst = call("list", 1L, 2L))), quote(f(list(1L, 2L)))) +test(7.09, substitute2(f(lst), list(lst = list(1L, 2L))), quote(f(list(1L, 2L)))) +test(7.10, substitute2(f(lst), list(lst = list(1L, list(2L)))), quote(f(list(1L, list(2L))))) ## character to name and list into list calls works recursively +test(7.11, substitute2(f(lst), I(list(lst = list(1L, list(2L))))), substitute(f(lst), list(lst=list(1L, list(2L))))) +f = function(expr, env) { ## using substitute2 from another function + eval(substitute( + substitute2(.expr, env), + list(.expr = substitute(expr)) + )) +} +test(7.12, f(list(var1 = var2), list(var1 = "c1", var2 = 5L)), quote(list(c1 = 5L))) # data.table i, j, by d = data.table(a = 2:1, b = 1:4) -test(4.01, d[var3%in%values, .(var1 = f(var2)), by=var3, +test(11.01, d[var3%in%values, .(var1 = f(var2)), by=var3, env=list(var1="res", var2="b", f="sum", var3="a", values=0:3), - verbose=TRUE], data.table(a=c(2L,1L), res=c(4L,6L)), output=c("Argument 'by' after substitute: a","Argument 'j' after substitute: .(res = sum(b))","Argument 'i' after substitute: a %in% 0:3")) # 0:3 is not expanded even when we use c(0L,1L,2L,3L), see `substitute(v+x, list(x=c(1L,2L)))` vs `substitute(v+x, list(x=c(0L,2L)))` + verbose=TRUE], data.table(a=c(2L,1L), res=c(4L,6L)), output=c("Argument 'by' after substitute: a","Argument 'j' after substitute: .(res = sum(b))","Argument 'i' after substitute: a %in% 0:3")) # data.table symbols and chars d = data.table(a = c("b","a"), b = 1:4) out = capture.output(ans <- d[var3%in%values, .(var1 = f(var2)), keyby=var3, env=list(var1="res", var2="b", f="sum", var3="a", values=I(c("a","b","c"))), verbose=TRUE]) # could not use output arg in test, so test it manually -test(4.02, ans, data.table(a=c("a","b"), res=c(6L,4L), key="a")) +test(11.02, ans, data.table(a=c("a","b"), res=c(6L,4L), key="a")) out = grep("Argument.*substitute", out, value=TRUE) -test(4.021, length(out), 3L) # we expect i, j, by only here, ensure about that -test(4.022, "Argument 'by' after substitute: a" %in% out, TRUE) -test(4.023, "Argument 'j' after substitute: .(res = sum(b))" %in% out, TRUE) -test(4.024, "Argument 'i' after substitute: a %in% c(\"a\", \"b\", \"c\")" %in% out, TRUE) +test(11.021, length(out), 3L) # we expect i, j, by only here, ensure about that +test(11.022, "Argument 'by' after substitute: a" %in% out, TRUE) +test(11.023, "Argument 'j' after substitute: .(res = sum(b))" %in% out, TRUE) +test(11.024, "Argument 'i' after substitute: a %in% c(\"a\", \"b\", \"c\")" %in% out, TRUE) out = capture.output(ans <- d[var3%in%values, .(var1 = f(var2)), keyby=var3, env=I(list(var1=as.name("res"), var2=as.name("b"), f=as.name("sum"), var3=as.name("a"), values=c("b","c"))), verbose=TRUE]) -test(4.03, ans, data.table(a=c("b"), res=c(4L), key="a")) +test(11.03, ans, data.table(a=c("b"), res=c(4L), key="a")) out = grep("Argument.*substitute", out, value=TRUE) -test(4.031, length(out), 3L) -test(4.032, "Argument 'by' after substitute: a" %in% out, TRUE) -test(4.033, "Argument 'j' after substitute: .(res = sum(b))" %in% out, TRUE) -test(4.034, "Argument 'i' after substitute: a %in% c(\"b\", \"c\")" %in% out, TRUE) +test(11.031, length(out), 3L) +test(11.032, "Argument 'by' after substitute: a" %in% out, TRUE) +test(11.033, "Argument 'j' after substitute: .(res = sum(b))" %in% out, TRUE) +test(11.034, "Argument 'i' after substitute: a %in% c(\"b\", \"c\")" %in% out, TRUE) # substitute2 during join d1 = data.table(id1=1:4, v1=5) d2 = data.table(id1=c(0L,2:3), v1=6) out = capture.output(ans <- d1[d2, on="id1<=id1", .(c1, c2, c3, c4), env=list(c1="x.id1", c2="i.id1", c3="x.v1", c4="i.v1"), verbose=TRUE]) -test(4.041, ans, data.table(x.id1=c(NA,1:2,1:3), i.id1=c(0L,2L,2L,3L,3L,3L), x.v1=c(NA,rep(5,5)), i.v1=rep(6,6))) +test(11.041, ans, data.table(x.id1=c(NA,1:2,1:3), i.id1=c(0L,2L,2L,3L,3L,3L), x.v1=c(NA,rep(5,5)), i.v1=rep(6,6))) out = grep("Argument.*substitute", out, value=TRUE) -test(4.042, length(out), 2L) ## 2L because i is non-missing attempt to substitute is made -test(4.043, "Argument 'j' after substitute: .(x.id1, i.id1, x.v1, i.v1)" %in% out, TRUE) +test(11.042, length(out), 2L) ## 2L because i is non-missing attempt to substitute is made +test(11.043, "Argument 'j' after substitute: .(x.id1, i.id1, x.v1, i.v1)" %in% out, TRUE) d1 = data.table(id1=c(2L,4L,2L,4L), v1=5) d2 = data.table(id1=c(0L,2:3), v1=6) out = capture.output(ans <- d1[dd, on="id1<=id1", .(sum(c3), sum(c4)), by=by, env=list(dd="d2", c3="x.v1", c4="i.v1", by=".EACHI"), verbose=TRUE]) -test(4.044, ans, data.table(id1=c(0L,2L,3L), V1=c(NA,10,10), V2=c(6,6,6))) +test(11.044, ans, data.table(id1=c(0L,2L,3L), V1=c(NA,10,10), V2=c(6,6,6))) out = grep("Argument.*substitute", out, value=TRUE) -test(4.045, length(out), 3L) -test(4.046, "Argument 'by' after substitute: .EACHI" %in% out, TRUE) -test(4.047, "Argument 'j' after substitute: .(sum(x.v1), sum(i.v1))" %in% out, TRUE) -test(4.048, "Argument 'i' after substitute: d2" %in% out, TRUE) +test(11.045, length(out), 3L) +test(11.046, "Argument 'by' after substitute: .EACHI" %in% out, TRUE) +test(11.047, "Argument 'j' after substitute: .(sum(x.v1), sum(i.v1))" %in% out, TRUE) +test(11.048, "Argument 'i' after substitute: d2" %in% out, TRUE) dt1 = data.table(x = letters[1:5], y = 1:5) dt2 = data.table(x = letters[1:3], y = 11:13) target_v = "y" @@ -199,64 +295,153 @@ source_v = paste0("i.", target_v) on_v = "x" out = capture.output(invisible(dt1[dt2, target_v := source_v, on = on_v, env = list(target_v = target_v, source_v = source_v), verbose=TRUE])) out = grep("Argument.*substitute", out, value=TRUE) -test(4.049, length(out), 2L) -test(4.050, dt1, data.table(x = c("a", "b", "c", "d", "e"), y = c(11L, 12L, 13L, 4L, 5L))) +test(11.049, length(out), 2L) +test(11.050, dt1, data.table(x = c("a", "b", "c", "d", "e"), y = c(11L, 12L, 13L, 4L, 5L))) # substitute special symbols d = data.table(V1=1:2, V2=1:4) -test(4.051, d[, j, by, env=list(j=".N", by="V1")], data.table(V1=c(1L,2L), N=c(2L,2L))) -test(4.052, d[, j, by, env=list(j=".SD", by="V1")], data.table(V1=c(1L,1L,2L,2L), V2=c(1L,3L,2L,4L))) -test(4.053, d[, j, env=I(list(j=as.name(".N")))], 4L) -test(4.054, d[, .(op, fun(col)), by=by, env=list(op=".N", fun="sum", col="V2", by="V1")], data.table(V1=1:2, N=c(2L,2L), V2=c(4L,6L))) +test(11.051, d[, j, by, env=list(j=".N", by="V1")], data.table(V1=c(1L,2L), N=c(2L,2L))) +test(11.052, d[, j, by, env=list(j=".SD", by="V1")], data.table(V1=c(1L,1L,2L,2L), V2=c(1L,3L,2L,4L))) +test(11.053, d[, j, env=I(list(j=as.name(".N")))], 4L) +test(11.054, d[, .(op, fun(col)), by=by, env=list(op=".N", fun="sum", col="V2", by="V1")], data.table(V1=1:2, N=c(2L,2L), V2=c(4L,6L))) # get and mget use cases d = as.data.table(lapply(1:5, rep, 2L)) setnames(d, paste0("c",1:5)) v1 = "c1"; v2 = "c2"; v3 = "c3"; v4 = "c4"; v5 = "c5" -test(4.061, d[, v1, env=list(v1=v1)], d[, get(v1)]) ## symbol c1 -test(4.062, d[, v1, env=list(v1=I(v1))], data.table(c1=c(1L,1L))) ## character "c1" -test(4.063, d[, list(v1), env=list(v1=v1)], d[, mget(v1)]) ## symbol c1 in list -test(4.064, d[, v1v2, env=list(v1v2=I(c(v1,v2)))], d[, mget(c(v1, v2))]) ## character c("c1","c2") -test(4.065, d[, .(v1), env=list(v1=v1)], data.table(c1=c(1L,1L))) ## d[, .(get(v1))] - (m)get would return unnamed columns -test(4.066, d[, .(v1, v2), env=list(v1=v1, v2=v2)], data.table(c1=c(1L,1L),c2=c(2L,2L))) ## d[, .(get(v1), get(v2))] -test(4.067, d[, .(sum(v1)), env=list(v1=v1)], d[, .(sum(get(v1)))]) -test(4.068, d[, lapply(vN, sum), env=list(vN=substitute2(list(v1=v1, v3=v3), list(v1=v1, v3=v3)))], d[, lapply(mget(c(v1,v3)), sum)]) -test(4.069, d[, c(list(c1=c1, c2=c2), list(v3=v3), list(v4=v4, v5=v5)), env=list(v3=v3,v4=v4,v5=v5)], d) ## d[, c(list(c1, c2), list(get(v3)), mget(c(v4,v5)))] - some are unnamed -# addresses: weird exception when by contains get #1985 +test(11.061, d[, v1, env=list(v1=v1)], d[, get(v1)]) ## symbol c1 +test(11.062, d[, v1, env=list(v1=I(v1))], data.table(c1=c(1L,1L))) ## character "c1" +test(11.063, d[, list(v1), env=list(v1=v1)], d[, mget(v1)]) ## symbol c1 in list +test(11.064, d[, v1v2, env=list(v1v2=I(c(v1,v2)))], d[, mget(c(v1, v2))]) ## character c("c1","c2") +test(11.065, d[, v1v2, env=list(v1v2=as.list(c(v1,v2)))], d[, mget(c(v1, v2))]) ## call list(c1,c2) ## auto-enlist +test(11.066, d[, .(v1), env=list(v1=v1)], data.table(c1=c(1L,1L))) ## d[, .(get(v1))] - (m)get would return unnamed columns +test(11.067, d[, .(v1, v2), env=list(v1=v1, v2=v2)], data.table(c1=c(1L,1L),c2=c(2L,2L))) ## d[, .(get(v1), get(v2))] +test(11.068, d[, .(sum(v1)), env=list(v1=v1)], d[, .(sum(get(v1)))]) +test(11.069, d[, lapply(vN, sum), env=list(vN=as.list(setNames(nm = c(v1, v3))))], d[, lapply(mget(c(v1,v3)), sum)]) +test(11.070, d[, c(list(c1=c1, c2=c2), list(v3=v3), list(v4=v4, v5=v5)), env=list(v3=v3,v4=v4,v5=v5)], d) ## d[, c(list(c1, c2), list(get(v3)), mget(c(v4,v5)))] - some are unnamed +#1985 weird exception when by contains get tb = data.table(x=c(1,2), y=c(3,4), z=c(5,6), w=c("a","b")) -test(4.101, tb[w != "b", .(x=sum(x)), by=.(y, zz=.z), env=list(.z="z")], data.table(y=3, zz=5, x=1)) +test(11.101, tb[w != "b", .(x=sum(x)), by=.(y, zz=.z), env=list(.z="z")], data.table(y=3, zz=5, x=1)) dtIris = as.data.table(iris) speciesVar = "Species" -test(4.102, dtIris[Sepal.Length > 4, .N, by = .(var = .speciesVar, Petal.Width), env = list(.speciesVar = speciesVar)], dtIris[Sepal.Length > 4, .N, by = .(var = Species, Petal.Width)]) +test(11.102, dtIris[Sepal.Length > 4, .N, by = .(var = .speciesVar, Petal.Width), env = list(.speciesVar = speciesVar)], dtIris[Sepal.Length > 4, .N, by = .(var = Species, Petal.Width)]) #2589 Need an easier way to use dynamically determined symbols dt = data.table(x1 = 1:10, x2 = 10:1, x3 = 1:10) s1 = "x2"; s2 = "x3" -test(4.103, dt[, s1 * s2, env=list(s1=s1,s2=s2)], c(10L, 18L, 24L, 28L, 30L, 30L, 28L, 24L, 18L, 10L)) +test(11.103, dt[, s1 * s2, env=list(s1=s1,s2=s2)], c(10L, 18L, 24L, 28L, 30L, 30L, 28L, 24L, 18L, 10L)) #2884 Alternative way to dynamic symbol usage in `j` dt = data.table(id = rep(1:2, 5), x1 = rnorm(10), x2 = rnorm(10), y1 = rnorm(10), y2 = rnorm(10)) -test(4.104, dt[, .(xsum = sum(x), ysum = sum(y)), by = id, env = list(x = "x1", y = "y2")], dt[, .(xsum=sum(x1), ysum=sum(y2)), by=id]) +test(11.104, dt[, .(xsum = sum(x), ysum = sum(y)), by = id, env = list(x = "x1", y = "y2")], dt[, .(xsum=sum(x1), ysum=sum(y2)), by=id]) #2816 Possible regression for programmatic use in `j` dt = data.table(x=1:3) var = "x" dt[, var := var+1L, env=list(var="x")] -test(4.105, dt, data.table(x=2:4)) +test(11.105, dt, data.table(x=2:4)) # injecting quoted expressions #750 `by=list(eval(as.name("colA")))` renames column DT = data.table(colA=1:4, colB=5:8, colC=9:12) -test(4.106, DT[, sum(colA), by=list(grp_name=grp), env=list(grp_name="colA", grp="colA")], data.table(colA=1:4, V1=1:4)) +test(11.106, DT[, sum(colA), by=list(grp_name=grp), env=list(grp_name="colA", grp="colA")], data.table(colA=1:4, V1=1:4)) #2432 Add Programmable NSE co2 = as.data.table(CO2) Jexp1 = quote(max(conc)) Jexp2 = quote(mean(conc)) Jexp = substitute(list(Jexp1, round(Jexp2)), list(Jexp1=Jexp1, Jexp2=Jexp2)) out = capture.output(ans <- co2[, j, by=Type, env=list(j=Jexp), verbose=TRUE]) -test(4.107, ans, data.table(Type=factor(c("Quebec","Mississippi"), levels=c("Quebec","Mississippi")), V1=c(1000,1000), V2=c(435,435))) +test(11.107, ans, data.table(Type=factor(c("Quebec","Mississippi"), levels=c("Quebec","Mississippi")), V1=c(1000,1000), V2=c(435,435))) out = grep("Argument.*substitute", out, value=TRUE) -test(4.108, length(out), 2L) -test(4.109, "Argument 'by' after substitute: Type" %in% out, TRUE) -test(4.110, "Argument 'j' after substitute: list(max(conc), round(mean(conc)))" %in% out, TRUE) +test(11.108, length(out), 2L) +test(11.109, "Argument 'by' after substitute: Type" %in% out, TRUE) +test(11.110, "Argument 'j' after substitute: list(max(conc), round(mean(conc)))" %in% out, TRUE) #628 Change j=list(xout=eval(...))'s eval to eval within scope of DT dat = data.table(x_one=1:10, x_two=1:10, y_one=1:10, y_two=1:10) -f = function(vars) as.call(c(as.name("list"), lapply(setNames(vars, paste(vars,"out",sep="_")), function(var) substitute2(one-two, list(one=paste(var,"one",sep="_"), two=paste(var,"two",sep="_")))))) -test(4.111, dat[, j, env=list(j = f(c("x","y")))], dat[, list(x_out = x_one - x_two, y_out = y_one - y_two)]) +f = function(vars) as.call(c(quote(list), lapply(setNames(vars, paste(vars,"out",sep="_")), function(var) substitute2(one-two, list(one=paste(var,"one",sep="_"), two=paste(var,"two",sep="_")))))) +test(11.111, dat[, j, env=list(j = f(c("x","y")))], dat[, list(x_out = x_one - x_two, y_out = y_one - y_two)]) + +# vignette examples +square = function(x) x^2 +test(12.01, + substitute2(outer(inner(var1) + inner(var2)), env = list(outer = "sqrt", inner = "square", var1 = "a", var2 = "b")), + quote(sqrt(square(a) + square(b)))) +DT = as.data.table(iris) +test(12.02, + DT[, outer(inner(var1) + inner(var2)), env = list(outer = "sqrt", inner = "square", var1 = "Sepal.Length", var2 = "Sepal.Width")], + DT[, sqrt(square(Sepal.Length) + square(Sepal.Width))]) +test(12.03, # return as data.table, substitute call argument name + DT[, .(Species, var1, var2, out = outer(inner(var1) + inner(var2))), env = list(outer = "sqrt", inner = "square", var1 = "Sepal.Length", var2 = "Sepal.Width", out = "Sepal.Hypotenuse")], + DT[, .(Species, Sepal.Length, Sepal.Width, Sepal.Hypotenuse = sqrt(square(Sepal.Length) + square(Sepal.Width)))]) +test(12.04, # i, j, by + DT[filter_col %in% filter_val, .(var1, var2, out = outer(inner(var1) + inner(var2))), by = by_col, env = list(outer = "sqrt", inner = "square", var1 = "Sepal.Length", var2 = "Sepal.Width", out = "Sepal.Hypotenuse", filter_col = "Species", filter_val = I(c("versicolor", "virginica")), by_col = "Species")], + DT[Species %in% c("versicolor","virginica"), .(Sepal.Length, Sepal.Width, Sepal.Hypotenuse = sqrt(square(Sepal.Length) + square(Sepal.Width))), by = Species]) +test(12.05, # like base R, env AsIs class + substitute2(rank(input, ties.method = ties), env = I(list(input = as.name("Sepal.Width"), ties = "first"))), + quote(rank(Sepal.Width, ties.method = "first"))) +test(12.06, # only particular elements of env are AsIs class + substitute2(rank(input, ties.method = ties), env = list(input = "Sepal.Width", ties = I("first"))), + quote(rank(Sepal.Width, ties.method = "first"))) +test(12.07, # all are symbols + substitute2(f(v1, v2), list(v1 = "a", v2 = list("b", list("c", "d")))), + quote(f(a, list(b, list(c, d))))) +test(12.08, # 'a' and 'd' should stay as character + substitute2(f(v1, v2), list(v1 = I("a"), v2 = list("b", list("c", I("d"))))), + quote(f("a", list(b, list(c, "d"))))) +cols = c("Sepal.Length", "Sepal.Width") +test(12.09, # data.table automatically enlist nested lists into list calls + DT[, j, env = list(j = as.list(cols))], + DT[, list(Sepal.Length, Sepal.Width)]) +test(12.10, # turning above 'j' list into a list call + DT[, j, env = list(j = quote(list(Sepal.Length, Sepal.Width)))], + DT[, list(Sepal.Length, Sepal.Width)]) +test(12.11, # the same as above but accepts character vector + DT[, j, env = list(j = as.call(c(quote(list), lapply(cols, as.name))))], + DT[, list(Sepal.Length, Sepal.Width)]) +test(12.12, # list of symbols + DT[, j, env = I(list(j = lapply(cols, as.name))), verbose = TRUE], + error = "j-argument should be", + output = "list(Sepal.Length, Sepal.Width)") +test(12.13, substitute2(j, env = I(list(j = lapply(cols, as.name)))), lapply(cols, as.name)) +test(12.14, substitute2(j, env = list(j = as.list(cols))), as.call(c(quote(list), lapply(cols, as.name)))) +outer = "sqrt"; inner = "square"; vars = c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") +syms = lapply(vars, as.name) +to_inner_call = function(var, fun) call(fun, var) +inner_calls = lapply(syms, to_inner_call, inner) +test(12.15, inner_calls, list(quote(square(Sepal.Length)), quote(square(Sepal.Width)), quote(square(Petal.Length)), quote(square(Petal.Width)))) +to_add_call = function(x, y) call("+", x, y) +add_calls = Reduce(to_add_call, inner_calls) +test(12.16, add_calls, quote(square(Sepal.Length) + square(Sepal.Width) + square(Petal.Length) + square(Petal.Width))) +rms = substitute2(expr = outer((add_calls) / len), env = list(outer = outer, add_calls = add_calls, len = length(vars))) +test(12.17, rms, quote(sqrt((square(Sepal.Length) + square(Sepal.Width) + square(Petal.Length) + square(Petal.Width))/4L))) +test(12.18, + DT[, j, env = list(j = rms)], + DT[, sqrt((square(Sepal.Length) + square(Sepal.Width) + square(Petal.Length) + square(Petal.Width))/4L)]) +test(12.19, # same but skipping last substitute2 call and using add_calls directly + DT[, outer((add_calls) / len), env = list(outer = outer, add_calls = add_calls, len = length(vars))], + DT[, sqrt((square(Sepal.Length) + square(Sepal.Width) + square(Petal.Length) + square(Petal.Width))/4L)]) +j = substitute2(j, list(j = as.list(setNames(nm = c(vars, "Species", "rms"))))) # return as data.table +j[["rms"]] = rms +test(12.20, + DT[, j, env = list(j = j)], + DT[, .(Sepal.Length=Sepal.Length, Sepal.Width=Sepal.Width, Petal.Length=Petal.Length, Petal.Width=Petal.Width, Species, rms = sqrt((square(Sepal.Length) + square(Sepal.Width) + square(Petal.Length) + square(Petal.Width))/4L))]) +j = as.call(c( # alternatively + quote(list), + lapply(setNames(nm = vars), as.name), + list(Species = as.name("Species")), + list(rms = rms) +)) +test(12.21, + DT[, j, env = list(j = j)], + DT[, .(Sepal.Length=Sepal.Length, Sepal.Width=Sepal.Width, Petal.Length=Petal.Length, Petal.Width=Petal.Width, Species, rms = sqrt((square(Sepal.Length) + square(Sepal.Width) + square(Petal.Length) + square(Petal.Width))/4L))]) +v1 = "Petal.Width" # get +v2 = "Sepal.Width" +test(12.22, + DT[, .(total = sum(v1, v2)), env = list(v1 = v1, v2 = v2)], + DT[, .(total = sum(get(v1), get(v2)))]) +v = c("Petal.Width", "Sepal.Width") # mget +test(12.23, + DT[, lapply(v, mean), env = list(v = as.list(v))], + DT[, lapply(list(Petal.Width, Sepal.Width), mean)]) +test(12.24, + DT[, lapply(v, mean), env = list(v = as.list(setNames(nm = v)))], + DT[, lapply(mget(v), mean)]) +cl = quote(.(Petal.Width = mean(Petal.Width), Sepal.Width = mean(Sepal.Width))) +test(12.25, DT[, cl, env = list(cl = cl)], DT[, eval(cl)]) ####################### # contributed use cases @@ -350,8 +535,7 @@ cor_xy = function(xdt, ydt, x, y) { ## cor between each x and a single y xdt[, lapply(.SD, cor, y = y), keyby = symbol, .SDcols = x] } out = capture.output(ans <- cor_xy(xdt, ydt, c("x1", "x2"), "y10")) -exp = as.data.table(list(symbol = 1:2, x1 = c(0.529292252112253, 0.0301956035638738 -), x2 = c(0.287076866252898, -0.335969587268599)), key="symbol") +exp = as.data.table(list(symbol = 1:2, x1 = c(0.529292252112253, 0.0301956035638738), x2 = c(0.287076866252898, -0.335969587268599)), key="symbol") test(102.01, ans, exp) test(102.02, length(grep("Argument.*substitute", out, value=TRUE)), 2L) test(102.03, "Argument 'j' after substitute: `:=`(y, y10)" %in% out, TRUE) @@ -376,9 +560,9 @@ exp = as.data.table(list( )) test(102.05, cor_dt, exp) cor_xy3 = function(xdt, ydt, x, y) { ## cor matrix of existing columns and dynamically in-place merged columns - qc = as.call(lapply(setNames(c(":=", y), c("", y)), as.name)) + cl = as.call(lapply(setNames(c(":=", y), c("", y)), as.name)) xdt[ydt, j, on = .(symbol, date), - env = list(j=qc)] + env = list(j=cl)] on.exit(xdt[, (y) := NULL]) xdt[, cor(.SD), .SDcols = c(x, y)] } diff --git a/man/substitute2.Rd b/man/substitute2.Rd index d54662856d..3b8d536141 100644 --- a/man/substitute2.Rd +++ b/man/substitute2.Rd @@ -4,30 +4,34 @@ \alias{I} \title{ Substitute expression } \description{ - Experimental, more robust and more user-friendly version of base R \code{\link[base]{substitute}}. + Experimental, more robust, and more user-friendly version of base R \code{\link[base]{substitute}}. } \usage{ -substitute2(expr, env) + substitute2(expr, env) } \arguments{ -\item{expr}{ Unevaluated expression in which substitution has to take place. } -\item{env}{ List or an environment from which variables will be taken to inject into \code{expr}. } + \item{expr}{ Unevaluated expression in which substitution has to take place. } + \item{env}{ List, or an environment that will be coerced to list, from which variables will be taken to inject into \code{expr}. } } \details{ - Function will turn any character elements of \code{env} argument into symbols. In case character is of length 2 or more, it will raise an error. Behaviour can be changed by wrapping \code{env} into \code{\link[base]{I}} call. In such case any symbols must be explicitly created, for example using \code{as.name} function. Alternatively it is possible to wrap elements of \code{env} into \code{\link[base]{I}} call individually, not the whole list, then only those elements will retain their original class. + For convenience function will turn any character elements of \code{env} argument into symbols. In case if character is of length 2 or more, it will raise an error. It will also turn any list elements into list calls instead. Behaviour can be changed by wrapping \code{env} into \code{\link[base]{I}} call. In such case any symbols must be explicitly created, for example using \code{as.name} function. Alternatively it is possible to wrap particular elements of \code{env} into \code{\link[base]{I}} call, then only those elements will retain their original class. -Comparing to base R \code{\link[base]{substitute}}, \code{substitute2} function: + Comparing to base R \code{\link[base]{substitute}}, \code{substitute2} function: \enumerate{ -\item substitutes calls argument names as well. -\item automatically converts character elements of \code{env} argument to symbols. -\item does not accept missing \code{env} argument. -\item evaluates elements of \code{env} argument. + \item substitutes calls argument names as well + \item by default converts character elements of \code{env} argument to symbols + \item by default converts list elements of \code{env} argument to list calls + \item does not accept missing \code{env} argument + \item evaluates elements of \code{env} argument } } +\note{ + Conversion of \emph{character to symbol} and \emph{list to list call} works recursively for each list element in \code{env} list. If this behaviour is not desired for your use case, we would like to hear about that via our issue tracker. For the present moment there is an option to disable that: \code{options(datatable.enlist=FALSE)}. This option is provided only for debugging and will be removed in future. Please do not write code that depends on it, but use \code{\link[base]{I}} calls instead. +} \value{ -Quoted expression having variables substituted. + Quoted expression having variables and call argument names substituted. } -\seealso{ \code{\link[base]{substitute}}, \code{\link[base]{I}} } +\seealso{ \code{\link[base]{substitute}}, \code{\link[base]{I}}, \code{\link[base]{call}}, \code{\link[base]{name}}, \code{\link[base]{eval}} } \examples{ ## base R substitute vs substitute2 substitute(list(var1 = var2), list(var1 = "c1", var2 = 5L)) @@ -43,19 +47,23 @@ substitute2(var1, list(var1 = "c1")) ## turn character into symbol, for convenie substitute2(list(var1 = var2), list(var1 = "c1", var2 = I("some_character"))) substitute2(list(var1 = var2), I(list(var1 = as.name("c1"), var2 = "some_character"))) -## character to symbol coerce will not be done for character stored inside a list -substitute2(lapply(some_list, sum), list(some_list = list("V1","V2"))) -## those has to be substituted before -some_list = substitute2(list(v1, v2), list(v1="V1", v2="V2")) -substitute2(lapply(some_list, sum), list(some_list = some_list)) -## in case of a character list of arbitrary length -some_list = as.call(lapply(c("list", c("V1","V2")), as.name)) -substitute2(lapply(some_list, sum), list(some_list = some_list)) -## or from R 4.0.0 also bquote splice functionality -if (getRversion() >= "4.0.0") { - some_list = bquote(list(..(lapply(c("V1","V2"), as.name))), splice=TRUE) - substitute2(lapply(some_list, sum), list(some_list = some_list)) -} +## list elements are enlist'ed into list calls +(cl1 = substitute(f(lst), list(lst = list(1L, 2L)))) +(cl2 = substitute2(f(lst), I(list(lst = list(1L, 2L))))) +(cl3 = substitute2(f(lst), list(lst = I(list(1L, 2L))))) +(cl4 = substitute2(f(lst), list(lst = quote(list(1L, 2L))))) +(cl5 = substitute2(f(lst), list(lst = list(1L, 2L)))) +cl1[[2L]] ## base R substitute with list element +cl2[[2L]] ## same +cl3[[2L]] ## same +cl4[[2L]] ## desired +cl5[[2L]] ## automatically + +## character to name and list into list calls works recursively +(cl1 = substitute2(f(lst), list(lst = list(1L, list(2L))))) +(cl2 = substitute2(f(lst), I(list(lst = list(1L, list(2L)))))) ## unless I() used +last(cl1[[2L]]) ## enlisted recursively +last(cl2[[2L]]) ## AsIs ## using substitute2 from another function f = function(expr, env) { From 28e240ff9926cd883d5c37ac1fb3dd010c3af582 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 31 Mar 2020 20:33:14 +0100 Subject: [PATCH 30/37] programming on data.table vignette --- vignettes/datatable-programming.Rmd | 413 ++++++++++++++++++++++++++++ 1 file changed, 413 insertions(+) create mode 100644 vignettes/datatable-programming.Rmd diff --git a/vignettes/datatable-programming.Rmd b/vignettes/datatable-programming.Rmd new file mode 100644 index 0000000000..0106a37319 --- /dev/null +++ b/vignettes/datatable-programming.Rmd @@ -0,0 +1,413 @@ +--- +title: "Programming on data.table" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Programming data.table} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +```{r init, include = FALSE} +require(data.table) +knitr::opts_chunk$set( + comment = "#", + error = FALSE, + tidy = FALSE, + cache = FALSE, + collapse = TRUE +) +``` + + +## Introduction + +`data.table` from the very first releases enabled interface of `subset` and `with` (or `within`) functions to be used inside `[.data.frame` by defining `[.data.table` method. `subset` and `with` are base R functions that are useful to reduce repetition in code, to make it more easily readable, and usually reduce number of total characters user has to type. Those functions are possible in R because of its quite unique feature called *lazy evaluation*. Feature that allows for a function to catch its arguments before they are evaluated, and for example to evaluate them in a scope different than scope in which they were called. Let's recap interface of `subset` function. + +```{r opt_max_print_10, include = FALSE} +options(max.print = 10L) # 2 rows +``` + +```{r subset} +subset(iris, Species == "setosa") +``` + +It takes the second argument and evaluates it inside the scope of the first argument, a data.frame. It removes variable repetition, makes code more readable, making it less prone to errors. + +## Problem description + +Problem of this kind of interface is that we cannot easily parameterize the code that uses it. It is because expressions passed to those functions are substituted before being evaluated. Those substituted expressions are different when they are passed by user interactivel from those that passed as arguments of another function. + +### Example + +```{r subset_error, error=TRUE} +my_subset = function(data, col, val) { + subset(data, col == val) +} +my_subset(iris, Species, "setosa") +``` + +### Approaching the problem + +There are multiple ways to work around the problem. + +#### Avoid *lazy evaluation* + +The easiest workaround is to avoid *lazy evaluation* in the first place, and fall back to less intuitive, more error-prone machnisms that uses `df[["variable"]]`, etc. + +```{r subset_nolazy} +my_subset = function(data, col, val) { + data[data[[col]] == val, ] +} +my_subset(iris, col = "Species", val = "setosa") +``` + +We basically compute logical vector (of length of nrow of our dataset) first, then logical vector is supplied to `i` argument of `[.data.frame` to perform ordinary subsetting by logical vector. It works well for this simple example, but it lacks flexibility, incorporates variable repetition, and requires user to change the interface to pass column name as character rather than unquoted symbol. The more complex is the expression we need to parameterize the less practical is this approach. + +#### Use of `eval parse` + +This method is usually preferred by newcomers to R language. Conceptually it is the most straightforward way. It is to produce required statement by using string concatenation, then parse it and evaluate. + +```{r subset_parse} +my_subset = function(data, col, val) { + data = deparse(substitute(data)) + col = deparse(substitute(col)) + val = paste0("'", val, "'") + text = paste0("subset(", data, ", ", col, " == ", val, ")") + eval(parse(text = text)[[1L]]) +} +my_subset(iris, Species, "setosa") +``` + +We have to use `deparse(substitute(.))` to catch the actual names of objects passed to function, so we can construct the `subset` function call using those original names. Although It gives unlimited flexibility with a relatively low complexity, **use of `eval(parse(.))` should be avoided**. The main reasons are: + +- lack of syntax validation +- [vulnerability to code injection](https://github.com/Rdatatable/data.table/issues/2655#issuecomment-376781159) +- better alternatives + +Martin Machler, R Project Core Developer, [once said](https://stackoverflow.com/a/40164111/2490497): + +> Sorry but I don't understand why too many people even think a string was something that could be evaluated. You must change your mindset, really. Forget all connections between strings on one side and expressions, calls, evaluation on the other side. +The (possibly) only connection is via `parse(text = ....)` and all good R programmers should know that this is rarely an efficient or safe means to construct expressions (or calls). Rather learn more about `substitute()`, `quote()`, and possibly the power of using `do.call(substitute, ......)`. + +#### Use *computing one the language* + +Mentioned functions, along with some others (including `as.call`, `as.name`/`as.symbol`, `bquote`, `eval`), can be categorized as functions to *compute on the language*, as they operate on language objects (`call`, `name`/`symbol`). + +```{r subset_substitute} +my_subset = function(data, col, val) { + eval(substitute(subset(data, col == val))) +} +my_subset(iris, Species, "setosa") +``` + +We used base R `substitute` function to transform call `subset(data, col == val)` into `subset(iris, Species == "setosa")` by substituting `data`, `col` and `val` with their original names (or values) from their parent environment. We can see this solution is superior to former ones. Note that we operate on the language objects layer not touching string manipulation routines, thus we refer to that process as *computing on the language*. There is a dedicated chapter on *Computing on the language* in [R language manual](https://cloud.r-project.org/doc/manuals/r-release/R-lang.html). Although it is not necessary for *programming on data.table*, we encourage readers to read this chapter, for the sake of better understanding of the R's powerful and pretty unique feature. + +#### Use third party packages + +There are third party packages that can achieve what base R computing on the language routines do. To name a few `pryr`, `lazyeval` or `rlang`. We will not discuss them here as they are not solving any problem that could not be addressed by base R routines. + +## Programming on data.table + +Now, once we established the proper way to parameterize code that uses *lazy evaluation*, we can move on to the main discussion of this vignette, *programming on data.table*. + +Starting from version 1.12.10 data.table provides robust mechanism for parameterizing expressions passed to `i`, `j` and `by` (or `keyby`) arguments of `[.data.table`. It is built upon base R `substitute` function, and mimics its interface. For that purpose `substitute2` has been added as a more robust, and more user-friendly, version of base R `substitute`. For a complete list of differences between `base::substitute` and `data.table::substitute2` please read [`substitute2` manual](https://rdatatable.gitlab.io/data.table/library/data.table/html/substitute2.html). + +### Substitute variables and names + +Let's assume we want to have a general function that applies a function to sum of two arguments that has been applied another function. Code of the particular example function will make this example clearer. Below we have a function to compute length of hypotenuse in a right-angled triangle, knowing length of its legs. + +${\displaystyle c = \sqrt{a^2 + b^2}}$ + +```{r hypotenuse} +square = function(x) x^2 +quote( + sqrt(square(a) + square(b)) +) +``` + +The goal is the make every name in the above call to be able to be passed as parameter. + +```{r hypotenuse_substitute2} +substitute2( + outer(inner(var1) + inner(var2)), + env = list( + outer = "sqrt", + inner = "square", + var1 = "a", + var2 = "b" + ) +) +``` + +We can see that both functions names has been replaced, as well names of the variables passed to functions. We used `substitute2` for convenience. In this simple case, base R `substitute` could be used as well, although it requires extra `lapply(env, as.name)`. + +Now, to use substitution inside `[.data.table` we don't need to call `substitute2` function. It is being used internally, all we have to do is to provide `env` argument, the same way as we provide it to `substitute2` function. Substitution is applied to `i`, `j` and `by` (or `keyby`) arguments of `[.data.table` method. Note that `verbose = TRUE` argument can be used to print expressions after substitution is applied. + +```{r opt_max_print_8, include = FALSE} +options(max.print = 8L) # 2 rows +``` + +```{r hypotenuse_datatable} +DT = as.data.table(iris) + +DT[, outer(inner(var1) + inner(var2)), + env = list( + outer = "sqrt", + inner = "square", + var1 = "Sepal.Length", + var2 = "Sepal.Width" + )] + +# return as data.table, substitute call argument name +DT[, .(Species, var1, var2, out = outer(inner(var1) + inner(var2))), + env = list( + outer = "sqrt", + inner = "square", + var1 = "Sepal.Length", + var2 = "Sepal.Width", + out = "Sepal.Hypotenuse" + )] +``` + +In the last call we added another parameter `out = "Sepal.Hypotenuse"` that conveys the name of output column. Unlike base R, `substitute2` will handle substitution of call arguments names as well. + +Substitution works on `i` and `by` (or `keyby`) as well. + +```{r hypotenuse_datatable_i_j_by} +DT[filter_col %in% filter_val, + .(var1, var2, out = outer(inner(var1) + inner(var2))), + by = by_col, + env = list( + outer = "sqrt", + inner = "square", + var1 = "Sepal.Length", + var2 = "Sepal.Width", + out = "Sepal.Hypotenuse", + filter_col = "Species", + filter_val = I(c("versicolor", "virginica")), + by_col = "Species" + )] +``` + +### Substitute variables and character values + +In the above example we have seen convenient feature of `substitute2` to automatically converts character to names/symbols. Obvious question arises, what if we actually want to substitute parameter with a character value, so to have base R `substitute` behaviour. We provide mechanism to escape automatic conversion by wrapping elements into base R `I()` call. The `I` function marks an object as *AsIs*, preventing conversion to take place, read `?AsIs` for more details. If base R behaviour is desired for a whole `env` argument, then best to wrap `env` argument into `I()`, otherwise each list element can be wrapped into `I()` individually, see both use cases below. + +```{r rank} +substitute( # base R + rank(input, ties.method = ties), + env = list(input = as.name("Sepal.Width"), ties = "first") +) + +substitute2( # like base R, env AsIs class + rank(input, ties.method = ties), + env = I(list(input = as.name("Sepal.Width"), ties = "first")) +) + +substitute2( # only particular elements of env are AsIs class + rank(input, ties.method = ties), + env = list(input = "Sepal.Width", ties = I("first")) +) +``` + +Note that conversion works recursively on each list element, including of course the escape mechanism. + +```{r substitute2_recursive} +substitute2( # all are symbols + f(v1, v2), + list(v1 = "a", v2 = list("b", list("c", "d"))) +) +substitute2( # 'a' and 'd' should stay as character + f(v1, v2), + list(v1 = I("a"), v2 = list("b", list("c", I("d")))) +) +``` + +### Substitute list of arbitrary length + +Example discussed above presents neat and powerful way to make your code more dynamic. Although there are many other, much more complex cases that developer might have to deal with. One of the common problems is to handle arbitrary length list of arguments. + +An obvious use case could be to mimic `.SD` functionality by injecting a `list` call into `j` argument. + +```{r opt_max_print_4, include = FALSE} +options(max.print = 4L) # 2 rows +``` + +```{r splice_sd} +cols = c("Sepal.Length", "Sepal.Width") +DT[, .SD, .SDcols = cols] +``` + +Having `cols` parameter we want to splice it into a `list` call making `j` argument look like below. + +```{r splice_tobe} +DT[, list(Sepal.Length, Sepal.Width)] +``` + +*Splice* is an operation where list of objects has to be inlined into expression as a sequence of arguments to call. +In base R splice `cols` into a `list` call can be achieved using `as.call(c(quote(list), cols))`. Additionally starting from R 4.0.0, there is new interface for such operation in `bquote` function. +In data.table we make it easier, by automatically _enlist_-ing list of objects into list call to those objects. It means that any `list` object inside `env` list argument will be turned into list `call`. Making the API for that as simple as presented below. + +```{r splice_datatable} +# this works +DT[, j, + env = list(j = as.list(cols)), + verbose = TRUE] + +# this will not work +#DT[, list(cols), +# env = list(cols = cols)] +``` + +It is important to provide a call to list, rather than a list, inside the `env` list argument. It is exactly what is happening in the above example, let's explain _enlist_ list into list call more in details. + +```{r splice_enlist} +DT[, j, # data.table automatically enlist nested lists into list calls + env = list(j = as.list(cols)), + verbose = TRUE] + +DT[, j, # turning above 'j' list into a list call + env = list(j = quote(list(Sepal.Length, Sepal.Width))), + verbose = TRUE] + +DT[, j, # the same as above but accepts character vector + env = list(j = as.call(c(quote(list), lapply(cols, as.name)))), + verbose = TRUE] +``` + +Now let's try to pass a list of symbols, rather than list call to those symbols. We will use `I()` to escape automatic _enlist_-ing, it will also turn off character to symbol conversion, so we also have to use `as.name`. + +```{r splice_not, error=TRUE} +DT[, j, # list of symbols + env = I(list(j = lapply(cols, as.name))), + verbose = TRUE] + +DT[, j, # again the proper way, enlist list to list call automatically + env = list(j = as.list(cols)), + verbose = TRUE] +``` + +Note that both expressions visually look the same, although they are not identical. + +```{r splice_substitute2_not} +str(substitute2(j, env = I(list(j = lapply(cols, as.name))))) + +str(substitute2(j, env = list(j = as.list(cols)))) +``` + +For more detailed explanation on that matter please see examples in [`substitute2` manual](https://rdatatable.gitlab.io/data.table/library/data.table/html/substitute2.html). + +### Substitution a complex query + +Let's take as an example more complex function that calculates root mean square. + +${\displaystyle x_{\text{RMS}}={\sqrt{{\frac{1}{n}}\left(x_{1}^{2}+x_{2}^{2}+\cdots +x_{n}^{2}\right)}}}$ + +It takes arbitrary number of variables on input, but now we cannot just *splice* list of arguments into a list call because each of those arguments has to be wrapped into `square` call. In this case we have to *splice* by hand rather than relying on data.table automatic _enlist_. + +First we have to construct calls to `square` function for each of the variables (see `inner_calls`). Then we have to reduce list of calls into a single call, having nested sequence of `+` calls (see `add_calls`). Lastly we have to substitute constructed call into surrounding expression (see `rms`). + +```{r opt_max_print_12, include = FALSE} +options(max.print = 12L) # 2 rows +``` + +```{r complex} +outer = "sqrt" +inner = "square" +vars = c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") + +syms = lapply(vars, as.name) +to_inner_call = function(var, fun) call(fun, var) +inner_calls = lapply(syms, to_inner_call, inner) +print(inner_calls) + +to_add_call = function(x, y) call("+", x, y) +add_calls = Reduce(to_add_call, inner_calls) +print(add_calls) + +rms = substitute2( + expr = outer((add_calls) / len), + env = list( + outer = outer, + add_calls = add_calls, + len = length(vars) + ) +) +print(rms) + +DT[, j, env = list(j = rms)] + +# same but skipping last substitute2 call and using add_calls directly +DT[, outer((add_calls) / len), + env = list( + outer = outer, + add_calls = add_calls, + len = length(vars) + )] + +# return as data.table +j = substitute2(j, list(j = as.list(setNames(nm = c(vars, "Species", "rms"))))) +j[["rms"]] = rms +print(j) +DT[, j, env = list(j = j)] + +# alternatively +j = as.call(c( + quote(list), + lapply(setNames(nm = vars), as.name), + list(Species = as.name("Species")), + list(rms = rms) +)) +print(j) +DT[, j, env = list(j = j)] +``` + +## Retired interfaces + +In `[.data.table` it is also possible to use other interfaces for variable substitution, or passing quoted expressions. Those are `get` and `mget` for inline injection of variables by providing their names as character, and `eval` that tells `[.data.table` that expression we passed into an argument is a quoted expression, so should be handled differently. Those interfaces are now considered as retired and use of the new `env` argument is recommended. + +### `get` + +```{r opt_max_print_4b, include = FALSE} +options(max.print = 4L) # 2 rows +``` + +```{r old_get} +v1 = "Petal.Width" +v2 = "Sepal.Width" + +DT[, .(total = sum(get(v1), get(v2)))] + +DT[, .(total = sum(v1, v2)), + env = list(v1 = v1, v2 = v2)] +``` + +### `mget` + +```{r old_mget} +v = c("Petal.Width", "Sepal.Width") + +DT[, lapply(mget(v), mean)] + +DT[, lapply(v, mean), + env = list(v = as.list(v))] + +DT[, lapply(v, mean), + env = list(v = as.list(setNames(nm = v)))] +``` + +### `eval` + +Here instead of using `eval` function we just provide quoted expression into the element of `env` argument. + +```{r old_eval} +cl = quote( + .(Petal.Width = mean(Petal.Width), Sepal.Width = mean(Sepal.Width)) +) + +DT[, eval(cl)] + +DT[, cl, env = list(cl = cl)] +``` From d056afe0b46bcdfb1f1544680e74562092d2747e Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 31 Mar 2020 20:47:49 +0100 Subject: [PATCH 31/37] NEWS wording --- NEWS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index a71d92c1b8..2cd1d53baf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -93,7 +93,7 @@ fun_arg1 = "na.rm" fun_arg1val = TRUE out_col_name = "sum_x" -# parametrized query +# parameterized query #DT[, .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val))] # desired query @@ -110,7 +110,7 @@ DT[, .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), )] ``` -Addresses [#2655](https://github.com/Rdatatable/data.table/issues/2655) any many other linked issues. Thanks to numerous users for filling requests for a better flexibility in parametrizing data.table queries. +Addresses [#2655](https://github.com/Rdatatable/data.table/issues/2655) any many other linked issues. Thanks to numerous users for filling requests for a better flexibility in parameterizing data.table queries. ## BUG FIXES From a42e3d2a03e3e56cd256a976c1d1a45ec0e12efc Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 1 Apr 2020 09:37:09 +0100 Subject: [PATCH 32/37] fix rmd yaml index entry --- vignettes/datatable-programming.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/datatable-programming.Rmd b/vignettes/datatable-programming.Rmd index 0106a37319..9d98128793 100644 --- a/vignettes/datatable-programming.Rmd +++ b/vignettes/datatable-programming.Rmd @@ -4,7 +4,7 @@ date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Programming data.table} + %\VignetteIndexEntry{Programming on data.table} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- From 5c47c099a3b90ccab989b3b6973a48e50d1baf5b Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 8 Apr 2020 19:47:06 +0100 Subject: [PATCH 33/37] typos and wording --- vignettes/datatable-programming.Rmd | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vignettes/datatable-programming.Rmd b/vignettes/datatable-programming.Rmd index 9d98128793..9f904138a8 100644 --- a/vignettes/datatable-programming.Rmd +++ b/vignettes/datatable-programming.Rmd @@ -37,7 +37,7 @@ It takes the second argument and evaluates it inside the scope of the first argu ## Problem description -Problem of this kind of interface is that we cannot easily parameterize the code that uses it. It is because expressions passed to those functions are substituted before being evaluated. Those substituted expressions are different when they are passed by user interactivel from those that passed as arguments of another function. +Problem of this kind of interface is that we cannot easily parameterize the code that uses it. It is because expressions passed to those functions are substituted before being evaluated. Those substituted expressions are different when they are passed by user interactively from those that passed as arguments of another function. ### Example @@ -299,7 +299,7 @@ str(substitute2(j, env = list(j = as.list(cols)))) For more detailed explanation on that matter please see examples in [`substitute2` manual](https://rdatatable.gitlab.io/data.table/library/data.table/html/substitute2.html). -### Substitution a complex query +### Substitution of a complex query Let's take as an example more complex function that calculates root mean square. @@ -366,7 +366,7 @@ DT[, j, env = list(j = j)] ## Retired interfaces -In `[.data.table` it is also possible to use other interfaces for variable substitution, or passing quoted expressions. Those are `get` and `mget` for inline injection of variables by providing their names as character, and `eval` that tells `[.data.table` that expression we passed into an argument is a quoted expression, so should be handled differently. Those interfaces are now considered as retired and use of the new `env` argument is recommended. +In `[.data.table` it is also possible to use other interfaces for variable substitution, or passing quoted expressions. Those are `get` and `mget` for inline injection of variables by providing their names as character, and `eval` that tells `[.data.table` that expression we passed into an argument is a quoted expression, so should be handled differently. Those interfaces should be considered as retired and we recommended to use new `env` argument instead. ### `get` @@ -400,7 +400,7 @@ DT[, lapply(v, mean), ### `eval` -Here instead of using `eval` function we just provide quoted expression into the element of `env` argument. +Instead of using `eval` function we can provide quoted expression into the element of `env` argument, no extra `eval` call is needed then. ```{r old_eval} cl = quote( From e869026b30bc4ea077df48e757dfdac33fa37a54 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 27 May 2020 13:48:20 +0100 Subject: [PATCH 34/37] empty input to substitute2 --- R/programming.R | 2 ++ inst/tests/programming.Rraw | 1 + 2 files changed, 3 insertions(+) diff --git a/R/programming.R b/R/programming.R index 772c680554..ee4f4c0b3d 100644 --- a/R/programming.R +++ b/R/programming.R @@ -42,6 +42,8 @@ enlist = function(x) { } substitute2 = function(expr, env) { + if (missing(expr)) + return(substitute()) if (missing(env)) { stop("'env' must not be missing") } else if (is.null(env)) { diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index be8cddedc0..959b965153 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -76,6 +76,7 @@ test(2.13, substitute2( const1 = function() 1L test(2.21, substitute2(list(nm = fun()), env=list(a="b", fun="const1", nm="int1")), quote(list(int1=const1()))) test(2.22, substitute2(.(), env=list(a="b", fun="const1", nm="int1")), quote(.())) +test(2.23, identical(substitute2(), substitute())) # substitute2 AsIs class properly removed or kept test(2.31, class(substitute2(var3%in%values, list(var3="a", values=I(c("a","b","c"))))[[3L]]), "character") test(2.32, class(substitute2(var3%in%values, I(list(var3=as.name("a"), values=c("a","b","c"))))[[3L]]), "character") From 09e0499b55545ae15a83d4e9a0536b632f85a892 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 27 May 2020 15:17:18 +0100 Subject: [PATCH 35/37] empty input to env more robust --- R/data.table.R | 14 +++++++++----- R/programming.R | 2 ++ inst/tests/programming.Rraw | 12 ++++++++++++ 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index f8282c15cd..d1ed7650e1 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -156,7 +156,7 @@ replace_dot_alias = function(e) { substitute2(.keyby, env), list(.keyby = substitute(keyby)) )) - if (verbose) cat("Argument 'by' after substitute: ", paste(deparse(bysub, width.cutoff=500L), collapse=" "), "\n", sep="") + if (missing(by)) {missingby=TRUE; by=bysub=NULL} else if (verbose) cat("Argument 'by' after substitute: ", paste(deparse(bysub, width.cutoff=500L), collapse=" "), "\n", sep="") } keyby=TRUE # Assign to 'by' so that by is no longer missing and we can proceed as if there were one by @@ -168,7 +168,7 @@ replace_dot_alias = function(e) { substitute2(.by, env), list(.by = substitute(by)) )) - if (verbose) cat("Argument 'by' after substitute: ", paste(deparse(bysub, width.cutoff=500L), collapse=" "), "\n", sep="") + if (missing(by)) {missingby=TRUE; by=bysub=NULL} else if (verbose) cat("Argument 'by' after substitute: ", paste(deparse(bysub, width.cutoff=500L), collapse=" "), "\n", sep="") } } keyby=FALSE @@ -232,8 +232,10 @@ replace_dot_alias = function(e) { substitute2(.j, env), list(.j = substitute(j)) )) - if (verbose) cat("Argument 'j' after substitute: ", paste(deparse(jsub, width.cutoff=500L), collapse=" "), "\n", sep="") + if (missing(jsub)) {j = substitute(); jsub=NULL} else if (verbose) cat("Argument 'j' after substitute: ", paste(deparse(jsub, width.cutoff=500L), collapse=" "), "\n", sep="") } + } + if (!missing(j)) { jsub = replace_dot_alias(jsub) root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else "" if (root == ":" || @@ -310,15 +312,17 @@ replace_dot_alias = function(e) { # setdiff removes duplicate entries, which'll create issues with duplicated names. Use %chin% instead. dupdiff = function(x, y) x[!x %chin% y] - + isub = NULL if (!missing(i)) { if (is.null(env)) isub = substitute(i) else { isub = eval(substitute( substitute2(.i, env), list(.i = substitute(i)) )) - if (verbose) cat("Argument 'i' after substitute: ", paste(deparse(isub, width.cutoff=500L), collapse=" "), "\n", sep="") + if (missing(isub)) {i = substitute(); isub=NULL} else if (verbose) cat("Argument 'i' after substitute: ", paste(deparse(isub, width.cutoff=500L), collapse=" "), "\n", sep="") } + } + if (!missing(i)) { xo = NULL if (identical(isub, NA)) { # only possibility *isub* can be NA (logical) is the symbol NA itself; i.e. DT[NA] diff --git a/R/programming.R b/R/programming.R index ee4f4c0b3d..b4d25012f8 100644 --- a/R/programming.R +++ b/R/programming.R @@ -73,6 +73,8 @@ substitute2 = function(expr, env) { substitute(.expr, env), env = list(.expr = substitute(expr)) )) + if (missing(expr.sub)) + return(substitute()) ## nested emptiness # substitute call argument names .Call(Csubstitute_call_arg_namesR, expr.sub, env) } diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index 959b965153..3d8a056e3a 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -318,6 +318,18 @@ test(11.067, d[, .(v1, v2), env=list(v1=v1, v2=v2)], data.table(c1=c(1L,1L),c2=c test(11.068, d[, .(sum(v1)), env=list(v1=v1)], d[, .(sum(get(v1)))]) test(11.069, d[, lapply(vN, sum), env=list(vN=as.list(setNames(nm = c(v1, v3))))], d[, lapply(mget(c(v1,v3)), sum)]) test(11.070, d[, c(list(c1=c1, c2=c2), list(v3=v3), list(v4=v4, v5=v5)), env=list(v3=v3,v4=v4,v5=v5)], d) ## d[, c(list(c1, c2), list(get(v3)), mget(c(v4,v5)))] - some are unnamed +# empty input +d = data.table(x=1:2, y=1:4) +test(11.081, d[.i, env=list(.i=substitute()), verbose=TRUE], d, notOutput="after substitute") +test(11.082, d[.i, .j, .by, env=list(.i=substitute(), .j=substitute(), .by=substitute()), verbose=TRUE], d, notOutput="after substitute") +f = function(x, i, j, by) { + x[.i, .j, .by, env=list(.i=substitute(i), .j=substitute(j), .by=substitute(by)), verbose=TRUE] +} +test(11.083, f(d), d) +test(11.084, f(d, 1), d[1], output="Argument 'i' after substitute", notOutput="Argument 'j' after substitute") +test(11.085, f(d,, 1), d[,1], output="Argument 'j' after substitute", notOutput="Argument 'i' after substitute") +test(11.086, f(d, 1, 1), d[1, 1], output="Argument 'j' after substitute.*Argument 'i' after substitute") + #1985 weird exception when by contains get tb = data.table(x=c(1,2), y=c(3,4), z=c(5,6), w=c("a","b")) test(11.101, tb[w != "b", .(x=sum(x)), by=.(y, zz=.z), env=list(.z="z")], data.table(y=3, zz=5, x=1)) From c67d75f5890ddceb0f19031a63d081e4e583d563 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 9 May 2021 16:25:01 -0600 Subject: [PATCH 36/37] move news item up --- NEWS.md | 62 +++++++++++++++++++++++++++------------------------------ 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/NEWS.md b/NEWS.md index b0ecf1bc6c..bcaf69f92c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -58,6 +58,35 @@ 9. `melt()` now supports multiple output variable columns via the `variable_table` attribute of `measure.vars`, [#3396](https://github.com/Rdatatable/data.table/issues/3396) [#2575](https://github.com/Rdatatable/data.table/issues/2575) [#2551](https://github.com/Rdatatable/data.table/issues/2551). It should be a `data.table` with one row that describes each element of the `measure.vars` vector(s). These data/columns are copied to the output instead of the usual variable column. This is backwards compatible since the previous behavior (one output variable column) is used when there is no `variable_table`. New function `measure()` which uses either a separator or a regex to create a `measure.vars` list/vector with `variable_table` attribute; useful for melting data that has several distinct pieces of information encoded in each column name. See new `?measure` and new section in reshape vignette. Thanks to Matthias Gomolka, Ananda Mahto, Hugh Parsonage for reporting, and to @tdhock for implementing. +10. A new interface for _programming on data.table_ has been added, [#2655](https://github.com/Rdatatable/data.table/issues/2655) any many other linked issues. It is built using base R's `substitute`-like interface via a new `env` argument to `[.data.table`. For details see the new vignette *programming on data.table*, and the new `?substitute2` manual page. Thanks to numerous users for filing requests, and Jan Gorecki for implementing. + + ```R + DT = data.table(x = 1:5, y = 5:1) + + # parameters + in_col_name = "x" + fun = "sum" + fun_arg1 = "na.rm" + fun_arg1val = TRUE + out_col_name = "sum_x" + + # parameterized query + #DT[, .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val))] + + # desired query + DT[, .(sum_x = sum(x, na.rm=TRUE))] + + # new interface + DT[, .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), + env = list( + in_col_name = "x", + fun = "sum", + fun_arg1 = "na.rm", + fun_arg1val = TRUE, + out_col_name = "sum_x" + )] + ``` + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. @@ -119,39 +148,6 @@ The community was consulted in [this tweet](https://twitter.com/MattDowle/status/1358011599336931328) before release. -## NEW FEATURES - -1. New interface for _programming on data.table_ has been added. It is built using base R `substitute`-like interface via new `env` argument to `[.data.table`. For details of substitution see new vignette *programming on data.table* and `?substitute2` manual. - -```r -DT = data.table(x = 1:5, y = 5:1) - -# parameters -in_col_name = "x" -fun = "sum" -fun_arg1 = "na.rm" -fun_arg1val = TRUE -out_col_name = "sum_x" - -# parameterized query -#DT[, .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val))] - -# desired query -DT[, .(sum_x = sum(x, na.rm=TRUE))] - -# new interface -DT[, .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), - env = list( - in_col_name = "x", - fun = "sum", - fun_arg1 = "na.rm", - fun_arg1val = TRUE, - out_col_name = "sum_x" - )] -``` - -Addresses [#2655](https://github.com/Rdatatable/data.table/issues/2655) any many other linked issues. Thanks to numerous users for filling requests for a better flexibility in parameterizing data.table queries. - ## BUG FIXES 1. If `fread()` discards a single line footer, the warning message which includes the discarded text now displays any non-ASCII characters correctly on Windows, [#4747](https://github.com/Rdatatable/data.table/issues/4747). Thanks to @shrektan for reporting and the PR. From d00f292596d82fd21aac59edc34a9f8b388450f4 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 9 May 2021 16:45:58 -0600 Subject: [PATCH 37/37] merge follow-up to pass programming.Rraw --- R/data.table.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 26948d6db0..c651759ad7 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -162,7 +162,8 @@ replace_dot_alias = function(e) { else if (!isTRUEorFALSE(keyby)) stop("When by and keyby are both provided, keyby must be TRUE or FALSE") } - if (verbose) cat("Argument 'by' after substitute: ", paste(deparse(bysub, width.cutoff=500L), collapse=" "), "\n", sep="") + if (missing(by)) { missingby=TRUE; by=bysub=NULL } # possible when env is used, PR#4304 + else if (verbose) cat("Argument 'by' after substitute: ", paste(deparse(bysub, width.cutoff=500L), collapse=" "), "\n", sep="") } bynull = !missingby && is.null(by) #3530 byjoin = !is.null(by) && is.symbol(bysub) && bysub==".EACHI"