From 0e08e615e22ffff9c07302bfe302a3b1c8b915f5 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 19 Oct 2021 12:47:11 +0200 Subject: [PATCH 1/8] init fix gprod --- src/gsumm.c | 84 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 61 insertions(+), 23 deletions(-) diff --git a/src/gsumm.c b/src/gsumm.c index 5bb2620243..2dd460f413 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -1116,13 +1116,15 @@ SEXP gprod(SEXP x, SEXP narmArg) { //clock_t start = clock(); SEXP ans; if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gprod"); - long double *s = malloc(ngrp * sizeof(long double)); - if (!s) error(_("Unable to allocate %d * %d bytes for gprod"), ngrp, sizeof(long double)); - for (int i=0; i DBL_MAX) ansd[i] = R_PosInf; + else if (s[i] < -DBL_MAX) ansd[i] = R_NegInf; + else ansd[i] = (double)s[i]; + } + free(s); + } break; case REALSXP: { - const double *xd = REAL(x); - for (int i=0; i max_bd) ansd[i] = R_PosInf; + else if (s[i] < min_bd) ansd[i] = R_NegInf; + else ansd[i] = (int64_t)s[i]; + } + free(s); + } else { + long double *s = malloc(ngrp * sizeof(long double)); + for (int i=0; i DBL_MAX) ansd[i] = R_PosInf; + else if (s[i] < -DBL_MAX) ansd[i] = R_NegInf; + else ansd[i] = (double)s[i]; + } + free(s); + } + } break; default: - free(s); error(_("Type '%s' is not supported by GForce %s. Either add the prefix %s or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x)), "prod (gprod)", "base::prod(.)"); } - for (int i=0; i DBL_MAX) ansd[i] = R_PosInf; - else if (s[i] < -DBL_MAX) ansd[i] = R_NegInf; - else ansd[i] = (double)s[i]; - } - free(s); copyMostAttrib(x, ans); UNPROTECT(1); // Rprintf(_("this gprod took %8.3f\n"), 1.0*(clock()-start)/CLOCKS_PER_SEC); From cfbd9e3b1b0cee95358a36995ecabe87a9e430a7 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Thu, 21 Oct 2021 17:59:37 +0200 Subject: [PATCH 2/8] finish bit64 --- src/gsumm.c | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/src/gsumm.c b/src/gsumm.c index 2dd460f413..6944ddbad0 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -1116,14 +1116,13 @@ SEXP gprod(SEXP x, SEXP narmArg) { //clock_t start = clock(); SEXP ans; if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gprod"); + long double *s = malloc(ngrp * sizeof(long double)); + if (!s) error(_("Unable to allocate %d * %d bytes for gprod"), ngrp, sizeof(long double)); + for (int i=0; i DBL_MAX) ansd[i] = R_PosInf; - else if (s[i] < -DBL_MAX) ansd[i] = R_NegInf; + if (s[i] > max_val) ansd[i] = R_PosInf; + else if (s[i] < min_val) ansd[i] = R_NegInf; else ansd[i] = (double)s[i]; } free(s); } break; case REALSXP: { if (INHERITS(x, char_integer64)) { - int64_t *s = malloc(ngrp * sizeof(int64_t)); - for (int i=0; i max_bd) ansd[i] = R_PosInf; - else if (s[i] < min_bd) ansd[i] = R_NegInf; + if (s[i] > max_val) ansd[i] = R_PosInf; + else if (s[i] < min_val) ansd[i] = R_NegInf; else ansd[i] = (int64_t)s[i]; } free(s); } else { - long double *s = malloc(ngrp * sizeof(long double)); - for (int i=0; i DBL_MAX) ansd[i] = R_PosInf; - else if (s[i] < -DBL_MAX) ansd[i] = R_NegInf; + if (s[i] > max_val) ansd[i] = R_PosInf; + else if (s[i] < min_val) ansd[i] = R_NegInf; else ansd[i] = (double)s[i]; } free(s); From 7562da1e5bbcacde47587f441bafbb503458093a Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Thu, 21 Oct 2021 18:27:46 +0200 Subject: [PATCH 3/8] move free outside of cases --- src/gsumm.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/gsumm.c b/src/gsumm.c index 786357c6b7..c3ebc3411a 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -1139,7 +1139,6 @@ SEXP gprod(SEXP x, SEXP narmArg) { else if (s[i] < min_val) ansd[i] = R_NegInf; else ansd[i] = (double)s[i]; } - free(s); } break; case REALSXP: { if (INHERITS(x, char_integer64)) { @@ -1160,7 +1159,6 @@ SEXP gprod(SEXP x, SEXP narmArg) { else if (s[i] < min_val) ansd[i] = R_NegInf; else ansd[i] = (int64_t)s[i]; } - free(s); } else { double min_val = -DBL_MAX, max_val = DBL_MAX; double *ansd = REAL(ans); @@ -1179,12 +1177,13 @@ SEXP gprod(SEXP x, SEXP narmArg) { else if (s[i] < min_val) ansd[i] = R_NegInf; else ansd[i] = (double)s[i]; } - free(s); } } break; default: + free(s); error(_("Type '%s' is not supported by GForce %s. Either add the prefix %s or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x)), "prod (gprod)", "base::prod(.)"); } + free(s); copyMostAttrib(x, ans); UNPROTECT(1); // Rprintf(_("this gprod took %8.3f\n"), 1.0*(clock()-start)/CLOCKS_PER_SEC); From ae0975a398c5b84ed65e225a1c1cf9ca38affdb1 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Thu, 21 Oct 2021 18:46:29 +0200 Subject: [PATCH 4/8] add test --- inst/tests/tests.Rraw | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6382a13a85..1813e6349d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18348,3 +18348,10 @@ test(2225.1, groupingsets(data.table(iris), j=sum(Sepal.Length), by=c('Sp'='Spec test(2225.2, groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Sp'='Species'), sets=list('Species')), groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Species'), sets=list('Species'))) +# make gprod work for bit64 # 5225 +if (test_bit64) { + DT = data.table(x=c(lim.integer64(), 1, 1, NA, NA), g=1:2) + test(2226.1, DT[, prod(x), g], DT[, base::prod(x), g]) + test(2226.2, DT[, prod(x, na.rm=TRUE), g], DT[, base::prod(x, na.rm=TRUE), g]) +} + From 2ad25545d64c90462ee2ef2fbfc6658ad44deaf7 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Thu, 21 Oct 2021 18:46:49 +0200 Subject: [PATCH 5/8] delete newline --- inst/tests/tests.Rraw | 1 - 1 file changed, 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1813e6349d..915024ea8c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18354,4 +18354,3 @@ if (test_bit64) { test(2226.1, DT[, prod(x), g], DT[, base::prod(x), g]) test(2226.2, DT[, prod(x, na.rm=TRUE), g], DT[, base::prod(x, na.rm=TRUE), g]) } - From 15a6b5cea6e3f0c8a4b5c132927f868999f67cf2 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Thu, 21 Oct 2021 19:06:51 +0200 Subject: [PATCH 6/8] add NEWS --- NEWS.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 5faf40723f..358100f1f9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -448,9 +448,11 @@ # 2: 2021-02-03 # was 18661 # 3: 4611686018427387906 # was error 'please use as.character' ``` - + 47. `tables()` failed with `argument "..." is missing` when called from within a function taking `...`; e.g. `function(...) { tables() }`, [#5197](https://github.com/Rdatatable/data.table/issues/5197). Thanks @greg-minshall for the report and @michaelchirico for the fix. +48. `DT[, prod(int64Col), by=grp]` produced wrong results for `bit64::integer64`, [#5225](https://github.com/Rdatatable/data.table/issues/5225). Thanks to Benjamin Schwendinger for reporting and fixing. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : From 609af689231ada496b7d31c1fc688dfa87b1c130 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 25 Oct 2021 13:00:02 -0600 Subject: [PATCH 7/8] change fix to keep double being returned --- NEWS.md | 2 +- inst/tests/tests.Rraw | 12 ++++++++---- src/gsumm.c | 39 ++++++++++++--------------------------- 3 files changed, 21 insertions(+), 32 deletions(-) diff --git a/NEWS.md b/NEWS.md index 358100f1f9..e89778c544 100644 --- a/NEWS.md +++ b/NEWS.md @@ -451,7 +451,7 @@ 47. `tables()` failed with `argument "..." is missing` when called from within a function taking `...`; e.g. `function(...) { tables() }`, [#5197](https://github.com/Rdatatable/data.table/issues/5197). Thanks @greg-minshall for the report and @michaelchirico for the fix. -48. `DT[, prod(int64Col), by=grp]` produced wrong results for `bit64::integer64`, [#5225](https://github.com/Rdatatable/data.table/issues/5225). Thanks to Benjamin Schwendinger for reporting and fixing. +48. `DT[, prod(int64Col), by=grp]` produced wrong results for `bit64::integer64` due to incorrect optimization, [#5225](https://github.com/Rdatatable/data.table/issues/5225). Thanks to Benjamin Schwendinger for reporting and fixing. ## NOTES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 915024ea8c..255a3282bb 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18348,9 +18348,13 @@ test(2225.1, groupingsets(data.table(iris), j=sum(Sepal.Length), by=c('Sp'='Spec test(2225.2, groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Sp'='Species'), sets=list('Species')), groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Species'), sets=list('Species'))) -# make gprod work for bit64 # 5225 +# make gprod work for bit64, #5225 if (test_bit64) { - DT = data.table(x=c(lim.integer64(), 1, 1, NA, NA), g=1:2) - test(2226.1, DT[, prod(x), g], DT[, base::prod(x), g]) - test(2226.2, DT[, prod(x, na.rm=TRUE), g], DT[, base::prod(x, na.rm=TRUE), g]) + # base::prod always returns double; we like that and follow that lead in data.table + # despite bit64 having its own prod method which returns integer64 currently + test(2226.1, base::prod(2147483647L,2L), 4294967294) # just to demonstrate + DT = data.table(x=c(lim.integer64(), 2, 2, NA, NA, -2, 4), g=INT(1,2,1,2,1,2,3,3)) + test(2226.2, DT[, prod(x), g], data.table(g=1:3, V1=c(NA,NA,-8))) + test(2226.3, signif(DT[, prod(x,na.rm=TRUE), g]$V1, 6), c(-1.84467e19, +1.84467e19, -8)) } + diff --git a/src/gsumm.c b/src/gsumm.c index c3ebc3411a..6a96cad829 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -1114,16 +1114,12 @@ SEXP gprod(SEXP x, SEXP narmArg) { const bool nosubset = irowslen==-1; const int n = nosubset ? length(x) : irowslen; //clock_t start = clock(); - SEXP ans; if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gprod"); long double *s = malloc(ngrp * sizeof(long double)); if (!s) error(_("Unable to allocate %d * %d bytes for gprod"), ngrp, sizeof(long double)); for (int i=0; i max_val) ansd[i] = R_PosInf; - else if (s[i] < min_val) ansd[i] = R_NegInf; - else ansd[i] = (double)s[i]; - } - } break; + }} + break; case REALSXP: { if (INHERITS(x, char_integer64)) { - int64_t min_val = INT64_MIN, max_val = INT64_MAX; - int64_t *ansd = (int64_t *)REAL(ans); const int64_t *xd = (const int64_t *)REAL(x); for (int i=0; i max_val) ansd[i] = R_PosInf; - else if (s[i] < min_val) ansd[i] = R_NegInf; - else ansd[i] = (int64_t)s[i]; - } } else { - double min_val = -DBL_MAX, max_val = DBL_MAX; - double *ansd = REAL(ans); const double *xd = REAL(x); for (int i=0; i max_val) ansd[i] = R_PosInf; - else if (s[i] < min_val) ansd[i] = R_NegInf; - else ansd[i] = (double)s[i]; - } } } break; default: free(s); error(_("Type '%s' is not supported by GForce %s. Either add the prefix %s or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x)), "prod (gprod)", "base::prod(.)"); } + SEXP ans = PROTECT(allocVector(REALSXP, ngrp)); + double *ansd = REAL(ans); + for (int i=0; i DBL_MAX) ansd[i] = R_PosInf; + else if (s[i] < -DBL_MAX) ansd[i] = R_NegInf; + else ansd[i] = (double)s[i]; + } free(s); - copyMostAttrib(x, ans); + if (!INHERITS(x, char_integer64)) + copyMostAttrib(x, ans); UNPROTECT(1); // Rprintf(_("this gprod took %8.3f\n"), 1.0*(clock()-start)/CLOCKS_PER_SEC); return(ans); From b4f83489398ab6539bbd7deee015b8f52f757090 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 16 Nov 2021 04:01:29 -0700 Subject: [PATCH 8/8] back to returning integer64 --- inst/tests/tests.Rraw | 10 ++++------ src/gsumm.c | 22 ++++++++++++++-------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 255a3282bb..0bd814862a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18350,11 +18350,9 @@ test(2225.2, groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Sp'='Spe # make gprod work for bit64, #5225 if (test_bit64) { - # base::prod always returns double; we like that and follow that lead in data.table - # despite bit64 having its own prod method which returns integer64 currently - test(2226.1, base::prod(2147483647L,2L), 4294967294) # just to demonstrate - DT = data.table(x=c(lim.integer64(), 2, 2, NA, NA, -2, 4), g=INT(1,2,1,2,1,2,3,3)) - test(2226.2, DT[, prod(x), g], data.table(g=1:3, V1=c(NA,NA,-8))) - test(2226.3, signif(DT[, prod(x,na.rm=TRUE), g]$V1, 6), c(-1.84467e19, +1.84467e19, -8)) + test(2226.1, base::prod(2147483647L,2L), 4294967294) # just to illustrate that base returns double + DT = data.table(x=c(lim.integer64(), 2, 1, NA, NA, -2, 4), g=INT(1,2,1,2,1,2,3,3)) + test(2226.2, DT[, prod(x), g], data.table(g=1:3, V1=as.integer64(c(NA,NA,-8L)))) + test(2226.3, DT[, prod(x,na.rm=TRUE), g], data.table(g=1:3, V1=as.integer64(c(NA,"9223372036854775807",-8L)))) } diff --git a/src/gsumm.c b/src/gsumm.c index 6a96cad829..2a2c0cdd4e 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -1161,18 +1161,24 @@ SEXP gprod(SEXP x, SEXP narmArg) { error(_("Type '%s' is not supported by GForce %s. Either add the prefix %s or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x)), "prod (gprod)", "base::prod(.)"); } SEXP ans = PROTECT(allocVector(REALSXP, ngrp)); - double *ansd = REAL(ans); - for (int i=0; i DBL_MAX) ansd[i] = R_PosInf; - else if (s[i] < -DBL_MAX) ansd[i] = R_NegInf; - else ansd[i] = (double)s[i]; + if (INHERITS(x, char_integer64)) { + int64_t *ansd = (int64_t *)REAL(ans); + for (int i=0; iINT64_MAX || s[i]<=INT64_MIN) ? NA_INTEGER64 : (int64_t)s[i]; + } + } else { + double *ansd = REAL(ans); + for (int i=0; i DBL_MAX) ansd[i] = R_PosInf; + else if (s[i] < -DBL_MAX) ansd[i] = R_NegInf; + else ansd[i] = (double)s[i]; + } } free(s); - if (!INHERITS(x, char_integer64)) - copyMostAttrib(x, ans); + copyMostAttrib(x, ans); UNPROTECT(1); // Rprintf(_("this gprod took %8.3f\n"), 1.0*(clock()-start)/CLOCKS_PER_SEC); - return(ans); + return ans; } SEXP gshift(SEXP x, SEXP nArg, SEXP fillArg, SEXP typeArg) {