From ae93abdfeee2b15cdaa5e6b92f66307da6dc533d Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 8 Sep 2019 05:16:07 +0800 Subject: [PATCH 1/3] make units explicit on implicit difftime by group --- NEWS.md | 2 ++ R/data.table.R | 13 +++++++++++++ inst/tests/tests.Rraw | 6 ++++++ 3 files changed, 21 insertions(+) diff --git a/NEWS.md b/NEWS.md index e3820812ee..c18b71e1b4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -284,6 +284,8 @@ 38. `rbindlist` now returns correct idcols for lists with different length vectors, [#3785](https://github.com/Rdatatable/data.table/issues/3785), [#3786](https://github.com/Rdatatable/data.table/pull/3786). Thanks to @shrektan for the report and fix. +39. Subtracting two `POSIXt` objects by group could lead to incorrect results because the `base` method internally calls `difftime` with `units='auto'`; `data.table` does not notice if the chosen units differ by group and only the last group's `units` attribute was retained, [#3694](https://github.com/Rdatatable/data.table/issues/3694) and [#761](https://github.com/Rdatatable/data.table/issues/761). To surmount this, we now internally force `units='secs'` on all `POSIXt-POSIXt` calls (reported when `verbose=TRUE`); generally we recommend calling `difftime` directly instead. Thanks @oliver-oliver and @boethian for the reports. + #### NOTES 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. diff --git a/R/data.table.R b/R/data.table.R index 91ede5d212..08d7d82d79 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1322,6 +1322,19 @@ replace_dot_alias = function(e) { SDenv$.N = vector("integer", 1L) # explicit new vector (not 0L or as.integer() which might return R's internal small-integer global) SDenv$.GRP = vector("integer", 1L) # because written to by reference at C level (one write per group). TODO: move this alloc to C level + # #3694/#761 common gotcha -- doing t1-t0 by group, but -.POSIXt uses units='auto' + # independently by group & attr mismatch among groups is ignored. The latter + # is a more general issue but the former can be fixed by forcing units='secs' + SDenv$`-.POSIXt` = function(e1, e2) { + if (inherits(e2, 'POSIXt')) { + if (verbose && !exists('done_units_report', parent.frame())) { + cat('\nNote: forcing units="secs" on implicit difftime by group; call difftime explicitly to choose custom units') + assign('done_units_report', TRUE, parent.frame()) + } + return(difftime(e1, e2, units='secs')) + } else return(base::`-.POSIXt`(e1, e2)) + } + if (byjoin) { # The groupings come instead from each row of the i data.table. # Much faster for a few known groups vs a 'by' for all followed by a subset diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 93aa828bf2..87ad39d573 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15936,6 +15936,12 @@ test(2100.14, fifelse(c(T,F,NA),c(1,1,1),c(2,2,2),NA), c(1,2,NA)) DT = data.table(id=1:3, v=4:6, key="id") test(2101, DT[.(logical())], data.table(id=logical(), v=integer(), key="id")) +# implicit difftime by group can choose different units but are later ignored, #3694 & #761 (among others) +del = c(0, 60, 3600, 86400) +DT = data.table(ID = 1:4, t0 = .POSIXct(0), t1 = .POSIXct(del)) +test(2102.1, DT[ , t1-t0, by=ID], data.table(ID=1:4, V1 = as.difftime(del, units='secs'))) +test(2102.2, DT[ , t1-t0, by=ID, verbose=TRUE], output='Note: forcing units="secs"') + ################################### # Add new tests above this line # From 8eda955ceb669525ced93d9b688be22373df0991 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 8 Sep 2019 05:20:35 +0800 Subject: [PATCH 2/3] one more test for good measure --- inst/tests/tests.Rraw | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 87ad39d573..6752666e20 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15938,9 +15938,10 @@ test(2101, DT[.(logical())], data.table(id=logical(), v=integer(), key="id")) # implicit difftime by group can choose different units but are later ignored, #3694 & #761 (among others) del = c(0, 60, 3600, 86400) -DT = data.table(ID = 1:4, t0 = .POSIXct(0), t1 = .POSIXct(del)) -test(2102.1, DT[ , t1-t0, by=ID], data.table(ID=1:4, V1 = as.difftime(del, units='secs'))) +DT = data.table(ID=1:4, t0=.POSIXct(0), t1=.POSIXct(del)) +test(2102.1, DT[ , t1-t0, by=ID], data.table(ID=1:4, V1=as.difftime(del, units='secs'))) test(2102.2, DT[ , t1-t0, by=ID, verbose=TRUE], output='Note: forcing units="secs"') +test(2102.3, DT[ , t1-5, by=ID], data.table(ID, V1=.POSIXct(del-5))) ################################### From 44d82c2081fb25937a72622426b756a62e452925 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 8 Sep 2019 19:14:16 +0800 Subject: [PATCH 3/3] fix test --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6752666e20..f26729f1b9 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15941,7 +15941,7 @@ del = c(0, 60, 3600, 86400) DT = data.table(ID=1:4, t0=.POSIXct(0), t1=.POSIXct(del)) test(2102.1, DT[ , t1-t0, by=ID], data.table(ID=1:4, V1=as.difftime(del, units='secs'))) test(2102.2, DT[ , t1-t0, by=ID, verbose=TRUE], output='Note: forcing units="secs"') -test(2102.3, DT[ , t1-5, by=ID], data.table(ID, V1=.POSIXct(del-5))) +test(2102.3, DT[ , t1-5, by=ID], data.table(ID=1:4, V1=.POSIXct(del-5))) ###################################