From 8ab9bea333630da92b00bbdcd62370f6ec4ba9f4 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 7 Jul 2019 18:52:35 +0800 Subject: [PATCH 1/2] Closes #3683 -- test.data.table handles pre-declared datatable.integer64 option correctly --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 61 +++++++++++++++++++++++++++---------------- 2 files changed, 41 insertions(+), 22 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3e46498067..110185772f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -220,6 +220,8 @@ 11. We intend to deprecate the `datatable.nomatch` option, [more info](https://github.com/Rdatatable/data.table/pull/3578/files). A message is now printed upon use of the option (once per session) as a first step. It asks you to please stop using the option and to pass `nomatch=NULL` explicitly if you require inner join. Outer join (`nomatch=NA`) has always been the default because it is safer; it does not drop missing data silently. The problem is that the option is global; i.e., if a user changes the default using this option for their own use, that can change the behavior of joins inside packages that use `data.table` too. This is the only `data.table` option with this concern. +12. `test.data.table()` wasn't careful about the user option `datatable.integer64` which allows users to specify how `fread` treats columns it thinks could be 64-bit integers, [#3683](https://github.com/Rdatatable/data.table/issues/3683). Thanks @xiaguoxin for pointing this out. + ### Changes in [v1.12.2](https://github.com/Rdatatable/data.table/milestone/14?closed=1) (07 Apr 2019) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5f2e5fceba..6896e676a2 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2523,24 +2523,32 @@ if (test_bit64) { DT = data.table( a=sample(1:1000,n,replace=TRUE), b=sample(as.integer64(2)^35 * 1:10, n, replace=TRUE), c=sample(c("foo","bar","baz"),n,replace=TRUE) ) - fwrite(DT,f<-tempfile()) + fwrite(DT,f1<-tempfile()) + old = options(datatable.integer64 = 'integer64') test(897, class(DT$b), "integer64") - test(898, fread(f), DT) - unlink(f) - DT[,a2:=as.integer64(a)][,a3:=as.double(a)][,a4:=gsub(" ","",format(a))] - DT[,b2:=as.double(b)][,b3:=gsub(" ","",format(b))] - DT[,r:=a/100][,r2:=gsub(" ","",format(r))] + test(898, fread(f1), DT) + DT[, c('a2', 'a3', 'a4') := .(as.integer64(a), as.double(a), gsub(" ","",format(a)))] + DT[, c('b2', 'b3') := .(as.double(b), gsub(" ","",format(b)))] + DT[, c('r', 'r2') := .(a/100, gsub(" ","",format(a/100)))] DT[112, a2:=as.integer64(12345678901234)] # start on row 112 to avoid the first 100 DT[113, a3:=3.14] DT[114, a4:="123A"] DT[115, b2:=1234567890123.45] DT[116, b3:="12345678901234567890A"] # A is needed otherwise read as double with loss of precision (TO DO: should detect and bump to STR) DT[117, r2:="3.14A"] - fwrite(DT,f<-tempfile()) - test(899.1, fread(f,verbose=TRUE), DT, output="Rereading 6 columns.*out-of-sample.*Column 4.*a2.*int32.*int64.*<<12345678901234>>.*Column 10.*r2.*float64.*string.*<<3.14A>>") - test(899.2, fread(f, colClasses=list(character=c("a4","b3","r2"),integer64="a2",double=c("a3","b2")), verbose=TRUE), + fwrite(DT,f2<-tempfile()) + test(899.1, fread(f2,verbose=TRUE), DT, output="Rereading 6 columns.*out-of-sample.*Column 4.*a2.*int32.*int64.*<<12345678901234>>.*Column 10.*r2.*float64.*string.*<<3.14A>>") + test(899.2, fread(f2, colClasses=list(character=c("a4","b3","r2"),integer64="a2",double=c("a3","b2")), verbose=TRUE), DT, output="Rereading 0 columns due to out-of-sample type exceptions") - unlink(f) + + # #3683 -- add explicit tests for datatable.integer64='character' + options(datatable.integer64 = 'character') + test(899.3, fread(f1), DT[ , .(a, b = as.character(b), c)]) + # leaving integer64='character' version of 899.1,899.2 until #2749 is fixed + + options(old) + unlink(f1) + unlink(f2) } # getwd() has been set by test.data.table() to the location of this tests.Rraw file. Test files should be in the same directory. @@ -2903,7 +2911,11 @@ test(1016.1, sapply(suppressWarnings(fread(f,verbose=TRUE)),"class"), c(A="integ test(1016.2, fread(f, colClasses = c(A="numeric"), verbose=TRUE), copy(DT)[,A:=as.numeric(A)], output="Rereading 0 columns") DT[90, A:="321456789123456"] # inside the sample write.table(DT,f,sep=",",row.names=FALSE,quote=FALSE) -if (test_bit64) test(1017.1, fread(f), copy(DT)[,A:=as.integer64(A)]) +if (test_bit64) { + old = options(datatable.integer64='integer64') + test(1017.1, fread(f), copy(DT)[,A:=as.integer64(A)]) + options(old) +} test(1017.2, fread(f, integer64="character"), DT) unlink(f) @@ -6293,12 +6305,17 @@ quote"\n2,should be ok\n'), quote','should be ok'))) if (test_bit64) { - # quoted multiline (scrambled data thanks to #810) - test(1449, fread(testDir("quoted_multiline.csv"))[c(1,43:44),c(1,22:24),with=FALSE], - data.table(GPMLHTLN=as.integer64(c("3308386085360","3440245203140","1305220146734")), - BLYBZ = c(0L,4L,6L), - ZBJBLOAJAQI = c("LHCYS AYE ZLEMYA IFU HEI JG FEYE","",""), - JKCRUUBAVQ = c("",".\\YAPCNXJ\\004570_850034_757\\VWBZSS_848482_600874_487_PEKT-6-KQTVIL-7_30\\IRVQT\\HUZWLBSJYHZ\\XFWPXQ-WSPJHC-00-0770000855383.KKZ",""))) + old = options(datatable.integer64 = 'integer64') + # quoted multiline (scrambled data thanks to #810) + DT = data.table( + GPMLHTLN = as.integer64(c("3308386085360", "3440245203140", "1305220146734")), + BLYBZ = c(0L,4L,6L), + ZBJBLOAJAQI = c("LHCYS AYE ZLEMYA IFU HEI JG FEYE", "", ""), + JKCRUUBAVQ = c("", ".\\YAPCNXJ\\004570_850034_757\\VWBZSS_848482_600874_487_PEKT-6-KQTVIL-7_30\\IRVQT\\HUZWLBSJYHZ\\XFWPXQ-WSPJHC-00-0770000855383.KKZ", "") + ) + test(1449.1, fread(testDir("quoted_multiline.csv"))[c(1L, 43:44), c(1L, 22:24)], DT) + test(1449.2, fread(testDir("quoted_multiline.csv"), integer64='character', select = 'GPMLHTLN')[c(1L, 43:44)][[1L]], DT[ , as.character(GPMLHTLN)]) + options(old) } # Fix for #927 @@ -6919,14 +6936,14 @@ test(1499, ans1, ans2) # Fix for #488 if (test_bit64) { - test(1500.1, fread("x,y\n3,\n", colClasses = list(integer64 = "y")), + test(1500.1, fread("x,y\n3,\n", colClasses = list(integer64="y"), integer64='integer64'), data.table(x=3L, y=as.integer64(NA))) # more tests after new fix - test(1500.2, fread("x,y\n0,12345678901234\n0,\n0,\n0,\n0,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n12345678901234,\n0,\n0,\n0,\n0,\n0,\n"), - data.table(x=as.integer64(c(rep(0L, 5L), rep(NA, 11), 12345678901234, rep(0L,5L))), - y=as.integer64(c(12345678901234, rep(NA,21))))) + test(1500.2, fread("x,y\n0,12345678901234\n0,\n0,\n0,\n0,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n12345678901234,\n0,\n0,\n0,\n0,\n0,\n", integer64='integer64'), + data.table(x=as.integer64(c(rep(0L, 5L), rep(NA, 11L), 12345678901234, rep(0L, 5L))), + y=as.integer64(c(12345678901234, rep(NA, 21L))))) - x = c("12345678901234", rep("NA", 178), "a") + x = c("12345678901234", rep("NA", 178L), "a") y = sample(letters, length(x), TRUE) ll = paste(x,y, sep=",", collapse="\n") test(1500.3, fread(ll, na.strings=NULL), From ab3ef1c5ac325625eed78ef3d7a6fac3ca1c4490 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 15 Jul 2019 14:26:24 -0700 Subject: [PATCH 2/2] moved datatable.integer64 up to the top section, and minimized diff --- NEWS.md | 2 +- inst/tests/tests.Rraw | 47 +++++++++++++++++-------------------------- 2 files changed, 19 insertions(+), 30 deletions(-) diff --git a/NEWS.md b/NEWS.md index b44fcbfc7f..d33e8baef2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -224,7 +224,7 @@ 12. The test suite of 9k tests now runs with three R options on: `warnPartialMatchArgs`, `warnPartialMatchAttr`, and `warnPartialMatchDollar`. This ensures that we don't rely on partial argument matching in internal code, for robustness and efficiency, and so that users can turn these options on for their code in production, [#3664](https://github.com/Rdatatable/data.table/issues/3664). Thanks to Vijay Lulla for the suggestion, and Michael Chirico for fixing 48 internal calls to `attr()` which were missing `exact=TRUE`, for example. Thanks to R-core for adding these options to R 2.6.0 (Oct 2007). -13. `test.data.table()` wasn't careful about the user option `datatable.integer64` which allows users to specify how `fread` treats columns it thinks could be 64-bit integers, [#3683](https://github.com/Rdatatable/data.table/issues/3683). Thanks @xiaguoxin for pointing this out. +13. `test.data.table()` could fail if the `datatable.integer64` user option was set, [#3683](https://github.com/Rdatatable/data.table/issues/3683). Thanks @xiaguoxin for reporting. ### Changes in [v1.12.2](https://github.com/Rdatatable/data.table/milestone/14?closed=1) (07 Apr 2019) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e4ffed4c66..bc14dba34b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -102,6 +102,7 @@ oldOptions = options( datatable.alloccol = 1024L, datatable.print.class = FALSE, # this is TRUE in cc.R and we like TRUE. But output= tests need to be updated (they assume FALSE currently) datatable.rbindlist.check = NULL, + datatable.integer64 = "integer64", warnPartialMatchArgs = TRUE, # ensure we don't rely on partial argument matching in internal code, #3664 warnPartialMatchAttr = TRUE, warnPartialMatchDollar = TRUE @@ -2530,32 +2531,26 @@ if (test_bit64) { DT = data.table( a=sample(1:1000,n,replace=TRUE), b=sample(as.integer64(2)^35 * 1:10, n, replace=TRUE), c=sample(c("foo","bar","baz"),n,replace=TRUE) ) - fwrite(DT,f1<-tempfile()) - old = options(datatable.integer64 = 'integer64') + fwrite(DT,f<-tempfile()) test(897, class(DT$b), "integer64") - test(898, fread(f1), DT) - DT[, c('a2', 'a3', 'a4') := .(as.integer64(a), as.double(a), gsub(" ","",format(a)))] - DT[, c('b2', 'b3') := .(as.double(b), gsub(" ","",format(b)))] - DT[, c('r', 'r2') := .(a/100, gsub(" ","",format(a/100)))] + test(898, fread(f), DT) + unlink(f) + DT[,a2:=as.integer64(a)][,a3:=as.double(a)][,a4:=gsub(" ","",format(a))] + DT[,b2:=as.double(b)][,b3:=gsub(" ","",format(b))] + DT[,r:=a/100][,r2:=gsub(" ","",format(r))] DT[112, a2:=as.integer64(12345678901234)] # start on row 112 to avoid the first 100 DT[113, a3:=3.14] DT[114, a4:="123A"] DT[115, b2:=1234567890123.45] DT[116, b3:="12345678901234567890A"] # A is needed otherwise read as double with loss of precision (TO DO: should detect and bump to STR) DT[117, r2:="3.14A"] - fwrite(DT,f2<-tempfile()) - test(899.1, fread(f2,verbose=TRUE), DT, output="Rereading 6 columns.*out-of-sample.*Column 4.*a2.*int32.*int64.*<<12345678901234>>.*Column 10.*r2.*float64.*string.*<<3.14A>>") - test(899.2, fread(f2, colClasses=list(character=c("a4","b3","r2"),integer64="a2",double=c("a3","b2")), verbose=TRUE), + fwrite(DT,f<-tempfile()) + test(899.1, fread(f, verbose=TRUE), DT, output="Rereading 6 columns.*out-of-sample.*Column 4.*a2.*int32.*int64.*<<12345678901234>>.*Column 10.*r2.*float64.*string.*<<3.14A>>") + test(899.2, fread(f, colClasses=list(character=c("a4","b3","r2"), integer64="a2", double=c("a3","b2")), verbose=TRUE), DT, output="Rereading 0 columns due to out-of-sample type exceptions") - - # #3683 -- add explicit tests for datatable.integer64='character' - options(datatable.integer64 = 'character') - test(899.3, fread(f1), DT[ , .(a, b = as.character(b), c)]) + test(899.3, fread(f, integer64="character", select=c("a","b","c")), DT[, .(a, b=as.character(b), c)]) # leaving integer64='character' version of 899.1,899.2 until #2749 is fixed - - options(old) - unlink(f1) - unlink(f2) + unlink(f) } # getwd() has been set by test.data.table() to the location of this tests.Rraw file. Test files should be in the same directory. @@ -2918,11 +2913,7 @@ test(1016.1, sapply(suppressWarnings(fread(f,verbose=TRUE)),"class"), c(A="integ test(1016.2, fread(f, colClasses = c(A="numeric"), verbose=TRUE), copy(DT)[,A:=as.numeric(A)], output="Rereading 0 columns") DT[90, A:="321456789123456"] # inside the sample write.table(DT,f,sep=",",row.names=FALSE,quote=FALSE) -if (test_bit64) { - old = options(datatable.integer64='integer64') - test(1017.1, fread(f), copy(DT)[,A:=as.integer64(A)]) - options(old) -} +if (test_bit64) test(1017.1, fread(f), copy(DT)[,A:=as.integer64(A)]) test(1017.2, fread(f, integer64="character"), DT) unlink(f) @@ -6312,7 +6303,6 @@ quote"\n2,should be ok\n'), quote','should be ok'))) if (test_bit64) { - old = options(datatable.integer64 = 'integer64') # quoted multiline (scrambled data thanks to #810) DT = data.table( GPMLHTLN = as.integer64(c("3308386085360", "3440245203140", "1305220146734")), @@ -6322,7 +6312,6 @@ if (test_bit64) { ) test(1449.1, fread(testDir("quoted_multiline.csv"))[c(1L, 43:44), c(1L, 22:24)], DT) test(1449.2, fread(testDir("quoted_multiline.csv"), integer64='character', select = 'GPMLHTLN')[c(1L, 43:44)][[1L]], DT[ , as.character(GPMLHTLN)]) - options(old) } # Fix for #927 @@ -6943,14 +6932,14 @@ test(1499, ans1, ans2) # Fix for #488 if (test_bit64) { - test(1500.1, fread("x,y\n3,\n", colClasses = list(integer64="y"), integer64='integer64'), + test(1500.1, fread("x,y\n3,\n", colClasses = list(integer64 = "y")), data.table(x=3L, y=as.integer64(NA))) # more tests after new fix - test(1500.2, fread("x,y\n0,12345678901234\n0,\n0,\n0,\n0,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n12345678901234,\n0,\n0,\n0,\n0,\n0,\n", integer64='integer64'), - data.table(x=as.integer64(c(rep(0L, 5L), rep(NA, 11L), 12345678901234, rep(0L, 5L))), - y=as.integer64(c(12345678901234, rep(NA, 21L))))) + test(1500.2, fread("x,y\n0,12345678901234\n0,\n0,\n0,\n0,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n12345678901234,\n0,\n0,\n0,\n0,\n0,\n"), + data.table(x=as.integer64(c(rep(0L, 5L), rep(NA, 11), 12345678901234, rep(0L,5L))), + y=as.integer64(c(12345678901234, rep(NA,21))))) - x = c("12345678901234", rep("NA", 178L), "a") + x = c("12345678901234", rep("NA", 178), "a") y = sample(letters, length(x), TRUE) ll = paste(x,y, sep=",", collapse="\n") test(1500.3, fread(ll, na.strings=NULL),