From 1df0de5c3026a9a172127f1e24c762a94bc18807 Mon Sep 17 00:00:00 2001 From: Ofek Shilon Date: Thu, 4 Feb 2021 22:09:21 +0200 Subject: [PATCH 01/10] Fix #4889: setDF now deletes the index attribute --- R/data.table.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/data.table.R b/R/data.table.R index 2b010db77a..0ba0283330 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2653,6 +2653,7 @@ setDF = function(x, rownames=NULL) { setattr(x, "class", "data.frame") setattr(x, "sorted", NULL) setattr(x, ".internal.selfref", NULL) + setattr(x, "index", NULL) # bug 4883 } else if (is.data.frame(x)) { if (!is.null(rownames)) { if (length(rownames) != nrow(x)) From b2f76d5ba3588a61d095157f3cc9d58f196410e3 Mon Sep 17 00:00:00 2001 From: Ofek Shilon Date: Mon, 8 Feb 2021 17:25:07 +0200 Subject: [PATCH 02/10] Added a test, NEWS item and DESCRIPTION, per CR comment --- DESCRIPTION | 3 ++- NEWS.md | 1 + inst/tests/tests.Rraw | 8 ++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index c7820bbb34..63fdd97db4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -61,7 +61,8 @@ Authors@R: c( person("Vaclav","Tlapak", role="ctb"), person("Kevin","Ushey", role="ctb"), person("Dirk","Eddelbuettel", role="ctb"), - person("Ben","Schwen", role="ctb")) + person("Ben","Schwen", role="ctb"), + person("Ofek", "Shilon", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown diff --git a/NEWS.md b/NEWS.md index e79ab4a795..63870be958 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,6 +16,7 @@ 2. `r-datatable.com` continues to be the short, canonical and long-standing URL which forwards to the current homepage. The homepage domain has changed a few times over the years but those using `r-datatable.com` did not need to change their links. For example, we use `r-datatable.com` in messages (and translated messages) in preference to the word 'homepage' to save users time in searching for the current homepage. The web forwarding was provided by Domain Monster but they do not support `https://r-datatable.com`, only `http://r-datatable.com`, despite the homepage being forwarded to being `https:` for many years. Meanwhile, CRAN submission checks now require all URLs to be `https:`, rejecting `http:`. Therefore we have moved to [gandi.net](https://www.gandi.net) who do support `https:` web forwarding and so [https://r-datatable.com](https://r-datatable.com) now forwards correctly. Thanks to Dirk Eddelbuettel for suggesting Gandi. Further, Gandi allows the web-forward to be marked 301 (permanent) or 302 (temporary). Since the very point of `https://r-datatable.com` is to be a forward, 302 is appropriate in this case. This enables us to link to it in DESCRIPTION, README, and this NEWS item. Otherwise, CRAN submission checks would require the 301 forward to be followed; i.e. the forward replaced with where it points to and the package resubmitted. Thanks to Uwe Ligges for explaining this distinction. +3. setDF now clears existing indices, as it did for other data.table-only attributes. Without this, sometimes bracket operator returned wrong results [#4889](https://github.com/Rdatatable/data.table/issues/4889). # data.table [v1.13.6](https://github.com/Rdatatable/data.table/milestone/22?closed=1) (30 Dec 2020) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2b44b3038f..0cc2c16cda 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17261,3 +17261,11 @@ if (identical(x, enc2native(x))) { # fintersect now preserves order of first argument like intersect, #4716 test(2163, fintersect(data.table(x=c("b", "c", "a")), data.table(x=c("a","c")))$x, c("c", "a")) + +# setDF now drops index attributes, #4889 +d <- data.table(a=1:100, b=1:100) +invisible(d[a == 50]) +setDF(d) +d[1:50, "a"] <- d[51:100, "a"] +setDT(d) +test(2164, nrow(d[a==99]), 2) \ No newline at end of file From 43c738fdc47093dff189bbc26adac770f6426e65 Mon Sep 17 00:00:00 2001 From: Ofek Shilon Date: Tue, 9 Feb 2021 14:50:11 +0200 Subject: [PATCH 03/10] Fixed NEWS, tests and ?setDF. Fix buffer overrun warning. --- NEWS.md | 3 ++- inst/tests/tests.Rraw | 2 +- man/setDF.Rd | 2 +- src/utils.c | 6 +++--- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index 63870be958..443ad78bdd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,13 +10,14 @@ 2. `fintersect()` now retains the order of the first argument as reasonably expected, rather than retaining the order of the second argument, [#4716](https://github.com/Rdatatable/data.table/issues/4716). Thanks to Michel Lang for reporting, and Ben Schwen for the PR. +3. setDF now clears existing indices, as it did for other data.table-only attributes. Without this, sometimes bracket operator returned wrong results [#4889](https://github.com/Rdatatable/data.table/issues/4889). + ## NOTES 1. Compiling from source no longer requires `zlib` header files to be available, [#4844](https://github.com/Rdatatable/data.table/pull/4844). The output suggests installing `zlib` headers, and how (e.g. `zlib1g-dev` on Ubuntu) as before, but now proceeds with `gzip` compression disabled in `fwrite`. Upon calling `fwrite(DT, "file.csv.gz")` at runtime, an error message suggests to reinstall `data.table` with `zlib` headers available. This does not apply to users on Windows or Mac who install the pre-compiled binary package from CRAN. 2. `r-datatable.com` continues to be the short, canonical and long-standing URL which forwards to the current homepage. The homepage domain has changed a few times over the years but those using `r-datatable.com` did not need to change their links. For example, we use `r-datatable.com` in messages (and translated messages) in preference to the word 'homepage' to save users time in searching for the current homepage. The web forwarding was provided by Domain Monster but they do not support `https://r-datatable.com`, only `http://r-datatable.com`, despite the homepage being forwarded to being `https:` for many years. Meanwhile, CRAN submission checks now require all URLs to be `https:`, rejecting `http:`. Therefore we have moved to [gandi.net](https://www.gandi.net) who do support `https:` web forwarding and so [https://r-datatable.com](https://r-datatable.com) now forwards correctly. Thanks to Dirk Eddelbuettel for suggesting Gandi. Further, Gandi allows the web-forward to be marked 301 (permanent) or 302 (temporary). Since the very point of `https://r-datatable.com` is to be a forward, 302 is appropriate in this case. This enables us to link to it in DESCRIPTION, README, and this NEWS item. Otherwise, CRAN submission checks would require the 301 forward to be followed; i.e. the forward replaced with where it points to and the package resubmitted. Thanks to Uwe Ligges for explaining this distinction. -3. setDF now clears existing indices, as it did for other data.table-only attributes. Without this, sometimes bracket operator returned wrong results [#4889](https://github.com/Rdatatable/data.table/issues/4889). # data.table [v1.13.6](https://github.com/Rdatatable/data.table/milestone/22?closed=1) (30 Dec 2020) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0cc2c16cda..38e6a8ad53 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17268,4 +17268,4 @@ invisible(d[a == 50]) setDF(d) d[1:50, "a"] <- d[51:100, "a"] setDT(d) -test(2164, nrow(d[a==99]), 2) \ No newline at end of file +test(2164, nrow(d[a==99]), as.integer(2)) \ No newline at end of file diff --git a/man/setDF.Rd b/man/setDF.Rd index f50c9ae491..57cba39433 100644 --- a/man/setDF.Rd +++ b/man/setDF.Rd @@ -15,7 +15,7 @@ setDF(x, rownames=NULL) } \details{ - All \code{data.table} attributes including any keys of the input data.table are stripped off. + All \code{data.table} attributes including any keys and indices of the input data.table are stripped off. When using \code{rownames}, recall that the row names of a \code{data.frame} must be unique. By default, the assigned set of row names is simply the sequence 1, \ldots, \code{nrow(x)} (or \code{length(x)} for \code{list}s). } diff --git a/src/utils.c b/src/utils.c index b1cb3b1aed..b98f1c7586 100644 --- a/src/utils.c +++ b/src/utils.c @@ -378,11 +378,11 @@ SEXP coerceUtf8IfNeeded(SEXP x) { #include #endif SEXP dt_zlib_version() { - char out[51]; + char out[71]; #ifndef NOZLIB - snprintf(out, 50, "zlibVersion()==%s ZLIB_VERSION==%s", zlibVersion(), ZLIB_VERSION); + snprintf(out, 70, "zlibVersion()==%s ZLIB_VERSION==%s", zlibVersion(), ZLIB_VERSION); #else - snprintf(out, 50, "zlib header files were not found when data.table was compiled"); + snprintf(out, 70, "zlib header files were not found when data.table was compiled"); #endif return ScalarString(mkChar(out)); } From 42210c7a7ee1deb92ea0b6d523002c0bd30b8376 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 9 Feb 2021 19:35:13 -0800 Subject: [PATCH 04/10] wording/grammar/style in NEWS --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 443ad78bdd..5ca1dc98bc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,7 +10,7 @@ 2. `fintersect()` now retains the order of the first argument as reasonably expected, rather than retaining the order of the second argument, [#4716](https://github.com/Rdatatable/data.table/issues/4716). Thanks to Michel Lang for reporting, and Ben Schwen for the PR. -3. setDF now clears existing indices, as it did for other data.table-only attributes. Without this, sometimes bracket operator returned wrong results [#4889](https://github.com/Rdatatable/data.table/issues/4889). +3. `setDF()` now clears existing indices, as it does for other `data.table`-only attributes. Doing so prevents some errors that could happen if the `setDT()` is later applied after the indices are mutated; see [#4889](https://github.com/Rdatatable/data.table/issues/4889). Thanks @OfekShilon for the report and the fix. ## NOTES From ec2051cda7012271795fdb7e129a8ded2e468f1d Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 9 Feb 2021 19:35:53 -0800 Subject: [PATCH 05/10] terminal newline --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 38e6a8ad53..a4bfb281ef 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17268,4 +17268,4 @@ invisible(d[a == 50]) setDF(d) d[1:50, "a"] <- d[51:100, "a"] setDT(d) -test(2164, nrow(d[a==99]), as.integer(2)) \ No newline at end of file +test(2164, nrow(d[a==99]), as.integer(2)) From 3251bf41b68b1de62dad9516aaccb62e45eeb9be Mon Sep 17 00:00:00 2001 From: Ofek Shilon Date: Wed, 10 Feb 2021 19:55:19 +0200 Subject: [PATCH 06/10] Improve test (apply CR) --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a4bfb281ef..a2dc51f1ee 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17264,7 +17264,7 @@ test(2163, fintersect(data.table(x=c("b", "c", "a")), data.table(x=c("a","c")))$ # setDF now drops index attributes, #4889 d <- data.table(a=1:100, b=1:100) -invisible(d[a == 50]) +setindex(d, a) setDF(d) d[1:50, "a"] <- d[51:100, "a"] setDT(d) From d935887baa185bf4646083c6111e0a18e0de26a0 Mon Sep 17 00:00:00 2001 From: Ofek Shilon Date: Mon, 10 May 2021 10:42:37 +0300 Subject: [PATCH 07/10] Clean index in 2 more places --- R/data.table.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/data.table.R b/R/data.table.R index 7ae43dec1e..f2938c0621 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2093,6 +2093,7 @@ as.data.frame.data.table = function(x, ...) setattr(ans,"class","data.frame") setattr(ans,"sorted",NULL) # remove so if you convert to df, do something, and convert back, it is not sorted setattr(ans,".internal.selfref",NULL) + setattr(ans, "index", NULL) # bug 4883 # leave tl intact, no harm, ans } @@ -2109,6 +2110,7 @@ as.list.data.table = function(x, ...) { setattr(ans, "row.names", NULL) setattr(ans, "sorted", NULL) setattr(ans,".internal.selfref", NULL) # needed to pass S4 tests for example + setattr(ans, "index", NULL) # bug 4883 ans } From faf4f2057d937282d693bccd7bfa086d49a79542 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 27 Jul 2021 11:54:24 -0600 Subject: [PATCH 08/10] News item moved up --- NEWS.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index e0c244287e..34542e8a95 100644 --- a/NEWS.md +++ b/NEWS.md @@ -149,7 +149,7 @@ 26. `melt()` now outputs scalar logical `NA` instead of `NULL` in rows corresponding to missing list columns, for consistency with non-list columns when using `na.rm=TRUE`, [#5053](https://github.com/Rdatatable/data.table/pull/5053). Thanks to Toby Dylan Hocking for the PR. -27. `as.data.frame(DT)` now removes any indices in addition to removing any key, [#5042](https://github.com/Rdatatable/data.table/issues/5042). When indices were left intact, a subsequent subset or reorder of the `data.frame` would not update the indices since they are treated just like any other `data.frame` attribute, causing incorrect results if the result is later converted back to `data.table` again. +27. `as.data.frame(DT)`, `setDF(DT)` and `as.list(DT)` now remove the `"index"` attribute which contains any indices (a.k.a. secondary keys), as they already did for other `data.table`-only attributes such as the primary key stored in the `"sorted"` attribute. When indices were left intact, a subsequent subset or reorder of the `data.frame` by `data.frame`-code in base R or other packages would not update the indices, causing incorrect results if then converted back to `data.table`, [#4889](https://github.com/Rdatatable/data.table/issues/4889) [#5042](https://github.com/Rdatatable/data.table/issues/5042). Thanks @OfekShilon for the report and the PR. ## NOTES @@ -202,8 +202,6 @@ 2. `fintersect()` now retains the order of the first argument as reasonably expected, rather than retaining the order of the second argument, [#4716](https://github.com/Rdatatable/data.table/issues/4716). Thanks to Michel Lang for reporting, and Ben Schwen for the PR. -3. `setDF()` now clears existing indices, as it does for other `data.table`-only attributes. Doing so prevents some errors that could happen if the `setDT()` is later applied after the indices are mutated; see [#4889](https://github.com/Rdatatable/data.table/issues/4889). Thanks @OfekShilon for the report and the fix. - ## NOTES 1. Compiling from source no longer requires `zlib` header files to be available, [#4844](https://github.com/Rdatatable/data.table/pull/4844). The output suggests installing `zlib` headers, and how (e.g. `zlib1g-dev` on Ubuntu) as before, but now proceeds with `gzip` compression disabled in `fwrite`. Upon calling `fwrite(DT, "file.csv.gz")` at runtime, an error message suggests to reinstall `data.table` with `zlib` headers available. This does not apply to users on Windows or Mac who install the pre-compiled binary package from CRAN. From 0b0d5f3ddc69e5a39849d34144abe20d82db2f46 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 27 Jul 2021 12:22:33 -0600 Subject: [PATCH 09/10] tidy merge with #5043 --- R/data.table.R | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 611e02b901..5a7da164be 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2140,9 +2140,8 @@ as.data.frame.data.table = function(x, ...) setattr(ans,"row.names",.set_row_names(nrow(x))) # since R 2.4.0, data.frames can have non-character row names setattr(ans,"class","data.frame") setattr(ans,"sorted",NULL) # remove so if you convert to df, do something, and convert back, it is not sorted - setattr(ans,"index",NULL) #5042 + setattr(ans,"index",NULL) #4883 #5042 setattr(ans,".internal.selfref",NULL) - setattr(ans, "index", NULL) # bug 4883 # leave tl intact, no harm, ans } @@ -2158,8 +2157,8 @@ as.list.data.table = function(x, ...) { setattr(ans, "class", NULL) setattr(ans, "row.names", NULL) setattr(ans, "sorted", NULL) + setattr(ans, "index", NULL) #4883 #5042 setattr(ans,".internal.selfref", NULL) # needed to pass S4 tests for example - setattr(ans, "index", NULL) # bug 4883 ans } @@ -2718,8 +2717,8 @@ setDF = function(x, rownames=NULL) { setattr(x, "row.names", rn) setattr(x, "class", "data.frame") setattr(x, "sorted", NULL) + setattr(x, "index", NULL) #4883 #5042 setattr(x, ".internal.selfref", NULL) - setattr(x, "index", NULL) # bug 4883 } else if (is.data.frame(x)) { if (!is.null(rownames)) { if (length(rownames) != nrow(x)) From d38211dd44e455010a29a71838c0f71e43de30ba Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 27 Jul 2021 12:26:15 -0600 Subject: [PATCH 10/10] correct issue number 4883=>4889 --- R/data.table.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 5a7da164be..0a5a38785b 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2140,7 +2140,7 @@ as.data.frame.data.table = function(x, ...) setattr(ans,"row.names",.set_row_names(nrow(x))) # since R 2.4.0, data.frames can have non-character row names setattr(ans,"class","data.frame") setattr(ans,"sorted",NULL) # remove so if you convert to df, do something, and convert back, it is not sorted - setattr(ans,"index",NULL) #4883 #5042 + setattr(ans,"index",NULL) #4889 #5042 setattr(ans,".internal.selfref",NULL) # leave tl intact, no harm, ans @@ -2157,7 +2157,7 @@ as.list.data.table = function(x, ...) { setattr(ans, "class", NULL) setattr(ans, "row.names", NULL) setattr(ans, "sorted", NULL) - setattr(ans, "index", NULL) #4883 #5042 + setattr(ans, "index", NULL) #4889 #5042 setattr(ans,".internal.selfref", NULL) # needed to pass S4 tests for example ans } @@ -2717,7 +2717,7 @@ setDF = function(x, rownames=NULL) { setattr(x, "row.names", rn) setattr(x, "class", "data.frame") setattr(x, "sorted", NULL) - setattr(x, "index", NULL) #4883 #5042 + setattr(x, "index", NULL) #4889 #5042 setattr(x, ".internal.selfref", NULL) } else if (is.data.frame(x)) { if (!is.null(rownames)) {