From 8eb2dc854da72f9d6c685d30e4131c3cb1e8fa70 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Sun, 8 Aug 2021 17:57:27 -0400 Subject: [PATCH 1/7] Update fwrite.R Include row.names from R --- R/fwrite.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/fwrite.R b/R/fwrite.R index c822b05678..74d127c875 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -36,6 +36,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", nThread = as.integer(nThread) # write.csv default is 'double' so fwrite follows suit. write.table's default is 'escape' # validate arguments + rn = if (row.names) row.names(x) else NULL # allocate row.names in R to address integer row.names #4957 if (is.matrix(x)) { # coerce to data.table if input object is matrix messagef("x being coerced from class: matrix to data.table") x = as.data.table(x) @@ -111,7 +112,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", file = enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078. .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, row.names, col.names, logical01, scipen, dateTimeAs, buffMB, nThread, - showProgress, is_gzip, bom, yaml, verbose, encoding) + showProgress, is_gzip, bom, yaml, verbose, encoding, rn) invisible() } From 5a4842e80c67bd92a153eafa419cd12a637e924b Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Sun, 8 Aug 2021 18:00:36 -0400 Subject: [PATCH 2/7] Update fwriteR.c --- src/fwriteR.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/fwriteR.c b/src/fwriteR.c index b5f02f6a9e..c4a09bc98c 100644 --- a/src/fwriteR.c +++ b/src/fwriteR.c @@ -170,7 +170,8 @@ SEXP fwriteR( SEXP bom_Arg, SEXP yaml_Arg, SEXP verbose_Arg, - SEXP encoding_Arg + SEXP encoding_Arg, + SEXP rn_Arg // the actual row.names to be used #4957 ) { if (!isNewList(DF)) error(_("fwrite must be passed an object of type list; e.g. data.frame, data.table")); @@ -257,9 +258,7 @@ SEXP fwriteR( args.doRowNames = LOGICAL(rowNames_Arg)[0]; args.rowNames = NULL; if (args.doRowNames) { - SEXP rn = PROTECT(getAttrib(DF, R_RowNamesSymbol)); - protecti++; - args.rowNames = isString(rn) ? DATAPTR_RO(rn) : NULL; + args.rowNames = DATAPTR_RO(rn_Arg); } args.sep = *CHAR(STRING_ELT(sep_Arg, 0)); // DO NOT DO: allow multichar separator (bad idea) From 3882d8986472a7d9c63235c0602b91d59560c1b2 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Sun, 8 Aug 2021 18:05:09 -0400 Subject: [PATCH 3/7] Update tests.Rraw --- inst/tests/tests.Rraw | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9a30d6b3d4..630f0e395d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -10709,7 +10709,7 @@ test(1734.1, capture.output(fwrite(DT,row.names=TRUE,quote=FALSE)), test(1734.2, capture.output(fwrite(DT,row.names=TRUE,quote=TRUE)), capture.output(write.csv(DT))) test(1734.3, fwrite(DT,row.names=TRUE,quote='auto'), # same other than 'foo' and 'bar' column names not quoted - output="\"\",foo,bar\n\"1\",1,1.2\n\"2\",2,9.8\n\"3\",3,-6") + output="\"\",foo,bar\n1,1,1.2\n2,2,9.8\n3,3,-6") DF = as.data.frame(DT) test(1734.4, capture.output(fwrite(DF,row.names=TRUE,quote=FALSE)), capture.output(write.csv(DF,quote=FALSE))) @@ -10721,6 +10721,9 @@ test(1734.6, capture.output(fwrite(DF,row.names=TRUE,quote=FALSE)), capture.output(write.csv(DF,quote=FALSE))) test(1734.7, capture.output(fwrite(DF,row.names=TRUE,quote=TRUE)), capture.output(write.csv(DF))) +rownames(DF) = c(10L, 20L, 30L) ## test for 4957 +test(1734.8, capture.output(fwrite(DF, row.names = TRUE, quote = TRUE)), + capture.output(write.csv(DF))) # list columns and sep2 set.seed(1) From c200f31e18be556ecf889cc6b5535d0e27a96766 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Sun, 8 Aug 2021 18:18:57 -0400 Subject: [PATCH 4/7] Update NEWS.md --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index f333ec0356..15d30ad094 100644 --- a/NEWS.md +++ b/NEWS.md @@ -161,6 +161,8 @@ 29. A segfault occurred when `nrow/throttle < nthread`, [#5077](https://github.com/Rdatatable/data.table/issues/5077). With the default throttle of 1024 rows (see `?setDTthreads`), at least 64 threads would be needed to trigger the segfault since there needed to be more than 65,535 rows too. It occurred on a server with 256 logical cores where `data.table` uses 128 threads by default. Thanks to Bennet Becker for reporting, debugging at C level, and fixing. It also occurred when the throttle was increased so as to use fewer threads; e.g. at the limit `setDTthreads(throttle=nrow(DT))`. +30. Writing row names with `fwrite()` with integer row names would incorrectly write the row number instead of the user assigned row names. For example, instead of writing `row.names(DT) <- c(10L, 20L, 30L)`, using the vector of `c(10L, 20L, 30L)`, the `fwrite(..., row.names = TRUE)` output would be the row numbers of 1, 2, and 3. This PR change also affected the beviour of auto quote for default row names and integer row names. Before the PR, `quote = 'auto'` would return the default row names in double quotes (e.g., `"1"`). Now the change aligns with how `fwrite()` handles character row names where `quote = 'auto'` does not include double quotes. Thanks to @dgarrimar for the bug report [#4957](https://github.com/Rdatatable/data.table/issues/4957) and @ColeMiller1 for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : From b819857912b6f258b5dc51594565ae0d86e93e07 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 9 Aug 2021 23:26:07 -0600 Subject: [PATCH 5/7] news item shorten and standard style (code/funnction first word, issue number at the end of the first sentence) --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 4ce37341fc..d001c495bd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -163,7 +163,7 @@ 30. `fread(file=URL)` now works rather than error `does not exist or is non-readable`, [#4952](https://github.com/Rdatatable/data.table/issues/4952). `fread(URL)` and `fread(input=URL)` worked before and continue to work. Thanks to @pnacht for reporting and @ben-schwen for the PR. -31. Writing row names with `fwrite()` with integer row names would incorrectly write the row number instead of the user assigned row names. For example, instead of writing `row.names(DT) <- c(10L, 20L, 30L)`, using the vector of `c(10L, 20L, 30L)`, the `fwrite(..., row.names = TRUE)` output would be the row numbers of 1, 2, and 3. This PR change also affected the beviour of auto quote for default row names and integer row names. Before the PR, `quote = 'auto'` would return the default row names in double quotes (e.g., `"1"`). Now the change aligns with how `fwrite()` handles character row names where `quote = 'auto'` does not include double quotes. Thanks to @dgarrimar for the bug report [#4957](https://github.com/Rdatatable/data.table/issues/4957) and @ColeMiller1 for the PR. +31. `fwrite(DF, row.names=TRUE)` where `DF` has specific integer rownames (e.g. using `rownames(DF) <- c(10L,20L,30L)`) would incorrectly ignore the integer rowname and write the row number instead, [#4957](https://github.com/Rdatatable/data.table/issues/4957). Thanks to @dgarrimar for reporting and @ColeMiller1 for the PR. Further, when quote='auto' and the rownames are integers, they are no longer quoted. ## NOTES From f80ad26de7db5760d390c1739fc0ca272589eabc Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 10 Aug 2021 01:34:58 -0600 Subject: [PATCH 6/7] option 3 --- NEWS.md | 2 +- R/fwrite.R | 3 +-- inst/tests/tests.Rraw | 38 +++++++++++++++++++++----------------- src/fwrite.c | 20 ++++++++++++-------- src/fwrite.h | 3 ++- src/fwriteR.c | 19 ++++++++++++++++--- 6 files changed, 53 insertions(+), 32 deletions(-) diff --git a/NEWS.md b/NEWS.md index d001c495bd..fd75bce689 100644 --- a/NEWS.md +++ b/NEWS.md @@ -163,7 +163,7 @@ 30. `fread(file=URL)` now works rather than error `does not exist or is non-readable`, [#4952](https://github.com/Rdatatable/data.table/issues/4952). `fread(URL)` and `fread(input=URL)` worked before and continue to work. Thanks to @pnacht for reporting and @ben-schwen for the PR. -31. `fwrite(DF, row.names=TRUE)` where `DF` has specific integer rownames (e.g. using `rownames(DF) <- c(10L,20L,30L)`) would incorrectly ignore the integer rowname and write the row number instead, [#4957](https://github.com/Rdatatable/data.table/issues/4957). Thanks to @dgarrimar for reporting and @ColeMiller1 for the PR. Further, when quote='auto' and the rownames are integers, they are no longer quoted. +31. `fwrite(DF, row.names=TRUE)` where `DF` has specific integer rownames (e.g. using `rownames(DF) <- c(10L,20L,30L)`) would incorrectly ignore the integer rownames and write the row numbers instead, [#4957](https://github.com/Rdatatable/data.table/issues/4957). Thanks to @dgarrimar for reporting and @ColeMiller1 for the PR. Further, when quote='auto' and the rownames are integers (either default or specific), they are no longer quoted. ## NOTES diff --git a/R/fwrite.R b/R/fwrite.R index 74d127c875..c822b05678 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -36,7 +36,6 @@ fwrite = function(x, file="", append=FALSE, quote="auto", nThread = as.integer(nThread) # write.csv default is 'double' so fwrite follows suit. write.table's default is 'escape' # validate arguments - rn = if (row.names) row.names(x) else NULL # allocate row.names in R to address integer row.names #4957 if (is.matrix(x)) { # coerce to data.table if input object is matrix messagef("x being coerced from class: matrix to data.table") x = as.data.table(x) @@ -112,7 +111,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", file = enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078. .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, row.names, col.names, logical01, scipen, dateTimeAs, buffMB, nThread, - showProgress, is_gzip, bom, yaml, verbose, encoding, rn) + showProgress, is_gzip, bom, yaml, verbose, encoding) invisible() } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7ba203f978..0ccda6f46b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -10706,26 +10706,30 @@ test(1733.2, fwrite(data.table(c(1.2,-8.0,pi,67.99),1:4),dec=",",sep=";"), # fwrite implied and actual row.names DT = data.table(foo=1:3,bar=c(1.2,9.8,-6.0)) -test(1734.1, capture.output(fwrite(DT,row.names=TRUE,quote=FALSE)), - capture.output(write.csv(DT,quote=FALSE))) -test(1734.2, capture.output(fwrite(DT,row.names=TRUE,quote=TRUE)), - capture.output(write.csv(DT))) -test(1734.3, fwrite(DT,row.names=TRUE,quote='auto'), # same other than 'foo' and 'bar' column names not quoted - output="\"\",foo,bar\n1,1,1.2\n2,2,9.8\n3,3,-6") +test(1734.01, capture.output(fwrite(DT,row.names=TRUE,quote=FALSE)), + capture.output(write.csv(DT,quote=FALSE))) +test(1734.02, capture.output(fwrite(DT,row.names=TRUE,quote=TRUE)), + capture.output(write.csv(DT))) +test(1734.03, fwrite(DT,row.names=TRUE,quote='auto'), # same other than 'foo' and 'bar' column names not quoted + output="\"\",foo,bar\n1,1,1.2\n2,2,9.8\n3,3,-6") DF = as.data.frame(DT) -test(1734.4, capture.output(fwrite(DF,row.names=TRUE,quote=FALSE)), - capture.output(write.csv(DF,quote=FALSE))) -test(1734.5, capture.output(fwrite(DF,row.names=TRUE,quote=TRUE)), - capture.output(write.csv(DF))) +test(1734.04, capture.output(fwrite(DF,row.names=TRUE,quote=FALSE)), + capture.output(write.csv(DF,quote=FALSE))) +test(1734.05, capture.output(fwrite(DF,row.names=TRUE,quote=TRUE)), + capture.output(write.csv(DF))) rownames(DF)[2] = "someName" rownames(DF)[3] = "another" -test(1734.6, capture.output(fwrite(DF,row.names=TRUE,quote=FALSE)), - capture.output(write.csv(DF,quote=FALSE))) -test(1734.7, capture.output(fwrite(DF,row.names=TRUE,quote=TRUE)), - capture.output(write.csv(DF))) -rownames(DF) = c(10L, 20L, 30L) ## test for 4957 -test(1734.8, capture.output(fwrite(DF, row.names = TRUE, quote = TRUE)), - capture.output(write.csv(DF))) +test(1734.06, capture.output(fwrite(DF,row.names=TRUE,quote=FALSE)), + capture.output(write.csv(DF,quote=FALSE))) +test(1734.07, capture.output(fwrite(DF,row.names=TRUE,quote=TRUE)), + capture.output(write.csv(DF))) +rownames(DF) = c(10L, -20L, 30L) ## test for #4957 +test(1734.08, capture.output(fwrite(DF, row.names=TRUE, quote=TRUE)), + capture.output(write.csv(DF))) +test(1734.09, capture.output(fwrite(DF, row.names=TRUE, quote=FALSE)), + capture.output(write.csv(DF, quote=FALSE))) +test(1734.10, fwrite(DF, row.names=TRUE, quote='auto'), + output=c('"",foo,bar','10,1,1.2','-20,2,9.8','30,3,-6')) # list columns and sep2 set.seed(1) diff --git a/src/fwrite.c b/src/fwrite.c index f7f4003181..2d10d222fd 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -623,8 +623,8 @@ void fwriteMain(fwriteMainArgs args) DTPRINT(_("... ")); for (int j=args.ncol-10; j Date: Tue, 10 Aug 2021 01:43:06 -0600 Subject: [PATCH 7/7] news item tweak --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index fd75bce689..8bb857cef3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -163,7 +163,7 @@ 30. `fread(file=URL)` now works rather than error `does not exist or is non-readable`, [#4952](https://github.com/Rdatatable/data.table/issues/4952). `fread(URL)` and `fread(input=URL)` worked before and continue to work. Thanks to @pnacht for reporting and @ben-schwen for the PR. -31. `fwrite(DF, row.names=TRUE)` where `DF` has specific integer rownames (e.g. using `rownames(DF) <- c(10L,20L,30L)`) would incorrectly ignore the integer rownames and write the row numbers instead, [#4957](https://github.com/Rdatatable/data.table/issues/4957). Thanks to @dgarrimar for reporting and @ColeMiller1 for the PR. Further, when quote='auto' and the rownames are integers (either default or specific), they are no longer quoted. +31. `fwrite(DF, row.names=TRUE)` where `DF` has specific integer rownames (e.g. using `rownames(DF) <- c(10L,20L,30L)`) would ignore the integer rownames and write the row numbers instead, [#4957](https://github.com/Rdatatable/data.table/issues/4957). Thanks to @dgarrimar for reporting and @ColeMiller1 for the PR. Further, when `quote='auto'` (default) and the rownames are integers (either default or specific), they are no longer quoted. ## NOTES