From e913598b904f7e3d97f11f909e6240eec4054463 Mon Sep 17 00:00:00 2001 From: HughParsonage Date: Mon, 23 Apr 2018 14:29:51 +1000 Subject: [PATCH 1/3] Provide backwards-compatibility for easycsv --- NEWS.md | 2 ++ R/fread.R | 5 ++++- man/fread.Rd | 4 ++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 0c3051ce83..82b3e99f90 100644 --- a/NEWS.md +++ b/NEWS.md @@ -20,6 +20,8 @@ getOption("datatable.logical01", TRUE) # future release ``` This option controls whether a column of all 0's and 1's is read as `integer`, or `logical` directly to avoid needing to change the type afterwards to `logical` or use `colClasses`. `0/1` is smaller and faster than `"TRUE"/"FALSE"`, which can make a significant difference to space and time the more `logical` columns there are. When the default's default changes to `TRUE` for `fread` we do not expect much impact since all arithmetic operators that are currently receiving 0's and 1's as type `integer` (think `sum()`) but instead could receive `logical`, would return exactly the same result on the 0's and 1's as `logical` type. However, code that is manipulating column types using `is.integer` or `is.logical` on `fread`'s result, could require change. It could be painful if `DT[(logical_column)]` (i.e. `DT[logical_column==TRUE]`) changed behaviour due to `logical_column` no longer being type `logical` but `integer`. But that is not the change proposed. The change is the other way around; i.e., a previously `integer` column holding only 0's and 1's would now be type `logical`. Since it's that way around, we believe the scope for breakage is limited. We think a lot of code is converting 0/1 integer columns to logical anyway, either using `colClasses=` or afterwards with an assign. For `fwrite`, the level of breakage depends on the consumer of the output file. We believe `0/1` is a better more standard default choice to move to. See notes below about improvements to `fread`'s sampling for type guessing, and automatic rereading in the rare cases of out-of-sample type surprises. + +3. The `datatable.showProgress` option now has no effect. `fread`'s default argument for `showProgress` is now `NULL` which internally sets `showProgress = interactive()`. Since `getOption` of an unset option is `NULL`, if you continue to use `getOption("datatable.showProgress")` for this argument, you will still encounter the default argument; the only breaking change would be in code that relies on a non-default value for this option. These options are meant for temporary use to aid your migration, [#2652](https://github.com/Rdatatable/data.table/pull/2652). You are not meant to set them to the old default and then not migrate your code that is dependent on the default. Either set the argument explicitly so your code is not dependent on the default, or change the code to cope with the new default. Over the next few years we will slowly start to remove these options, warning you if you are using them, and return to a simple default. See the history of NEWS and NEWS.0 for past migrations that have, generally speaking, been successfully managed in this way. For example, at the end of NOTES for this version (below in this file) is a note about the usage of `datatable.old.unique.by.key` now warning, as you were warned it would do over a year ago. When that change was introduced, the default was changed and that option provided an option to restore the old behaviour. These `fread`/`fwrite` changes are even more cautious and not even changing the default's default yet. Giving you extra warning by way of this notice to move forward. And giving you a chance to object. diff --git a/R/fread.R b/R/fread.R index 13605800e9..732fffd778 100644 --- a/R/fread.R +++ b/R/fread.R @@ -1,5 +1,5 @@ -fread <- function(input="",file,sep="auto",sep2="auto",dec=".",quote="\"",nrows=Inf,header="auto",na.strings=getOption("datatable.na.strings","NA"),stringsAsFactors=FALSE,verbose=getOption("datatable.verbose",FALSE),skip="__auto__",select=NULL,drop=NULL,colClasses=NULL,integer64=getOption("datatable.integer64","integer64"), col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, showProgress=interactive(), data.table=getOption("datatable.fread.datatable",TRUE), nThread=getDTthreads(), logical01=getOption("datatable.logical01", FALSE), autostart=NA) +fread <- function(input="",file,sep="auto",sep2="auto",dec=".",quote="\"",nrows=Inf,header="auto",na.strings=getOption("datatable.na.strings","NA"),stringsAsFactors=FALSE,verbose=getOption("datatable.verbose",FALSE),skip="__auto__",select=NULL,drop=NULL,colClasses=NULL,integer64=getOption("datatable.integer64","integer64"), col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, showProgress=NULL, data.table=getOption("datatable.fread.datatable",TRUE), nThread=getDTthreads(), logical01=getOption("datatable.logical01", FALSE), autostart=NA) { if (is.null(sep)) sep="\n" # C level knows that \n means \r\n on Windows, for example else { @@ -13,6 +13,9 @@ fread <- function(input="",file,sep="auto",sep2="auto",dec=".",quote="\"",nrows= if (length(encoding) != 1L || !encoding %in% c("unknown", "UTF-8", "Latin-1")) { stop("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.") } + if (is.null(showProgress)) { + showProgress <- interactive() + } isTrueFalse = function(x) isTRUE(x) || identical(FALSE, x) isTrueFalseNA = function(x) isTRUE(x) || identical(FALSE, x) || identical(NA, x) stopifnot( isTrueFalse(strip.white), isTrueFalse(blank.lines.skip), isTrueFalse(fill), isTrueFalse(showProgress), diff --git a/man/fread.Rd b/man/fread.Rd index fb75bd405b..6ec8ff617a 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -19,7 +19,7 @@ col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, -showProgress=interactive(), +showProgress=NULL, data.table=getOption("datatable.fread.datatable", TRUE), nThread=getDTthreads(), logical01=getOption("datatable.logical01", FALSE), # due to change to TRUE; see NEWS @@ -51,7 +51,7 @@ autostart=NA \item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.} \item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } \item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } - \item{showProgress}{ \code{TRUE} displays progress on the console if the ETA is greater than 3 seconds. It is produced in fread's C code where the very nice (but R level) txtProgressBar and tkProgressBar are not easily available. } + \item{showProgress}{ \code{TRUE} displays progress on the console if the ETA is greater than 3 seconds. It is produced in fread's C code where the very nice (but R level) txtProgressBar and tkProgressBar are not easily available. If \code{NULL}, the default, it will be set internally to \code{interactive()}.} \item{data.table}{ TRUE returns a \code{data.table}. FALSE returns a \code{data.frame}. } \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.} \item{logical01}{If TRUE a column containing only 0s and 1s will be read as logical, otherwise as integer.} From e2c603b23db0def6cffc868111aef3daed009759 Mon Sep 17 00:00:00 2001 From: HughParsonage Date: Wed, 25 Apr 2018 14:36:23 +1000 Subject: [PATCH 2/3] Add test for explicit use of fmr default option --- inst/tests/tests.Rraw | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c87a498d43..a8b5bec7ae 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5476,6 +5476,10 @@ f = paste("file://",testDir("russellCRCRLF.csv"),sep="") test(1378.3, fread(f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) #==================================== +# easycsv uses a getOption("datatable.showProgress") which is now defunct, i.e. NULL +# so test use of this option does not now raise an error +test(1378.4, x=fread("A,B\n1,3\n2,4", showProgress=getOption("datatable.showProgress")), y=fread("A,B\n1,3\n2,4")) + oldv = options(datatable.fread.datatable = FALSE) test(1379.1, fread("A,B\n1,3\n2,4\n"), data.frame(A=1:2,B=3:4)) test(1379.2, fread("A,B\n1,3\n2,4\n",data.table=TRUE), data.table(A=1:2,B=3:4)) From 083d7be20ed06f6739fd5eebf578575f26bb0851 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 27 Apr 2018 14:00:09 -0700 Subject: [PATCH 3/3] Changed to showProgress=getOption('datatable.showProgress', default=interactive()) --- NEWS.md | 6 ++++-- R/fread.R | 5 +---- R/fwrite.R | 2 +- inst/tests/tests.Rraw | 4 ---- man/fread.Rd | 4 ++-- man/fwrite.Rd | 2 +- 6 files changed, 9 insertions(+), 14 deletions(-) diff --git a/NEWS.md b/NEWS.md index c0466ce0c0..fabeffa393 100644 --- a/NEWS.md +++ b/NEWS.md @@ -20,8 +20,7 @@ getOption("datatable.logical01", TRUE) # future release ``` This option controls whether a column of all 0's and 1's is read as `integer`, or `logical` directly to avoid needing to change the type afterwards to `logical` or use `colClasses`. `0/1` is smaller and faster than `"TRUE"/"FALSE"`, which can make a significant difference to space and time the more `logical` columns there are. When the default's default changes to `TRUE` for `fread` we do not expect much impact since all arithmetic operators that are currently receiving 0's and 1's as type `integer` (think `sum()`) but instead could receive `logical`, would return exactly the same result on the 0's and 1's as `logical` type. However, code that is manipulating column types using `is.integer` or `is.logical` on `fread`'s result, could require change. It could be painful if `DT[(logical_column)]` (i.e. `DT[logical_column==TRUE]`) changed behaviour due to `logical_column` no longer being type `logical` but `integer`. But that is not the change proposed. The change is the other way around; i.e., a previously `integer` column holding only 0's and 1's would now be type `logical`. Since it's that way around, we believe the scope for breakage is limited. We think a lot of code is converting 0/1 integer columns to logical anyway, either using `colClasses=` or afterwards with an assign. For `fwrite`, the level of breakage depends on the consumer of the output file. We believe `0/1` is a better more standard default choice to move to. See notes below about improvements to `fread`'s sampling for type guessing, and automatic rereading in the rare cases of out-of-sample type surprises. - -3. The `datatable.showProgress` option now has no effect. `fread`'s default argument for `showProgress` is now `NULL` which internally sets `showProgress = interactive()`. Since `getOption` of an unset option is `NULL`, if you continue to use `getOption("datatable.showProgress")` for this argument, you will still encounter the default argument; the only breaking change would be in code that relies on a non-default value for this option. + These options are meant for temporary use to aid your migration, [#2652](https://github.com/Rdatatable/data.table/pull/2652). You are not meant to set them to the old default and then not migrate your code that is dependent on the default. Either set the argument explicitly so your code is not dependent on the default, or change the code to cope with the new default. Over the next few years we will slowly start to remove these options, warning you if you are using them, and return to a simple default. See the history of NEWS and NEWS.0 for past migrations that have, generally speaking, been successfully managed in this way. For example, at the end of NOTES for this version (below in this file) is a note about the usage of `datatable.old.unique.by.key` now warning, as you were warned it would do over a year ago. When that change was introduced, the default was changed and that option provided an option to restore the old behaviour. These `fread`/`fwrite` changes are even more cautious and not even changing the default's default yet. Giving you extra warning by way of this notice to move forward. And giving you a chance to object. @@ -257,6 +256,9 @@ Was warning: set2key() will be deprecated in the next relase. Please use setinde Now error: set2key() is now deprecated. Please use setindex() instead. ``` +12. The option `datatable.showProgress` is no longer set to a default value when the package is loaded. Instead, the `default=` argument of `getOption` is used by both `fwrite` and `fread`. The default is the result of `interactive()` at the time of the call. Using `getOption` in this way is intended to be more helpful to users looking at `args(fread)` and `?fread`. + + ### Changes in v1.10.4-3 (on CRAN 20 Oct 2017) 1. Fixed crash/hang on MacOS when `parallel::mclapply` is used and data.table is merely loaded, [#2418](https://github.com/Rdatatable/data.table/issues/2418). Oddly, all tests including test 1705 (which tests `mclapply` with data.table) passed fine on CRAN. It appears to be some versions of MacOS or some versions of libraries on MacOS, perhaps. Many thanks to Martin Morgan for reporting and confirming this fix works. Thanks also to @asenabouth, Joe Thorley and Danton Noriega for testing, debugging and confirming that automatic parallelism inside data.table (such as `fwrite`) works well even on these MacOS installations. See also news items below for 1.10.4-1 and 1.10.4-2. diff --git a/R/fread.R b/R/fread.R index 778a5032e9..f0ac1aecc4 100644 --- a/R/fread.R +++ b/R/fread.R @@ -1,5 +1,5 @@ -fread <- function(input="",file,sep="auto",sep2="auto",dec=".",quote="\"",nrows=Inf,header="auto",na.strings=getOption("datatable.na.strings","NA"),stringsAsFactors=FALSE,verbose=getOption("datatable.verbose",FALSE),skip="__auto__",select=NULL,drop=NULL,colClasses=NULL,integer64=getOption("datatable.integer64","integer64"), col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, showProgress=NULL, data.table=getOption("datatable.fread.datatable",TRUE), nThread=getDTthreads(), logical01=getOption("datatable.logical01", FALSE), autostart=NA) +fread <- function(input="",file,sep="auto",sep2="auto",dec=".",quote="\"",nrows=Inf,header="auto",na.strings=getOption("datatable.na.strings","NA"),stringsAsFactors=FALSE,verbose=getOption("datatable.verbose",FALSE),skip="__auto__",select=NULL,drop=NULL,colClasses=NULL,integer64=getOption("datatable.integer64","integer64"), col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE), nThread=getDTthreads(), logical01=getOption("datatable.logical01", FALSE), autostart=NA) { if (is.null(sep)) sep="\n" # C level knows that \n means \r\n on Windows, for example else { @@ -13,9 +13,6 @@ fread <- function(input="",file,sep="auto",sep2="auto",dec=".",quote="\"",nrows= if (length(encoding) != 1L || !encoding %in% c("unknown", "UTF-8", "Latin-1")) { stop("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.") } - if (is.null(showProgress)) { - showProgress <- interactive() - } isTrueFalse = function(x) isTRUE(x) || identical(FALSE, x) isTrueFalseNA = function(x) isTRUE(x) || identical(FALSE, x) || identical(NA, x) stopifnot( isTrueFalse(strip.white), isTrueFalse(blank.lines.skip), isTrueFalse(fill), isTrueFalse(showProgress), diff --git a/R/fwrite.R b/R/fwrite.R index 914c23c2cd..c9576a385e 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -6,7 +6,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", logicalAsInt=logical01, dateTimeAs = c("ISO","squash","epoch","write.csv"), buffMB=8, nThread=getDTthreads(), - showProgress=interactive(), + showProgress=getOption("datatable.showProgress", interactive()), verbose=getOption("datatable.verbose", FALSE)) { isLOGICAL = function(x) isTRUE(x) || identical(FALSE, x) # it seems there is no isFALSE in R? na = as.character(na[1L]) # fix for #1725 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1104486cae..895bc1fa7f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5476,10 +5476,6 @@ f = paste("file://",testDir("russellCRCRLF.csv"),sep="") test(1378.3, fread(f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) #==================================== -# easycsv uses a getOption("datatable.showProgress") which is now defunct, i.e. NULL -# so test use of this option does not now raise an error -test(1378.4, x=fread("A,B\n1,3\n2,4", showProgress=getOption("datatable.showProgress")), y=fread("A,B\n1,3\n2,4")) - oldv = options(datatable.fread.datatable = FALSE) test(1379.1, fread("A,B\n1,3\n2,4\n"), data.frame(A=1:2,B=3:4)) test(1379.2, fread("A,B\n1,3\n2,4\n",data.table=TRUE), data.table(A=1:2,B=3:4)) diff --git a/man/fread.Rd b/man/fread.Rd index 6ec8ff617a..3dabfb6377 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -19,7 +19,7 @@ col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, -showProgress=NULL, +showProgress=getOption("datatable.showProgress", interactive()), data.table=getOption("datatable.fread.datatable", TRUE), nThread=getDTthreads(), logical01=getOption("datatable.logical01", FALSE), # due to change to TRUE; see NEWS @@ -51,7 +51,7 @@ autostart=NA \item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.} \item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } \item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } - \item{showProgress}{ \code{TRUE} displays progress on the console if the ETA is greater than 3 seconds. It is produced in fread's C code where the very nice (but R level) txtProgressBar and tkProgressBar are not easily available. If \code{NULL}, the default, it will be set internally to \code{interactive()}.} + \item{showProgress}{ \code{TRUE} displays progress on the console if the ETA is greater than 3 seconds. It is produced in fread's C code where the very nice (but R level) txtProgressBar and tkProgressBar are not easily available. } \item{data.table}{ TRUE returns a \code{data.table}. FALSE returns a \code{data.frame}. } \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.} \item{logical01}{If TRUE a column containing only 0s and 1s will be read as logical, otherwise as integer.} diff --git a/man/fwrite.Rd b/man/fwrite.Rd index 3fa8f7b38d..fbb9f5b406 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -16,7 +16,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", logicalAsInt = logical01, # deprecated dateTimeAs = c("ISO","squash","epoch","write.csv"), buffMB = 8L, nThread = getDTthreads(), - showProgress = interactive(), + showProgress = getOption("datatable.showProgress", interactive()), verbose = getOption("datatable.verbose", FALSE)) } \arguments{