diff --git a/.dev/.Rprofile b/.dev/.Rprofile new file mode 100644 index 0000000000..7d4ab3239d --- /dev/null +++ b/.dev/.Rprofile @@ -0,0 +1,14 @@ +# Matt's ~/.Rprofile is a link to this file at ~/GitHub/data.table/.dev/.Rprofile + +# options(repos = c(CRAN="http://cran.stat.ucla.edu")) +# options(repos = c(CRAN=c("http://cran.stat.ucla.edu", "http://cloud.r-project.org"))) # both needed for revdep checks sometimes +options(repos = c(CRAN="http://cloud.r-project.org")) + +options(help_type="html") +options(error=quote(dump.frames())) +options(width=200) +options(digits.secs=3) # for POSIXct to print milliseconds +suppressWarnings(RNGversion("3.5.0")) # so when I create tests in dev there isn't a mismatch when run by cc() + +Sys.setenv(PROJ_PATH=path.expand("~/GitHub/data.table")) +source(paste0(Sys.getenv("PROJ_PATH"),"/.dev/cc.R")) diff --git a/.dev/.bash_aliases b/.dev/.bash_aliases new file mode 100644 index 0000000000..93ea44ed5c --- /dev/null +++ b/.dev/.bash_aliases @@ -0,0 +1,21 @@ +# Matt's ~/.bash_aliases is a link to this file ~/GitHub/data.table/.dev/.bash_aliases + +# One off configure meld as difftool: +# git config --global diff.tool meld +# git config --global difftool.prompt false +alias gd='git difftool &> /dev/null' +alias gdm='git difftool master &> /dev/null' + +alias Rdevel='~/build/R-devel/bin/R --vanilla' +alias Rdevel-strict-gcc='~/build/R-devel-strict-gcc/bin/R --vanilla' +alias Rdevel-strict-clang='~/build/R-devel-strict-clang/bin/R --vanilla' +alias Rdevel32='~/build/32bit/R-devel/bin/R --vanilla' +alias R310='~/build/R-3.1.0/bin/R --vanilla' +alias revdepsh='cd ~/build/revdeplib/ && export TZ=UTC && export R_LIBS_SITE=none && export R_LIBS=~/build/revdeplib/ && export _R_CHECK_FORCE_SUGGESTS_=false' +alias revdepr='revdepsh; R_PROFILE_USER=~/GitHub/data.table/.dev/revdep.R ~/build/R-devel/bin/R' + +export R_PROFILE_USER='~/.Rprofile' +# there's a .Rprofile in ~/GitHub/data.table/ so Matt sets R_PROFILE_USER here to always use ~/.Rprofile +# even when starting R in ~/GitHub/data.table +# Matt's ~/.Rprofile as a link to ~/GitHub/data.table/.dev/.Rprofile + diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index e629ee980b..f9d435455e 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -304,7 +304,7 @@ cd R-devel # used for revdep testing: .dev/revdep.R. ./configure CFLAGS="-O2 -Wall -pedantic" make -# use latest available below `apt cache search gcc-` or `clang-` +# use latest available below `apt-cache search gcc-` or `clang-` cd ../R-devel-strict-clang ./configure --without-recommended-packages --disable-byte-compiled-packages --disable-openmp --enable-strict-barrier --disable-long-double CC="clang-8 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer" make @@ -479,7 +479,7 @@ sudo apt-get -y install r-base r-base-dev sudo apt-get -y build-dep r-base-dev sudo apt-get -y build-dep qpdf sudo apt-get -y install aptitude -sudo aptitude build-dep r-cran-rgl # leads to libglu1-mesa-dev +sudo aptitude -y build-dep r-cran-rgl # leads to libglu1-mesa-dev sudo apt-get -y build-dep r-cran-rmpi sudo apt-get -y build-dep r-cran-cairodevice sudo apt-get -y build-dep r-cran-tkrplot @@ -490,8 +490,7 @@ sudo apt-get -y install libv8-dev sudo apt-get -y install gsl-bin libgsl0-dev sudo apt-get -y install libgtk2.0-dev netcdf-bin sudo apt-get -y install libcanberra-gtk-module -sudo apt-get -y install git -sudo apt-get -y install openjdk-8-jdk +sudo apt-get -y install openjdk-11-jdk # solves "fatal error: jni.h: No such file or directory"; change 11 to match "java --version" sudo apt-get -y install libnetcdf-dev udunits-bin libudunits2-dev sudo apt-get -y install tk8.6-dev sudo apt-get -y install clustalo # for package LowMACA @@ -512,7 +511,7 @@ sudo apt-get -y install libmagick++-dev # for magick sudo apt-get -y install libjq-dev libprotoc-dev libprotobuf-dev and protobuf-compiler # for protolite sudo apt-get -y install python-dev # for PythonInR sudo apt-get -y install gdal-bin libgeos-dev # for rgdal/raster tested via lidR -sudo apt-get build-dep r-cran-rsymphony # for Rsymphony: coinor-libcgl-dev coinor-libclp-dev coinor-libcoinutils-dev coinor-libosi-dev coinor-libsymphony-dev +sudo apt-get -y build-dep r-cran-rsymphony # for Rsymphony: coinor-libcgl-dev coinor-libclp-dev coinor-libcoinutils-dev coinor-libosi-dev coinor-libsymphony-dev sudo apt-get -y install libtesseract-dev libleptonica-dev tesseract-ocr-eng # for tesseract sudo apt-get -y install libssl-dev libsasl2-dev sudo apt-get -y install biber # for ctsem @@ -520,6 +519,8 @@ sudo apt-get -y install libopenblas-dev # for ivmte (+ local R build with defau sudo apt-get -y install libhiredis-dev # for redux used by nodbi sudo apt-get -y install libzmq3-dev # for rzmq sudo apt-get -y install libimage-exiftool-perl # for camtrapR +sudo apt-get -y install parallel # for revdepr.R +sudo apt-get -y install pandoc-citeproc # for basecallQC sudo R CMD javareconf # ENDIF diff --git a/DESCRIPTION b/DESCRIPTION index 5dd73e284c..29258612ac 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -57,7 +57,8 @@ Authors@R: c( person("David","Simons", role="ctb"), person("Elliott","Sales de Andrade", role="ctb"), person("Cole","Miller", role="ctb"), - person("@JenspederM","", role="ctb")) + person("@JenspederM","", role="ctb"), + person("Elio", "Campitelli", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64, curl, R.utils, knitr, xts, nanotime, zoo, yaml diff --git a/NEWS.md b/NEWS.md index 7ecd4bc5af..b11172db78 100644 --- a/NEWS.md +++ b/NEWS.md @@ -81,6 +81,9 @@ unit = "s") 14. Added support for `round()` and `trunc()` to extend functionality of `ITime`. `round()` and `trunc()` can be used with argument units: "hours" or "minutes". Thanks to @JensPederM for the suggestion and PR. +15. The `yaml` argument in `fread` now accepts a list which will be used to populate the yaml header of the generated csv file. Thanks to @eliocamp for the PR. + + ## BUG FIXES 1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085). diff --git a/R/fread.R b/R/fread.R index d57d2cd6fd..2b1344b0d2 100644 --- a/R/fread.R +++ b/R/fread.R @@ -284,7 +284,6 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir()) if (check.names) { setattr(ans, 'names', make.names(names(ans), unique=TRUE)) } - colClassesAs = attr(ans, "colClassesAs", exact=TRUE) # should only be present if one or more are != "" for (j in which(colClassesAs!="")) { # # 1634 v = .subset2(ans, j) @@ -305,6 +304,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir()) return(v) }, error = fun) + set(ans, j = j, value = new_v) # aside: new_v == v if the coercion was aborted } setattr(ans, "colClassesAs", NULL) @@ -330,7 +330,13 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir()) } setkeyv(ans, key) } - if (yaml) setattr(ans, 'yaml_metadata', yaml_header) + if (yaml) { + setattr(ans, 'yaml_metadata', yaml_header) + + for (j in seq_along(ans)) { + attributes(ans[[j]]) <- c(attributes(ans[[j]]), yaml_header$schema$fields[[j]]$attributes) + } + } if (!is.null(index) && data.table) { if (!all(sapply(index, is.character))) stop("index argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)") diff --git a/R/fwrite.R b/R/fwrite.R index 1971c0e4ea..e9d5f2a1ba 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -82,33 +82,42 @@ fwrite = function(x, file="", append=FALSE, quote="auto", return(invisible()) } } - yaml = if (!yaml) "" else { + write_yaml = isTRUE(yaml) || (is.list(yaml) && length(yaml) != 0L) + yaml_text = "" + if (write_yaml) { if (!requireNamespace('yaml', quietly=TRUE)) stop("'data.table' relies on the package 'yaml' to write the file header; please add this to your library with install.packages('yaml') and try again.") # nocov - schema_vec = sapply(x, class) - # multi-class objects reduced to first class - if (is.list(schema_vec)) schema_vec = sapply(schema_vec, `[`, 1L) - # as.vector strips names - schema_vec = list(name=names(schema_vec), type=as.vector(schema_vec)) - yaml_header = list( - source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite', - R.version$major, R.version$minor, format(tryCatch(utils::packageVersion('data.table'), error=function(e) 'DEV'))), - creation_time_utc = format(Sys.time(), tz='UTC'), - schema = list( - fields = lapply( - seq_along(x), - function(i) list(name=schema_vec$name[i], type=schema_vec$type[i]) - ) - ), - header=col.names, sep=sep, sep2=sep2, eol=eol, na.strings=na, - dec=dec, qmethod=qmethod, logical01=logical01 - ) - paste0('---', eol, yaml::as.yaml(yaml_header, line.sep=eol), '---', eol) # NB: as.yaml adds trailing newline + + if (isTRUE(yaml)) { + yaml_text = generate_yaml(x = x, col.names = col.names, sep = sep, + sep2 = sep2, eol = eol, na = na, dec = dec, + qmethod = qmethod, logical01 = logical01) + } else { + names = names(yaml) + if (is.null(names)) { + names = rep("", length(yaml)) + } + unnamed = names == "" + true_yaml = vapply_1b(yaml, isTRUE) + + generated_yaml = unnamed & true_yaml + if (any(unnamed & !true_yaml)) { + stop("`yaml` contains unnamed elements that are not `TRUE`") + } + yaml_text <- yaml[!generated_yaml] + if (any(generated_yaml)) { + yaml_header = generate_yaml(x = x, col.names = col.names, sep = sep, + sep2 = sep2, eol = eol, na = na, dec = dec, + qmethod = qmethod, logical01 = logical01) + yaml_text = c(yaml_header, yaml_text) + } + } + + yaml_text = paste0('---', eol, yaml::as.yaml(yaml_text, line.sep = eol), '---', eol) # NB: as.yaml adds trailing newline } file = enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078. .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, row.names, col.names, logical01, scipen, dateTimeAs, buffMB, nThread, - showProgress, is_gzip, bom, yaml, verbose) + showProgress, is_gzip, bom, yaml_text, verbose) invisible() } - diff --git a/R/utils.R b/R/utils.R index 42e67ea8de..107d88c12d 100644 --- a/R/utils.R +++ b/R/utils.R @@ -124,6 +124,40 @@ do_patterns = function(pat_sub, all_cols) { return(matched) } + +generate_yaml = function(x, col.names = TRUE, sep = ",", sep2 = "", + eol = if (.Platform$OS.type == "windows") "\r\n" else "\n", + na = "", dec = "", qmethod = "double", + logical01 = getOption("datatable.logical01", FALSE)) { + fields = sapply(seq_along(x), function(i) { + attrs <- attributes(x[[i]]) + attrs <- attrs[!(names(attrs) %in% "class")] + + ret <- list(name = colnames(x)[i], + type = class(x[[i]])[[1]]) # multi-class objects reduced to first class + + if (!is.null(attrs) & length(attrs) != 0) { + ret <- c(ret, list(attributes = attrs)) + } + return(ret) + }, simplify = FALSE) + + list( + source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite', + R.version$major, R.version$minor, format(tryCatch(utils::packageVersion('data.table'), error=function(e) 'DEV'))), + creation_time_utc = format(Sys.time(), tz='UTC'), + schema = list(fields = fields), + header = col.names, + sep = sep, + sep2 = sep2, + eol = eol, + na.strings = na, + dec = dec, + qmethod = qmethod, + logical01 = logical01 + ) +} + # check UTC status is_utc = function(tz) { # via grep('UTC|GMT', OlsonNames(), value = TRUE); ordered by "prior" frequency diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ed17470383..e32821a5a4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8120,7 +8120,7 @@ test(1588.7, dt[ch>"c"], dt[4:6]) # coverage of a return(NULL) in .prepareFastS # data.table operates consistently independent of locale, but it's R that changes and is sensitive to it. # Because keys/indexes depend on a sort order. If a data.table is stored on disk with a key -# created in a locale-sensitive order and then loaded by another R session in a different locale, the ability to re-use existing sortedness +# created in a locale-sensitive order and then loaded by another R session in a different locale, the ability to reuse existing sortedness # will break because the order would depend on the locale. Which is why data.table is deliberately C-locale only. For consistency and simpler # internals for robustness to reduce the change of errors and to avoid that class of bug. It would be possible to have locale-sensitive keys # and indexes but we've, so far, decided not to, for those reasons. @@ -8137,12 +8137,20 @@ Encoding(x1) = "latin1" x2 = iconv(x1, "latin1", "UTF-8") test(1590.01, identical(x1,x2)) test(1590.02, x1==x2) -test(1590.03, forderv( c(x2,x1,x1,x2)), integer()) # desirable consistent result given data.table's needs -test(1590.04, base::order(c(x2,x1,x1,x2)), INT(1,4,2,3)) # different result in base R under C locale even though identical(x1,x2) +test(1590.03, forderv( c(x2,x1,x1,x2)), integer()) # desirable consistent result given identical(x1, x2) + # ^^ data.table consistent over time regardless of which version of R or locale +baseR = base::order(c(x2,x1,x1,x2)) + # Even though C locale and identical(x1,x2), base R<=4.0.0 considers the encoding too; i.e. orders the encoding together x2 (UTF-8) before x1 (latin1). + # Then around May 2020, R-devel (but just on Windows) started either respecting identical() like data.table has always done, or put latin1 before UTF-8. + # Jan emailed R-devel on 23 May 2020. + # We relaxed 1590.04 and 1590.07 (tests of base R behaviour) rather than remove them, PR#4492 and its follow-up. But these two tests + # are so relaxed now that they barely testing anything. It appears base R behaviour is undefined in this rare case of identical strings in different encodings. +test(1590.04, identical(baseR, INT(1,4,2,3)) || identical(baseR, INT(2,3,1,4)) || identical(baseR, 1:4)) Encoding(x2) = "unknown" test(1590.05, x1!=x2) test(1590.06, forderv( c(x2,x1,x1,x2)), INT(1,4,2,3)) # consistent with Windows-1252 result, tested further below -test(1590.07, base::order(c(x2,x1,x1,x2)), INT(2,3,1,4)) # different result; base R is encoding-sensitive in C-locale +baseR = base::order(c(x2,x1,x1,x2)) +test(1590.07, identical(baseR, INT(1,4,2,3)) || identical(baseR, INT(2,3,1,4)) || identical(baseR, 1:4)) Sys.setlocale("LC_CTYPE", ctype) Sys.setlocale("LC_COLLATE", collate) test(1590.08, Sys.getlocale(), oldlocale) # checked restored locale fully back to how it was before this test @@ -14839,6 +14847,67 @@ if (test_yaml) { # csvy; #1701 close(fcon) test(2033.14, fread(f), DT) unlink(f) + + # Extra arguments in yaml + data = data.table(x = 1, y = 100) + file = tmpfile() + + fwrite(data, file, yaml = list(something_else = "o hai!")) + x = fread(file, yaml = TRUE) + unlink(file) + test(2139.01, attr(x, "yaml_metadata")$something_else, "o hai!") + test(2139.02, attr(x, "yaml_metadata")$schema, NULL) + + fwrite(data, file, yaml = TRUE) + x = fread(file, yaml = TRUE) + unlink(file) + + test(2139.03, attr(x, "yaml_metadata")$schema$fields[[1]]$name, "x") + + + fwrite(data, file, yaml = list(TRUE)) + x = fread(file, yaml = TRUE) + unlink(file) + + test(2139.04, attr(x, "yaml_metadata")$schema$fields[[1]]$name, "x") + + + fwrite(data, file, yaml = list(TRUE, something_else = "o hai!")) + x = fread(file, yaml = TRUE) + unlink(file) + + test(2139.05, attr(x, "yaml_metadata")$schema$fields[[1]]$name, "x") + test(2139.06, attr(x, "yaml_metadata")$something_else, "o hai!") + + + fwrite(data, file, yaml = list(TRUE, something_else = "o hai!", this_is_true = TRUE)) + x = fread(file, yaml = TRUE) + unlink(file) + + test(2139.07, attr(x, "yaml_metadata")$schema$fields[[1]]$name, "x") + test(2139.08, attr(x, "yaml_metadata")$something_else, "o hai!") + test(2139.09, attr(x, "yaml_metadata")$this_is_true, TRUE) + + test(2139.10, fwrite(data, file, yaml = list(TRUE, "unnamed_stuff")), error = "unnamed") + + + data = data.table(x = 1L, y = 100L, + c = factor(letters[1:3], levels = letters[1:4]), + date = Sys.Date()) + attr(data$x, "attr") <- "attribute" + fwrite(data, file, yaml = TRUE) + x = fread(file, yaml = TRUE) + + + test(2139.11, levels(x$c), letters[1:4]) + test(2139.12, attr(data$x, "attr"), "attribute") + + attr(data$x, "attr") <- NULL + data$c = as.character(data$c) + data$date = as.character(data$date) + + test(2139.13, fread(file, yaml = FALSE), data) + unlink(file) } # fcast coverage @@ -16642,6 +16711,7 @@ test(2125.10, capture.output(print(DT, trunc.cols=TRUE, class=TRUE)), "4 variables not shown: [a , b , c , d ]") options(old_width) + # segfault when i is NULL or zero-column, #4060 DT = data.table(A="a", key="A") test(2126.1, DT[J(NULL)], DT[0])