Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
dc4c578
Adds yaml.extra to add custom attributes to fwrite's yaml
eliocamp Dec 16, 2019
5d22788
Adds test
eliocamp Dec 17, 2019
c3bb737
Escapes test
eliocamp Dec 28, 2019
6fbca5d
Adapts to new syntax for yaml.
eliocamp Feb 16, 2020
27a9d11
whitespace
MichaelChirico Feb 17, 2020
acb6478
Merge branch 'master' into extra-yaml
MichaelChirico Feb 17, 2020
31986b6
Small refactor in generate_yaml's schema
eliocamp Feb 17, 2020
5e2b092
More clear yaml logic
eliocamp Feb 17, 2020
b809f19
Saves and reads column attributes
eliocamp Feb 20, 2020
1608fa3
Adds test
eliocamp Feb 20, 2020
2eee0de
Adds attributes to all columns, not just factors et.al.
eliocamp Feb 20, 2020
3ae164b
Skips length 0 attributes
eliocamp Feb 20, 2020
c109708
Fix previous commit
eliocamp Feb 20, 2020
d47a83f
dev script updates only: CRAN_Release, and added my .bash_aliases and…
mattdowle May 23, 2020
f6bc553
relaxed test 1590 given change in R-devel on ordering encodings (#4492)
mattdowle May 25, 2020
cacdc92
further relaxation of 1590.04 and 1590.07; base R ordering of identic…
mattdowle May 26, 2020
3aa82c5
Adds yaml.extra to add custom attributes to fwrite's yaml
eliocamp Dec 16, 2019
063a4bc
Adds test
eliocamp Dec 17, 2019
fa0d66c
Escapes test
eliocamp Dec 28, 2019
ff47461
Adapts to new syntax for yaml.
eliocamp Feb 16, 2020
f91c99f
whitespace
MichaelChirico Feb 17, 2020
dce4e2e
Small refactor in generate_yaml's schema
eliocamp Feb 17, 2020
24f446e
More clear yaml logic
eliocamp Feb 17, 2020
df5b541
Saves and reads column attributes
eliocamp Feb 20, 2020
a5d1ce8
Adds test
eliocamp Feb 20, 2020
4735509
Adds attributes to all columns, not just factors et.al.
eliocamp Feb 20, 2020
b783fe3
Skips length 0 attributes
eliocamp Feb 20, 2020
9b43bdc
Fix previous commit
eliocamp Feb 20, 2020
1942aa0
Changes News and Description
eliocamp May 26, 2020
8dae2ba
Fixes test number
eliocamp May 26, 2020
e857360
Fixes error in test
eliocamp May 26, 2020
ce7293b
Merge remote-tracking branch 'origin/yaml-attrs' into yaml-attrs
eliocamp May 26, 2020
ae58f35
Fixes bad merge
eliocamp May 26, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .dev/.Rprofile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Matt's ~/.Rprofile is a link to this file at ~/GitHub/data.table/.dev/.Rprofile

# options(repos = c(CRAN="http://cran.stat.ucla.edu"))
# options(repos = c(CRAN=c("http://cran.stat.ucla.edu", "http://cloud.r-project.org"))) # both needed for revdep checks sometimes
options(repos = c(CRAN="http://cloud.r-project.org"))

options(help_type="html")
options(error=quote(dump.frames()))
options(width=200)
options(digits.secs=3) # for POSIXct to print milliseconds
suppressWarnings(RNGversion("3.5.0")) # so when I create tests in dev there isn't a mismatch when run by cc()

Sys.setenv(PROJ_PATH=path.expand("~/GitHub/data.table"))
source(paste0(Sys.getenv("PROJ_PATH"),"/.dev/cc.R"))
21 changes: 21 additions & 0 deletions .dev/.bash_aliases
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Matt's ~/.bash_aliases is a link to this file ~/GitHub/data.table/.dev/.bash_aliases

# One off configure meld as difftool:
# git config --global diff.tool meld
# git config --global difftool.prompt false
alias gd='git difftool &> /dev/null'
alias gdm='git difftool master &> /dev/null'

alias Rdevel='~/build/R-devel/bin/R --vanilla'
alias Rdevel-strict-gcc='~/build/R-devel-strict-gcc/bin/R --vanilla'
alias Rdevel-strict-clang='~/build/R-devel-strict-clang/bin/R --vanilla'
alias Rdevel32='~/build/32bit/R-devel/bin/R --vanilla'
alias R310='~/build/R-3.1.0/bin/R --vanilla'
alias revdepsh='cd ~/build/revdeplib/ && export TZ=UTC && export R_LIBS_SITE=none && export R_LIBS=~/build/revdeplib/ && export _R_CHECK_FORCE_SUGGESTS_=false'
alias revdepr='revdepsh; R_PROFILE_USER=~/GitHub/data.table/.dev/revdep.R ~/build/R-devel/bin/R'

export R_PROFILE_USER='~/.Rprofile'
# there's a .Rprofile in ~/GitHub/data.table/ so Matt sets R_PROFILE_USER here to always use ~/.Rprofile
# even when starting R in ~/GitHub/data.table
# Matt's ~/.Rprofile as a link to ~/GitHub/data.table/.dev/.Rprofile

11 changes: 6 additions & 5 deletions .dev/CRAN_Release.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ cd R-devel # used for revdep testing: .dev/revdep.R.
./configure CFLAGS="-O2 -Wall -pedantic"
make

# use latest available below `apt cache search gcc-` or `clang-`
# use latest available below `apt-cache search gcc-` or `clang-`
cd ../R-devel-strict-clang
./configure --without-recommended-packages --disable-byte-compiled-packages --disable-openmp --enable-strict-barrier --disable-long-double CC="clang-8 -fsanitize=undefined,address -fno-sanitize=float-divide-by-zero -fno-omit-frame-pointer"
make
Expand Down Expand Up @@ -479,7 +479,7 @@ sudo apt-get -y install r-base r-base-dev
sudo apt-get -y build-dep r-base-dev
sudo apt-get -y build-dep qpdf
sudo apt-get -y install aptitude
sudo aptitude build-dep r-cran-rgl # leads to libglu1-mesa-dev
sudo aptitude -y build-dep r-cran-rgl # leads to libglu1-mesa-dev
sudo apt-get -y build-dep r-cran-rmpi
sudo apt-get -y build-dep r-cran-cairodevice
sudo apt-get -y build-dep r-cran-tkrplot
Expand All @@ -490,8 +490,7 @@ sudo apt-get -y install libv8-dev
sudo apt-get -y install gsl-bin libgsl0-dev
sudo apt-get -y install libgtk2.0-dev netcdf-bin
sudo apt-get -y install libcanberra-gtk-module
sudo apt-get -y install git
sudo apt-get -y install openjdk-8-jdk
sudo apt-get -y install openjdk-11-jdk # solves "fatal error: jni.h: No such file or directory"; change 11 to match "java --version"
sudo apt-get -y install libnetcdf-dev udunits-bin libudunits2-dev
sudo apt-get -y install tk8.6-dev
sudo apt-get -y install clustalo # for package LowMACA
Expand All @@ -512,14 +511,16 @@ sudo apt-get -y install libmagick++-dev # for magick
sudo apt-get -y install libjq-dev libprotoc-dev libprotobuf-dev and protobuf-compiler # for protolite
sudo apt-get -y install python-dev # for PythonInR
sudo apt-get -y install gdal-bin libgeos-dev # for rgdal/raster tested via lidR
sudo apt-get build-dep r-cran-rsymphony # for Rsymphony: coinor-libcgl-dev coinor-libclp-dev coinor-libcoinutils-dev coinor-libosi-dev coinor-libsymphony-dev
sudo apt-get -y build-dep r-cran-rsymphony # for Rsymphony: coinor-libcgl-dev coinor-libclp-dev coinor-libcoinutils-dev coinor-libosi-dev coinor-libsymphony-dev
sudo apt-get -y install libtesseract-dev libleptonica-dev tesseract-ocr-eng # for tesseract
sudo apt-get -y install libssl-dev libsasl2-dev
sudo apt-get -y install biber # for ctsem
sudo apt-get -y install libopenblas-dev # for ivmte (+ local R build with default ./configure to pick up shared openblas)
sudo apt-get -y install libhiredis-dev # for redux used by nodbi
sudo apt-get -y install libzmq3-dev # for rzmq
sudo apt-get -y install libimage-exiftool-perl # for camtrapR
sudo apt-get -y install parallel # for revdepr.R
sudo apt-get -y install pandoc-citeproc # for basecallQC
sudo R CMD javareconf
# ENDIF

Expand Down
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ Authors@R: c(
person("David","Simons", role="ctb"),
person("Elliott","Sales de Andrade", role="ctb"),
person("Cole","Miller", role="ctb"),
person("@JenspederM","", role="ctb"))
person("@JenspederM","", role="ctb"),
person("Elio", "Campitelli", role="ctb"))
Depends: R (>= 3.1.0)
Imports: methods
Suggests: bit64, curl, R.utils, knitr, xts, nanotime, zoo, yaml
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ unit = "s")

14. Added support for `round()` and `trunc()` to extend functionality of `ITime`. `round()` and `trunc()` can be used with argument units: "hours" or "minutes". Thanks to @JensPederM for the suggestion and PR.

15. The `yaml` argument in `fread` now accepts a list which will be used to populate the yaml header of the generated csv file. Thanks to @eliocamp for the PR.


## BUG FIXES

1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085).
Expand Down
10 changes: 8 additions & 2 deletions R/fread.R
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,6 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir())
if (check.names) {
setattr(ans, 'names', make.names(names(ans), unique=TRUE))
}

colClassesAs = attr(ans, "colClassesAs", exact=TRUE) # should only be present if one or more are != ""
for (j in which(colClassesAs!="")) { # # 1634
v = .subset2(ans, j)
Expand All @@ -305,6 +304,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir())
return(v)
},
error = fun)

set(ans, j = j, value = new_v) # aside: new_v == v if the coercion was aborted
}
setattr(ans, "colClassesAs", NULL)
Expand All @@ -330,7 +330,13 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir())
}
setkeyv(ans, key)
}
if (yaml) setattr(ans, 'yaml_metadata', yaml_header)
if (yaml) {
setattr(ans, 'yaml_metadata', yaml_header)

for (j in seq_along(ans)) {
attributes(ans[[j]]) <- c(attributes(ans[[j]]), yaml_header$schema$fields[[j]]$attributes)
}
}
if (!is.null(index) && data.table) {
if (!all(sapply(index, is.character)))
stop("index argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)")
Expand Down
53 changes: 31 additions & 22 deletions R/fwrite.R
Original file line number Diff line number Diff line change
Expand Up @@ -82,33 +82,42 @@ fwrite = function(x, file="", append=FALSE, quote="auto",
return(invisible())
}
}
yaml = if (!yaml) "" else {
write_yaml = isTRUE(yaml) || (is.list(yaml) && length(yaml) != 0L)
yaml_text = ""
if (write_yaml) {
if (!requireNamespace('yaml', quietly=TRUE))
stop("'data.table' relies on the package 'yaml' to write the file header; please add this to your library with install.packages('yaml') and try again.") # nocov
schema_vec = sapply(x, class)
# multi-class objects reduced to first class
if (is.list(schema_vec)) schema_vec = sapply(schema_vec, `[`, 1L)
# as.vector strips names
schema_vec = list(name=names(schema_vec), type=as.vector(schema_vec))
yaml_header = list(
source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite',
R.version$major, R.version$minor, format(tryCatch(utils::packageVersion('data.table'), error=function(e) 'DEV'))),
creation_time_utc = format(Sys.time(), tz='UTC'),
schema = list(
fields = lapply(
seq_along(x),
function(i) list(name=schema_vec$name[i], type=schema_vec$type[i])
)
),
header=col.names, sep=sep, sep2=sep2, eol=eol, na.strings=na,
dec=dec, qmethod=qmethod, logical01=logical01
)
paste0('---', eol, yaml::as.yaml(yaml_header, line.sep=eol), '---', eol) # NB: as.yaml adds trailing newline

if (isTRUE(yaml)) {
yaml_text = generate_yaml(x = x, col.names = col.names, sep = sep,
sep2 = sep2, eol = eol, na = na, dec = dec,
qmethod = qmethod, logical01 = logical01)
} else {
names = names(yaml)
if (is.null(names)) {
names = rep("", length(yaml))
}
unnamed = names == ""
true_yaml = vapply_1b(yaml, isTRUE)

generated_yaml = unnamed & true_yaml
if (any(unnamed & !true_yaml)) {
stop("`yaml` contains unnamed elements that are not `TRUE`")
}
yaml_text <- yaml[!generated_yaml]
if (any(generated_yaml)) {
yaml_header = generate_yaml(x = x, col.names = col.names, sep = sep,
sep2 = sep2, eol = eol, na = na, dec = dec,
qmethod = qmethod, logical01 = logical01)
yaml_text = c(yaml_header, yaml_text)
}
}

yaml_text = paste0('---', eol, yaml::as.yaml(yaml_text, line.sep = eol), '---', eol) # NB: as.yaml adds trailing newline
}
file = enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078.
.Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append,
row.names, col.names, logical01, scipen, dateTimeAs, buffMB, nThread,
showProgress, is_gzip, bom, yaml, verbose)
showProgress, is_gzip, bom, yaml_text, verbose)
invisible()
}

34 changes: 34 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,40 @@ do_patterns = function(pat_sub, all_cols) {
return(matched)
}


generate_yaml = function(x, col.names = TRUE, sep = ",", sep2 = "",
eol = if (.Platform$OS.type == "windows") "\r\n" else "\n",
na = "", dec = "", qmethod = "double",
logical01 = getOption("datatable.logical01", FALSE)) {
fields = sapply(seq_along(x), function(i) {
attrs <- attributes(x[[i]])
attrs <- attrs[!(names(attrs) %in% "class")]

ret <- list(name = colnames(x)[i],
type = class(x[[i]])[[1]]) # multi-class objects reduced to first class

if (!is.null(attrs) & length(attrs) != 0) {
ret <- c(ret, list(attributes = attrs))
}
return(ret)
}, simplify = FALSE)

list(
source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite',
R.version$major, R.version$minor, format(tryCatch(utils::packageVersion('data.table'), error=function(e) 'DEV'))),
creation_time_utc = format(Sys.time(), tz='UTC'),
schema = list(fields = fields),
header = col.names,
sep = sep,
sep2 = sep2,
eol = eol,
na.strings = na,
dec = dec,
qmethod = qmethod,
logical01 = logical01
)
}

# check UTC status
is_utc = function(tz) {
# via grep('UTC|GMT', OlsonNames(), value = TRUE); ordered by "prior" frequency
Expand Down
78 changes: 74 additions & 4 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -8120,7 +8120,7 @@ test(1588.7, dt[ch>"c"], dt[4:6]) # coverage of a return(NULL) in .prepareFastS

# data.table operates consistently independent of locale, but it's R that changes and is sensitive to it.
# Because keys/indexes depend on a sort order. If a data.table is stored on disk with a key
# created in a locale-sensitive order and then loaded by another R session in a different locale, the ability to re-use existing sortedness
# created in a locale-sensitive order and then loaded by another R session in a different locale, the ability to reuse existing sortedness
# will break because the order would depend on the locale. Which is why data.table is deliberately C-locale only. For consistency and simpler
# internals for robustness to reduce the change of errors and to avoid that class of bug. It would be possible to have locale-sensitive keys
# and indexes but we've, so far, decided not to, for those reasons.
Expand All @@ -8137,12 +8137,20 @@ Encoding(x1) = "latin1"
x2 = iconv(x1, "latin1", "UTF-8")
test(1590.01, identical(x1,x2))
test(1590.02, x1==x2)
test(1590.03, forderv( c(x2,x1,x1,x2)), integer()) # desirable consistent result given data.table's needs
test(1590.04, base::order(c(x2,x1,x1,x2)), INT(1,4,2,3)) # different result in base R under C locale even though identical(x1,x2)
test(1590.03, forderv( c(x2,x1,x1,x2)), integer()) # desirable consistent result given identical(x1, x2)
# ^^ data.table consistent over time regardless of which version of R or locale
baseR = base::order(c(x2,x1,x1,x2))
# Even though C locale and identical(x1,x2), base R<=4.0.0 considers the encoding too; i.e. orders the encoding together x2 (UTF-8) before x1 (latin1).
# Then around May 2020, R-devel (but just on Windows) started either respecting identical() like data.table has always done, or put latin1 before UTF-8.
# Jan emailed R-devel on 23 May 2020.
# We relaxed 1590.04 and 1590.07 (tests of base R behaviour) rather than remove them, PR#4492 and its follow-up. But these two tests
# are so relaxed now that they barely testing anything. It appears base R behaviour is undefined in this rare case of identical strings in different encodings.
test(1590.04, identical(baseR, INT(1,4,2,3)) || identical(baseR, INT(2,3,1,4)) || identical(baseR, 1:4))
Encoding(x2) = "unknown"
test(1590.05, x1!=x2)
test(1590.06, forderv( c(x2,x1,x1,x2)), INT(1,4,2,3)) # consistent with Windows-1252 result, tested further below
test(1590.07, base::order(c(x2,x1,x1,x2)), INT(2,3,1,4)) # different result; base R is encoding-sensitive in C-locale
baseR = base::order(c(x2,x1,x1,x2))
test(1590.07, identical(baseR, INT(1,4,2,3)) || identical(baseR, INT(2,3,1,4)) || identical(baseR, 1:4))
Sys.setlocale("LC_CTYPE", ctype)
Sys.setlocale("LC_COLLATE", collate)
test(1590.08, Sys.getlocale(), oldlocale) # checked restored locale fully back to how it was before this test
Expand Down Expand Up @@ -14839,6 +14847,67 @@ if (test_yaml) { # csvy; #1701
close(fcon)
test(2033.14, fread(f), DT)
unlink(f)

# Extra arguments in yaml
data = data.table(x = 1, y = 100)
file = tmpfile()

fwrite(data, file, yaml = list(something_else = "o hai!"))
x = fread(file, yaml = TRUE)
unlink(file)
test(2139.01, attr(x, "yaml_metadata")$something_else, "o hai!")
test(2139.02, attr(x, "yaml_metadata")$schema, NULL)

fwrite(data, file, yaml = TRUE)
x = fread(file, yaml = TRUE)
unlink(file)

test(2139.03, attr(x, "yaml_metadata")$schema$fields[[1]]$name, "x")


fwrite(data, file, yaml = list(TRUE))
x = fread(file, yaml = TRUE)
unlink(file)

test(2139.04, attr(x, "yaml_metadata")$schema$fields[[1]]$name, "x")


fwrite(data, file, yaml = list(TRUE, something_else = "o hai!"))
x = fread(file, yaml = TRUE)
unlink(file)

test(2139.05, attr(x, "yaml_metadata")$schema$fields[[1]]$name, "x")
test(2139.06, attr(x, "yaml_metadata")$something_else, "o hai!")


fwrite(data, file, yaml = list(TRUE, something_else = "o hai!", this_is_true = TRUE))
x = fread(file, yaml = TRUE)
unlink(file)

test(2139.07, attr(x, "yaml_metadata")$schema$fields[[1]]$name, "x")
test(2139.08, attr(x, "yaml_metadata")$something_else, "o hai!")
test(2139.09, attr(x, "yaml_metadata")$this_is_true, TRUE)

test(2139.10, fwrite(data, file, yaml = list(TRUE, "unnamed_stuff")), error = "unnamed")


data = data.table(x = 1L, y = 100L,
c = factor(letters[1:3], levels = letters[1:4]),
date = Sys.Date())
attr(data$x, "attr") <- "attribute"
fwrite(data, file, yaml = TRUE)
x = fread(file, yaml = TRUE)


test(2139.11, levels(x$c), letters[1:4])
test(2139.12, attr(data$x, "attr"), "attribute")

attr(data$x, "attr") <- NULL
data$c = as.character(data$c)
data$date = as.character(data$date)

test(2139.13, fread(file, yaml = FALSE), data)
unlink(file)
}

# fcast coverage
Expand Down Expand Up @@ -16642,6 +16711,7 @@ test(2125.10, capture.output(print(DT, trunc.cols=TRUE, class=TRUE)),
"4 variables not shown: [a <char>, b <char>, c <char>, d <char>]")
options(old_width)


# segfault when i is NULL or zero-column, #4060
DT = data.table(A="a", key="A")
test(2126.1, DT[J(NULL)], DT[0])
Expand Down