Rdatatable · mattdowle · May 2, 2019 · Apr 3, 2019 · Apr 3, 2019 · May 2, 2019
@@ -42,7 +42,7 @@ Authors@R: c(
   person("Roy","Storey",           role="ctb"))
 Depends: R (>= 3.1.0)
 Imports: methods
-Suggests: bit64, curl, R.utils, knitr, xts, nanotime, zoo
+Suggests: bit64, curl, R.utils, knitr, xts, nanotime, zoo, yaml
 SystemRequirements: zlib
 Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development.
 License: MPL-2.0 | file LICENSE

@@ -11,6 +11,7 @@
     * `colClasses` now supports `'complex'`, `'raw'`, `'Date'`, `'POSIXct'`, and user-defined classes (so long as an `as.` method exists), [#491](https://github.com/Rdatatable/data.table/issues/491) [#1634](https://github.com/Rdatatable/data.table/issues/1634) [#2610](https://github.com/Rdatatable/data.table/issues/2610). Any error during coercion results in a warning and the column is left as the default type (probably `"character"`). Thanks to @hughparsonage for the PR.
     * `stringsAsFactors=0.10` will factorize any character column containing under `0.10*nrow` unique strings, [#2025](https://github.com/Rdatatable/data.table/issues/2025). Thanks to @hughparsonage for the PR.
     * `colClasses=list(numeric=20:30, numeric="ID")` will apply the `numeric` type to column numbers `20:30` as before and now also column name `"ID"`; i.e. all duplicate class names are now respected rather than only the first. This need may arise when specifying some columns by name and others by number, as in this example. Thanks to @hughparsonage for the PR.
+    * gains `yaml` (default `FALSE`) and the ability to parse CSVY-formatted input files; i.e., csv files with metadata in a header formatted as YAML (http://csvy.org/), [#1701](https://github.com/Rdatatable/data.table/issues/1701). See `?fread` and files in `/inst/tests/csvy/` for sample formats. Please provide feedback if you find this feature useful and would like extended capabilities. For now, consider it experimental, meaning the API/arguments may change. Thanks to @leeper at [`rio`](https://github.com/leeper/rio) for the inspiration and @MichaelChirico for implementing.
 
 3. `fwrite()`:
     * now writes compressed `.gz` files directly, [#2016](https://github.com/Rdatatable/data.table/issues/2016). Compression, like `fwrite()`, is multithreaded and compresses each chunk on-the-fly (a full size intermediate file is not created). Use a ".gz" extension, or the new `compress=` option. Many thanks to Philippe Chataignon for the significant PR. For example:

@@ -1,12 +1,11 @@
-
 fread <- function(
 input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec=".", quote="\"", nrows=Inf, header="auto",
 na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE),
 skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"),
 col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL,
 showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE),
 nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE), keepLeadingZeros=getOption("datatable.keepLeadingZeros",FALSE),
-autostart=NA)
+yaml=FALSE, autostart=NA)
 {
   if (missing(input)+is.null(file)+is.null(text)+is.null(cmd) < 3L) stop("Used more than one of the arguments input=, file=, text= and cmd=.")
   input_has_vars = length(all.vars(substitute(input)))>0L  # see news for v1.11.6
@@ -23,7 +22,7 @@ autostart=NA)
     stop("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.")
   }
   stopifnot( isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill), isTRUEorFALSE(showProgress),
-             isTRUEorFALSE(verbose), isTRUEorFALSE(check.names), isTRUEorFALSE(logical01), isTRUEorFALSE(keepLeadingZeros) )
+             isTRUEorFALSE(verbose), isTRUEorFALSE(check.names), isTRUEorFALSE(logical01), isTRUEorFALSE(keepLeadingZeros), isTRUEorFALSE(yaml) )
   stopifnot( isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0))
   stopifnot( is.numeric(nrows), length(nrows)==1L )
   if (is.na(nrows) || nrows<0) nrows=Inf   # accept -1 to mean Inf, as read.table does
@@ -134,8 +133,9 @@ autostart=NA)
     }
   }
   stopifnot(length(skip)==1L, !is.na(skip), is.character(skip) || is.numeric(skip))
-  if (skip=="__auto__") skip=-1L   # skip="string" so long as "string" is not "__auto__". Best conveys to user something is automatic there (than -1 or NA).
-  if (is.double(skip)) skip = as.integer(skip)
+  if (identical(skip,"__auto__")) skip = ifelse(yaml,0L,-1L)
+  else if (is.double(skip)) skip = as.integer(skip)
+  # else skip="string" so long as "string" is not "__auto__" (best conveys to user skip is automatic rather than user needing to know -1 or NA means auto)
   stopifnot(is.null(na.strings) || is.character(na.strings))
   tt = grep("^\\s+$", na.strings)
   if (length(tt)) {
@@ -152,6 +152,120 @@ autostart=NA)
     }
     # whitespace at the beginning or end of na.strings is checked at C level and is an error there; test 1804
   }
+  if (yaml) {
+    # for tracking which YAML elements may be overridden by being declared explicitly
+    call_args = names(match.call())
+    if (!requireNamespace('yaml', quietly = TRUE))
+      stop("'data.table' relies on the package 'yaml' to parse the file header; please add this to your library with install.packages('yaml') and try again.") # nocov
+    if (is.character(skip))
+      warning("Combining a search string as 'skip' and reading a YAML header may not work as expected -- currently, ",
+              "reading will proceed to search for 'skip' from the beginning of the file, NOT from the end of ",
+              "the metadata; please file an issue on GitHub if you'd like to see more intuitive behavior supported.")
+    # create connection to stream header lines from file:
+    #   https://stackoverflow.com/questions/9871307
+    f = base::file(input, 'r')
+    first_line = readLines(f, n=1L)
+    n_read = 1L
+    yaml_border_re = '^#?---'
+    if (!grepl(yaml_border_re, first_line)) {
+      close(f)
+      stop('Encountered <', substring(first_line, 1L, 50L), if (nchar(first_line) > 50L) '...', '> at the first ',
+           'unskipped line (', 1L+skip, '), which does not constitute the start to a valid YAML header ',
+           '(expecting something matching regex "', yaml_border_re, '"); please check your input and try again.')
+    }
+
+    yaml_comment_re = '^#'
+    yaml_string = character(0L)
+    while (TRUE) {
+      this_line = readLines(f, n=1L)
+      n_read = n_read + 1L
+      if (!length(this_line)){
+        close(f)
+        stop('Reached the end of the file before finding a completion to the YAML header. A valid YAML header is bookended by lines matching ',
+             'the regex "', yaml_border_re, '". Please double check the input file is a valid csvy.')
+      }
+      if (grepl(yaml_border_re, this_line)) break
+      if (grepl(yaml_comment_re, this_line))
+        this_line = sub(yaml_comment_re, '', this_line)
+      yaml_string = paste(yaml_string, this_line, sep='\n')
+    }
+    close(f) # when #561 is implemented, no need to close f.
+
+    yaml_header = yaml::yaml.load(yaml_string)
+    yaml_names = names(yaml_header)
+    if (verbose) cat('Processed', n_read, 'lines of YAML metadata with the following top-level fields:', brackify(yaml_names), '\n')
+    # process header first since it impacts how to handle colClasses
+    if ('header' %chin% yaml_names) {
+      if ('header' %chin% call_args) message("User-supplied 'header' will override that found in metadata.")
+      else header = as.logical(yaml_header$header)
+    }
+    if ('schema' %chin% yaml_names) {
+      new_types = sapply(yaml_header$schema$fields, `[[`, 'type')
+      if (any(null_idx <- sapply(new_types, is.null)))
+        new_types = do.call(c, new_types)
+      synonms = rbindlist(list(
+        character = list(syn = c('character', 'string')),
+        integer = list(syn = c('integer', 'int')),
+        numeric = list(syn = c('numeric', 'number', 'double')),
+        factor = list(syn = c('factor', 'categorical')),
+        integer64 = list(syn = c('integer64', 'int64'))
+      ), idcol = 'r_type')
+      setkeyv(synonms, 'syn')
+      new_types = synonms[list(new_types)]$r_type
+      new_names = sapply(yaml_header$schema$fields[!null_idx], `[[`, 'name')
+
+      if ('col.names' %chin% call_args) message("User-supplied column names in 'col.names' will override those found in YAML metadata.")
+      # resolve any conflicts with colClasses, if supplied;
+      #   colClasses (if present) is already in list form by now
+      if ('colClasses' %chin% call_args) {
+        if (any(idx_name <- new_names %chin% unlist(colClasses))) {
+          matched_name_idx = which(idx_name)
+          if (!all(idx_type <- sapply(matched_name_idx, function(ii) {
+            new_names[ii] %chin% colClasses[[ new_types[ii] ]]
+          }))) {
+            plural = sum(idx_type) > 1L
+            message('colClasses dictated by user input and those read from YAML header are in conflict (specifically, for column', if (plural) 's',
+                    ' [', paste(new_names[matched_name_idx[!idx_type]], collapse = ','), ']); the proceeding assumes the user input was ',
+                    'an intentional override and will ignore the types implied by the YAML header; please exclude ',
+                    if (plural) 'these columns' else 'this column from colClasses if this was unintentional.')
+          }
+        }
+        # only add unmentioned columns
+        for (ii in which(!idx_name)) {
+          colClasses[[ new_types[ii] ]] = c(colClasses[[ new_types[ii] ]], new_names[ii])
+        }
+      } else {
+        # there are no names to be matched in the data, which fread expects
+        #   at the C level; instead, apply these in post through col.names
+        #   and send the auto-generated V1:Vn as dummies
+        if (identical(header, FALSE)) {
+          if (!'col.names' %chin% call_args) col.names = new_names
+          new_names = paste0('V', seq_along(new_names))
+        }
+        colClasses = tapply(new_names, new_types, c, simplify=FALSE)
+      }
+    }
+    sep_syn = c('sep', 'delimiter')
+    if (any(sep_idx <- sep_syn %chin% yaml_names)) {
+      if ('sep' %chin% call_args) message("User-supplied 'sep' will override that found in metadata.")
+      else sep = yaml_header[[ sep_syn[sep_idx][1L] ]]
+    }
+    quote_syn = c('quote', 'quoteChar', 'quote_char')
+    if (any(quote_idx <- quote_syn %chin% yaml_names)) {
+      if ('quote' %chin% call_args) message("User-supplied 'quote' will override that found in metadata.")
+      else quote = yaml_header[[ quote_syn[quote_idx][1L] ]]
+    }
+    dec_syn = c('dec', 'decimal')
+    if (any(dec_idx <- dec_syn %chin% yaml_names)) {
+      if ('dec' %chin% call_args) message("User-supplied 'dec' will override that found in metadata.")
+      else dec = yaml_header[[ dec_syn[dec_idx][1L] ]]
+    }
+    if ('na.strings' %chin% yaml_names) {
+      if ('na.strings' %chin% call_args) message("User-supplied 'na.strings' will override that found in metadata.")
+      else na.strings = yaml_header$na.strings
+    }
+    if (is.integer(skip)) skip = skip + n_read
+  }
   warnings2errors = getOption("warn") >= 2
   ans = .Call(CfreadR,input,sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip,
               fill,showProgress,nThread,verbose,warnings2errors,logical01,select,drop,colClasses,integer64,encoding,keepLeadingZeros)
@@ -185,7 +299,7 @@ autostart=NA)
              methods::as(v, new_class))
       },
       warning = fun <- function(e) {
-        warning("Column '", names(ans)[j], "' was set by colClasses to be '", new_class, "' but fread encountered the following ", 
+        warning("Column '", names(ans)[j], "' was set by colClasses to be '", new_class, "' but fread encountered the following ",
                 if (inherits(e, "error")) "error" else "warning", ":\n\t", e$message, "\nso the column has been left as type '", typeof(v), "'", call.=FALSE)
         return(v)
       },
@@ -230,6 +344,7 @@ autostart=NA)
     }
     setkeyv(ans, key)
   }
+  if (yaml) setattr(ans, 'yaml_metadata', yaml_header)
   if (!is.null(index) && data.table) {
     if (!all(sapply(index, is.character)))
       stop("index argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)")

@@ -43,6 +43,7 @@ build_script:
   - travis-tool.sh r_install xts
   - travis-tool.sh r_install nanotime
   - travis-tool.sh r_install R.utils
+  - travis-tool.sh r_install yaml
 
 test_script:
   - travis-tool.sh run_tests

@@ -0,0 +1,21 @@
+---
+name: my-dataset
+source: https://github.com/leeper/csvy/tree/master/inst/examples
+schema:
+  fields:
+  - name: var1
+    title: variable 1
+    type: string
+    description: explaining var1
+    constraints:
+      - required: true
+  - name: var2
+    title: variable 2
+    type: integer
+  - name: var3
+    title: variable 3
+    type: number
+---
+var1,var2,var3
+A,1,2.5
+B,3,4.3
@@ -0,0 +1,25 @@
+---
+name:
+schema:
+  fields:
+  - name: var1
+    title: variable 1
+    type: string
+    description: a single-quoted character variable
+  - name: var2
+    title: variable 2
+    type: integer
+  - name: var3
+    title: variable 3
+    type: number
+    description: European-style numeric
+header: true
+sep: "|"
+dec: ","
+quote: "'"
+na.strings: '@'
+---
+var1|var2|var3
+'A'|1|2,5
+'B'|@|4,3
+
@@ -0,0 +1,22 @@
+#---
+#name: my-dataset
+#source: https://github.com/leeper/csvy/tree/master/inst/examples
+#schema:
+#  fields:
+#  - name: var1
+#    title: variable 1
+#    type: string
+#    description: explaining var1
+#    constraints:
+#      - required: true
+#  - name: var2
+#    title: variable 2
+#    type: integer
+#  - name: var3
+#    title: variable 3
+#    type: number
+#---
+var1,var2,var3
+A,1,2.5
+B,3,4.3
+
@@ -0,0 +1,47 @@
+---
+names:
+- Date
+- WTI
+class: data.frame
+title: Cushing, OK WTI Spot Price FOB
+filename: data.csv
+fileurl: https://raw.githubusercontent.com/jrovegno/csvy/master/data.csv
+sourceurl: http://www.eia.gov/dnav/pet/hist/LeafHandler.ashx?n=PET&s=RWTC&f=D
+source_csvy: https://github.com/leeper/csvy/tree/master/inst/examples
+item: PET
+sourcekey: RWTC
+freq: Daily
+rate: MID
+type: price
+units: Dollars per Barrel
+latestdate: '2015-08-31'
+releasedate: '2015-09-02'
+nextreleasedate: '2015-09-10'
+source: Thomson Reuters
+contactemail: infoctr@eia.doe.gov
+contactphone: (202) 586-8800
+---
+"Date","WTI"
+"1986-01-02",25.56
+"1986-01-03",26
+"1986-01-06",26.53
+"1986-01-07",25.85
+"1986-01-08",25.87
+"1986-01-09",26.03
+"1986-01-10",25.65
+"1986-01-13",25.08
+"1986-01-14",24.97
+"1986-01-15",25.18
+"1986-01-16",23.98
+"1986-01-17",23.63
+"1986-01-20",21.33
+"1986-01-21",20.61
+"1986-01-22",20.25
+"1986-01-23",19.93
+"1986-01-24",19.45
+"1986-01-27",20.87
+"1986-01-28",19.45
+"1986-01-29",19.61
+"1986-01-30",19.58
+"1986-01-31",18.95
+
@@ -0,0 +1,15 @@
+---
+name: my-dataset
+source: https://github.com/leeper/csvy/tree/master/inst/examples
+schema:
+  fields:
+  - name: var1
+  - name: var2
+    type: integer
+  - name: var3
+    type: number
+
+var1,var2,var3
+A,1,2.5
+B,3,4.3
+
@@ -0,0 +1,15 @@
+---
+name: my-dataset
+source: https://github.com/leeper/csvy/tree/master/inst/examples
+schema:
+  fields:
+  - name: var1
+  - name: var2
+    type: integer
+  - name: var3
+    type: number
+---
+var1,var2,var3
+A,1,2.5
+B,3,4.3
+
@@ -0,0 +1,25 @@
+---
+name:
+schema:
+  fields:
+  - name: var1
+    title: variable 1
+    type: string
+    description: a single-quoted character variable
+  - name: var2
+    title: variable 2
+    type: integer
+  - name: var3
+    title: variable 3
+    type: number
+    description: European-style numeric
+header: true
+sep: "|"
+dec: "."
+quote: "'"
+na.strings: '@'
+---
+var1|var2|var3
+'A'|1|2,5
+'B'|@|4,3
+
@@ -0,0 +1,24 @@
+---
+name:
+schema:
+  fields:
+  - name: var1
+    title: variable 1
+    type: string
+    description: a single-quoted character variable
+  - name: var2
+    title: variable 2
+    type: integer
+  - name: var3
+    title: variable 3
+    type: number
+    description: European-style numeric
+header: true
+sep: "|"
+dec: ","
+quote: "'"
+na.strings: '@'
+---
+'A'|1|2,5
+'B'|@|4,3
+