From 09187e63b5fbc5d91f3a3ffcee8712ba5e62cdcf Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Tue, 13 Nov 2018 12:11:58 +0100 Subject: [PATCH 1/9] Expose arrow::csv::TableReader, functions csv_table_reader() + csv_read() --- r/DESCRIPTION | 1 + r/NAMESPACE | 19 ++++ r/R/RcppExports.R | 20 +++++ r/R/csv.R | 136 +++++++++++++++++++++++++++++ r/man/csv_convert_options.Rd | 14 +++ r/man/csv_parse_options.Rd | 33 +++++++ r/man/csv_read.Rd | 14 +++ r/man/csv_read_options.Rd | 16 ++++ r/man/csv_table_reader.Rd | 24 +++++ r/src/RcppExports.cpp | 63 +++++++++++++ r/src/arrow_types.h | 1 + r/src/csv.cpp | 67 ++++++++++++++ r/tests/testthat/test-arrow-csv-.R | 33 +++++++ 13 files changed, 441 insertions(+) create mode 100644 r/R/csv.R create mode 100644 r/man/csv_convert_options.Rd create mode 100644 r/man/csv_parse_options.Rd create mode 100644 r/man/csv_read.Rd create mode 100644 r/man/csv_read_options.Rd create mode 100644 r/man/csv_table_reader.Rd create mode 100644 r/src/csv.cpp create mode 100644 r/tests/testthat/test-arrow-csv-.R diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 45e0f83dcbd..a2632973134 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -55,6 +55,7 @@ Collate: 'array.R' 'buffer.R' 'compute.R' + 'csv.R' 'dictionary.R' 'feather.R' 'io.R' diff --git a/r/NAMESPACE b/r/NAMESPACE index 65d60d846f4..dd62a545d4c 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -39,6 +39,19 @@ S3method(buffer,default) S3method(buffer,integer) S3method(buffer,numeric) S3method(buffer,raw) +S3method(buffer_reader,"arrow::Buffer") +S3method(buffer_reader,default) +S3method(csv_table_reader,"arrow::csv::TableReader") +S3method(csv_table_reader,"arrow::io::InputStream") +S3method(csv_table_reader,character) +S3method(csv_table_reader,default) +S3method(csv_table_reader,fs_path) +S3method(feather_table_reader,"arrow::io::RandomAccessFile") +S3method(feather_table_reader,"arrow::ipc::feather::TableReader") +S3method(feather_table_reader,character) +S3method(feather_table_reader,default) +S3method(feather_table_reader,fs_path) +S3method(feather_table_writer,"arrow::io::OutputStream") S3method(length,"arrow::Array") S3method(names,"arrow::RecordBatch") S3method(print,"arrow-enum") @@ -92,6 +105,11 @@ export(boolean) export(buffer) export(cast_options) export(chunked_array) +export(csv_convert_options) +export(csv_parse_options) +export(csv_read) +export(csv_read_options) +export(csv_table_reader) export(date32) export(date64) export(decimal) @@ -141,6 +159,7 @@ importFrom(glue,glue) importFrom(purrr,map) importFrom(purrr,map2) importFrom(purrr,map_int) +importFrom(rlang,abort) importFrom(rlang,dots_n) importFrom(rlang,list2) importFrom(rlang,warn) diff --git a/r/R/RcppExports.R b/r/R/RcppExports.R index 0310eab2027..55b9ab33ebf 100644 --- a/r/R/RcppExports.R +++ b/r/R/RcppExports.R @@ -193,6 +193,26 @@ Table__cast <- function(table, schema, options) { .Call(`_arrow_Table__cast`, table, schema, options) } +csv___ReadOptions__initialize <- function(options) { + .Call(`_arrow_csv___ReadOptions__initialize`, options) +} + +csv___ParseOptions__initialize <- function(options) { + .Call(`_arrow_csv___ParseOptions__initialize`, options) +} + +csv___ConvertOptions__initialize <- function(options) { + .Call(`_arrow_csv___ConvertOptions__initialize`, options) +} + +csv___TableReader__Make <- function(input, read_options, parse_options, convert_options) { + .Call(`_arrow_csv___TableReader__Make`, input, read_options, parse_options, convert_options) +} + +csv___TableReader__Read <- function(table_reader) { + .Call(`_arrow_csv___TableReader__Read`, table_reader) +} + shared_ptr_is_null <- function(xp) { .Call(`_arrow_shared_ptr_is_null`, xp) } diff --git a/r/R/csv.R b/r/R/csv.R new file mode 100644 index 00000000000..3e2c6e159f6 --- /dev/null +++ b/r/R/csv.R @@ -0,0 +1,136 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include R6.R + +`arrow::csv::TableReader` <- R6Class("arrow::csv::TableReader", inherit = `arrow::Object`, + public = list( + Read = function() shared_ptr(`arrow::Table`, csv___TableReader__Read(self)) + ) +) + +`arrow::csv::ReadOptions` <- R6Class("arrow::csv::ReadOptions", inherit = `arrow::Object`) +`arrow::csv::ParseOptions` <- R6Class("arrow::csv::ParseOptions", inherit = `arrow::Object`) +`arrow::csv::ConvertOptions` <- R6Class("arrow::csv::ConvertOptions", inherit = `arrow::Object`) + +#' read options for the csv reader +#' +#' @param use_threads Whether to use the global CPU thread pool +#' @param block_size Block size we request from the IO layer; also determines the size of chunks when use_threads is `TRUE` +#' +#' @export +csv_read_options <- function(use_threads = TRUE, block_size = 1048576L) { + shared_ptr(`arrow::csv::ReadOptions`, csv___ReadOptions__initialize( + list( + use_threads = use_threads, + block_size = block_size + ) + )) +} + +#' Parsing options +#' +#' @param delimiter Field delimiter +#' @param quoting Whether quoting is used +#' @param quote_char Quoting character (if `quoting` is `TRUE`) +#' @param double_quote Whether a quote inside a value is double-quoted +#' @param escaping Whether escaping is used +#' @param escape_char Escaping character (if `escaping` is `TRUE`) +#' @param newlines_in_values Whether values are allowed to contain CR (`0x0d``) and LF (`0x0a``) characters +#' @param ignore_empty_lines Whether empty lines are ignored. If false, an empty line represents +#' @param header_rows Number of header rows to skip (including the first row containing column names) +#' +#' @export +csv_parse_options <- function(delimiter = ",", quoting = TRUE, quote_char = '"', double_quote = TRUE, escaping = FALSE, escape_char = '\\', newlines_in_values = FALSE, ignore_empty_lines = TRUE, header_rows = 1L){ + shared_ptr(`arrow::csv::ParseOptions`, csv___ParseOptions__initialize( + list( + delimiter = delimiter, + quoting = quoting, + quote_char = quote_char, + double_quote = double_quote, + escaping = escaping, + escape_char = escape_char, + newlines_in_values = newlines_in_values, + ignore_empty_lines = ignore_empty_lines, + header_rows = header_rows + ) + )) +} + +#' Conversion Options for the csv reader +#' +#' @param check_utf8 Whether to check UTF8 validity of string columns +#' +#' @export +csv_convert_options <- function(check_utf8 = TRUE){ + shared_ptr(`arrow::csv::ConvertOptions`, csv___ConvertOptions__initialize( + list( + check_utf8 = check_utf8 + ) + )) +} + +#' CSV table reader +#' +#' @param file file +#' @param read_options, see [csv_read_options()] +#' @param parse_options, see [csv_parse_options()] +#' @param convert_options, see [csv_convert_options()] +#' @param ... additional parameters. +#' +#' @export +csv_table_reader <- function(file, read_options = csv_read_options(), parse_options = csv_parse_options(), convert_options = csv_convert_options(), ...){ + UseMethod("csv_table_reader") +} + +#' @importFrom rlang abort +#' @export +csv_table_reader.default <- function(file, read_options = csv_read_options(), parse_options = csv_parse_options(), convert_options = csv_convert_options(), ...) { + abort("unsupported") +} + +#' @export +`csv_table_reader.character` <- function(file, read_options = csv_read_options(), parse_options = csv_parse_options(), convert_options = csv_convert_options(), ...){ + csv_table_reader(fs::path_abs(file), read_options = read_options, parse_options = parse_options, convert_options = convert_options, ...) +} + +#' @export +`csv_table_reader.fs_path` <- function(file, read_options = csv_read_options(), parse_options = csv_parse_options(), convert_options = csv_convert_options(), mmap = TRUE, ...){ + stream <- if (isTRUE(mmap)) mmap_open(file) else file_open(file) + csv_table_reader(stream, read_options = read_options, parse_options = parse_options, convert_options = convert_options, ...) +} + +#' @export +`csv_table_reader.arrow::io::InputStream` <- function(file, read_options = csv_read_options(), parse_options = csv_parse_options(), convert_options = csv_convert_options(), ...){ + shared_ptr(`arrow::csv::TableReader`, csv___TableReader__Make(file, read_options, parse_options, convert_options)) +} + +#' @export +`csv_table_reader.arrow::csv::TableReader` <- function(file, read_options = csv_read_options(), parse_options = csv_parse_options(), convert_options = csv_convert_options(), ...){ + file +} + +#' Read csv file into an arrow::Table +#' +#' Use arrow::csv::TableReader from [csv_table_reader()] +#' +#' @param ... Used to construct an arrow::csv::TableReader +#' @export +csv_read <- function(...) { + csv_table_reader(...)$Read() +} + diff --git a/r/man/csv_convert_options.Rd b/r/man/csv_convert_options.Rd new file mode 100644 index 00000000000..323c6e01970 --- /dev/null +++ b/r/man/csv_convert_options.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/csv.R +\name{csv_convert_options} +\alias{csv_convert_options} +\title{Conversion Options for the csv reader} +\usage{ +csv_convert_options(check_utf8 = TRUE) +} +\arguments{ +\item{check_utf8}{Whether to check UTF8 validity of string columns} +} +\description{ +Conversion Options for the csv reader +} diff --git a/r/man/csv_parse_options.Rd b/r/man/csv_parse_options.Rd new file mode 100644 index 00000000000..9540771437f --- /dev/null +++ b/r/man/csv_parse_options.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/csv.R +\name{csv_parse_options} +\alias{csv_parse_options} +\title{Parsing options} +\usage{ +csv_parse_options(delimiter = ",", quoting = TRUE, + quote_char = "\\"", double_quote = TRUE, escaping = FALSE, + escape_char = "\\\\", newlines_in_values = FALSE, + ignore_empty_lines = TRUE, header_rows = 1L) +} +\arguments{ +\item{delimiter}{Field delimiter} + +\item{quoting}{Whether quoting is used} + +\item{quote_char}{Quoting character (if \code{quoting} is \code{TRUE})} + +\item{double_quote}{Whether a quote inside a value is double-quoted} + +\item{escaping}{Whether escaping is used} + +\item{escape_char}{Escaping character (if \code{escaping} is \code{TRUE})} + +\item{newlines_in_values}{Whether values are allowed to contain CR (\code{0x0d``) and LF (}0x0a``) characters} + +\item{ignore_empty_lines}{Whether empty lines are ignored. If false, an empty line represents} + +\item{header_rows}{Number of header rows to skip (including the first row containing column names)} +} +\description{ +Parsing options +} diff --git a/r/man/csv_read.Rd b/r/man/csv_read.Rd new file mode 100644 index 00000000000..8ef2fc4b57c --- /dev/null +++ b/r/man/csv_read.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/csv.R +\name{csv_read} +\alias{csv_read} +\title{Read csv file into an arrow::Table} +\usage{ +csv_read(...) +} +\arguments{ +\item{...}{Used to construct an arrow::csv::TableReader} +} +\description{ +Use arrow::csv::TableReader from \code{\link[=csv_table_reader]{csv_table_reader()}} +} diff --git a/r/man/csv_read_options.Rd b/r/man/csv_read_options.Rd new file mode 100644 index 00000000000..3fa2d8ccbf2 --- /dev/null +++ b/r/man/csv_read_options.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/csv.R +\name{csv_read_options} +\alias{csv_read_options} +\title{read options for the csv reader} +\usage{ +csv_read_options(use_threads = TRUE, block_size = 1048576L) +} +\arguments{ +\item{use_threads}{Whether to use the global CPU thread pool} + +\item{block_size}{Block size we request from the IO layer; also determines the size of chunks when use_threads is \code{TRUE}} +} +\description{ +read options for the csv reader +} diff --git a/r/man/csv_table_reader.Rd b/r/man/csv_table_reader.Rd new file mode 100644 index 00000000000..029cd0b5923 --- /dev/null +++ b/r/man/csv_table_reader.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/csv.R +\name{csv_table_reader} +\alias{csv_table_reader} +\title{CSV table reader} +\usage{ +csv_table_reader(file, read_options = csv_read_options(), + parse_options = csv_parse_options(), + convert_options = csv_convert_options(), ...) +} +\arguments{ +\item{file}{file} + +\item{read_options, }{see \code{\link[=csv_read_options]{csv_read_options()}}} + +\item{parse_options, }{see \code{\link[=csv_parse_options]{csv_parse_options()}}} + +\item{convert_options, }{see \code{\link[=csv_convert_options]{csv_convert_options()}}} + +\item{...}{additional parameters.} +} +\description{ +CSV table reader +} diff --git a/r/src/RcppExports.cpp b/r/src/RcppExports.cpp index e5a784eb70c..c752afba1c2 100644 --- a/r/src/RcppExports.cpp +++ b/r/src/RcppExports.cpp @@ -558,6 +558,64 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// csv___ReadOptions__initialize +std::shared_ptr csv___ReadOptions__initialize(List_ options); +RcppExport SEXP _arrow_csv___ReadOptions__initialize(SEXP optionsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< List_ >::type options(optionsSEXP); + rcpp_result_gen = Rcpp::wrap(csv___ReadOptions__initialize(options)); + return rcpp_result_gen; +END_RCPP +} +// csv___ParseOptions__initialize +std::shared_ptr csv___ParseOptions__initialize(List_ options); +RcppExport SEXP _arrow_csv___ParseOptions__initialize(SEXP optionsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< List_ >::type options(optionsSEXP); + rcpp_result_gen = Rcpp::wrap(csv___ParseOptions__initialize(options)); + return rcpp_result_gen; +END_RCPP +} +// csv___ConvertOptions__initialize +std::shared_ptr csv___ConvertOptions__initialize(List_ options); +RcppExport SEXP _arrow_csv___ConvertOptions__initialize(SEXP optionsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< List_ >::type options(optionsSEXP); + rcpp_result_gen = Rcpp::wrap(csv___ConvertOptions__initialize(options)); + return rcpp_result_gen; +END_RCPP +} +// csv___TableReader__Make +std::shared_ptr csv___TableReader__Make(const std::shared_ptr& input, const std::shared_ptr& read_options, const std::shared_ptr& parse_options, const std::shared_ptr& convert_options); +RcppExport SEXP _arrow_csv___TableReader__Make(SEXP inputSEXP, SEXP read_optionsSEXP, SEXP parse_optionsSEXP, SEXP convert_optionsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::shared_ptr& >::type input(inputSEXP); + Rcpp::traits::input_parameter< const std::shared_ptr& >::type read_options(read_optionsSEXP); + Rcpp::traits::input_parameter< const std::shared_ptr& >::type parse_options(parse_optionsSEXP); + Rcpp::traits::input_parameter< const std::shared_ptr& >::type convert_options(convert_optionsSEXP); + rcpp_result_gen = Rcpp::wrap(csv___TableReader__Make(input, read_options, parse_options, convert_options)); + return rcpp_result_gen; +END_RCPP +} +// csv___TableReader__Read +std::shared_ptr csv___TableReader__Read(const std::shared_ptr& table_reader); +RcppExport SEXP _arrow_csv___TableReader__Read(SEXP table_readerSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::shared_ptr& >::type table_reader(table_readerSEXP); + rcpp_result_gen = Rcpp::wrap(csv___TableReader__Read(table_reader)); + return rcpp_result_gen; +END_RCPP +} // shared_ptr_is_null bool shared_ptr_is_null(SEXP xp); RcppExport SEXP _arrow_shared_ptr_is_null(SEXP xpSEXP) { @@ -2200,6 +2258,11 @@ static const R_CallMethodDef CallEntries[] = { {"_arrow_ChunkedArray__cast", (DL_FUNC) &_arrow_ChunkedArray__cast, 3}, {"_arrow_RecordBatch__cast", (DL_FUNC) &_arrow_RecordBatch__cast, 3}, {"_arrow_Table__cast", (DL_FUNC) &_arrow_Table__cast, 3}, + {"_arrow_csv___ReadOptions__initialize", (DL_FUNC) &_arrow_csv___ReadOptions__initialize, 1}, + {"_arrow_csv___ParseOptions__initialize", (DL_FUNC) &_arrow_csv___ParseOptions__initialize, 1}, + {"_arrow_csv___ConvertOptions__initialize", (DL_FUNC) &_arrow_csv___ConvertOptions__initialize, 1}, + {"_arrow_csv___TableReader__Make", (DL_FUNC) &_arrow_csv___TableReader__Make, 4}, + {"_arrow_csv___TableReader__Read", (DL_FUNC) &_arrow_csv___TableReader__Read, 1}, {"_arrow_shared_ptr_is_null", (DL_FUNC) &_arrow_shared_ptr_is_null, 1}, {"_arrow_unique_ptr_is_null", (DL_FUNC) &_arrow_unique_ptr_is_null, 1}, {"_arrow_Int8__initialize", (DL_FUNC) &_arrow_Int8__initialize, 0}, diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index dba7a91c21e..61c22a1437f 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -28,6 +28,7 @@ #include #include #include +#include #define STOP_IF_NOT(TEST, MSG) \ do { \ diff --git a/r/src/csv.cpp b/r/src/csv.cpp new file mode 100644 index 00000000000..e979a36d56c --- /dev/null +++ b/r/src/csv.cpp @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow_types.h" + +using namespace Rcpp; + +// [[Rcpp::export]] +std::shared_ptr csv___ReadOptions__initialize(List_ options){ + auto res = std::make_shared(arrow::csv::ReadOptions::Defaults()); + res->use_threads = options["use_threads"]; + res->block_size = options["block_size"]; + return res; +} + +inline char get_char(SEXP x){ + return CHAR(STRING_ELT(x, 0))[0]; +} + +// [[Rcpp::export]] +std::shared_ptr csv___ParseOptions__initialize(List_ options){ + auto res = std::make_shared(arrow::csv::ParseOptions::Defaults()); + res->delimiter = get_char(options["delimiter"]); + res->quoting = options["quoting"]; + res->quote_char = get_char(options["quote_char"]); + res->double_quote = options["double_quote"]; + res->escape_char = get_char(options["escape_char"]); + res->newlines_in_values = options["newlines_in_values"]; + res->header_rows = options["header_rows"]; + res->ignore_empty_lines = options["ignore_empty_lines"]; + return res; +} + +// [[Rcpp::export]] +std::shared_ptr csv___ConvertOptions__initialize(List_ options){ + auto res = std::make_shared(arrow::csv::ConvertOptions::Defaults()); + res->check_utf8 = options["check_utf8"]; + return res; +} + +// [[Rcpp::export]] +std::shared_ptr csv___TableReader__Make(const std::shared_ptr& input, const std::shared_ptr& read_options, const std::shared_ptr& parse_options, const std::shared_ptr& convert_options){ + std::shared_ptr table_reader; + STOP_IF_NOT_OK(arrow::csv::TableReader::Make(arrow::default_memory_pool(), input, *read_options, *parse_options, *convert_options, &table_reader)); + return table_reader; +} + +// [[Rcpp::export]] +std::shared_ptr csv___TableReader__Read(const std::shared_ptr& table_reader) { + std::shared_ptr table ; + STOP_IF_NOT_OK(table_reader->Read(&table)); + return table; +} diff --git a/r/tests/testthat/test-arrow-csv-.R b/r/tests/testthat/test-arrow-csv-.R new file mode 100644 index 00000000000..9cf08401a1b --- /dev/null +++ b/r/tests/testthat/test-arrow-csv-.R @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +context("arrow::csv::TableReader") + +test_that("Can read csv file", { + tf <- local_tempfile() + readr::write_csv(iris, tf) + + tab1 <- csv_read(tf) + tab2 <- csv_read(mmap_open(tf)) + tab3 <- csv_read(file_open(tf)) + + iris$Species <- as.character(iris$Species) + tab0 <- table(iris) + expect_equal(tab0, tab1) + expect_equal(tab0, tab2) + expect_equal(tab0, tab3) +}) From 0ab839783a2b0c7052f67409f4d9c9e97d86089f Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Tue, 13 Nov 2018 12:21:11 +0100 Subject: [PATCH 2/9] linting --- r/src/arrow_types.h | 2 +- r/src/csv.cpp | 35 ++++++++++++++++++++++------------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 61c22a1437f..6fef7997dbf 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -22,13 +22,13 @@ #undef Free #include #include +#include #include #include #include #include #include #include -#include #define STOP_IF_NOT(TEST, MSG) \ do { \ diff --git a/r/src/csv.cpp b/r/src/csv.cpp index e979a36d56c..ca293641807 100644 --- a/r/src/csv.cpp +++ b/r/src/csv.cpp @@ -20,20 +20,20 @@ using namespace Rcpp; // [[Rcpp::export]] -std::shared_ptr csv___ReadOptions__initialize(List_ options){ - auto res = std::make_shared(arrow::csv::ReadOptions::Defaults()); +std::shared_ptr csv___ReadOptions__initialize(List_ options) { + auto res = + std::make_shared(arrow::csv::ReadOptions::Defaults()); res->use_threads = options["use_threads"]; res->block_size = options["block_size"]; return res; } -inline char get_char(SEXP x){ - return CHAR(STRING_ELT(x, 0))[0]; -} +inline char get_char(SEXP x) { return CHAR(STRING_ELT(x, 0))[0]; } // [[Rcpp::export]] -std::shared_ptr csv___ParseOptions__initialize(List_ options){ - auto res = std::make_shared(arrow::csv::ParseOptions::Defaults()); +std::shared_ptr csv___ParseOptions__initialize(List_ options) { + auto res = + std::make_shared(arrow::csv::ParseOptions::Defaults()); res->delimiter = get_char(options["delimiter"]); res->quoting = options["quoting"]; res->quote_char = get_char(options["quote_char"]); @@ -46,22 +46,31 @@ std::shared_ptr csv___ParseOptions__initialize(List_ o } // [[Rcpp::export]] -std::shared_ptr csv___ConvertOptions__initialize(List_ options){ - auto res = std::make_shared(arrow::csv::ConvertOptions::Defaults()); +std::shared_ptr csv___ConvertOptions__initialize( + List_ options) { + auto res = std::make_shared( + arrow::csv::ConvertOptions::Defaults()); res->check_utf8 = options["check_utf8"]; return res; } // [[Rcpp::export]] -std::shared_ptr csv___TableReader__Make(const std::shared_ptr& input, const std::shared_ptr& read_options, const std::shared_ptr& parse_options, const std::shared_ptr& convert_options){ +std::shared_ptr csv___TableReader__Make( + const std::shared_ptr& input, + const std::shared_ptr& read_options, + const std::shared_ptr& parse_options, + const std::shared_ptr& convert_options) { std::shared_ptr table_reader; - STOP_IF_NOT_OK(arrow::csv::TableReader::Make(arrow::default_memory_pool(), input, *read_options, *parse_options, *convert_options, &table_reader)); + STOP_IF_NOT_OK(arrow::csv::TableReader::Make(arrow::default_memory_pool(), input, + *read_options, *parse_options, + *convert_options, &table_reader)); return table_reader; } // [[Rcpp::export]] -std::shared_ptr csv___TableReader__Read(const std::shared_ptr& table_reader) { - std::shared_ptr table ; +std::shared_ptr csv___TableReader__Read( + const std::shared_ptr& table_reader) { + std::shared_ptr table; STOP_IF_NOT_OK(table_reader->Read(&table)); return table; } From 258550143ec1c40df6a530ee8fbb9c6d1a0dd044 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Tue, 13 Nov 2018 15:58:30 +0100 Subject: [PATCH 3/9] line breaks for readability --- r/R/csv.R | 68 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 10 deletions(-) diff --git a/r/R/csv.R b/r/R/csv.R index 3e2c6e159f6..68c933c47ee 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -55,7 +55,12 @@ csv_read_options <- function(use_threads = TRUE, block_size = 1048576L) { #' @param header_rows Number of header rows to skip (including the first row containing column names) #' #' @export -csv_parse_options <- function(delimiter = ",", quoting = TRUE, quote_char = '"', double_quote = TRUE, escaping = FALSE, escape_char = '\\', newlines_in_values = FALSE, ignore_empty_lines = TRUE, header_rows = 1L){ +csv_parse_options <- function( + delimiter = ",", quoting = TRUE, quote_char = '"', + double_quote = TRUE, escaping = FALSE, escape_char = '\\', + newlines_in_values = FALSE, ignore_empty_lines = TRUE, + header_rows = 1L +){ shared_ptr(`arrow::csv::ParseOptions`, csv___ParseOptions__initialize( list( delimiter = delimiter, @@ -93,34 +98,77 @@ csv_convert_options <- function(check_utf8 = TRUE){ #' @param ... additional parameters. #' #' @export -csv_table_reader <- function(file, read_options = csv_read_options(), parse_options = csv_parse_options(), convert_options = csv_convert_options(), ...){ +csv_table_reader <- function(file, + read_options = csv_read_options(), + parse_options = csv_parse_options(), + convert_options = csv_convert_options(), + ... +){ UseMethod("csv_table_reader") } #' @importFrom rlang abort #' @export -csv_table_reader.default <- function(file, read_options = csv_read_options(), parse_options = csv_parse_options(), convert_options = csv_convert_options(), ...) { +csv_table_reader.default <- function(file, + read_options = csv_read_options(), + parse_options = csv_parse_options(), + convert_options = csv_convert_options(), + ... +) { abort("unsupported") } #' @export -`csv_table_reader.character` <- function(file, read_options = csv_read_options(), parse_options = csv_parse_options(), convert_options = csv_convert_options(), ...){ - csv_table_reader(fs::path_abs(file), read_options = read_options, parse_options = parse_options, convert_options = convert_options, ...) +`csv_table_reader.character` <- function(file, + read_options = csv_read_options(), + parse_options = csv_parse_options(), + convert_options = csv_convert_options(), + ... +){ + csv_table_reader(fs::path_abs(file), + read_options = read_options, + parse_options = parse_options, + convert_options = convert_options, + ... + ) } #' @export -`csv_table_reader.fs_path` <- function(file, read_options = csv_read_options(), parse_options = csv_parse_options(), convert_options = csv_convert_options(), mmap = TRUE, ...){ +`csv_table_reader.fs_path` <- function(file, + read_options = csv_read_options(), + parse_options = csv_parse_options(), + convert_options = csv_convert_options(), + mmap = TRUE, + ... +){ stream <- if (isTRUE(mmap)) mmap_open(file) else file_open(file) - csv_table_reader(stream, read_options = read_options, parse_options = parse_options, convert_options = convert_options, ...) + csv_table_reader(stream, + read_options = read_options, + parse_options = parse_options, + convert_options = convert_options, + ... + ) } #' @export -`csv_table_reader.arrow::io::InputStream` <- function(file, read_options = csv_read_options(), parse_options = csv_parse_options(), convert_options = csv_convert_options(), ...){ - shared_ptr(`arrow::csv::TableReader`, csv___TableReader__Make(file, read_options, parse_options, convert_options)) +`csv_table_reader.arrow::io::InputStream` <- function(file, + read_options = csv_read_options(), + parse_options = csv_parse_options(), + convert_options = csv_convert_options(), + ... +){ + shared_ptr(`arrow::csv::TableReader`, + csv___TableReader__Make(file, read_options, parse_options, convert_options) + ) } #' @export -`csv_table_reader.arrow::csv::TableReader` <- function(file, read_options = csv_read_options(), parse_options = csv_parse_options(), convert_options = csv_convert_options(), ...){ +`csv_table_reader.arrow::csv::TableReader` <- function(file, + read_options = csv_read_options(), + parse_options = csv_parse_options(), + convert_options = csv_convert_options(), + ... +){ file } From 6e740037d9b1095edfeca4819646a51afaa2b132 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Tue, 13 Nov 2018 16:03:31 +0100 Subject: [PATCH 4/9] going through CharacterVector makes sure this is a character vector --- r/src/csv.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/csv.cpp b/r/src/csv.cpp index ca293641807..0e1d09fb65e 100644 --- a/r/src/csv.cpp +++ b/r/src/csv.cpp @@ -28,7 +28,7 @@ std::shared_ptr csv___ReadOptions__initialize(List_ opt return res; } -inline char get_char(SEXP x) { return CHAR(STRING_ELT(x, 0))[0]; } +inline char get_char(CharacterVector x) { return CHAR(STRING_ELT(x, 0))[0]; } // [[Rcpp::export]] std::shared_ptr csv___ParseOptions__initialize(List_ options) { From 959020c91b5b59e1a6148120219e51637f10b62b Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Tue, 13 Nov 2018 16:31:28 +0100 Subject: [PATCH 5/9] No need to special use mmap for file path method --- r/R/csv.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/r/R/csv.R b/r/R/csv.R index 68c933c47ee..9c312feaf98 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -138,11 +138,9 @@ csv_table_reader.default <- function(file, read_options = csv_read_options(), parse_options = csv_parse_options(), convert_options = csv_convert_options(), - mmap = TRUE, ... ){ - stream <- if (isTRUE(mmap)) mmap_open(file) else file_open(file) - csv_table_reader(stream, + csv_table_reader(file_open(file), read_options = read_options, parse_options = parse_options, convert_options = convert_options, From 83b51621adc0a1813b5536031cb9297f763e5430 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Tue, 4 Dec 2018 21:25:05 +0100 Subject: [PATCH 6/9] s/file_open/ReadableFile/ --- r/NAMESPACE | 2 -- r/R/csv.R | 2 +- r/tests/testthat/test-arrow-csv-.R | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index dd62a545d4c..ef0485f5f78 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -39,8 +39,6 @@ S3method(buffer,default) S3method(buffer,integer) S3method(buffer,numeric) S3method(buffer,raw) -S3method(buffer_reader,"arrow::Buffer") -S3method(buffer_reader,default) S3method(csv_table_reader,"arrow::csv::TableReader") S3method(csv_table_reader,"arrow::io::InputStream") S3method(csv_table_reader,character) diff --git a/r/R/csv.R b/r/R/csv.R index 9c312feaf98..87343a6070a 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -140,7 +140,7 @@ csv_table_reader.default <- function(file, convert_options = csv_convert_options(), ... ){ - csv_table_reader(file_open(file), + csv_table_reader(ReadableFile(file), read_options = read_options, parse_options = parse_options, convert_options = convert_options, diff --git a/r/tests/testthat/test-arrow-csv-.R b/r/tests/testthat/test-arrow-csv-.R index 9cf08401a1b..ba210985afb 100644 --- a/r/tests/testthat/test-arrow-csv-.R +++ b/r/tests/testthat/test-arrow-csv-.R @@ -23,7 +23,7 @@ test_that("Can read csv file", { tab1 <- csv_read(tf) tab2 <- csv_read(mmap_open(tf)) - tab3 <- csv_read(file_open(tf)) + tab3 <- csv_read(ReadableFile(tf)) iris$Species <- as.character(iris$Species) tab0 <- table(iris) From bb13a76e06bf84db6bb96c698e2840a663259d34 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Mon, 10 Dec 2018 19:12:16 +0100 Subject: [PATCH 7/9] rebase --- r/NAMESPACE | 6 ------ 1 file changed, 6 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index ef0485f5f78..01a4a3fba72 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -44,12 +44,6 @@ S3method(csv_table_reader,"arrow::io::InputStream") S3method(csv_table_reader,character) S3method(csv_table_reader,default) S3method(csv_table_reader,fs_path) -S3method(feather_table_reader,"arrow::io::RandomAccessFile") -S3method(feather_table_reader,"arrow::ipc::feather::TableReader") -S3method(feather_table_reader,character) -S3method(feather_table_reader,default) -S3method(feather_table_reader,fs_path) -S3method(feather_table_writer,"arrow::io::OutputStream") S3method(length,"arrow::Array") S3method(names,"arrow::RecordBatch") S3method(print,"arrow-enum") From 7770ec54c44c9438f799b9729c38b954d6957e54 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Wed, 2 Jan 2019 16:30:42 +0100 Subject: [PATCH 8/9] not using readr:: at this point --- r/tests/testthat/test-arrow-csv-.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-arrow-csv-.R b/r/tests/testthat/test-arrow-csv-.R index ba210985afb..e0a20cc90ec 100644 --- a/r/tests/testthat/test-arrow-csv-.R +++ b/r/tests/testthat/test-arrow-csv-.R @@ -19,7 +19,7 @@ context("arrow::csv::TableReader") test_that("Can read csv file", { tf <- local_tempfile() - readr::write_csv(iris, tf) + write.csv(iris, tf, row.names = FALSE, quote = FALSE) tab1 <- csv_read(tf) tab2 <- csv_read(mmap_open(tf)) From 951e9f58ba8cea2ab86097a7970206ae32137f84 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Wed, 2 Jan 2019 17:05:55 +0100 Subject: [PATCH 9/9] s/csv_read/read_csv_arrow/ --- r/NAMESPACE | 2 +- r/R/csv.R | 2 +- r/man/{csv_read.Rd => read_csv_arrow.Rd} | 6 +++--- r/tests/testthat/test-arrow-csv-.R | 6 +++--- 4 files changed, 8 insertions(+), 8 deletions(-) rename r/man/{csv_read.Rd => read_csv_arrow.Rd} (82%) diff --git a/r/NAMESPACE b/r/NAMESPACE index 01a4a3fba72..8846defbd8e 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -99,7 +99,6 @@ export(cast_options) export(chunked_array) export(csv_convert_options) export(csv_parse_options) -export(csv_read) export(csv_read_options) export(csv_table_reader) export(date32) @@ -121,6 +120,7 @@ export(mmap_open) export(null) export(print.integer64) export(read_arrow) +export(read_csv_arrow) export(read_feather) export(read_message) export(read_record_batch) diff --git a/r/R/csv.R b/r/R/csv.R index 87343a6070a..bad87559c05 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -176,7 +176,7 @@ csv_table_reader.default <- function(file, #' #' @param ... Used to construct an arrow::csv::TableReader #' @export -csv_read <- function(...) { +read_csv_arrow <- function(...) { csv_table_reader(...)$Read() } diff --git a/r/man/csv_read.Rd b/r/man/read_csv_arrow.Rd similarity index 82% rename from r/man/csv_read.Rd rename to r/man/read_csv_arrow.Rd index 8ef2fc4b57c..4cdca91246b 100644 --- a/r/man/csv_read.Rd +++ b/r/man/read_csv_arrow.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/csv.R -\name{csv_read} -\alias{csv_read} +\name{read_csv_arrow} +\alias{read_csv_arrow} \title{Read csv file into an arrow::Table} \usage{ -csv_read(...) +read_csv_arrow(...) } \arguments{ \item{...}{Used to construct an arrow::csv::TableReader} diff --git a/r/tests/testthat/test-arrow-csv-.R b/r/tests/testthat/test-arrow-csv-.R index e0a20cc90ec..2afd0622821 100644 --- a/r/tests/testthat/test-arrow-csv-.R +++ b/r/tests/testthat/test-arrow-csv-.R @@ -21,9 +21,9 @@ test_that("Can read csv file", { tf <- local_tempfile() write.csv(iris, tf, row.names = FALSE, quote = FALSE) - tab1 <- csv_read(tf) - tab2 <- csv_read(mmap_open(tf)) - tab3 <- csv_read(ReadableFile(tf)) + tab1 <- read_csv_arrow(tf) + tab2 <- read_csv_arrow(mmap_open(tf)) + tab3 <- read_csv_arrow(ReadableFile(tf)) iris$Species <- as.character(iris$Species) tab0 <- table(iris)