From eeedde1f7a179c8f8175abe0908186e1177650e7 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 20 Aug 2019 11:30:20 -0700 Subject: [PATCH] Read parquet and feather from raw vector --- r/NAMESPACE | 3 +- r/NEWS.md | 1 + r/R/feather.R | 10 ++--- r/R/parquet.R | 5 +++ r/tests/testthat/test-feather.R | 65 +++++++++++---------------------- r/tests/testthat/test-parquet.R | 6 +++ 6 files changed, 40 insertions(+), 50 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index 0c1ae8c35d5..3a413c0e802 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -18,8 +18,8 @@ S3method(CompressedOutputStream,fs_path) S3method(FeatherTableReader,"arrow::io::RandomAccessFile") S3method(FeatherTableReader,"arrow::ipc::feather::TableReader") S3method(FeatherTableReader,character) -S3method(FeatherTableReader,default) S3method(FeatherTableReader,fs_path) +S3method(FeatherTableReader,raw) S3method(FeatherTableWriter,"arrow::io::OutputStream") S3method(FixedSizeBufferWriter,"arrow::Buffer") S3method(FixedSizeBufferWriter,default) @@ -65,6 +65,7 @@ S3method(names,"arrow::RecordBatch") S3method(parquet_file_reader,"arrow::io::RandomAccessFile") S3method(parquet_file_reader,character) S3method(parquet_file_reader,fs_path) +S3method(parquet_file_reader,raw) S3method(print,"arrow-enum") S3method(read_message,"arrow::io::InputStream") S3method(read_message,"arrow::ipc::MessageReader") diff --git a/r/NEWS.md b/r/NEWS.md index 6259d09ba9e..5ab4e18794b 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -20,6 +20,7 @@ # arrow 0.14.1.9000 * `read_csv_arrow()` supports more parsing options, including `col_names` and `skip` +* `read_parquet()` and `read_feather()` can ingest data from a `raw` vector ([ARROW-6278](https://issues.apache.org/jira/browse/ARROW-6278)) # arrow 0.14.1 diff --git a/r/R/feather.R b/r/R/feather.R index 9de91553163..e06dff08cb3 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -137,11 +137,6 @@ FeatherTableReader <- function(file, mmap = TRUE, ...){ UseMethod("FeatherTableReader") } -#' @export -FeatherTableReader.default <- function(file, mmap = TRUE, ...) { - stop("unsupported") -} - #' @export FeatherTableReader.character <- function(file, mmap = TRUE, ...) { FeatherTableReader(fs::path_abs(file), mmap = mmap, ...) @@ -157,6 +152,11 @@ FeatherTableReader.fs_path <- function(file, mmap = TRUE, ...) { FeatherTableReader(stream) } +#' @export +FeatherTableReader.raw <- function(file, mmap = TRUE, ...) { + FeatherTableReader(BufferReader(file), mmap = mmap, ...) +} + #' @export `FeatherTableReader.arrow::io::RandomAccessFile` <- function(file, mmap = TRUE, ...){ unique_ptr(`arrow::ipc::feather::TableReader`, ipc___feather___TableReader__Open(file)) diff --git a/r/R/parquet.R b/r/R/parquet.R index 1163a2df828..4fcff6b7b1b 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -98,6 +98,11 @@ parquet_file_reader.character <- function(file, props = parquet_arrow_reader_pro parquet_file_reader(fs::path_abs(file), props = parquet_arrow_reader_properties(), memory_map = memory_map, ...) } +#' @export +parquet_file_reader.raw <- function(file, props = parquet_arrow_reader_properties(), memory_map = TRUE, ...) { + parquet_file_reader(BufferReader(file), props = parquet_arrow_reader_properties(), memory_map = memory_map, ...) +} + #' Read a Parquet file #' #' '[Parquet](https://parquet.apache.org/)' is a columnar storage file format. diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R index adf81519193..4fe058eadc6 100644 --- a/r/tests/testthat/test-feather.R +++ b/r/tests/testthat/test-feather.R @@ -17,13 +17,15 @@ context("Feather") -test_that("feather read/write round trip", { - tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10]) +feather_file <- tempfile() +tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10]) - tf1 <- tempfile() - write_feather(tib, tf1) - expect_true(fs::file_exists(tf1)) +test_that("Write a feather file", { + write_feather(tib, feather_file) + expect_true(fs::file_exists(feather_file)) +}) +test_that("feather read/write round trip", { tf2 <- fs::path_abs(tempfile()) write_feather(tib, tf2) expect_true(fs::file_exists(tf2)) @@ -34,7 +36,7 @@ test_that("feather read/write round trip", { stream$close() expect_true(fs::file_exists(tf3)) - tab1 <- read_feather(tf1) + tab1 <- read_feather(feather_file) expect_is(tab1, "data.frame") tab2 <- read_feather(tf2) @@ -57,76 +59,51 @@ test_that("feather read/write round trip", { expect_equal(tib, tab4) expect_equal(tib, tab5) - unlink(tf1) unlink(tf2) unlink(tf3) }) test_that("feather handles col_select = ", { - tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10]) - - tf1 <- tempfile() - write_feather(tib, tf1) - expect_true(fs::file_exists(tf1)) - - tab1 <- read_feather(tf1, col_select = c("x", "y")) + tab1 <- read_feather(feather_file, col_select = c("x", "y")) expect_is(tab1, "data.frame") expect_equal(tib$x, tab1$x) expect_equal(tib$y, tab1$y) - - unlink(tf1) }) test_that("feather handles col_select = ", { - tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10]) - - tf1 <- tempfile() - write_feather(tib, tf1) - expect_true(fs::file_exists(tf1)) - - tab1 <- read_feather(tf1, col_select = 1:2) + tab1 <- read_feather(feather_file, col_select = 1:2) expect_is(tab1, "data.frame") expect_equal(tib$x, tab1$x) expect_equal(tib$y, tab1$y) - unlink(tf1) }) test_that("feather handles col_select = ", { - tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10]) - - tf1 <- tempfile() - write_feather(tib, tf1) - expect_true(fs::file_exists(tf1)) - - tab1 <- read_feather(tf1, col_select = everything()) + tab1 <- read_feather(feather_file, col_select = everything()) expect_identical(tib, tab1) - tab2 <- read_feather(tf1, col_select = starts_with("x")) + tab2 <- read_feather(feather_file, col_select = starts_with("x")) expect_identical(tab2, tib[, "x", drop = FALSE]) - tab3 <- read_feather(tf1, col_select = c(starts_with("x"), contains("y"))) + tab3 <- read_feather(feather_file, col_select = c(starts_with("x"), contains("y"))) expect_identical(tab3, tib[, c("x", "y"), drop = FALSE]) - tab4 <- read_feather(tf1, col_select = -z) + tab4 <- read_feather(feather_file, col_select = -z) expect_identical(tab4, tib[, c("x", "y"), drop = FALSE]) - - unlink(tf1) }) test_that("feather read/write round trip", { - tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10]) - - tf1 <- tempfile() - write_feather(tib, tf1) - expect_true(fs::file_exists(tf1)) - - tab1 <- read_feather(tf1, as_tibble = FALSE) + tab1 <- read_feather(feather_file, as_tibble = FALSE) expect_is(tab1, "arrow::Table") expect_equal(tib, as.data.frame(tab1)) - unlink(tf1) }) +test_that("Read feather from raw vector", { + test_raw <- readBin(feather_file, what = "raw", n = 5000) + df <- read_feather(test_raw) + expect_is(df, "data.frame") +}) +unlink(feather_file) diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R index f23c37fe6c0..fd6f40fcd56 100644 --- a/r/tests/testthat/test-parquet.R +++ b/r/tests/testthat/test-parquet.R @@ -43,3 +43,9 @@ test_that("read_parquet() supports col_select", { df <- read_parquet(pq_file, col_select = starts_with("c")) expect_equal(names(df), c("carat", "cut", "color", "clarity")) }) + +test_that("read_parquet() with raw data", { + test_raw <- readBin(pq_file, what = "raw", n = 5000) + df <- read_parquet(test_raw) + expect_identical(dim(df), c(10L, 11L)) +})