From d7e465f99c8aaa1c6bae3b1f149580a47dad1f6a Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Mon, 15 Jun 2020 13:38:52 -0700 Subject: [PATCH 1/2] Add documentation on translation of Arrow <--> R types --- r/src/array_to_vector.cpp | 3 ++ r/tests/testthat/test-Array.R | 2 +- r/vignettes/arrow.Rmd | 78 ++++++++++++++++++++++++++++++++--- 3 files changed, 77 insertions(+), 6 deletions(-) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index 223c3fee970..0adcc4a8cde 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -418,6 +418,7 @@ class Converter_Struct : public Converter { std::vector> converters; }; +// Shouldn't this cast before dividing? Otherwise we're doing integer division double ms_to_seconds(int64_t ms) { return static_cast(ms / 1000); } class Converter_Date64 : public Converter { @@ -479,6 +480,7 @@ class Converter_Time : public Converter { SEXP Allocate(R_xlen_t n) const { Rcpp::NumericVector data(no_init(n)); data.attr("class") = Rcpp::CharacterVector::create("hms", "difftime"); + // hms difftime is always stored as "seconds" data.attr("units") = Rcpp::CharacterVector::create("secs"); return data; } @@ -499,6 +501,7 @@ class Converter_Time : public Converter { private: int TimeUnit_multiplier(const std::shared_ptr& array) const { + // hms difftime is always "seconds", so multiply based on the Array's TimeUnit switch (static_cast(array->type().get())->unit()) { case TimeUnit::SECOND: return 1; diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 6d61d9ac5fc..b105d2b158d 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -215,7 +215,7 @@ test_that("array supports POSIXct (ARROW-3340)", { expect_array_roundtrip(times2, timestamp("us", "US/Eastern")) }) -test_that("array supports POSIXlt and without timezone", { +test_that("array supports POSIXct without timezone", { # Make sure timezone is not set tz <- Sys.getenv("TZ") Sys.setenv(TZ = "") diff --git a/r/vignettes/arrow.Rmd b/r/vignettes/arrow.Rmd index d1e9c406562..828b9659734 100644 --- a/r/vignettes/arrow.Rmd +++ b/r/vignettes/arrow.Rmd @@ -10,12 +10,14 @@ vignette: > The Apache Arrow C++ library provides rich, powerful features for working with columnar data. The `arrow` R package provides both a low-level interface to the C++ library and some higher-level, R-flavored tools for working with it. This vignette provides an overview of how the pieces fit together, and it describes the conventions that the classes and methods follow in R. -# Multi-file datasets +# Features + +## Multi-file datasets The `arrow` package lets you work efficiently with large, multi-file datasets using `dplyr` methods. See `vignette("dataset", package = "arrow")` for an overview. -# Reading and writing files +## Reading and writing files `arrow` provides some simple functions for using the Arrow C++ library to read and write files. These functions are designed to drop into your normal R workflow @@ -70,14 +72,14 @@ memory layout of the Arrow columnar format and are not intended as a direct replacement for existing R CSV readers (`base::read.csv`, `readr::read_csv`, `data.table::fread`) that return an R `data.frame`. -# Working with Arrow data in Python +## Working with Arrow data in Python Using [`reticulate`](https://rstudio.github.io/reticulate/), `arrow` lets you share data between R and Python (`pyarrow`) efficiently, enabling you to take advantage of the vibrant ecosystem of Python packages that build on top of Apache Arrow. See `vignette("python", package = "arrow")` for details. -# Access to Arrow messages, buffers, and streams +## Access to Arrow messages, buffers, and streams The `arrow` package also provides many lower-level bindings to the C++ library, which enable you to access and manipulate Arrow objects. You can use these to build connectors @@ -86,7 +88,73 @@ to other applications and services that use Arrow. One example is Spark: the move data to and from Spark, yielding [significant performance gains](http://arrow.apache.org/blog/2019/01/25/r-spark-improvements/). -# Class structure and package conventions +# Internals + +## Mapping of R <--> Arrow types + +Arrow has a rich data type system that includes direct parallels with R's data types and much more. + +In the tables, entries with a `-` are not currently implemented. + +### R to Arrow + +| R type | Arrow type | +|--------------------------|------------| +| logical | boolean | +| integer | int32 | +| double ("numeric") | float64 | +| character | utf8 | +| factor | dictionary | +| raw | uint8 | +| Date | date32 | +| POSIXct | timestamp | +| POSIXlt | - | +| data.frame | struct | +| list^+^ | list | +| bit64::integer64 | int64 | +| difftime | time32 | +| vctrs::vctrs_unspecified | null | + +^+^: Only lists where all elements are the same type are able to be translated to Arrow list type (which is a "list of" some type). + +### Arrow to R + +| Arrow type | R type | +|-------------------|--------------------------| +| boolean | logical | +| int8 | integer | +| int16 | integer | +| int32 | integer | +| int64 | bit64::integer64 | +| uint8 | integer | +| uint16 | integer | +| uint32 | double | +| uint64 | - | +| float16 | - | +| float32 | double | +| float64 | double | +| utf8 | character | +| binary | - | +| fixed_size_binary | - | +| date32 | Date | +| date64 | POSIXct | +| time32 | hms::difftime | +| time64 | hms::difftime | +| timestamp | POSIXct | +| duration | - | +| decimal | double | +| dictionary | factor | +| list | list | +| fixed_size_list | - | +| struct | data.frame | +| null | vctrs::vctrs_unspecified | +| map | - | +| union | - | +| large_utf8 | - | +| large_binary | - | +| large_list | - | + +## Class structure and package conventions C++ is an object-oriented language, so the core logic of the Arrow library is encapsulated in classes and methods. In the R package, these classes are implemented as `R6` reference classes, most of which are exported from the namespace. From 7e072f6cb33ece09ab7962bad9a2fc21b247f610 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 16 Jun 2020 12:01:37 -0700 Subject: [PATCH 2/2] Incorporate feedback --- r/src/array_to_vector.cpp | 3 +-- r/vignettes/arrow.Rmd | 6 ++++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index 0adcc4a8cde..d92eaaea0a2 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -418,8 +418,7 @@ class Converter_Struct : public Converter { std::vector> converters; }; -// Shouldn't this cast before dividing? Otherwise we're doing integer division -double ms_to_seconds(int64_t ms) { return static_cast(ms / 1000); } +double ms_to_seconds(int64_t ms) { return static_cast(ms) / 1000; } class Converter_Date64 : public Converter { public: diff --git a/r/vignettes/arrow.Rmd b/r/vignettes/arrow.Rmd index 828b9659734..c9ae799bcf9 100644 --- a/r/vignettes/arrow.Rmd +++ b/r/vignettes/arrow.Rmd @@ -130,7 +130,7 @@ In the tables, entries with a `-` are not currently implemented. | uint16 | integer | | uint32 | double | | uint64 | - | -| float16 | - | +| float16 | - | | float32 | double | | float64 | double | | utf8 | character | @@ -143,7 +143,7 @@ In the tables, entries with a `-` are not currently implemented. | timestamp | POSIXct | | duration | - | | decimal | double | -| dictionary | factor | +| dictionary | factor^++^ | | list | list | | fixed_size_list | - | | struct | data.frame | @@ -154,6 +154,8 @@ In the tables, entries with a `-` are not currently implemented. | large_binary | - | | large_list | - | +^++^: Due to the limitation of R `factor`s, Arrow `dictionary` values are coerced to string when translated to R if they are not already strings. + ## Class structure and package conventions C++ is an object-oriented language, so the core logic of the Arrow library is encapsulated in classes and methods. In the R package, these classes are implemented as `R6` reference classes, most of which are exported from the namespace.