diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 4306f644082..1d9e41bba7a 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -25,6 +25,8 @@ arch=("any") url="https://arrow.apache.org/" license=("Apache-2.0") depends=("${MINGW_PACKAGE_PREFIX}-aws-sdk-cpp" + "${MINGW_PACKAGE_PREFIX}-libutf8proc" + "${MINGW_PACKAGE_PREFIX}-re2" "${MINGW_PACKAGE_PREFIX}-thrift" "${MINGW_PACKAGE_PREFIX}-snappy" "${MINGW_PACKAGE_PREFIX}-zlib" @@ -103,9 +105,7 @@ build() { -DARROW_SNAPPY_USE_SHARED=OFF \ -DARROW_USE_GLOG=OFF \ -DARROW_WITH_LZ4=ON \ - -DARROW_WITH_RE2=OFF \ -DARROW_WITH_SNAPPY=ON \ - -DARROW_WITH_UTF8PROC=OFF \ -DARROW_WITH_ZLIB=ON \ -DARROW_WITH_ZSTD=ON \ -DARROW_ZSTD_USE_SHARED=OFF \ diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh index cb33e676a7d..be03b75f5ad 100755 --- a/ci/scripts/r_windows_build.sh +++ b/ci/scripts/r_windows_build.sh @@ -96,8 +96,8 @@ cp $MSYS_LIB_DIR/mingw64/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/x6 cp $MSYS_LIB_DIR/mingw32/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/i386 # These are from https://dl.bintray.com/rtools/mingw{32,64}/ -cp $MSYS_LIB_DIR/mingw64/lib/lib{zstd,lz4,crypto,aws*}.a $DST_DIR/lib/x64 -cp $MSYS_LIB_DIR/mingw32/lib/lib{zstd,lz4,crypto,aws*}.a $DST_DIR/lib/i386 +cp $MSYS_LIB_DIR/mingw64/lib/lib{zstd,lz4,crypto,utf8proc,re2,aws*}.a $DST_DIR/lib/x64 +cp $MSYS_LIB_DIR/mingw32/lib/lib{zstd,lz4,crypto,utf8proc,re2,aws*}.a $DST_DIR/lib/i386 # Create build artifact zip -r ${DST_DIR}.zip $DST_DIR diff --git a/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb b/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb index 8779ad19570..351d7764603 100644 --- a/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb +++ b/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb @@ -57,9 +57,7 @@ def install -DARROW_USE_GLOG=OFF -DARROW_VERBOSE_THIRDPARTY_BUILD=ON -DARROW_WITH_LZ4=ON - -DARROW_WITH_RE2=OFF -DARROW_WITH_SNAPPY=ON - -DARROW_WITH_UTF8PROC=OFF -DARROW_WITH_ZLIB=ON -DARROW_WITH_ZSTD=ON -DCMAKE_UNITY_BUILD=OFF diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 145a5aeef9b..f37e6a4e84f 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -46,6 +46,7 @@ Suggests: pkgload, reticulate, rmarkdown, + stringr, testthat, tibble LinkingTo: cpp11 (>= 0.2.0) diff --git a/r/NAMESPACE b/r/NAMESPACE index fbc71e9edf0..54061128ac7 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -284,6 +284,7 @@ importFrom(rlang,is_false) importFrom(rlang,is_integerish) importFrom(rlang,list2) importFrom(rlang,new_data_mask) +importFrom(rlang,new_environment) importFrom(rlang,quo_is_null) importFrom(rlang,quos) importFrom(rlang,set_names) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index fd3f8b47856..66694a97867 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -18,7 +18,7 @@ #' @importFrom R6 R6Class #' @importFrom purrr as_mapper map map2 map_chr map_dfr map_int map_lgl keep #' @importFrom assertthat assert_that is.string -#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos eval_tidy new_data_mask syms env env_bind as_label set_names exec is_bare_character +#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos eval_tidy new_data_mask syms env new_environment env_bind as_label set_names exec is_bare_character #' @importFrom tidyselect vars_select #' @useDynLib arrow, .registration = TRUE #' @keywords internal diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index ec0aae94f30..3d0f31ce8f3 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -296,6 +296,10 @@ compute__CallFunction <- function(func_name, args, options){ .Call(`_arrow_compute__CallFunction`, func_name, args, options) } +list_compute_functions <- function(){ + .Call(`_arrow_list_compute_functions`) +} + csv___ReadOptions__initialize <- function(options){ .Call(`_arrow_csv___ReadOptions__initialize`, options) } diff --git a/r/R/dplyr.R b/r/R/dplyr.R index 8bc64ce089d..f9df04d5172 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -238,31 +238,50 @@ filter.arrow_dplyr_query <- function(.data, ..., .preserve = FALSE) { } filter.Dataset <- filter.ArrowTabular <- filter.arrow_dplyr_query +# Helper to assemble the functions that go in the NSE data mask +# The only difference between the Dataset and the Table/RecordBatch versions +# is that they use a different wrapping function (FUN) to hold the unevaluated +# expression. +build_function_list <- function(FUN) { + wrapper <- function(operator) { + force(operator) + function(e1, e2) FUN(operator, e1, e2) + } + + c( + lapply(set_names(names(.array_function_map)), wrapper), + # TODO: lapply also for the arrow spellings? + # See list_compute_functions() + # (would want to do these first, and then modifyList with the R ones + # in case of name collision) + # Would need to generalize FUN to accept ... args + str_trim = function(string, side = c("both", "left", "right")) { + side <- match.arg(side) + switch( + side, + left = FUN("utf8_ltrim_whitespace", string), + right = FUN("utf8_rtrim_whitespace", string), + both = FUN("utf8_trim_whitespace", string) + ) + } + ) +} + +# Create these once, at package build time +dataset_function_list <- build_function_list(build_dataset_expression) +array_function_list <- build_function_list(build_array_expression) + # Create a data mask for evaluating a filter expression filter_mask <- function(.data) { - f_env <- env() - - # Insert functions/operators and field references - # TODO: define functions in env once, outside of this function - # filter_env <- env(parent = if (data_is_dataset) function_env1 else function_env2) if (query_on_dataset(.data)) { - comp_func <- function(operator) { - force(operator) - function(e1, e2) build_dataset_expression(operator, e1, e2) - } + f_env <- new_environment(dataset_function_list) var_binder <- function(x) Expression$field_ref(x) } else { - comp_func <- function(operator) { - force(operator) - function(e1, e2) build_array_expression(operator, e1, e2) - } + f_env <- new_environment(array_function_list) var_binder <- function(x) .data$.data[[x]] } - # First add the functions - func_names <- set_names(names(.array_function_map)) - env_bind(f_env, !!!lapply(func_names, comp_func)) - # Then add the column references + # Add the column references # Renaming is handled automatically by the named list data_pronoun <- lapply(.data$selected_columns, var_binder) env_bind(f_env, !!!data_pronoun) diff --git a/r/R/expression.R b/r/R/expression.R index 5475f7a44bc..878b800c652 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -51,8 +51,8 @@ Ops.array_expression <- function(e1, e2) { } build_array_expression <- function(.Generic, e1, e2, ...) { - if (.Generic %in% names(.unary_function_map)) { - expr <- array_expression(.unary_function_map[[.Generic]], e1) + if (.Generic %in% names(.unary_function_map) || nargs() == 2L) { + expr <- array_expression(.unary_function_map[[.Generic]] %||% .Generic, e1) } else { e1 <- .wrap_arrow(e1, .Generic) e2 <- .wrap_arrow(e2, .Generic) @@ -79,7 +79,7 @@ build_array_expression <- function(.Generic, e1, e2, ...) { return(build_array_expression("-", e1, base)) } - expr <- array_expression(.binary_function_map[[.Generic]], e1, e2, ...) + expr <- array_expression(.binary_function_map[[.Generic]] %||% .Generic, e1, e2, ...) } expr } @@ -110,7 +110,14 @@ cast_array_expression <- function(x, to_type, safe = TRUE, ...) { .unary_function_map <- list( "!" = "invert", "is.na" = "is_null", - "is.nan" = "is_nan" + "is.nan" = "is_nan", + "nchar" = "binary_length", + "tolower" = "utf8_lower", + "toupper" = "utf8_upper", + # stringr spellings of those + "str_length" = "binary_length", + "str_to_lower" = "utf8_lower", + "str_to_upper" = "utf8_upper" ) .binary_function_map <- list( @@ -228,8 +235,8 @@ Expression$scalar <- function(x) { } build_dataset_expression <- function(.Generic, e1, e2, ...) { - if (.Generic %in% names(.unary_function_map)) { - expr <- Expression$create(.unary_function_map[[.Generic]], e1) + if (.Generic %in% names(.unary_function_map) || nargs() == 2L) { + expr <- Expression$create(.unary_function_map[[.Generic]] %||% .Generic, e1) } else if (.Generic == "%in%") { # Special-case %in%, which is different from the Array function name expr <- Expression$create("is_in", e1, @@ -260,7 +267,7 @@ build_dataset_expression <- function(.Generic, e1, e2, ...) { return(e1 - e2 * ( e1 %/% e2 )) } - expr <- Expression$create(.binary_function_map[[.Generic]], e1, e2, ...) + expr <- Expression$create(.binary_function_map[[.Generic]] %||% .Generic, e1, e2, ...) } expr } diff --git a/r/configure.win b/r/configure.win index 32d90ce8303..80529e702ac 100644 --- a/r/configure.win +++ b/r/configure.win @@ -50,7 +50,7 @@ AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-man # NOTE: If you make changes to the libraries below, you should also change # ci/scripts/r_windows_build.sh and ci/scripts/PKGBUILD PKG_CFLAGS="-I${RWINLIB}/include -DARROW_STATIC -DPARQUET_STATIC -DARROW_DS_STATIC -DARROW_R_WITH_ARROW" -PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH) '"-lparquet -larrow_dataset -larrow -larrow_bundled_dependencies -lthrift -lsnappy -lz -lzstd -llz4 ${MIMALLOC_LIBS} ${OPENSSL_LIBS}" +PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH) '"-lparquet -larrow_dataset -larrow -larrow_bundled_dependencies -lutf8proc -lre2 -lthrift -lsnappy -lz -lzstd -llz4 ${MIMALLOC_LIBS} ${OPENSSL_LIBS}" # S3 support only for Rtools40 (i.e. R >= 4.0) "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e 'R.version$major >= 4' | grep TRUE >/dev/null 2>&1 diff --git a/r/inst/build_arrow_static.sh b/r/inst/build_arrow_static.sh index 3000f5826f3..61dd5930de0 100755 --- a/r/inst/build_arrow_static.sh +++ b/r/inst/build_arrow_static.sh @@ -61,9 +61,7 @@ ${CMAKE} -DARROW_BOOST_USE_SHARED=OFF \ -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI:-$ARROW_DEFAULT_PARAM} \ -DARROW_WITH_BZ2=${ARROW_WITH_BZ2:-$ARROW_DEFAULT_PARAM} \ -DARROW_WITH_LZ4=${ARROW_WITH_LZ4:-$ARROW_DEFAULT_PARAM} \ - -DARROW_WITH_RE2=OFF \ -DARROW_WITH_SNAPPY=${ARROW_WITH_SNAPPY:-$ARROW_DEFAULT_PARAM} \ - -DARROW_WITH_UTF8PROC=OFF \ -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB:-$ARROW_DEFAULT_PARAM} \ -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD:-$ARROW_DEFAULT_PARAM} \ -DCMAKE_BUILD_TYPE=Release \ diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 2fbfecacfa1..839c9d6c173 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -642,6 +642,13 @@ BEGIN_CPP11 return cpp11::as_sexp(compute__CallFunction(func_name, args, options)); END_CPP11 } +// compute.cpp +std::vector list_compute_functions(); +extern "C" SEXP _arrow_list_compute_functions(){ +BEGIN_CPP11 + return cpp11::as_sexp(list_compute_functions()); +END_CPP11 +} // csv.cpp std::shared_ptr csv___ReadOptions__initialize(cpp11::list options); extern "C" SEXP _arrow_csv___ReadOptions__initialize(SEXP options_sexp){ @@ -3583,6 +3590,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_RecordBatch__cast", (DL_FUNC) &_arrow_RecordBatch__cast, 3}, { "_arrow_Table__cast", (DL_FUNC) &_arrow_Table__cast, 3}, { "_arrow_compute__CallFunction", (DL_FUNC) &_arrow_compute__CallFunction, 3}, + { "_arrow_list_compute_functions", (DL_FUNC) &_arrow_list_compute_functions, 0}, { "_arrow_csv___ReadOptions__initialize", (DL_FUNC) &_arrow_csv___ReadOptions__initialize, 1}, { "_arrow_csv___ParseOptions__initialize", (DL_FUNC) &_arrow_csv___ParseOptions__initialize, 1}, { "_arrow_csv___ReadOptions__column_names", (DL_FUNC) &_arrow_csv___ReadOptions__column_names, 1}, diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 2d69d8029c6..7bcded78f0d 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -199,4 +199,9 @@ SEXP compute__CallFunction(std::string func_name, cpp11::list args, cpp11::list return from_datum(std::move(out)); } +// [[arrow::export]] +std::vector list_compute_functions() { + return arrow::compute::GetFunctionRegistry()->GetFunctionNames(); +} + #endif diff --git a/r/tests/testthat/helper-data.R b/r/tests/testthat/helper-data.R index 15ea0fca31f..5fac5481f26 100644 --- a/r/tests/testthat/helper-data.R +++ b/r/tests/testthat/helper-data.R @@ -63,6 +63,49 @@ example_with_times <- tibble::tibble( posixlt_tz = as.POSIXlt(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "US/Eastern") + 1:10) ) +verses <- list( + # Since we tend to test with dataframes with 10 rows, here are verses from + # "Milonga del moro judío", by Jorge Drexler. They are décimas, 10-line + # poems with a particular meter and rhyme scheme. + # (They also have non-ASCII characters, which is nice for testing) + c( + "Por cada muro, un lamento", + "En Jerusalén la dorada", + "Y mil vidas malgastadas", + "Por cada mandamiento", + "Yo soy polvo de tu viento", + "Y aunque sangro de tu herida", + "Y cada piedra querida", + "Guarda mi amor más profundo", + "No hay una piedra en el mundo", + "Que valga lo que una vida" + ), + c( + "No hay muerto que no me duela", + "No hay un bando ganador", + "No hay nada más que dolor", + "Y otra vida que se vuela", + "La guerra es muy mala escuela", + "No importa el disfraz que viste", + "Perdonen que no me aliste", + "Bajo ninguna bandera", + "Vale más cualquier quimera", + "Que un trozo de tela triste" + ), + c( + "Y a nadie le di permiso", + "Para matar en mi nombre", + "Un hombre no es más que un hombre", + "Y si hay Dios, así lo quiso", + "El mismo suelo que piso", + "Seguirá, yo me habré ido", + "Rumbo también del olvido", + "No hay doctrina que no vaya", + "Y no hay pueblo que no se haya", + "Creído el pueblo elegido" + ) +) + make_big_string <- function() { # This creates a character vector that would exceed the capacity of BinaryArray rep(purrr::map_chr(2047:2050, ~paste(sample(letters, ., replace = TRUE), collapse = "")), 2^18) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 990f024212e..e84eb12b08a 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -492,7 +492,7 @@ test_that("filter() with %in%", { tibble(int = df1$int[c(3, 4, 6)], part = 1) ) -# ARROW-9606: bug in %in% filter on partition column with >1 partition columns + # ARROW-9606: bug in %in% filter on partition column with >1 partition columns ds <- open_dataset(hive_dir) expect_equivalent( ds %>% @@ -503,6 +503,25 @@ test_that("filter() with %in%", { ) }) +test_that("filter() with strings", { + ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) + expect_equivalent( + ds %>% + select(chr, part) %>% + filter(chr == "b", part == 1) %>% + collect(), + tibble(chr = "b", part = 1) + ) + + expect_equivalent( + ds %>% + select(chr, part) %>% + filter(toupper(chr) == "B", part == 1) %>% + collect(), + tibble(chr = "b", part = 1) + ) +}) + test_that("filter() with .data", { ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) expect_equivalent( diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index a80e17c6f3e..3082019ed98 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -18,6 +18,7 @@ context("dplyr verbs") library(dplyr) +library(stringr) expect_dplyr_equal <- function(expr, # A dplyr pipeline with `input` as its start tbl, # A tbl/df as reference, will make RB/Table with @@ -83,6 +84,11 @@ expect_dplyr_error <- function(expr, # A dplyr pipeline with `input` as its star } tbl <- example_data +# Add some better string data +tbl$verses <- verses[[1]] +# c(" a ", " b ", " c ", ...) increasing padding +# nchar = 3 5 7 9 11 13 15 17 19 21 +tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2*(1:10)+1, side = "both") test_that("basic select/filter/collect", { batch <- record_batch(tbl) @@ -256,6 +262,50 @@ test_that("filter() with %in%", { ) }) +test_that("filter() with string ops", { + # Extra instrumentation to ensure that we're calling Arrow compute here + # because many base R string functions implicitly call as.character, + # which means they still work on Arrays but actually force data into R + # 1) wrapper that raises a warning if as.character is called. Can't wrap + # the whole test because as.character apparently gets called in other + # (presumably legitimate) places + # 2) Wrap the test in expect_warning(expr, NA) to catch the warning + + with_no_as_character <- function(expr) { + trace( + "as.character", + tracer = quote(warning("as.character was called")), + print = FALSE, + where = toupper + ) + on.exit(untrace("as.character", where = toupper)) + force(expr) + } + + expect_warning( + expect_dplyr_equal( + input %>% + filter(dbl > 2, with_no_as_character(toupper(chr)) %in% c("D", "F")) %>% + collect(), + tbl + ), + NA) + + expect_dplyr_equal( + input %>% + filter(dbl > 2, str_length(verses) > 25) %>% + collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + filter(dbl > 2, str_length(str_trim(padded_strings, "left")) > 5) %>% + collect(), + tbl + ) +}) + test_that("filter environment scope", { # "object 'b_var' not found" expect_dplyr_error(input %>% filter(batch, chr == b_var)) diff --git a/r/tests/testthat/test-python.R b/r/tests/testthat/test-python.R index a073b73479f..821e14a493b 100644 --- a/r/tests/testthat/test-python.R +++ b/r/tests/testthat/test-python.R @@ -20,6 +20,8 @@ context("To/from Python") test_that("install_pyarrow", { skip_on_cran() skip_if_not_dev_mode() + # Python problems on Apple M1 still + skip_if(grepl("arm-apple", R.Version()$platform)) skip_if_not_installed("reticulate") venv <- try(reticulate::virtualenv_create("arrow-test")) # Bail out if virtualenv isn't available