From cebcc5ca32406c5b73050e4b717b5ad1c47d7374 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 17 Oct 2022 10:51:46 -0300 Subject: [PATCH 1/5] nix warning when re-regisitering a func --- r/R/compute.R | 8 +++++++- r/R/dplyr-funcs.R | 2 +- r/src/compute.cpp | 10 +++++++++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/r/R/compute.R b/r/R/compute.R index a144e7d678a..2e263c02163 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -379,9 +379,15 @@ register_scalar_function <- function(name, fun, in_type, out_type, RegisterScalarUDF(name, scalar_function) # register with dplyr binding (enables its use in mutate(), filter(), etc.) + # extra step to avoid saving this execution environment in the binding, + # which eliminates a warning when the same binding is registered twice + binding_fun <- function(...) build_expr(name, ...) + body(binding_fun)[[2]] <- name + environment(binding_fun) <- asNamespace("arrow") + register_binding( name, - function(...) build_expr(name, ...), + binding_fun, update_cache = TRUE ) diff --git a/r/R/dplyr-funcs.R b/r/R/dplyr-funcs.R index e5f76570616..ee64a09918d 100644 --- a/r/R/dplyr-funcs.R +++ b/r/R/dplyr-funcs.R @@ -75,7 +75,7 @@ register_binding <- function(fun_name, previous_fun <- registry[[unqualified_name]] # if the unqualified name exists in the registry, warn - if (!is.null(previous_fun)) { + if (!is.null(previous_fun) && !identical(fun, previous_fun)) { warn( paste0( "A \"", diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 1ed949e7295..bd961a3ff8c 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -609,10 +609,18 @@ std::vector compute__GetFunctionNames() { class RScalarUDFKernelState : public arrow::compute::KernelState { public: RScalarUDFKernelState(cpp11::sexp exec_func, cpp11::sexp resolver) - : exec_func_(exec_func), resolver_(resolver) {} + : exec_func_(exec_func), + resolver_(resolver), + exec_fun_shelter_(exec_func), + resolver_shelter_(resolver) {} cpp11::function exec_func_; cpp11::function resolver_; + // cpp11::function does not protect its argument from garbage collection, + // so we need a C++ object that does to make sure the functions still exist + // when called. + cpp11::sexp exec_fun_shelter_; + cpp11::sexp resolver_shelter_; }; arrow::Result ResolveScalarUDFOutputType( From 8aad44220e1ccf3430d47ea68a793fa866364c4a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 17 Oct 2022 10:53:22 -0300 Subject: [PATCH 2/5] add test --- r/tests/testthat/test-dplyr-funcs.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/r/tests/testthat/test-dplyr-funcs.R b/r/tests/testthat/test-dplyr-funcs.R index 86f984dd32c..48b74c9af43 100644 --- a/r/tests/testthat/test-dplyr-funcs.R +++ b/r/tests/testthat/test-dplyr-funcs.R @@ -35,6 +35,9 @@ test_that("register_binding()/unregister_binding() works", { register_binding("some.pkg2::some_fun", fun2, fake_registry), "A \"some_fun\" binding already exists in the registry and will be overwritten." ) + + # No warning when an identical function is re-registered + expect_silent(register_binding("some.pkg2::some_fun", fun2, fake_registry)) }) test_that("register_binding_agg() works", { From 82976103b00cfeca58f69b1b96226bf0db85adf8 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 17 Oct 2022 11:16:11 -0300 Subject: [PATCH 3/5] fix another place where we used cpp11::function --- r/src/recordbatchreader.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/r/src/recordbatchreader.cpp b/r/src/recordbatchreader.cpp index d0c52acc416..3caed0d0d48 100644 --- a/r/src/recordbatchreader.cpp +++ b/r/src/recordbatchreader.cpp @@ -64,7 +64,7 @@ class RFunctionRecordBatchReader : public arrow::RecordBatchReader { public: RFunctionRecordBatchReader(cpp11::sexp fun, const std::shared_ptr& schema) - : fun_(fun), schema_(schema) {} + : fun_(fun), fun_shelter_(fun), schema_(schema) {} std::shared_ptr schema() const { return schema_; } @@ -95,6 +95,8 @@ class RFunctionRecordBatchReader : public arrow::RecordBatchReader { private: cpp11::function fun_; + // Because cpp11::function does not protect its argument from garbage collection + cpp11::sexp fun_shelter_; std::shared_ptr schema_; }; From 8af2022fba246789656a756a88d3cb9593c138ab Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 17 Oct 2022 11:52:21 -0300 Subject: [PATCH 4/5] more readable inlining of name into the function body --- r/R/compute.R | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/r/R/compute.R b/r/R/compute.R index 2e263c02163..1386728ac90 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -379,10 +379,12 @@ register_scalar_function <- function(name, fun, in_type, out_type, RegisterScalarUDF(name, scalar_function) # register with dplyr binding (enables its use in mutate(), filter(), etc.) - # extra step to avoid saving this execution environment in the binding, - # which eliminates a warning when the same binding is registered twice binding_fun <- function(...) build_expr(name, ...) - body(binding_fun)[[2]] <- name + + # inject the value of `name` into the expression to avoid saving this + # execution environment in the binding, which eliminates a warning when the + # same binding is registered twice + body(binding_fun) <- expr_substitute(body(binding_fun), sym("name"), name) environment(binding_fun) <- asNamespace("arrow") register_binding( From 0facb9ba28af0d10039c168d077cf564c924aaa2 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 17 Oct 2022 16:34:32 -0300 Subject: [PATCH 5/5] less verbose change for function stuff --- r/src/compute.cpp | 22 ++++++++-------------- r/src/recordbatchreader.cpp | 8 +++----- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/r/src/compute.cpp b/r/src/compute.cpp index bd961a3ff8c..0bfc5172852 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -609,18 +609,10 @@ std::vector compute__GetFunctionNames() { class RScalarUDFKernelState : public arrow::compute::KernelState { public: RScalarUDFKernelState(cpp11::sexp exec_func, cpp11::sexp resolver) - : exec_func_(exec_func), - resolver_(resolver), - exec_fun_shelter_(exec_func), - resolver_shelter_(resolver) {} - - cpp11::function exec_func_; - cpp11::function resolver_; - // cpp11::function does not protect its argument from garbage collection, - // so we need a C++ object that does to make sure the functions still exist - // when called. - cpp11::sexp exec_fun_shelter_; - cpp11::sexp resolver_shelter_; + : exec_func_(exec_func), resolver_(resolver) {} + + cpp11::sexp exec_func_; + cpp11::sexp resolver_; }; arrow::Result ResolveScalarUDFOutputType( @@ -638,7 +630,8 @@ arrow::Result ResolveScalarUDFOutputType( cpp11::to_r6(input_types[i].GetSharedPtr()); } - cpp11::sexp output_type_sexp = state->resolver_(input_types_sexp); + cpp11::sexp output_type_sexp = + cpp11::function(state->resolver_)(input_types_sexp); if (!Rf_inherits(output_type_sexp, "DataType")) { cpp11::stop( "Function specified as arrow_scalar_function() out_type argument must " @@ -682,7 +675,8 @@ arrow::Status CallRScalarUDF(arrow::compute::KernelContext* context, cpp11::writable::list udf_context = {batch_length_sexp, output_type_sexp}; udf_context.names() = {"batch_length", "output_type"}; - cpp11::sexp func_result_sexp = state->exec_func_(udf_context, args_sexp); + cpp11::sexp func_result_sexp = + cpp11::function(state->exec_func_)(udf_context, args_sexp); if (Rf_inherits(func_result_sexp, "Array")) { auto array = cpp11::as_cpp>(func_result_sexp); diff --git a/r/src/recordbatchreader.cpp b/r/src/recordbatchreader.cpp index 3caed0d0d48..8e9df121748 100644 --- a/r/src/recordbatchreader.cpp +++ b/r/src/recordbatchreader.cpp @@ -64,13 +64,13 @@ class RFunctionRecordBatchReader : public arrow::RecordBatchReader { public: RFunctionRecordBatchReader(cpp11::sexp fun, const std::shared_ptr& schema) - : fun_(fun), fun_shelter_(fun), schema_(schema) {} + : fun_(fun), schema_(schema) {} std::shared_ptr schema() const { return schema_; } arrow::Status ReadNext(std::shared_ptr* batch_out) { auto batch = SafeCallIntoR>([&]() { - cpp11::sexp result_sexp = fun_(); + cpp11::sexp result_sexp = cpp11::function(fun_)(); if (result_sexp == R_NilValue) { return std::shared_ptr(nullptr); } else if (!Rf_inherits(result_sexp, "RecordBatch")) { @@ -94,9 +94,7 @@ class RFunctionRecordBatchReader : public arrow::RecordBatchReader { } private: - cpp11::function fun_; - // Because cpp11::function does not protect its argument from garbage collection - cpp11::sexp fun_shelter_; + cpp11::sexp fun_; std::shared_ptr schema_; };