From ecd240af5d4e7ad9d544d65f2a07cb768be8b747 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 4 Mar 2022 18:57:55 +0530 Subject: [PATCH 01/74] updating submodule --- testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing b/testing index 53b49804710..634739c6644 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 53b498047109d9940fcfab388bd9d6aeb8c57425 +Subproject commit 634739c664433cec366b4b9a81d1e1044a8c5eda From fb5b46fce2cd2a39b3676dade1db7620aacf4a76 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sat, 19 Mar 2022 19:14:39 +0530 Subject: [PATCH 02/74] temp commit to remove files in submodule --- .gitmodules | 3 --- testing | 1 - 2 files changed, 4 deletions(-) delete mode 160000 testing diff --git a/.gitmodules b/.gitmodules index 6efc4871542..71722b21777 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ [submodule "cpp/submodules/parquet-testing"] path = cpp/submodules/parquet-testing url = https://github.com/apache/parquet-testing.git -[submodule "testing"] - path = testing - url = https://github.com/apache/arrow-testing diff --git a/testing b/testing deleted file mode 160000 index 634739c6644..00000000000 --- a/testing +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 634739c664433cec366b4b9a81d1e1044a8c5eda From a57040a04c0a5ce2677b648bee42a1001adedfb9 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sat, 19 Mar 2022 19:15:36 +0530 Subject: [PATCH 03/74] adding submodule --- .gitmodules | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitmodules b/.gitmodules index 71722b21777..6efc4871542 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "cpp/submodules/parquet-testing"] path = cpp/submodules/parquet-testing url = https://github.com/apache/parquet-testing.git +[submodule "testing"] + path = testing + url = https://github.com/apache/arrow-testing From 61ed8ef67d8f33da512c08dab132f6889ff32886 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sun, 20 Mar 2022 23:09:21 +0530 Subject: [PATCH 04/74] updating testing submodule --- testing | 1 + 1 file changed, 1 insertion(+) create mode 160000 testing diff --git a/testing b/testing new file mode 160000 index 00000000000..d315f798520 --- /dev/null +++ b/testing @@ -0,0 +1 @@ +Subproject commit d315f7985207d2d67fc2c8e41053e9d97d573f4b From 3f85bf077e581501495d3c69a4745c146caee95f Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sun, 20 Mar 2022 23:11:24 +0530 Subject: [PATCH 05/74] revert to uupstream version --- testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing b/testing index d315f798520..53b49804710 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit d315f7985207d2d67fc2c8e41053e9d97d573f4b +Subproject commit 53b498047109d9940fcfab388bd9d6aeb8c57425 From 92bc822d8f642fc5e4aa38e50ebf05112c8bc714 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 18 Mar 2022 08:26:40 +0530 Subject: [PATCH 06/74] rebase --- cpp/examples/arrow/CMakeLists.txt | 1 + .../arrow/engine_substrait_example.cc | 183 ++++++++++++++++++ 2 files changed, 184 insertions(+) create mode 100644 cpp/examples/arrow/engine_substrait_example.cc diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt index 229373665d5..96461999f3a 100644 --- a/cpp/examples/arrow/CMakeLists.txt +++ b/cpp/examples/arrow/CMakeLists.txt @@ -23,6 +23,7 @@ endif() if(ARROW_SUBSTRAIT) add_arrow_example(engine_substrait_consumption EXTRA_LINK_LIBS arrow_substrait_shared) + add_arrow_example(engine_substrait_example EXTRA_LINK_LIBS arrow_substrait_shared) endif() if(ARROW_COMPUTE AND ARROW_CSV) diff --git a/cpp/examples/arrow/engine_substrait_example.cc b/cpp/examples/arrow/engine_substrait_example.cc new file mode 100644 index 00000000000..dfac15eaad2 --- /dev/null +++ b/cpp/examples/arrow/engine_substrait_example.cc @@ -0,0 +1,183 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include +#include +#include +#include + +namespace eng = arrow::engine; +namespace cp = arrow::compute; + +#define ABORT_ON_FAILURE(expr) \ + do { \ + arrow::Status status_ = (expr); \ + if (!status_.ok()) { \ + std::cerr << status_.message() << std::endl; \ + abort(); \ + } \ + } while (0); + +class IgnoringConsumer : public cp::SinkNodeConsumer { + public: + explicit IgnoringConsumer(size_t tag) : tag_{tag} {} + + arrow::Status Consume(cp::ExecBatch batch) override { + // Consume a batch of data + // (just print its row count to stdout) + std::cout << "-" << tag_ << " consumed " << batch.length << " rows" << std::endl; + std::cout << "Array Data" << std::endl; + for(int64_t idx=0; idx < batch.length; idx++) { + std::cout << "Batch Id : " << idx << std::endl; + if (batch[idx].is_array()) { + auto arr = batch[idx].make_array(); + std::cout << arr->ToString() << std::endl; + } else if(batch[idx].is_scalar()) { + auto scl = batch[idx].scalar(); + std::cout << scl->ToString() << std::endl; + } + } + return arrow::Status::OK(); + } + + arrow::Future<> Finish() override { + // Signal to the consumer that the last batch has been delivered + // (we don't do any real work in this consumer so mark it finished immediately) + // + // The returned future should only finish when all outstanding tasks have completed + // (after this method is called Consume is guaranteed not to be called again) + std::cout << "-" << tag_ << " finished" << std::endl; + return arrow::Future<>::MakeFinished(); + } + + private: + // A unique label for instances to help distinguish logging output if a plan has + // multiple sinks + // + // In this example, this is set to the zero-based index of the relation tree in the plan + size_t tag_; +}; + +arrow::Future> GetSubstraitFromServer( + const std::string& filename) { + // Emulate server interaction by parsing hard coded JSON + std::string substrait_json = R"({ + "relations": [ + {"rel": { + "read": { + "base_schema": { + "struct": { + "types": [ {"i64": {}}, {"bool": {}} ] + }, + "names": ["i", "b"] + }, + "filter": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + } + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER", + "format": "FILE_FORMAT_PARQUET" + } + ] + } + } + }} + ] + })"; + std::string filename_placeholder = "FILENAME_PLACEHOLDER"; + substrait_json.replace(substrait_json.find(filename_placeholder), + filename_placeholder.size(), filename); + return eng::internal::SubstraitFromJSON("Plan", substrait_json); +} + +int main(int argc, char** argv) { + if (argc < 2) { + std::cout << "Please specify a parquet file to scan" << std::endl; + // Fake pass for CI + return EXIT_SUCCESS; + } + + // Plans arrive at the consumer serialized in a Buffer, using the binary protobuf + // serialization of a substrait Plan + auto maybe_serialized_plan = GetSubstraitFromServer(argv[1]).result(); + ABORT_ON_FAILURE(maybe_serialized_plan.status()); + std::shared_ptr serialized_plan = + std::move(maybe_serialized_plan).ValueOrDie(); + + // Print the received plan to stdout as JSON + arrow::Result maybe_plan_json = + eng::internal::SubstraitToJSON("Plan", *serialized_plan); + ABORT_ON_FAILURE(maybe_plan_json.status()); + std::cout << std::string(50, '#') << " received substrait::Plan:" << std::endl; + std::cout << maybe_plan_json.ValueOrDie() << std::endl; + + // The data sink(s) for plans is/are implicit in substrait plans, but explicit in + // Arrow. Therefore, deserializing a plan requires a factory for consumers: each + // time the root of a substrait relation tree is deserialized, an Arrow consumer is + // constructed into which its batches will be piped. + std::vector> consumers; + std::function()> consumer_factory = [&] { + // All batches produced by the plan will be fed into IgnoringConsumers: + auto tag = consumers.size(); + consumers.emplace_back(new IgnoringConsumer{tag}); + return consumers.back(); + }; + + // Deserialize each relation tree in the substrait plan to an Arrow compute Declaration + arrow::Result> maybe_decls = + eng::DeserializePlan(*serialized_plan, consumer_factory); + ABORT_ON_FAILURE(maybe_decls.status()); + std::vector decls = std::move(maybe_decls).ValueOrDie(); + + // It's safe to drop the serialized plan; we don't leave references to its memory + serialized_plan.reset(); + + // Construct an empty plan (note: configure Function registry and ThreadPool here) + arrow::Result> maybe_plan = cp::ExecPlan::Make(); + ABORT_ON_FAILURE(maybe_plan.status()); + std::shared_ptr plan = std::move(maybe_plan).ValueOrDie(); + + // Add decls to plan (note: configure ExecNode registry before this point) + for (const cp::Declaration& decl : decls) { + ABORT_ON_FAILURE(decl.AddToPlan(plan.get()).status()); + } + + // Validate the plan and print it to stdout + ABORT_ON_FAILURE(plan->Validate()); + std::cout << std::string(50, '#') << " produced arrow::ExecPlan:" << std::endl; + std::cout << plan->ToString() << std::endl; + + // Start the plan... + std::cout << std::string(50, '#') << " consuming batches:" << std::endl; + ABORT_ON_FAILURE(plan->StartProducing()); + + // ... and wait for it to finish + ABORT_ON_FAILURE(plan->finished().status()); + return EXIT_SUCCESS; +} From c319d2a87f286c882ae7cbb26bb471fb5d177f34 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sat, 19 Mar 2022 18:53:19 +0530 Subject: [PATCH 07/74] updating the engine substrait exammple --- .../arrow/engine_substrait_example.cc | 71 ++++++++++++------- 1 file changed, 46 insertions(+), 25 deletions(-) diff --git a/cpp/examples/arrow/engine_substrait_example.cc b/cpp/examples/arrow/engine_substrait_example.cc index dfac15eaad2..f24f789371b 100644 --- a/cpp/examples/arrow/engine_substrait_example.cc +++ b/cpp/examples/arrow/engine_substrait_example.cc @@ -18,6 +18,8 @@ #include #include #include +#include "arrow/util/async_generator.h" +#include "arrow/util/iterator.h" #include #include @@ -36,28 +38,32 @@ namespace cp = arrow::compute; } \ } while (0); -class IgnoringConsumer : public cp::SinkNodeConsumer { +class SubstraitSinkConsumer : public cp::SinkNodeConsumer { public: - explicit IgnoringConsumer(size_t tag) : tag_{tag} {} + explicit SubstraitSinkConsumer( + arrow::AsyncGenerator>* generator, size_t tag, + std::shared_ptr schema, + arrow::util::BackpressureOptions backpressure = {}) + : producer_(MakeProducer(generator, std::move(backpressure))), + tag_{tag}, + schema_(schema) {} arrow::Status Consume(cp::ExecBatch batch) override { // Consume a batch of data - // (just print its row count to stdout) - std::cout << "-" << tag_ << " consumed " << batch.length << " rows" << std::endl; - std::cout << "Array Data" << std::endl; - for(int64_t idx=0; idx < batch.length; idx++) { - std::cout << "Batch Id : " << idx << std::endl; - if (batch[idx].is_array()) { - auto arr = batch[idx].make_array(); - std::cout << arr->ToString() << std::endl; - } else if(batch[idx].is_scalar()) { - auto scl = batch[idx].scalar(); - std::cout << scl->ToString() << std::endl; - } - } + producer_.Push(batch); return arrow::Status::OK(); } + static arrow::PushGenerator>::Producer + MakeProducer(arrow::AsyncGenerator>* out_gen, + arrow::util::BackpressureOptions backpressure) { + arrow::PushGenerator> push_gen( + std::move(backpressure)); + auto out = push_gen.producer(); + *out_gen = std::move(push_gen); + return out; + } + arrow::Future<> Finish() override { // Signal to the consumer that the last batch has been delivered // (we don't do any real work in this consumer so mark it finished immediately) @@ -65,6 +71,9 @@ class IgnoringConsumer : public cp::SinkNodeConsumer { // The returned future should only finish when all outstanding tasks have completed // (after this method is called Consume is guaranteed not to be called again) std::cout << "-" << tag_ << " finished" << std::endl; + producer_.Push(arrow::IterationEnd>()); + producer_.Close(); + is_finished_ = true; return arrow::Future<>::MakeFinished(); } @@ -73,7 +82,10 @@ class IgnoringConsumer : public cp::SinkNodeConsumer { // multiple sinks // // In this example, this is set to the zero-based index of the relation tree in the plan + bool is_finished_ = false; + arrow::PushGenerator>::Producer producer_; size_t tag_; + std::shared_ptr schema_; }; arrow::Future> GetSubstraitFromServer( @@ -89,15 +101,6 @@ arrow::Future> GetSubstraitFromServer( }, "names": ["i", "b"] }, - "filter": { - "selection": { - "directReference": { - "structField": { - "field": 1 - } - } - } - }, "local_files": { "items": [ { @@ -141,11 +144,14 @@ int main(int argc, char** argv) { // Arrow. Therefore, deserializing a plan requires a factory for consumers: each // time the root of a substrait relation tree is deserialized, an Arrow consumer is // constructed into which its batches will be piped. + auto schema = arrow::schema( + {arrow::field("i", arrow::int64()), arrow::field("b", arrow::boolean())}); std::vector> consumers; + arrow::AsyncGenerator> sink_gen; std::function()> consumer_factory = [&] { // All batches produced by the plan will be fed into IgnoringConsumers: auto tag = consumers.size(); - consumers.emplace_back(new IgnoringConsumer{tag}); + consumers.emplace_back(new SubstraitSinkConsumer{&sink_gen, tag, schema}); return consumers.back(); }; @@ -177,6 +183,21 @@ int main(int argc, char** argv) { std::cout << std::string(50, '#') << " consuming batches:" << std::endl; ABORT_ON_FAILURE(plan->StartProducing()); + // Extract output + + cp::ExecContext exec_context(arrow::default_memory_pool(), + ::arrow::internal::GetCpuThreadPool()); + std::shared_ptr sink_reader = + cp::MakeGeneratorReader(schema, std::move(sink_gen), exec_context.memory_pool()); + + std::shared_ptr response_table; + + auto maybe_table = arrow::Table::FromRecordBatchReader(sink_reader.get()); + ABORT_ON_FAILURE(maybe_table.status()); + response_table = maybe_table.ValueOrDie(); + + std::cout << "Results : " << response_table->ToString() << std::endl; + // ... and wait for it to finish ABORT_ON_FAILURE(plan->finished().status()); return EXIT_SUCCESS; From dae4c0c6643e44688ec96730e5d1888b899aea2f Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 22 Mar 2022 06:47:31 +0530 Subject: [PATCH 08/74] rebase --- .../arrow/engine_substrait_example.cc | 32 +++--- cpp/src/arrow/python/CMakeLists.txt | 2 +- cpp/src/arrow/python/engine.cc | 98 +++++++++++++++++++ cpp/src/arrow/python/engine.h | 75 ++++++++++++++ cpp/src/arrow/python/python_test.cc | 7 ++ 5 files changed, 193 insertions(+), 21 deletions(-) create mode 100644 cpp/src/arrow/python/engine.cc create mode 100644 cpp/src/arrow/python/engine.h diff --git a/cpp/examples/arrow/engine_substrait_example.cc b/cpp/examples/arrow/engine_substrait_example.cc index f24f789371b..1db55af9bb9 100644 --- a/cpp/examples/arrow/engine_substrait_example.cc +++ b/cpp/examples/arrow/engine_substrait_example.cc @@ -42,15 +42,13 @@ class SubstraitSinkConsumer : public cp::SinkNodeConsumer { public: explicit SubstraitSinkConsumer( arrow::AsyncGenerator>* generator, size_t tag, - std::shared_ptr schema, arrow::util::BackpressureOptions backpressure = {}) - : producer_(MakeProducer(generator, std::move(backpressure))), - tag_{tag}, - schema_(schema) {} + : producer_(MakeProducer(generator, std::move(backpressure))), tag_{tag} {} arrow::Status Consume(cp::ExecBatch batch) override { // Consume a batch of data - producer_.Push(batch); + bool did_push = producer_.Push(batch); + if (!did_push) return arrow::Status::ExecutionError("Producer closed already"); return arrow::Status::OK(); } @@ -65,27 +63,18 @@ class SubstraitSinkConsumer : public cp::SinkNodeConsumer { } arrow::Future<> Finish() override { - // Signal to the consumer that the last batch has been delivered - // (we don't do any real work in this consumer so mark it finished immediately) - // - // The returned future should only finish when all outstanding tasks have completed - // (after this method is called Consume is guaranteed not to be called again) std::cout << "-" << tag_ << " finished" << std::endl; producer_.Push(arrow::IterationEnd>()); - producer_.Close(); - is_finished_ = true; - return arrow::Future<>::MakeFinished(); + if (producer_.Close()) { + return arrow::Future<>::MakeFinished(); + } + return arrow::Future<>::MakeFinished( + arrow::Status::ExecutionError("Error occurred in closing the batch producer")); } private: - // A unique label for instances to help distinguish logging output if a plan has - // multiple sinks - // - // In this example, this is set to the zero-based index of the relation tree in the plan - bool is_finished_ = false; arrow::PushGenerator>::Producer producer_; size_t tag_; - std::shared_ptr schema_; }; arrow::Future> GetSubstraitFromServer( @@ -151,7 +140,7 @@ int main(int argc, char** argv) { std::function()> consumer_factory = [&] { // All batches produced by the plan will be fed into IgnoringConsumers: auto tag = consumers.size(); - consumers.emplace_back(new SubstraitSinkConsumer{&sink_gen, tag, schema}); + consumers.emplace_back(new SubstraitSinkConsumer{&sink_gen, tag}); return consumers.back(); }; @@ -183,6 +172,9 @@ int main(int argc, char** argv) { std::cout << std::string(50, '#') << " consuming batches:" << std::endl; ABORT_ON_FAILURE(plan->StartProducing()); + auto sinks = plan->sinks(); + std::cout << "String ::" << std::endl; + std::cout << sinks.size() << std::endl; // Extract output cp::ExecContext exec_context(arrow::default_memory_pool(), diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index c37240a426c..626be1d67db 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -45,7 +45,7 @@ set(ARROW_PYTHON_SRCS python_to_arrow.cc pyarrow.cc serialize.cc - udf.cc) + engine.cc) set_source_files_properties(init.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON SKIP_UNITY_BUILD_INCLUSION ON) diff --git a/cpp/src/arrow/python/engine.cc b/cpp/src/arrow/python/engine.cc new file mode 100644 index 00000000000..8b65112bf82 --- /dev/null +++ b/cpp/src/arrow/python/engine.cc @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +namespace arrow { + +namespace py { + +namespace engine { + +Status SubstraitSinkConsumer::Consume(cp::ExecBatch batch) { + // Consume a batch of data + bool did_push = producer_.Push(batch); + if (!did_push) return Status::ExecutionError("Producer closed already"); + return Status::OK(); +} + +Future<> SubstraitSinkConsumer::Finish() { + producer_.Push(IterationEnd>()); + if (producer_.Close()) { + return Future<>::MakeFinished(); + } + return Future<>::MakeFinished( + Status::ExecutionError("Error occurred in closing the batch producer")); +} + +Result> SubstraitHandler::BuildPlan( + std::string substrait_json) { + auto maybe_serialized_plan = eng::internal::SubstraitFromJSON("Plan", substrait_json); + RETURN_NOT_OK(maybe_serialized_plan.status()); + std::shared_ptr serialized_plan = + std::move(maybe_serialized_plan).ValueOrDie(); + + auto maybe_plan_json = eng::internal::SubstraitToJSON("Plan", *serialized_plan); + RETURN_NOT_OK(maybe_plan_json.status()); + + std::vector> consumers; + std::function()> consumer_factory = [&] { + // All batches produced by the plan will be fed into IgnoringConsumers: + consumers.emplace_back(new SubstraitSinkConsumer{generator_}); + return consumers.back(); + }; + + // Deserialize each relation tree in the substrait plan to an Arrow compute Declaration + Result> maybe_decls = + eng::DeserializePlan(*serialized_plan, consumer_factory); + RETURN_NOT_OK(maybe_decls.status()); + std::vector decls = std::move(maybe_decls).ValueOrDie(); + + // It's safe to drop the serialized plan; we don't leave references to its memory + serialized_plan.reset(); + + // Construct an empty plan (note: configure Function registry and ThreadPool here) + return decls; +} + +Result> SubstraitHandler::ExecutePlan( + cp::ExecContext exec_context, std::shared_ptr schema, + std::vector declarations) { + Result> maybe_plan = cp::ExecPlan::Make(); + RETURN_NOT_OK(maybe_plan.status()); + std::shared_ptr plan = std::move(maybe_plan).ValueOrDie(); + + for (const cp::Declaration& decl : declarations) { + RETURN_NOT_OK(decl.AddToPlan(plan.get()).status()); + } + + RETURN_NOT_OK(plan->Validate()); + + RETURN_NOT_OK(plan->StartProducing()); + + std::shared_ptr sink_reader = + cp::MakeGeneratorReader(schema, std::move(*generator_), exec_context.memory_pool()); + + return sink_reader; +} + + +} // namespace engine + +} // namespace py + +} // namespace arrow \ No newline at end of file diff --git a/cpp/src/arrow/python/engine.h b/cpp/src/arrow/python/engine.h new file mode 100644 index 00000000000..b72d7156f3d --- /dev/null +++ b/cpp/src/arrow/python/engine.h @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include "arrow/util/async_generator.h" +#include "arrow/util/iterator.h" + +namespace arrow { + +namespace eng = arrow::engine; +namespace cp = arrow::compute; + +namespace py { + +namespace engine { + +class SubstraitSinkConsumer : public cp::SinkNodeConsumer { + public: + explicit SubstraitSinkConsumer( + arrow::AsyncGenerator>* generator, + arrow::util::BackpressureOptions backpressure = {}) + : producer_(MakeProducer(generator, std::move(backpressure))) {} + + arrow::Status Consume(cp::ExecBatch batch) override; + + static arrow::PushGenerator>::Producer + MakeProducer(arrow::AsyncGenerator>* out_gen, + arrow::util::BackpressureOptions backpressure) { + arrow::PushGenerator> push_gen( + std::move(backpressure)); + auto out = push_gen.producer(); + *out_gen = std::move(push_gen); + return out; + } + + arrow::Future<> Finish() override; + + private: + arrow::PushGenerator>::Producer producer_; +}; + +class SubstraitHandler { + public: + explicit SubstraitHandler(AsyncGenerator>* generator) + : generator_(generator) {} + + Result> BuildPlan(std::string substrait_json); + + Result> ExecutePlan( + cp::ExecContext exec_context, std::shared_ptr schema, + std::vector declarations); + + private: + arrow::AsyncGenerator>* generator_; +}; + +} // namespace engine +} // namespace py +} // namespace arrow diff --git a/cpp/src/arrow/python/python_test.cc b/cpp/src/arrow/python/python_test.cc index c465fabc680..3c58e743e7c 100644 --- a/cpp/src/arrow/python/python_test.cc +++ b/cpp/src/arrow/python/python_test.cc @@ -32,6 +32,7 @@ #include "arrow/python/arrow_to_pandas.h" #include "arrow/python/decimal.h" +#include "arrow/python/engine.h" #include "arrow/python/helpers.h" #include "arrow/python/numpy_convert.h" #include "arrow/python/numpy_interop.h" @@ -593,6 +594,12 @@ TEST_F(DecimalTest, UpdateWithNaN) { ASSERT_OK(metadata.Update(nan_value.obj())); ASSERT_EQ(std::numeric_limits::min(), metadata.precision()); ASSERT_EQ(std::numeric_limits::min(), metadata.scale()); +} + +TEST(SubstraitEngineTest, TestPlanFromJSON){ + + + } } // namespace py From 9a532856af06c7d9aac811584bd74780a5e807f7 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 22 Mar 2022 09:54:30 +0530 Subject: [PATCH 09/74] updating cmakelist --- cpp/src/arrow/python/CMakeLists.txt | 22 ++++++++++++++++++++-- cpp/src/arrow/python/engine.cc | 6 +++--- cpp/src/arrow/python/engine.h | 20 ++++++++++---------- 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 626be1d67db..a437b2aceef 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -44,8 +44,7 @@ set(ARROW_PYTHON_SRCS numpy_to_arrow.cc python_to_arrow.cc pyarrow.cc - serialize.cc - engine.cc) + serialize.cc) set_source_files_properties(init.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON SKIP_UNITY_BUILD_INCLUSION ON) @@ -161,6 +160,25 @@ if(ARROW_FLIGHT AND ARROW_BUILD_SHARED) endif() endif() +if(ARROW_ENGINE AND ARROW_BUILD_SHARED) + add_arrow_lib(arrow_python_engine + CMAKE_PACKAGE_NAME + PKG_CONFIG_NAME + SOURCES + engine.cc + OUTPUTS + DEPENDENCIES + SHARED_LINK_FLAGS + ${ARROW_VERSION_SCRIPT_FLAGS} # Defined in cpp/arrow/CMakeLists.txt + SHARED_LINK_LIBS + arrow_python_shared + arrow_engine_shared + STATIC_LINK_LIBS + ${PYTHON_OTHER_LIBS} + EXTRA_INCLUDES + "${ARROW_PYTHON_INCLUDES}") +endif() + if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") # Clang, be quiet. Python C API has lots of macros set_property(SOURCE ${ARROW_PYTHON_SRCS} diff --git a/cpp/src/arrow/python/engine.cc b/cpp/src/arrow/python/engine.cc index 8b65112bf82..c1321019c83 100644 --- a/cpp/src/arrow/python/engine.cc +++ b/cpp/src/arrow/python/engine.cc @@ -43,11 +43,11 @@ Result> SubstraitHandler::BuildPlan( std::string substrait_json) { auto maybe_serialized_plan = eng::internal::SubstraitFromJSON("Plan", substrait_json); RETURN_NOT_OK(maybe_serialized_plan.status()); - std::shared_ptr serialized_plan = + std::shared_ptr serialized_plan = std::move(maybe_serialized_plan).ValueOrDie(); auto maybe_plan_json = eng::internal::SubstraitToJSON("Plan", *serialized_plan); - RETURN_NOT_OK(maybe_plan_json.status()); + RETURN_NOT_OK(maybe_plan_json.status()); std::vector> consumers; std::function()> consumer_factory = [&] { @@ -84,7 +84,7 @@ Result> SubstraitHandler::ExecutePlan( RETURN_NOT_OK(plan->StartProducing()); - std::shared_ptr sink_reader = + std::shared_ptr sink_reader = cp::MakeGeneratorReader(schema, std::move(*generator_), exec_context.memory_pool()); return sink_reader; diff --git a/cpp/src/arrow/python/engine.h b/cpp/src/arrow/python/engine.h index b72d7156f3d..36ac2109ccc 100644 --- a/cpp/src/arrow/python/engine.h +++ b/cpp/src/arrow/python/engine.h @@ -33,26 +33,26 @@ namespace engine { class SubstraitSinkConsumer : public cp::SinkNodeConsumer { public: explicit SubstraitSinkConsumer( - arrow::AsyncGenerator>* generator, - arrow::util::BackpressureOptions backpressure = {}) + AsyncGenerator>* generator, + util::BackpressureOptions backpressure = {}) : producer_(MakeProducer(generator, std::move(backpressure))) {} - arrow::Status Consume(cp::ExecBatch batch) override; + Status Consume(cp::ExecBatch batch) override; - static arrow::PushGenerator>::Producer - MakeProducer(arrow::AsyncGenerator>* out_gen, - arrow::util::BackpressureOptions backpressure) { - arrow::PushGenerator> push_gen( + static arrow::PushGenerator>::Producer + MakeProducer(AsyncGenerator>* out_gen, + util::BackpressureOptions backpressure) { + arrow::PushGenerator> push_gen( std::move(backpressure)); auto out = push_gen.producer(); *out_gen = std::move(push_gen); return out; } - arrow::Future<> Finish() override; + Future<> Finish() override; private: - arrow::PushGenerator>::Producer producer_; + PushGenerator>::Producer producer_; }; class SubstraitHandler { @@ -67,7 +67,7 @@ class SubstraitHandler { std::vector declarations); private: - arrow::AsyncGenerator>* generator_; + AsyncGenerator>* generator_; }; } // namespace engine From 91929f7f9c37cb963931717e8844d7849e70c0d9 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 22 Mar 2022 12:06:25 +0530 Subject: [PATCH 10/74] removing parquet dep --- cpp/src/arrow/python/python_test.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/python/python_test.cc b/cpp/src/arrow/python/python_test.cc index 3c58e743e7c..871baca2702 100644 --- a/cpp/src/arrow/python/python_test.cc +++ b/cpp/src/arrow/python/python_test.cc @@ -27,6 +27,7 @@ #include "arrow/array/builder_binary.h" #include "arrow/table.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" #include "arrow/util/decimal.h" #include "arrow/util/optional.h" @@ -38,7 +39,6 @@ #include "arrow/python/numpy_interop.h" #include "arrow/python/python_to_arrow.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/logging.h" namespace arrow { @@ -596,10 +596,9 @@ TEST_F(DecimalTest, UpdateWithNaN) { ASSERT_EQ(std::numeric_limits::min(), metadata.scale()); } -TEST(SubstraitEngineTest, TestPlanFromJSON){ +TEST(SubstraitEngineTest, TestPlanFromJSON){ - } } // namespace py From a45b25c1f6545dc2c5b2ca70187fb987b5731564 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 23 Mar 2022 10:17:42 +0530 Subject: [PATCH 11/74] adding substrait test cases and refactor --- .../arrow/engine_substrait_example.cc | 10 ++- cpp/src/arrow/engine/CMakeLists.txt | 3 +- cpp/src/arrow/engine/substrait/util.cc | 81 ++++++++++++++++++ cpp/src/arrow/engine/substrait/util.h | 84 +++++++++++++++++++ cpp/src/arrow/python/CMakeLists.txt | 19 ----- cpp/src/arrow/python/python_test.cc | 8 +- 6 files changed, 176 insertions(+), 29 deletions(-) create mode 100644 cpp/src/arrow/engine/substrait/util.cc create mode 100644 cpp/src/arrow/engine/substrait/util.h diff --git a/cpp/examples/arrow/engine_substrait_example.cc b/cpp/examples/arrow/engine_substrait_example.cc index 1db55af9bb9..fff1bed40c5 100644 --- a/cpp/examples/arrow/engine_substrait_example.cc +++ b/cpp/examples/arrow/engine_substrait_example.cc @@ -86,9 +86,15 @@ arrow::Future> GetSubstraitFromServer( "read": { "base_schema": { "struct": { - "types": [ {"i64": {}}, {"bool": {}} ] + "types": [ + {"i64": {}}, + {"bool": {}} + ] }, - "names": ["i", "b"] + "names": [ + "i", + "b" + ] }, "local_files": { "items": [ diff --git a/cpp/src/arrow/engine/CMakeLists.txt b/cpp/src/arrow/engine/CMakeLists.txt index d09b8819fb1..ea9797ea1d7 100644 --- a/cpp/src/arrow/engine/CMakeLists.txt +++ b/cpp/src/arrow/engine/CMakeLists.txt @@ -26,7 +26,8 @@ set(ARROW_SUBSTRAIT_SRCS substrait/serde.cc substrait/plan_internal.cc substrait/relation_internal.cc - substrait/type_internal.cc) + substrait/type_internal.cc + substrait/util.cc) add_arrow_lib(arrow_substrait CMAKE_PACKAGE_NAME diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc new file mode 100644 index 00000000000..684f5cf870e --- /dev/null +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/engine/substrait/util.h" + +namespace arrow { + +namespace engine { + + +Status SubstraitSinkConsumer::Consume(cp::ExecBatch batch) { + // Consume a batch of data + bool did_push = producer_.Push(batch); + if (!did_push) return Status::ExecutionError("Producer closed already"); + return Status::OK(); +} + +Future<> SubstraitSinkConsumer::Finish() { + producer_.Push(IterationEnd>()); + if (producer_.Close()) { + return Future<>::MakeFinished(); + } + return Future<>::MakeFinished( + Status::ExecutionError("Error occurred in closing the batch producer")); +} + +Status SubstraitExecutor::MakePlan() { + ARROW_ASSIGN_OR_RAISE(auto serialized_plan, engine::internal::SubstraitFromJSON("Plan", substrait_json_)); + + auto maybe_plan_json = engine::internal::SubstraitToJSON("Plan", *serialized_plan); + RETURN_NOT_OK(maybe_plan_json.status()); + + std::vector> consumers; + std::function()> consumer_factory = [&] { + // All batches produced by the plan will be fed into IgnoringConsumers: + consumers.emplace_back(new SubstraitSinkConsumer{generator_}); + return consumers.back(); + }; + + // Deserialize each relation tree in the substrait plan to an Arrow compute Declaration + ARROW_ASSIGN_OR_RAISE(declerations_, engine::DeserializePlan(*serialized_plan, consumer_factory)); + + // It's safe to drop the serialized plan; we don't leave references to its memory + serialized_plan.reset(); + + // Construct an empty plan (note: configure Function registry and ThreadPool here) + return Status::OK(); +} + +Result> SubstraitExecutor::Execute() { + for (const cp::Declaration& decl : declerations_) { + RETURN_NOT_OK(decl.AddToPlan(plan_.get()).status()); + } + + ARROW_RETURN_NOT_OK(plan_->Validate()); + + ARROW_RETURN_NOT_OK(plan_->StartProducing()); + + std::shared_ptr sink_reader = + cp::MakeGeneratorReader(schema_, std::move(*generator_), exec_context_.memory_pool()); + ARROW_RETURN_NOT_OK(plan_->finished().status()); + return sink_reader; +} + +} // namespace engine + +} // namespace arrow \ No newline at end of file diff --git a/cpp/src/arrow/engine/substrait/util.h b/cpp/src/arrow/engine/substrait/util.h new file mode 100644 index 00000000000..c4e3127c055 --- /dev/null +++ b/cpp/src/arrow/engine/substrait/util.h @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/api.h" +#include "arrow/compute/api.h" +#include "arrow/engine/api.h" +#include "arrow/util/async_generator.h" +#include "arrow/util/iterator.h" + +namespace arrow { + +namespace cp = arrow::compute; + +namespace engine { + +class SubstraitSinkConsumer : public cp::SinkNodeConsumer { + public: + explicit SubstraitSinkConsumer( + AsyncGenerator>* generator, + arrow::util::BackpressureOptions backpressure = {}) + : producer_(MakeProducer(generator, std::move(backpressure))) {} + + Status Consume(cp::ExecBatch batch) override; + + static arrow::PushGenerator>::Producer + MakeProducer(AsyncGenerator>* out_gen, + arrow::util::BackpressureOptions backpressure) { + arrow::PushGenerator> push_gen( + std::move(backpressure)); + auto out = push_gen.producer(); + *out_gen = std::move(push_gen); + return out; + } + + Future<> Finish() override; + + private: + PushGenerator>::Producer producer_; +}; + +class SubstraitExecutor { + public: + explicit SubstraitExecutor( + std::string substrait_json, + AsyncGenerator>* generator, + std::shared_ptr plan, std::shared_ptr schema, + cp::ExecContext exec_context) + : substrait_json_(substrait_json), generator_(generator), + plan_(std::move(plan)), schema_(schema), exec_context_(exec_context) {} + + Status MakePlan(); + + Result> Execute(); + + const std::shared_ptr plan() const { return plan_;} + + private: + std::string substrait_json_; + AsyncGenerator>* generator_; + std::vector declerations_; + std::shared_ptr plan_; + std::shared_ptr schema_; + cp::ExecContext exec_context_; +}; + +} // namespace engine + +} // namespace arrow \ No newline at end of file diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index a437b2aceef..5ba83b5a490 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -160,25 +160,6 @@ if(ARROW_FLIGHT AND ARROW_BUILD_SHARED) endif() endif() -if(ARROW_ENGINE AND ARROW_BUILD_SHARED) - add_arrow_lib(arrow_python_engine - CMAKE_PACKAGE_NAME - PKG_CONFIG_NAME - SOURCES - engine.cc - OUTPUTS - DEPENDENCIES - SHARED_LINK_FLAGS - ${ARROW_VERSION_SCRIPT_FLAGS} # Defined in cpp/arrow/CMakeLists.txt - SHARED_LINK_LIBS - arrow_python_shared - arrow_engine_shared - STATIC_LINK_LIBS - ${PYTHON_OTHER_LIBS} - EXTRA_INCLUDES - "${ARROW_PYTHON_INCLUDES}") -endif() - if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") # Clang, be quiet. Python C API has lots of macros set_property(SOURCE ${ARROW_PYTHON_SRCS} diff --git a/cpp/src/arrow/python/python_test.cc b/cpp/src/arrow/python/python_test.cc index 871baca2702..c465fabc680 100644 --- a/cpp/src/arrow/python/python_test.cc +++ b/cpp/src/arrow/python/python_test.cc @@ -27,18 +27,17 @@ #include "arrow/array/builder_binary.h" #include "arrow/table.h" #include "arrow/testing/gtest_util.h" -#include "arrow/testing/util.h" #include "arrow/util/decimal.h" #include "arrow/util/optional.h" #include "arrow/python/arrow_to_pandas.h" #include "arrow/python/decimal.h" -#include "arrow/python/engine.h" #include "arrow/python/helpers.h" #include "arrow/python/numpy_convert.h" #include "arrow/python/numpy_interop.h" #include "arrow/python/python_to_arrow.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" namespace arrow { @@ -596,10 +595,5 @@ TEST_F(DecimalTest, UpdateWithNaN) { ASSERT_EQ(std::numeric_limits::min(), metadata.scale()); } - -TEST(SubstraitEngineTest, TestPlanFromJSON){ - -} - } // namespace py } // namespace arrow From b3edec2ac704df47e6587de3c5de486c1d03d2ec Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 23 Mar 2022 10:18:22 +0530 Subject: [PATCH 12/74] rebase --- cpp/src/arrow/engine/substrait/serde_test.cc | 123 +++++++++++++++++++ cpp/src/arrow/python/engine.cc | 98 --------------- cpp/src/arrow/python/engine.h | 75 ----------- 3 files changed, 123 insertions(+), 173 deletions(-) delete mode 100644 cpp/src/arrow/python/engine.cc delete mode 100644 cpp/src/arrow/python/engine.h diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 775e2520e0b..396102187ac 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -16,6 +16,7 @@ // under the License. #include "arrow/engine/substrait/serde.h" +#include "arrow/engine/substrait/util.h" #include #include @@ -30,6 +31,8 @@ #include "arrow/testing/matchers.h" #include "arrow/util/key_value_metadata.h" +#include "arrow/util/async_generator.h" + using testing::ElementsAre; using testing::Eq; using testing::HasSubstr; @@ -38,6 +41,7 @@ using testing::UnorderedElementsAre; namespace arrow { using internal::checked_cast; +namespace cp = arrow::compute; namespace engine { @@ -752,5 +756,124 @@ TEST(Substrait, ExtensionSetFromPlanMissingFunc) { &ext_set)); } +TEST(Substrait, GetRecordBatchIterator) { + const auto parquet_root = std::getenv("PARQUET_TEST_DATA"); + std::string dir_string(parquet_root); + std::stringstream ss; + ss << dir_string << "/alltypes_plain.parquet"; + auto file_path = ss.str(); + file_path = "/Users/vibhatha/sandbox/parquet/example.parquet"; + std::cout << "File Path : " << file_path << std::endl; + // std::string substrait_json = R"({ + // "relations": [ + // {"rel": { + // "read": { + // "base_schema": { + // "struct": { + // "types": [ {"i32": {}}, + // {"bool": {}}, + // {"i32": {}}, + // {"i32": {}}, + // {"i32": {}}, + // {"i64": {}}, + // {"f32": {}}, + // {"f64": {}}, + // {"binary": {}}, + // {"binary": {}}, + // {"ts_ns": {}}, + // ] + // }, + // "names": [ + // "id", + // "bool_col", + // "tinyint_col", + // "smallint_col", + // "int_col", + // "bigint_col", + // "float_col", + // "double_col", + // "date_string_col", + // "string_col", + // "timestamp_col" + // ] + // }, + // "local_files": { + // "items": [ + // { + // "uri_file": "file://FILENAME_PLACEHOLDER", + // "format": "FILE_FORMAT_PARQUET" + // } + // ] + // } + // } + // }} + // ] + // })"; + + std::string substrait_json = R"({ + "relations": [ + {"rel": { + "read": { + "base_schema": { + "struct": { + "types": [ + {"i64": {}}, + {"bool": {}} + ] + }, + "names": [ + "i", + "b" + ] + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER", + "format": "FILE_FORMAT_PARQUET" + } + ] + } + } + }} + ] + })"; + + std::string filename_placeholder = "FILENAME_PLACEHOLDER"; + substrait_json.replace(substrait_json.find(filename_placeholder), + filename_placeholder.size(), file_path); + + // auto in_schema = schema({ + // field("id", int32()), + // field("bool_col", boolean()), + // field("tinyint_col", int32()), + // field("smallint_col", int32()), + // field("int_col", int32()), + // field("bigint_col", int64()), + // field("float_col", float32()), + // field("double_col", float64()), + // field("date_string_col", binary()), + // field("string_col", binary()), + // field("timestamp_col", timestamp(TimeUnit::NANO)), + // }); + auto in_schema = schema({ + field("i", int64()), + field("b", boolean()), + }); + AsyncGenerator> sink_gen; + + cp::ExecContext exec_context(default_memory_pool(), + arrow::internal::GetCpuThreadPool()); + ASSERT_OK_AND_ASSIGN(auto plan, cp::ExecPlan::Make()); + engine::SubstraitExecutor executor(substrait_json, &sink_gen, plan, in_schema, exec_context); + auto status = executor.MakePlan(); + ASSERT_OK(status); + ASSERT_OK_AND_ASSIGN(auto reader, executor.Execute()); + + ASSERT_OK_AND_ASSIGN(auto table, Table::FromRecordBatchReader(reader.get())); + + std::cout << "Results : " << table->ToString() << std::endl; +} + } // namespace engine } // namespace arrow diff --git a/cpp/src/arrow/python/engine.cc b/cpp/src/arrow/python/engine.cc deleted file mode 100644 index c1321019c83..00000000000 --- a/cpp/src/arrow/python/engine.cc +++ /dev/null @@ -1,98 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include - -namespace arrow { - -namespace py { - -namespace engine { - -Status SubstraitSinkConsumer::Consume(cp::ExecBatch batch) { - // Consume a batch of data - bool did_push = producer_.Push(batch); - if (!did_push) return Status::ExecutionError("Producer closed already"); - return Status::OK(); -} - -Future<> SubstraitSinkConsumer::Finish() { - producer_.Push(IterationEnd>()); - if (producer_.Close()) { - return Future<>::MakeFinished(); - } - return Future<>::MakeFinished( - Status::ExecutionError("Error occurred in closing the batch producer")); -} - -Result> SubstraitHandler::BuildPlan( - std::string substrait_json) { - auto maybe_serialized_plan = eng::internal::SubstraitFromJSON("Plan", substrait_json); - RETURN_NOT_OK(maybe_serialized_plan.status()); - std::shared_ptr serialized_plan = - std::move(maybe_serialized_plan).ValueOrDie(); - - auto maybe_plan_json = eng::internal::SubstraitToJSON("Plan", *serialized_plan); - RETURN_NOT_OK(maybe_plan_json.status()); - - std::vector> consumers; - std::function()> consumer_factory = [&] { - // All batches produced by the plan will be fed into IgnoringConsumers: - consumers.emplace_back(new SubstraitSinkConsumer{generator_}); - return consumers.back(); - }; - - // Deserialize each relation tree in the substrait plan to an Arrow compute Declaration - Result> maybe_decls = - eng::DeserializePlan(*serialized_plan, consumer_factory); - RETURN_NOT_OK(maybe_decls.status()); - std::vector decls = std::move(maybe_decls).ValueOrDie(); - - // It's safe to drop the serialized plan; we don't leave references to its memory - serialized_plan.reset(); - - // Construct an empty plan (note: configure Function registry and ThreadPool here) - return decls; -} - -Result> SubstraitHandler::ExecutePlan( - cp::ExecContext exec_context, std::shared_ptr schema, - std::vector declarations) { - Result> maybe_plan = cp::ExecPlan::Make(); - RETURN_NOT_OK(maybe_plan.status()); - std::shared_ptr plan = std::move(maybe_plan).ValueOrDie(); - - for (const cp::Declaration& decl : declarations) { - RETURN_NOT_OK(decl.AddToPlan(plan.get()).status()); - } - - RETURN_NOT_OK(plan->Validate()); - - RETURN_NOT_OK(plan->StartProducing()); - - std::shared_ptr sink_reader = - cp::MakeGeneratorReader(schema, std::move(*generator_), exec_context.memory_pool()); - - return sink_reader; -} - - -} // namespace engine - -} // namespace py - -} // namespace arrow \ No newline at end of file diff --git a/cpp/src/arrow/python/engine.h b/cpp/src/arrow/python/engine.h deleted file mode 100644 index 36ac2109ccc..00000000000 --- a/cpp/src/arrow/python/engine.h +++ /dev/null @@ -1,75 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include "arrow/util/async_generator.h" -#include "arrow/util/iterator.h" - -namespace arrow { - -namespace eng = arrow::engine; -namespace cp = arrow::compute; - -namespace py { - -namespace engine { - -class SubstraitSinkConsumer : public cp::SinkNodeConsumer { - public: - explicit SubstraitSinkConsumer( - AsyncGenerator>* generator, - util::BackpressureOptions backpressure = {}) - : producer_(MakeProducer(generator, std::move(backpressure))) {} - - Status Consume(cp::ExecBatch batch) override; - - static arrow::PushGenerator>::Producer - MakeProducer(AsyncGenerator>* out_gen, - util::BackpressureOptions backpressure) { - arrow::PushGenerator> push_gen( - std::move(backpressure)); - auto out = push_gen.producer(); - *out_gen = std::move(push_gen); - return out; - } - - Future<> Finish() override; - - private: - PushGenerator>::Producer producer_; -}; - -class SubstraitHandler { - public: - explicit SubstraitHandler(AsyncGenerator>* generator) - : generator_(generator) {} - - Result> BuildPlan(std::string substrait_json); - - Result> ExecutePlan( - cp::ExecContext exec_context, std::shared_ptr schema, - std::vector declarations); - - private: - AsyncGenerator>* generator_; -}; - -} // namespace engine -} // namespace py -} // namespace arrow From cfe821c7a3d38cdba3c592d36b0a79af4a348804 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 23 Mar 2022 11:14:12 +0530 Subject: [PATCH 13/74] rebase continue --- cpp/src/arrow/engine/substrait/serde_test.cc | 83 ++------------------ 1 file changed, 7 insertions(+), 76 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 396102187ac..2eafbc05d86 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -760,56 +760,9 @@ TEST(Substrait, GetRecordBatchIterator) { const auto parquet_root = std::getenv("PARQUET_TEST_DATA"); std::string dir_string(parquet_root); std::stringstream ss; - ss << dir_string << "/alltypes_plain.parquet"; + ss << dir_string << "/binary.parquet"; auto file_path = ss.str(); - file_path = "/Users/vibhatha/sandbox/parquet/example.parquet"; - std::cout << "File Path : " << file_path << std::endl; - // std::string substrait_json = R"({ - // "relations": [ - // {"rel": { - // "read": { - // "base_schema": { - // "struct": { - // "types": [ {"i32": {}}, - // {"bool": {}}, - // {"i32": {}}, - // {"i32": {}}, - // {"i32": {}}, - // {"i64": {}}, - // {"f32": {}}, - // {"f64": {}}, - // {"binary": {}}, - // {"binary": {}}, - // {"ts_ns": {}}, - // ] - // }, - // "names": [ - // "id", - // "bool_col", - // "tinyint_col", - // "smallint_col", - // "int_col", - // "bigint_col", - // "float_col", - // "double_col", - // "date_string_col", - // "string_col", - // "timestamp_col" - // ] - // }, - // "local_files": { - // "items": [ - // { - // "uri_file": "file://FILENAME_PLACEHOLDER", - // "format": "FILE_FORMAT_PARQUET" - // } - // ] - // } - // } - // }} - // ] - // })"; - + std::string substrait_json = R"({ "relations": [ {"rel": { @@ -817,14 +770,12 @@ TEST(Substrait, GetRecordBatchIterator) { "base_schema": { "struct": { "types": [ - {"i64": {}}, - {"bool": {}} + {"binary": {}} ] }, "names": [ - "i", - "b" - ] + "foo" + ] }, "local_files": { "items": [ @@ -842,26 +793,8 @@ TEST(Substrait, GetRecordBatchIterator) { std::string filename_placeholder = "FILENAME_PLACEHOLDER"; substrait_json.replace(substrait_json.find(filename_placeholder), filename_placeholder.size(), file_path); - - // auto in_schema = schema({ - // field("id", int32()), - // field("bool_col", boolean()), - // field("tinyint_col", int32()), - // field("smallint_col", int32()), - // field("int_col", int32()), - // field("bigint_col", int64()), - // field("float_col", float32()), - // field("double_col", float64()), - // field("date_string_col", binary()), - // field("string_col", binary()), - // field("timestamp_col", timestamp(TimeUnit::NANO)), - // }); - auto in_schema = schema({ - field("i", int64()), - field("b", boolean()), - }); + auto in_schema = schema({ field("foo", binary())}); AsyncGenerator> sink_gen; - cp::ExecContext exec_context(default_memory_pool(), arrow::internal::GetCpuThreadPool()); ASSERT_OK_AND_ASSIGN(auto plan, cp::ExecPlan::Make()); @@ -869,10 +802,8 @@ TEST(Substrait, GetRecordBatchIterator) { auto status = executor.MakePlan(); ASSERT_OK(status); ASSERT_OK_AND_ASSIGN(auto reader, executor.Execute()); - ASSERT_OK_AND_ASSIGN(auto table, Table::FromRecordBatchReader(reader.get())); - - std::cout << "Results : " << table->ToString() << std::endl; + EXPECT_TRUE(table->num_rows() > 0); } } // namespace engine From f16ee61c487cb5e084bcf80328fa5d45d46dc853 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 23 Mar 2022 12:01:31 +0530 Subject: [PATCH 14/74] updating example --- .../arrow/engine_substrait_example.cc | 136 ++++-------------- cpp/src/arrow/engine/substrait/serde_test.cc | 4 +- cpp/src/arrow/engine/substrait/util.cc | 6 +- cpp/src/arrow/engine/substrait/util.h | 2 +- 4 files changed, 40 insertions(+), 108 deletions(-) diff --git a/cpp/examples/arrow/engine_substrait_example.cc b/cpp/examples/arrow/engine_substrait_example.cc index fff1bed40c5..21ca7178a64 100644 --- a/cpp/examples/arrow/engine_substrait_example.cc +++ b/cpp/examples/arrow/engine_substrait_example.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include "arrow/util/async_generator.h" #include "arrow/util/iterator.h" @@ -38,46 +39,7 @@ namespace cp = arrow::compute; } \ } while (0); -class SubstraitSinkConsumer : public cp::SinkNodeConsumer { - public: - explicit SubstraitSinkConsumer( - arrow::AsyncGenerator>* generator, size_t tag, - arrow::util::BackpressureOptions backpressure = {}) - : producer_(MakeProducer(generator, std::move(backpressure))), tag_{tag} {} - - arrow::Status Consume(cp::ExecBatch batch) override { - // Consume a batch of data - bool did_push = producer_.Push(batch); - if (!did_push) return arrow::Status::ExecutionError("Producer closed already"); - return arrow::Status::OK(); - } - - static arrow::PushGenerator>::Producer - MakeProducer(arrow::AsyncGenerator>* out_gen, - arrow::util::BackpressureOptions backpressure) { - arrow::PushGenerator> push_gen( - std::move(backpressure)); - auto out = push_gen.producer(); - *out_gen = std::move(push_gen); - return out; - } - - arrow::Future<> Finish() override { - std::cout << "-" << tag_ << " finished" << std::endl; - producer_.Push(arrow::IterationEnd>()); - if (producer_.Close()) { - return arrow::Future<>::MakeFinished(); - } - return arrow::Future<>::MakeFinished( - arrow::Status::ExecutionError("Error occurred in closing the batch producer")); - } - - private: - arrow::PushGenerator>::Producer producer_; - size_t tag_; -}; - -arrow::Future> GetSubstraitFromServer( +std::string GetSubstraitPlanFromServer( const std::string& filename) { // Emulate server interaction by parsing hard coded JSON std::string substrait_json = R"({ @@ -111,7 +73,7 @@ arrow::Future> GetSubstraitFromServer( std::string filename_placeholder = "FILENAME_PLACEHOLDER"; substrait_json.replace(substrait_json.find(filename_placeholder), filename_placeholder.size(), filename); - return eng::internal::SubstraitFromJSON("Plan", substrait_json); + return substrait_json; } int main(int argc, char** argv) { @@ -120,73 +82,33 @@ int main(int argc, char** argv) { // Fake pass for CI return EXIT_SUCCESS; } - - // Plans arrive at the consumer serialized in a Buffer, using the binary protobuf - // serialization of a substrait Plan - auto maybe_serialized_plan = GetSubstraitFromServer(argv[1]).result(); - ABORT_ON_FAILURE(maybe_serialized_plan.status()); - std::shared_ptr serialized_plan = - std::move(maybe_serialized_plan).ValueOrDie(); - - // Print the received plan to stdout as JSON - arrow::Result maybe_plan_json = - eng::internal::SubstraitToJSON("Plan", *serialized_plan); - ABORT_ON_FAILURE(maybe_plan_json.status()); - std::cout << std::string(50, '#') << " received substrait::Plan:" << std::endl; - std::cout << maybe_plan_json.ValueOrDie() << std::endl; - - // The data sink(s) for plans is/are implicit in substrait plans, but explicit in - // Arrow. Therefore, deserializing a plan requires a factory for consumers: each - // time the root of a substrait relation tree is deserialized, an Arrow consumer is - // constructed into which its batches will be piped. + auto substrait_json = GetSubstraitPlanFromServer(argv[1]); + auto schema = arrow::schema( {arrow::field("i", arrow::int64()), arrow::field("b", arrow::boolean())}); - std::vector> consumers; - arrow::AsyncGenerator> sink_gen; - std::function()> consumer_factory = [&] { - // All batches produced by the plan will be fed into IgnoringConsumers: - auto tag = consumers.size(); - consumers.emplace_back(new SubstraitSinkConsumer{&sink_gen, tag}); - return consumers.back(); - }; - - // Deserialize each relation tree in the substrait plan to an Arrow compute Declaration - arrow::Result> maybe_decls = - eng::DeserializePlan(*serialized_plan, consumer_factory); - ABORT_ON_FAILURE(maybe_decls.status()); - std::vector decls = std::move(maybe_decls).ValueOrDie(); - - // It's safe to drop the serialized plan; we don't leave references to its memory - serialized_plan.reset(); - - // Construct an empty plan (note: configure Function registry and ThreadPool here) - arrow::Result> maybe_plan = cp::ExecPlan::Make(); - ABORT_ON_FAILURE(maybe_plan.status()); - std::shared_ptr plan = std::move(maybe_plan).ValueOrDie(); - - // Add decls to plan (note: configure ExecNode registry before this point) - for (const cp::Declaration& decl : decls) { - ABORT_ON_FAILURE(decl.AddToPlan(plan.get()).status()); - } - - // Validate the plan and print it to stdout - ABORT_ON_FAILURE(plan->Validate()); - std::cout << std::string(50, '#') << " produced arrow::ExecPlan:" << std::endl; - std::cout << plan->ToString() << std::endl; - - // Start the plan... - std::cout << std::string(50, '#') << " consuming batches:" << std::endl; - ABORT_ON_FAILURE(plan->StartProducing()); - - auto sinks = plan->sinks(); - std::cout << "String ::" << std::endl; - std::cout << sinks.size() << std::endl; - // Extract output cp::ExecContext exec_context(arrow::default_memory_pool(), ::arrow::internal::GetCpuThreadPool()); - std::shared_ptr sink_reader = - cp::MakeGeneratorReader(schema, std::move(sink_gen), exec_context.memory_pool()); + + arrow::AsyncGenerator> sink_gen; + + auto maybe_plan = cp::ExecPlan::Make(); + if(!maybe_plan.status().ok()) { + return EXIT_FAILURE; + } + auto plan = maybe_plan.ValueOrDie(); + arrow::engine::SubstraitExecutor executor(substrait_json, &sink_gen, plan, schema, exec_context); + auto status = executor.MakePlan(); + if(!status.ok()) { + return EXIT_FAILURE; + } + auto maybe_reader = executor.Execute(); + + if(!maybe_reader.status().ok()) { + return EXIT_FAILURE; + } + + auto sink_reader = maybe_reader.ValueOrDie(); std::shared_ptr response_table; @@ -196,7 +118,11 @@ int main(int argc, char** argv) { std::cout << "Results : " << response_table->ToString() << std::endl; - // ... and wait for it to finish - ABORT_ON_FAILURE(plan->finished().status()); + auto finish = executor.Finalize(); + + if(!finish.ok()) { + return EXIT_FAILURE; + } + return EXIT_SUCCESS; } diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 2eafbc05d86..530efff3d2e 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -762,7 +762,7 @@ TEST(Substrait, GetRecordBatchIterator) { std::stringstream ss; ss << dir_string << "/binary.parquet"; auto file_path = ss.str(); - + std::string substrait_json = R"({ "relations": [ {"rel": { @@ -802,6 +802,8 @@ TEST(Substrait, GetRecordBatchIterator) { auto status = executor.MakePlan(); ASSERT_OK(status); ASSERT_OK_AND_ASSIGN(auto reader, executor.Execute()); + auto finish = executor.Finalize(); + ASSERT_OK(finish); ASSERT_OK_AND_ASSIGN(auto table, Table::FromRecordBatchReader(reader.get())); EXPECT_TRUE(table->num_rows() > 0); } diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index 684f5cf870e..7309aafe6d6 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -72,10 +72,14 @@ Result> SubstraitExecutor::Execute() { std::shared_ptr sink_reader = cp::MakeGeneratorReader(schema_, std::move(*generator_), exec_context_.memory_pool()); - ARROW_RETURN_NOT_OK(plan_->finished().status()); return sink_reader; } +Status SubstraitExecutor::Finalize() { + ARROW_RETURN_NOT_OK(plan_->finished().status()); + return Status::OK(); +} + } // namespace engine } // namespace arrow \ No newline at end of file diff --git a/cpp/src/arrow/engine/substrait/util.h b/cpp/src/arrow/engine/substrait/util.h index c4e3127c055..46c00ee1ced 100644 --- a/cpp/src/arrow/engine/substrait/util.h +++ b/cpp/src/arrow/engine/substrait/util.h @@ -68,7 +68,7 @@ class SubstraitExecutor { Result> Execute(); - const std::shared_ptr plan() const { return plan_;} + Status Finalize(); private: std::string substrait_json_; From 517014e00e4eed5c53b6275973db1f5ec731cc43 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 23 Mar 2022 15:23:02 +0530 Subject: [PATCH 15/74] rebase --- .../arrow/engine_substrait_example.cc | 28 +++++----- cpp/src/arrow/engine/substrait/serde_test.cc | 52 +++++++++++++++++-- cpp/src/arrow/engine/substrait/util.cc | 41 +++++++++++---- cpp/src/arrow/engine/substrait/util.h | 12 +++-- .../substrait/query_execution_example.py | 33 ++++++++++++ 5 files changed, 136 insertions(+), 30 deletions(-) create mode 100644 python/examples/substrait/query_execution_example.py diff --git a/cpp/examples/arrow/engine_substrait_example.cc b/cpp/examples/arrow/engine_substrait_example.cc index 21ca7178a64..926de469b47 100644 --- a/cpp/examples/arrow/engine_substrait_example.cc +++ b/cpp/examples/arrow/engine_substrait_example.cc @@ -39,8 +39,7 @@ namespace cp = arrow::compute; } \ } while (0); -std::string GetSubstraitPlanFromServer( - const std::string& filename) { +std::string GetSubstraitPlanFromServer(const std::string& filename) { // Emulate server interaction by parsing hard coded JSON std::string substrait_json = R"({ "relations": [ @@ -83,7 +82,7 @@ int main(int argc, char** argv) { return EXIT_SUCCESS; } auto substrait_json = GetSubstraitPlanFromServer(argv[1]); - + auto schema = arrow::schema( {arrow::field("i", arrow::int64()), arrow::field("b", arrow::boolean())}); @@ -91,23 +90,24 @@ int main(int argc, char** argv) { ::arrow::internal::GetCpuThreadPool()); arrow::AsyncGenerator> sink_gen; - - auto maybe_plan = cp::ExecPlan::Make(); - if(!maybe_plan.status().ok()) { + + auto maybe_plan = cp::ExecPlan::Make(); + if (!maybe_plan.status().ok()) { return EXIT_FAILURE; } auto plan = maybe_plan.ValueOrDie(); - arrow::engine::SubstraitExecutor executor(substrait_json, &sink_gen, plan, schema, exec_context); + arrow::engine::SubstraitExecutor executor(substrait_json, &sink_gen, plan, schema, + exec_context); auto status = executor.MakePlan(); - if(!status.ok()) { + if (!status.ok()) { return EXIT_FAILURE; } - auto maybe_reader = executor.Execute(); - - if(!maybe_reader.status().ok()) { + auto maybe_reader = executor.Execute(); + + if (!maybe_reader.status().ok()) { return EXIT_FAILURE; } - + auto sink_reader = maybe_reader.ValueOrDie(); std::shared_ptr response_table; @@ -120,9 +120,9 @@ int main(int argc, char** argv) { auto finish = executor.Finalize(); - if(!finish.ok()) { + if (!finish.ok()) { return EXIT_FAILURE; } - + return EXIT_SUCCESS; } diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 530efff3d2e..20204b986ef 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -793,19 +793,65 @@ TEST(Substrait, GetRecordBatchIterator) { std::string filename_placeholder = "FILENAME_PLACEHOLDER"; substrait_json.replace(substrait_json.find(filename_placeholder), filename_placeholder.size(), file_path); - auto in_schema = schema({ field("foo", binary())}); + auto in_schema = schema({field("foo", binary())}); AsyncGenerator> sink_gen; cp::ExecContext exec_context(default_memory_pool(), arrow::internal::GetCpuThreadPool()); ASSERT_OK_AND_ASSIGN(auto plan, cp::ExecPlan::Make()); - engine::SubstraitExecutor executor(substrait_json, &sink_gen, plan, in_schema, exec_context); + engine::SubstraitExecutor executor(substrait_json, &sink_gen, plan, in_schema, + exec_context); auto status = executor.MakePlan(); ASSERT_OK(status); ASSERT_OK_AND_ASSIGN(auto reader, executor.Execute()); auto finish = executor.Finalize(); ASSERT_OK(finish); ASSERT_OK_AND_ASSIGN(auto table, Table::FromRecordBatchReader(reader.get())); - EXPECT_TRUE(table->num_rows() > 0); + EXPECT_GT(table->num_rows(), 0); +} + +TEST(Substrait, GetRecordBatchIteratorUtil) { + const auto parquet_root = std::getenv("PARQUET_TEST_DATA"); + std::string dir_string(parquet_root); + std::stringstream ss; + ss << dir_string << "/binary.parquet"; + auto file_path = ss.str(); + + std::string substrait_json = R"({ + "relations": [ + {"rel": { + "read": { + "base_schema": { + "struct": { + "types": [ + {"binary": {}} + ] + }, + "names": [ + "foo" + ] + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER", + "format": "FILE_FORMAT_PARQUET" + } + ] + } + } + }} + ] + })"; + + std::string filename_placeholder = "FILENAME_PLACEHOLDER"; + substrait_json.replace(substrait_json.find(filename_placeholder), + filename_placeholder.size(), file_path); + auto in_schema = schema({field("foo", binary())}); + + ASSERT_OK_AND_ASSIGN(auto reader, engine::SubstraitExecutor::GetRecordBatchReader( + substrait_json, in_schema)); + ASSERT_OK_AND_ASSIGN(auto table, Table::FromRecordBatchReader(reader.get())); + EXPECT_GT(table->num_rows(), 0); } } // namespace engine diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index 7309aafe6d6..05f89de7df6 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -21,7 +21,6 @@ namespace arrow { namespace engine { - Status SubstraitSinkConsumer::Consume(cp::ExecBatch batch) { // Consume a batch of data bool did_push = producer_.Push(batch); @@ -39,8 +38,9 @@ Future<> SubstraitSinkConsumer::Finish() { } Status SubstraitExecutor::MakePlan() { - ARROW_ASSIGN_OR_RAISE(auto serialized_plan, engine::internal::SubstraitFromJSON("Plan", substrait_json_)); - + ARROW_ASSIGN_OR_RAISE(auto serialized_plan, + engine::internal::SubstraitFromJSON("Plan", substrait_json_)); + auto maybe_plan_json = engine::internal::SubstraitToJSON("Plan", *serialized_plan); RETURN_NOT_OK(maybe_plan_json.status()); @@ -52,8 +52,9 @@ Status SubstraitExecutor::MakePlan() { }; // Deserialize each relation tree in the substrait plan to an Arrow compute Declaration - ARROW_ASSIGN_OR_RAISE(declerations_, engine::DeserializePlan(*serialized_plan, consumer_factory)); - + ARROW_ASSIGN_OR_RAISE(declerations_, + engine::DeserializePlan(*serialized_plan, consumer_factory)); + // It's safe to drop the serialized plan; we don't leave references to its memory serialized_plan.reset(); @@ -70,16 +71,36 @@ Result> SubstraitExecutor::Execute() { ARROW_RETURN_NOT_OK(plan_->StartProducing()); - std::shared_ptr sink_reader = - cp::MakeGeneratorReader(schema_, std::move(*generator_), exec_context_.memory_pool()); + std::shared_ptr sink_reader = cp::MakeGeneratorReader( + schema_, std::move(*generator_), exec_context_.memory_pool()); return sink_reader; } Status SubstraitExecutor::Finalize() { - ARROW_RETURN_NOT_OK(plan_->finished().status()); - return Status::OK(); + ARROW_RETURN_NOT_OK(plan_->finished().status()); + return Status::OK(); +} + +Result> SubstraitExecutor::GetRecordBatchReader( + std::string& substrait_json, std::shared_ptr schema) { + cp::ExecContext exec_context(arrow::default_memory_pool(), + ::arrow::internal::GetCpuThreadPool()); + + arrow::AsyncGenerator> sink_gen; + + ARROW_ASSIGN_OR_RAISE(auto plan, cp::ExecPlan::Make()); + + arrow::engine::SubstraitExecutor executor(substrait_json, &sink_gen, plan, schema, + exec_context); + RETURN_NOT_OK(executor.MakePlan()); + + ARROW_ASSIGN_OR_RAISE(auto sink_reader, executor.Execute()); + + RETURN_NOT_OK(executor.Finalize()); + + return sink_reader; } } // namespace engine -} // namespace arrow \ No newline at end of file +} // namespace arrow diff --git a/cpp/src/arrow/engine/substrait/util.h b/cpp/src/arrow/engine/substrait/util.h index 46c00ee1ced..8915a3d0d0e 100644 --- a/cpp/src/arrow/engine/substrait/util.h +++ b/cpp/src/arrow/engine/substrait/util.h @@ -61,8 +61,11 @@ class SubstraitExecutor { AsyncGenerator>* generator, std::shared_ptr plan, std::shared_ptr schema, cp::ExecContext exec_context) - : substrait_json_(substrait_json), generator_(generator), - plan_(std::move(plan)), schema_(schema), exec_context_(exec_context) {} + : substrait_json_(substrait_json), + generator_(generator), + plan_(std::move(plan)), + schema_(schema), + exec_context_(exec_context) {} Status MakePlan(); @@ -70,6 +73,9 @@ class SubstraitExecutor { Status Finalize(); + static Result> GetRecordBatchReader( + std::string& substrait_json, std::shared_ptr schema); + private: std::string substrait_json_; AsyncGenerator>* generator_; @@ -81,4 +87,4 @@ class SubstraitExecutor { } // namespace engine -} // namespace arrow \ No newline at end of file +} // namespace arrow diff --git a/python/examples/substrait/query_execution_example.py b/python/examples/substrait/query_execution_example.py new file mode 100644 index 00000000000..832345d83f9 --- /dev/null +++ b/python/examples/substrait/query_execution_example.py @@ -0,0 +1,33 @@ +import pyarrow as pa +import pyarrow.compute as pc + +query = """ +({ + "relations": [ + {"rel": { + "read": { + "base_schema": { + "struct": { + "types": [ + {"i64": {}}, + {"bool": {}} + ] + }, + "names": [ + "i", + "b" + ] + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER", + "format": "FILE_FORMAT_PARQUET" + } + ] + } + } + }} + ] + }) +""" \ No newline at end of file From 005823ba7728229b339c8f9780dfd86ddaec4511 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 24 Mar 2022 14:14:48 +0530 Subject: [PATCH 16/74] rebase --- python/CMakeLists.txt | 24 +++++++++++++++++++ .../substrait/query_execution_example.py | 10 +++++++- python/setup.py | 8 +++++++ 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index d17c76e2888..4f103631438 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -69,6 +69,7 @@ endif() if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" OFF) option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" OFF) + option(PYARROW_BUILD_ENGINE "Build the PyArrow Substrait integration" ON) option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF) option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF) option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF) @@ -227,6 +228,10 @@ if(PYARROW_BUILD_FLIGHT) set(ARROW_FLIGHT TRUE) endif() +if(PYARROW_BUILD_ENGINE) + set(ARROW_ENGINE TRUE) +endif() + # Arrow find_package(ArrowPython REQUIRED) include_directories(SYSTEM ${ARROW_INCLUDE_DIR}) @@ -398,6 +403,7 @@ endif() set(CYTHON_EXTENSIONS lib _compute + _engine _csv _exec_plan _feather @@ -535,6 +541,20 @@ if(PYARROW_BUILD_FLIGHT) set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _flight) endif() +# Engine + +if(PYARROW_BUILD_ENGINE) +find_package(ArrowEngine REQUIRED) + if(PYARROW_BUNDLE_ARROW_CPP) + message("ARROW_ENGINE_SHARED_LIB") + message(""${ARROW_ENGINE_SHARED_LIB}) + bundle_arrow_lib(ARROW_ENGINE_SHARED_LIB SO_VERSION ${ARROW_SO_VERSION}) + endif() + + set(ENGINE_LINK_LIBS arrow_engine_shared) + set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _engine) +endif() + # Gandiva if(PYARROW_BUILD_GANDIVA) find_package(Gandiva REQUIRED) @@ -625,6 +645,10 @@ if(PYARROW_BUILD_FLIGHT) target_link_libraries(_flight PRIVATE ${FLIGHT_LINK_LIBS}) endif() +if(PYARROW_BUILD_ENGINE) + target_link_libraries(_engine PRIVATE ${ENGINE_LINK_LIBS}) +endif() + if(PYARROW_BUILD_DATASET) target_link_libraries(_dataset PRIVATE ${DATASET_LINK_LIBS}) target_link_libraries(_exec_plan PRIVATE ${DATASET_LINK_LIBS}) diff --git a/python/examples/substrait/query_execution_example.py b/python/examples/substrait/query_execution_example.py index 832345d83f9..a8725bf3011 100644 --- a/python/examples/substrait/query_execution_example.py +++ b/python/examples/substrait/query_execution_example.py @@ -1,6 +1,8 @@ import pyarrow as pa import pyarrow.compute as pc +from pyarrow.engine import run_query + query = """ ({ "relations": [ @@ -30,4 +32,10 @@ }} ] }) -""" \ No newline at end of file +""" + +query = query.replace("FILENAME_PLACEHOLDER", "/Users/vibhatha/sandbox/parquet/example.parquet") + +schema = pa.schema({"i": pa.int64(), "b": pa.bool_()}) + + diff --git a/python/setup.py b/python/setup.py index 1189357b234..f0abbb952d6 100755 --- a/python/setup.py +++ b/python/setup.py @@ -160,6 +160,8 @@ def initialize_options(self): os.environ.get('PYARROW_WITH_HDFS', '0')) self.with_cuda = strtobool( os.environ.get('PYARROW_WITH_CUDA', '0')) + self.with_engine = strtobool( + os.environ.get('PYARROW_WITH_ENGINE', '0')) self.with_flight = strtobool( os.environ.get('PYARROW_WITH_FLIGHT', '0')) self.with_dataset = strtobool( @@ -203,6 +205,7 @@ def initialize_options(self): '_json', '_compute', '_cuda', + '_engine', '_flight', '_dataset', '_dataset_orc', @@ -268,6 +271,7 @@ def append_cmake_bool(value, varname): cmake_options += ['-G', self.cmake_generator] append_cmake_bool(self.with_cuda, 'PYARROW_BUILD_CUDA') + append_cmake_bool(self.with_engine, 'PYARROW_BUILD_ENGINE') append_cmake_bool(self.with_flight, 'PYARROW_BUILD_FLIGHT') append_cmake_bool(self.with_gandiva, 'PYARROW_BUILD_GANDIVA') append_cmake_bool(self.with_dataset, 'PYARROW_BUILD_DATASET') @@ -393,6 +397,8 @@ def _bundle_arrow_cpp(self, build_prefix, build_lib): move_shared_libs(build_prefix, build_lib, "arrow_python") if self.with_cuda: move_shared_libs(build_prefix, build_lib, "arrow_cuda") + if self.with_engine: + move_shared_libs(build_prefix, build_lib, "arrow_engine") if self.with_flight: move_shared_libs(build_prefix, build_lib, "arrow_flight") move_shared_libs(build_prefix, build_lib, @@ -438,6 +444,8 @@ def _failure_permitted(self, name): return True if name == '_flight' and not self.with_flight: return True + if name == '_engine' and not self.with_engine: + return True if name == '_s3fs' and not self.with_s3: return True if name == '_hdfs' and not self.with_hdfs: From 8f4d91366dd8af84791693bb69057999ffc05abc Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 24 Mar 2022 14:16:00 +0530 Subject: [PATCH 17/74] adding engine --- python/pyarrow/_engine.pyx | 48 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 python/pyarrow/_engine.pyx diff --git a/python/pyarrow/_engine.pyx b/python/pyarrow/_engine.pyx new file mode 100644 index 00000000000..4983b63adc9 --- /dev/null +++ b/python/pyarrow/_engine.pyx @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +import sys + +from cython.operator cimport dereference as deref + +from pyarrow.lib cimport * +from pyarrow.includes.libarrow cimport * +import pyarrow.lib as lib + +import numpy as np + +def run_query(plan, output_schema): + + cdef: + CResult[shared_ptr[CRecordBatchReader]] c_res_reader + shared_ptr[CRecordBatchReader] c_reader + shared_ptr[CSchema] c_schema + c_string c_plan + RecordBatchReader reader + + c_plan = plan.encode() + + c_schema = pyarrow_unwrap_schema(output_schema) + + c_res_reader = GetRecordBatchReader(c_plan, c_schema) + c_reader = GetResultValue(c_res_reader) + + reader = RecordBatchReader.__new__(RecordBatchReader) + reader.reader = c_reader + return reader \ No newline at end of file From 07cb5d2111f125a383f1f8cfa25f75c960b8f367 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 24 Mar 2022 14:52:33 +0530 Subject: [PATCH 18/74] functional init version of python substrait --- python/CMakeLists.txt | 1 - .../substrait/query_execution_example.py | 8 ++++++-- python/pyarrow/engine.py | 20 +++++++++++++++++++ python/setup.py | 1 + 4 files changed, 27 insertions(+), 3 deletions(-) create mode 100644 python/pyarrow/engine.py diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 4f103631438..affe8470d6c 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -403,7 +403,6 @@ endif() set(CYTHON_EXTENSIONS lib _compute - _engine _csv _exec_plan _feather diff --git a/python/examples/substrait/query_execution_example.py b/python/examples/substrait/query_execution_example.py index a8725bf3011..71c9647e97d 100644 --- a/python/examples/substrait/query_execution_example.py +++ b/python/examples/substrait/query_execution_example.py @@ -4,7 +4,7 @@ from pyarrow.engine import run_query query = """ -({ +{ "relations": [ {"rel": { "read": { @@ -31,7 +31,7 @@ } }} ] - }) + } """ query = query.replace("FILENAME_PLACEHOLDER", "/Users/vibhatha/sandbox/parquet/example.parquet") @@ -39,3 +39,7 @@ schema = pa.schema({"i": pa.int64(), "b": pa.bool_()}) +reader = run_query(query, schema) + +print(reader.read_all()) + diff --git a/python/pyarrow/engine.py b/python/pyarrow/engine.py new file mode 100644 index 00000000000..88871da0243 --- /dev/null +++ b/python/pyarrow/engine.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._engine import ( # noqa +run_query +) \ No newline at end of file diff --git a/python/setup.py b/python/setup.py index f0abbb952d6..a60cba7fe24 100755 --- a/python/setup.py +++ b/python/setup.py @@ -108,6 +108,7 @@ def run(self): 'namespace of boost (default: boost)'), ('with-cuda', None, 'build the Cuda extension'), ('with-flight', None, 'build the Flight extension'), + ('with-engine', None, 'build the Engine extension'), ('with-dataset', None, 'build the Dataset extension'), ('with-parquet', None, 'build the Parquet extension'), ('with-parquet-encryption', None, From c359a1d91743d8307d2da41007f1456c476d383f Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 24 Mar 2022 15:24:12 +0530 Subject: [PATCH 19/74] rebase --- .../substrait/query_execution_example.py | 48 ++++++++++++++++--- python/pyarrow/_engine.pyx | 5 +- python/pyarrow/engine.py | 4 +- 3 files changed, 47 insertions(+), 10 deletions(-) diff --git a/python/examples/substrait/query_execution_example.py b/python/examples/substrait/query_execution_example.py index 71c9647e97d..133f1342e7e 100644 --- a/python/examples/substrait/query_execution_example.py +++ b/python/examples/substrait/query_execution_example.py @@ -1,8 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse import pyarrow as pa -import pyarrow.compute as pc from pyarrow.engine import run_query +""" +Data Sample +----------- + + i b +0 10 False +1 20 True +2 30 False + +Run Example +----------- + +python3 query_execution_example.py --filename +""" + +parser = argparse.ArgumentParser() +parser.add_argument("--filename", help="display a square of a given number", + type=str) +args = parser.parse_args() + query = """ { "relations": [ @@ -10,9 +47,9 @@ "read": { "base_schema": { "struct": { - "types": [ - {"i64": {}}, - {"bool": {}} + "types": [ + {"i64": {}}, + {"bool": {}} ] }, "names": [ @@ -34,7 +71,7 @@ } """ -query = query.replace("FILENAME_PLACEHOLDER", "/Users/vibhatha/sandbox/parquet/example.parquet") +query = query.replace("FILENAME_PLACEHOLDER", args.filename) schema = pa.schema({"i": pa.int64(), "b": pa.bool_()}) @@ -42,4 +79,3 @@ reader = run_query(query, schema) print(reader.read_all()) - diff --git a/python/pyarrow/_engine.pyx b/python/pyarrow/_engine.pyx index 4983b63adc9..dc077495f07 100644 --- a/python/pyarrow/_engine.pyx +++ b/python/pyarrow/_engine.pyx @@ -27,6 +27,7 @@ import pyarrow.lib as lib import numpy as np + def run_query(plan, output_schema): cdef: @@ -39,10 +40,10 @@ def run_query(plan, output_schema): c_plan = plan.encode() c_schema = pyarrow_unwrap_schema(output_schema) - + c_res_reader = GetRecordBatchReader(c_plan, c_schema) c_reader = GetResultValue(c_res_reader) reader = RecordBatchReader.__new__(RecordBatchReader) reader.reader = c_reader - return reader \ No newline at end of file + return reader diff --git a/python/pyarrow/engine.py b/python/pyarrow/engine.py index 88871da0243..dcc952f43f1 100644 --- a/python/pyarrow/engine.py +++ b/python/pyarrow/engine.py @@ -16,5 +16,5 @@ # under the License. from pyarrow._engine import ( # noqa -run_query -) \ No newline at end of file + run_query +) From 0be4d6f8a5965271a33f90e0f634e6a337340242 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 24 Mar 2022 15:39:28 +0530 Subject: [PATCH 20/74] minor modification to cmakelist --- python/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index affe8470d6c..ca2493e48de 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -69,7 +69,7 @@ endif() if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" OFF) option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" OFF) - option(PYARROW_BUILD_ENGINE "Build the PyArrow Substrait integration" ON) + option(PYARROW_BUILD_ENGINE "Build the PyArrow Substrait integration" OFF) option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF) option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF) option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF) From ef3f5d061276058e1e6e7c914a0d7990be97513c Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 24 Mar 2022 16:39:41 +0530 Subject: [PATCH 21/74] adding class import --- cpp/src/arrow/engine/substrait/util.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/util.h b/cpp/src/arrow/engine/substrait/util.h index 8915a3d0d0e..6d9d44905a9 100644 --- a/cpp/src/arrow/engine/substrait/util.h +++ b/cpp/src/arrow/engine/substrait/util.h @@ -29,7 +29,7 @@ namespace cp = arrow::compute; namespace engine { -class SubstraitSinkConsumer : public cp::SinkNodeConsumer { +class ARROW_ENGINE_EXPORT SubstraitSinkConsumer : public cp::SinkNodeConsumer { public: explicit SubstraitSinkConsumer( AsyncGenerator>* generator, @@ -54,7 +54,7 @@ class SubstraitSinkConsumer : public cp::SinkNodeConsumer { PushGenerator>::Producer producer_; }; -class SubstraitExecutor { +class ARROW_ENGINE_EXPORT SubstraitExecutor { public: explicit SubstraitExecutor( std::string substrait_json, From a2ae427db5ed6d95e2ae89d627df079ee67d94b7 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 25 Mar 2022 13:10:26 +0530 Subject: [PATCH 22/74] addressing python review comments --- python/CMakeLists.txt | 5 +- .../substrait/query_execution_example.py | 6 +- python/pyarrow/_engine.pyx | 15 +-- python/pyarrow/engine.py | 2 +- python/pyarrow/tests/test_substrait.py | 91 +++++++++++++++++++ 5 files changed, 105 insertions(+), 14 deletions(-) create mode 100644 python/pyarrow/tests/test_substrait.py diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index ca2493e48de..7a1b23fb28b 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -541,12 +541,9 @@ if(PYARROW_BUILD_FLIGHT) endif() # Engine - if(PYARROW_BUILD_ENGINE) -find_package(ArrowEngine REQUIRED) + find_package(ArrowEngine REQUIRED) if(PYARROW_BUNDLE_ARROW_CPP) - message("ARROW_ENGINE_SHARED_LIB") - message(""${ARROW_ENGINE_SHARED_LIB}) bundle_arrow_lib(ARROW_ENGINE_SHARED_LIB SO_VERSION ${ARROW_SO_VERSION}) endif() diff --git a/python/examples/substrait/query_execution_example.py b/python/examples/substrait/query_execution_example.py index 133f1342e7e..7970ec11a50 100644 --- a/python/examples/substrait/query_execution_example.py +++ b/python/examples/substrait/query_execution_example.py @@ -17,6 +17,7 @@ import argparse import pyarrow as pa +from pyarrow.lib import tobytes from pyarrow.engine import run_query @@ -37,7 +38,7 @@ parser = argparse.ArgumentParser() parser.add_argument("--filename", help="display a square of a given number", - type=str) + type=str, required=True) args = parser.parse_args() query = """ @@ -71,11 +72,10 @@ } """ -query = query.replace("FILENAME_PLACEHOLDER", args.filename) +query = tobytes(query.replace("FILENAME_PLACEHOLDER", args.filename)) schema = pa.schema({"i": pa.int64(), "b": pa.bool_()}) - reader = run_query(query, schema) print(reader.read_all()) diff --git a/python/pyarrow/_engine.pyx b/python/pyarrow/_engine.pyx index dc077495f07..866741e957f 100644 --- a/python/pyarrow/_engine.pyx +++ b/python/pyarrow/_engine.pyx @@ -19,16 +19,19 @@ import sys -from cython.operator cimport dereference as deref - from pyarrow.lib cimport * from pyarrow.includes.libarrow cimport * -import pyarrow.lib as lib - -import numpy as np def run_query(plan, output_schema): + """ + executes a substrait plan and returns a RecordBatchReader + + Paramters + --------- + plan : bytes + output_schema: expected output schema + """ cdef: CResult[shared_ptr[CRecordBatchReader]] c_res_reader @@ -37,7 +40,7 @@ def run_query(plan, output_schema): c_string c_plan RecordBatchReader reader - c_plan = plan.encode() + c_plan = plan c_schema = pyarrow_unwrap_schema(output_schema) diff --git a/python/pyarrow/engine.py b/python/pyarrow/engine.py index dcc952f43f1..e975f4b616f 100644 --- a/python/pyarrow/engine.py +++ b/python/pyarrow/engine.py @@ -16,5 +16,5 @@ # under the License. from pyarrow._engine import ( # noqa - run_query + run_query, ) diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py new file mode 100644 index 00000000000..a47d7428882 --- /dev/null +++ b/python/pyarrow/tests/test_substrait.py @@ -0,0 +1,91 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import pathlib +import pyarrow as pa +from pyarrow.lib import tobytes +import pyarrow.parquet as pq + +try: + from pyarrow import engine + from pyarrow.engine import ( + run_query, + ) +except ImportError: + engine = None + + +def test_import(): + # So we see the ImportError somewhere + import pyarrow.engine # noqa + + +def resource_root(): + """Get the path to the test resources directory.""" + if not os.environ.get("PARQUET_TEST_DATA"): + raise RuntimeError("Test resources not found; set " + "PARQUET_TEST_DATA to " + "/cpp/submodules/parquet-testing/data") + return pathlib.Path(os.environ["PARQUET_TEST_DATA"]) + + +def test_run_query(): + filename = str(resource_root() / "binary.parquet") + + query = """ + { + "relations": [ + {"rel": { + "read": { + "base_schema": { + "struct": { + "types": [ + {"binary": {}} + ] + }, + "names": [ + "foo" + ] + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER", + "format": "FILE_FORMAT_PARQUET" + } + ] + } + } + }} + ] + } + """ + + query = tobytes(query.replace("FILENAME_PLACEHOLDER", filename)) + + schema = pa.schema({"foo": pa.binary()}) + + reader = run_query(query, schema) + + res = reader.read_all() + + assert res.schema == schema + assert res.num_rows > 0 + + assert pq.read_table(filename).equals(res) + \ No newline at end of file From d15b7b456ebe9c1a5f861925314a0051ffd668d7 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 25 Mar 2022 15:11:47 +0530 Subject: [PATCH 23/74] rebase --- .../arrow/engine_substrait_example.cc | 58 ++++++++----------- cpp/src/arrow/engine/substrait/serde_test.cc | 12 ++-- cpp/src/arrow/engine/substrait/util.cc | 41 ++++++------- cpp/src/arrow/engine/substrait/util.h | 27 ++++----- 4 files changed, 62 insertions(+), 76 deletions(-) diff --git a/cpp/examples/arrow/engine_substrait_example.cc b/cpp/examples/arrow/engine_substrait_example.cc index 926de469b47..2892f810c6e 100644 --- a/cpp/examples/arrow/engine_substrait_example.cc +++ b/cpp/examples/arrow/engine_substrait_example.cc @@ -19,8 +19,8 @@ #include #include #include -#include "arrow/util/async_generator.h" -#include "arrow/util/iterator.h" +#include +#include #include #include @@ -75,54 +75,44 @@ std::string GetSubstraitPlanFromServer(const std::string& filename) { return substrait_json; } -int main(int argc, char** argv) { - if (argc < 2) { - std::cout << "Please specify a parquet file to scan" << std::endl; - // Fake pass for CI - return EXIT_SUCCESS; - } - auto substrait_json = GetSubstraitPlanFromServer(argv[1]); - +arrow::Status Main(std::string substrait_json) { auto schema = arrow::schema( {arrow::field("i", arrow::int64()), arrow::field("b", arrow::boolean())}); - cp::ExecContext exec_context(arrow::default_memory_pool(), - ::arrow::internal::GetCpuThreadPool()); + cp::ExecContext exec_context; arrow::AsyncGenerator> sink_gen; - auto maybe_plan = cp::ExecPlan::Make(); - if (!maybe_plan.status().ok()) { - return EXIT_FAILURE; - } - auto plan = maybe_plan.ValueOrDie(); + ARROW_ASSIGN_OR_RAISE(auto plan, cp::ExecPlan::Make()); + arrow::engine::SubstraitExecutor executor(substrait_json, &sink_gen, plan, schema, exec_context); - auto status = executor.MakePlan(); - if (!status.ok()) { - return EXIT_FAILURE; - } - auto maybe_reader = executor.Execute(); - if (!maybe_reader.status().ok()) { - return EXIT_FAILURE; - } + ARROW_ASSIGN_OR_RAISE(auto sink_reader, executor.Execute()); - auto sink_reader = maybe_reader.ValueOrDie(); + ARROW_ASSIGN_OR_RAISE(auto table, + arrow::Table::FromRecordBatchReader(sink_reader.get())); - std::shared_ptr response_table; + std::cout << "Results : " << table->ToString() << std::endl; - auto maybe_table = arrow::Table::FromRecordBatchReader(sink_reader.get()); - ABORT_ON_FAILURE(maybe_table.status()); - response_table = maybe_table.ValueOrDie(); + RETURN_NOT_OK(executor.Close()); - std::cout << "Results : " << response_table->ToString() << std::endl; + return arrow::Status::OK(); +} - auto finish = executor.Finalize(); +int main(int argc, char** argv) { + if (argc < 2) { + std::cout << "Please specify a parquet file to scan" << std::endl; + // Fake pass for CI + return EXIT_SUCCESS; + } + auto substrait_json = GetSubstraitPlanFromServer(argv[1]); - if (!finish.ok()) { + auto status = Main(substrait_json); + + if (!status.ok()) { + std::cout << "Error ocurred: " << status.message() << std::endl; return EXIT_FAILURE; } - return EXIT_SUCCESS; } diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 20204b986ef..09d5321a192 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -730,6 +730,7 @@ TEST(Substrait, ExtensionSetFromPlan) { EXPECT_EQ(decoded_add_func.name, "add"); } +<<<<<<< HEAD TEST(Substrait, ExtensionSetFromPlanMissingFunc) { ASSERT_OK_AND_ASSIGN(auto buf, internal::SubstraitFromJSON("Plan", R"({ "relations": [], @@ -757,6 +758,9 @@ TEST(Substrait, ExtensionSetFromPlanMissingFunc) { } TEST(Substrait, GetRecordBatchIterator) { +======= +TEST(Substrait, GetRecordBatchReader) { +>>>>>>> 7ed254e6a (addressing review comments) const auto parquet_root = std::getenv("PARQUET_TEST_DATA"); std::string dir_string(parquet_root); std::stringstream ss; @@ -800,16 +804,14 @@ TEST(Substrait, GetRecordBatchIterator) { ASSERT_OK_AND_ASSIGN(auto plan, cp::ExecPlan::Make()); engine::SubstraitExecutor executor(substrait_json, &sink_gen, plan, in_schema, exec_context); - auto status = executor.MakePlan(); - ASSERT_OK(status); ASSERT_OK_AND_ASSIGN(auto reader, executor.Execute()); - auto finish = executor.Finalize(); - ASSERT_OK(finish); + ASSERT_OK(executor.Close()); + ASSERT_OK_AND_ASSIGN(auto table, Table::FromRecordBatchReader(reader.get())); EXPECT_GT(table->num_rows(), 0); } -TEST(Substrait, GetRecordBatchIteratorUtil) { +TEST(Substrait, GetRecordBatchReaderUtil) { const auto parquet_root = std::getenv("PARQUET_TEST_DATA"); std::string dir_string(parquet_root); std::stringstream ss; diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index 05f89de7df6..6b541ff6448 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -40,43 +40,41 @@ Future<> SubstraitSinkConsumer::Finish() { Status SubstraitExecutor::MakePlan() { ARROW_ASSIGN_OR_RAISE(auto serialized_plan, engine::internal::SubstraitFromJSON("Plan", substrait_json_)); - - auto maybe_plan_json = engine::internal::SubstraitToJSON("Plan", *serialized_plan); - RETURN_NOT_OK(maybe_plan_json.status()); - + RETURN_NOT_OK(engine::internal::SubstraitToJSON("Plan", *serialized_plan)); std::vector> consumers; std::function()> consumer_factory = [&] { - // All batches produced by the plan will be fed into IgnoringConsumers: consumers.emplace_back(new SubstraitSinkConsumer{generator_}); return consumers.back(); }; - - // Deserialize each relation tree in the substrait plan to an Arrow compute Declaration - ARROW_ASSIGN_OR_RAISE(declerations_, + ARROW_ASSIGN_OR_RAISE(declarations_, engine::DeserializePlan(*serialized_plan, consumer_factory)); - - // It's safe to drop the serialized plan; we don't leave references to its memory - serialized_plan.reset(); - - // Construct an empty plan (note: configure Function registry and ThreadPool here) return Status::OK(); } +arrow::PushGenerator>::Producer +SubstraitSinkConsumer::MakeProducer( + AsyncGenerator>* out_gen, + arrow::util::BackpressureOptions backpressure) { + arrow::PushGenerator> push_gen( + std::move(backpressure)); + auto out = push_gen.producer(); + *out_gen = std::move(push_gen); + return out; +} + Result> SubstraitExecutor::Execute() { - for (const cp::Declaration& decl : declerations_) { + RETURN_NOT_OK(SubstraitExecutor::MakePlan()); + for (const cp::Declaration& decl : declarations_) { RETURN_NOT_OK(decl.AddToPlan(plan_.get()).status()); } - ARROW_RETURN_NOT_OK(plan_->Validate()); - ARROW_RETURN_NOT_OK(plan_->StartProducing()); - std::shared_ptr sink_reader = cp::MakeGeneratorReader( schema_, std::move(*generator_), exec_context_.memory_pool()); return sink_reader; } -Status SubstraitExecutor::Finalize() { +Status SubstraitExecutor::Close() { ARROW_RETURN_NOT_OK(plan_->finished().status()); return Status::OK(); } @@ -87,17 +85,12 @@ Result> SubstraitExecutor::GetRecordBatchRead ::arrow::internal::GetCpuThreadPool()); arrow::AsyncGenerator> sink_gen; - ARROW_ASSIGN_OR_RAISE(auto plan, cp::ExecPlan::Make()); - arrow::engine::SubstraitExecutor executor(substrait_json, &sink_gen, plan, schema, exec_context); RETURN_NOT_OK(executor.MakePlan()); - ARROW_ASSIGN_OR_RAISE(auto sink_reader, executor.Execute()); - - RETURN_NOT_OK(executor.Finalize()); - + RETURN_NOT_OK(executor.Close()); return sink_reader; } diff --git a/cpp/src/arrow/engine/substrait/util.h b/cpp/src/arrow/engine/substrait/util.h index 6d9d44905a9..edc2f14d649 100644 --- a/cpp/src/arrow/engine/substrait/util.h +++ b/cpp/src/arrow/engine/substrait/util.h @@ -17,11 +17,14 @@ #pragma once -#include "arrow/api.h" -#include "arrow/compute/api.h" +#include +#include +#include +#include "arrow/compute/type_fwd.h" #include "arrow/engine/api.h" #include "arrow/util/async_generator.h" #include "arrow/util/iterator.h" +#include "arrow/util/optional.h" namespace arrow { @@ -29,6 +32,7 @@ namespace cp = arrow::compute; namespace engine { +/// \brief A SinkNodeConsumer specialized to output ExecBatches via PushGenerator class ARROW_ENGINE_EXPORT SubstraitSinkConsumer : public cp::SinkNodeConsumer { public: explicit SubstraitSinkConsumer( @@ -40,13 +44,7 @@ class ARROW_ENGINE_EXPORT SubstraitSinkConsumer : public cp::SinkNodeConsumer { static arrow::PushGenerator>::Producer MakeProducer(AsyncGenerator>* out_gen, - arrow::util::BackpressureOptions backpressure) { - arrow::PushGenerator> push_gen( - std::move(backpressure)); - auto out = push_gen.producer(); - *out_gen = std::move(push_gen); - return out; - } + arrow::util::BackpressureOptions backpressure); Future<> Finish() override; @@ -54,6 +52,9 @@ class ARROW_ENGINE_EXPORT SubstraitSinkConsumer : public cp::SinkNodeConsumer { PushGenerator>::Producer producer_; }; +/// \brief An executor to run a Substrait Query +/// This interface is provided as a utility when creating language +/// bindings for consuming a Substrait plan. class ARROW_ENGINE_EXPORT SubstraitExecutor { public: explicit SubstraitExecutor( @@ -67,11 +68,9 @@ class ARROW_ENGINE_EXPORT SubstraitExecutor { schema_(schema), exec_context_(exec_context) {} - Status MakePlan(); - Result> Execute(); - Status Finalize(); + Status Close(); static Result> GetRecordBatchReader( std::string& substrait_json, std::shared_ptr schema); @@ -79,10 +78,12 @@ class ARROW_ENGINE_EXPORT SubstraitExecutor { private: std::string substrait_json_; AsyncGenerator>* generator_; - std::vector declerations_; + std::vector declarations_; std::shared_ptr plan_; std::shared_ptr schema_; cp::ExecContext exec_context_; + + Status MakePlan(); }; } // namespace engine From 2ab7231737864d7820372fef01321a5cc5673a63 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 7 Apr 2022 11:25:50 +0530 Subject: [PATCH 24/74] addressing reviews --- cpp/examples/arrow/engine_substrait_example.cc | 5 +---- cpp/src/arrow/engine/substrait/serde_test.cc | 6 ++---- cpp/src/arrow/engine/substrait/util.cc | 7 ++++--- cpp/src/arrow/engine/substrait/util.h | 6 ++---- python/CMakeLists.txt | 2 +- python/pyarrow/_engine.pyx | 7 +------ python/pyarrow/tests/test_substrait.py | 3 +-- 7 files changed, 12 insertions(+), 24 deletions(-) diff --git a/cpp/examples/arrow/engine_substrait_example.cc b/cpp/examples/arrow/engine_substrait_example.cc index 2892f810c6e..e96ad640d67 100644 --- a/cpp/examples/arrow/engine_substrait_example.cc +++ b/cpp/examples/arrow/engine_substrait_example.cc @@ -76,16 +76,13 @@ std::string GetSubstraitPlanFromServer(const std::string& filename) { } arrow::Status Main(std::string substrait_json) { - auto schema = arrow::schema( - {arrow::field("i", arrow::int64()), arrow::field("b", arrow::boolean())}); - cp::ExecContext exec_context; arrow::AsyncGenerator> sink_gen; ARROW_ASSIGN_OR_RAISE(auto plan, cp::ExecPlan::Make()); - arrow::engine::SubstraitExecutor executor(substrait_json, &sink_gen, plan, schema, + arrow::engine::SubstraitExecutor executor(substrait_json, &sink_gen, plan, exec_context); ARROW_ASSIGN_OR_RAISE(auto sink_reader, executor.Execute()); diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 09d5321a192..8633b5ba53a 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -797,12 +797,11 @@ TEST(Substrait, GetRecordBatchReader) { std::string filename_placeholder = "FILENAME_PLACEHOLDER"; substrait_json.replace(substrait_json.find(filename_placeholder), filename_placeholder.size(), file_path); - auto in_schema = schema({field("foo", binary())}); AsyncGenerator> sink_gen; cp::ExecContext exec_context(default_memory_pool(), arrow::internal::GetCpuThreadPool()); ASSERT_OK_AND_ASSIGN(auto plan, cp::ExecPlan::Make()); - engine::SubstraitExecutor executor(substrait_json, &sink_gen, plan, in_schema, + engine::SubstraitExecutor executor(substrait_json, &sink_gen, plan, exec_context); ASSERT_OK_AND_ASSIGN(auto reader, executor.Execute()); ASSERT_OK(executor.Close()); @@ -848,10 +847,9 @@ TEST(Substrait, GetRecordBatchReaderUtil) { std::string filename_placeholder = "FILENAME_PLACEHOLDER"; substrait_json.replace(substrait_json.find(filename_placeholder), filename_placeholder.size(), file_path); - auto in_schema = schema({field("foo", binary())}); ASSERT_OK_AND_ASSIGN(auto reader, engine::SubstraitExecutor::GetRecordBatchReader( - substrait_json, in_schema)); + substrait_json)); ASSERT_OK_AND_ASSIGN(auto table, Table::FromRecordBatchReader(reader.get())); EXPECT_GT(table->num_rows(), 0); } diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index 6b541ff6448..6aa75756d9f 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -69,8 +69,9 @@ Result> SubstraitExecutor::Execute() { } ARROW_RETURN_NOT_OK(plan_->Validate()); ARROW_RETURN_NOT_OK(plan_->StartProducing()); + auto schema = plan_->sinks()[0]->output_schema(); std::shared_ptr sink_reader = cp::MakeGeneratorReader( - schema_, std::move(*generator_), exec_context_.memory_pool()); + schema, std::move(*generator_), exec_context_.memory_pool()); return sink_reader; } @@ -80,13 +81,13 @@ Status SubstraitExecutor::Close() { } Result> SubstraitExecutor::GetRecordBatchReader( - std::string& substrait_json, std::shared_ptr schema) { + std::string& substrait_json) { cp::ExecContext exec_context(arrow::default_memory_pool(), ::arrow::internal::GetCpuThreadPool()); arrow::AsyncGenerator> sink_gen; ARROW_ASSIGN_OR_RAISE(auto plan, cp::ExecPlan::Make()); - arrow::engine::SubstraitExecutor executor(substrait_json, &sink_gen, plan, schema, + arrow::engine::SubstraitExecutor executor(substrait_json, &sink_gen, plan, exec_context); RETURN_NOT_OK(executor.MakePlan()); ARROW_ASSIGN_OR_RAISE(auto sink_reader, executor.Execute()); diff --git a/cpp/src/arrow/engine/substrait/util.h b/cpp/src/arrow/engine/substrait/util.h index edc2f14d649..2447df74c70 100644 --- a/cpp/src/arrow/engine/substrait/util.h +++ b/cpp/src/arrow/engine/substrait/util.h @@ -60,12 +60,11 @@ class ARROW_ENGINE_EXPORT SubstraitExecutor { explicit SubstraitExecutor( std::string substrait_json, AsyncGenerator>* generator, - std::shared_ptr plan, std::shared_ptr schema, + std::shared_ptr plan, cp::ExecContext exec_context) : substrait_json_(substrait_json), generator_(generator), plan_(std::move(plan)), - schema_(schema), exec_context_(exec_context) {} Result> Execute(); @@ -73,14 +72,13 @@ class ARROW_ENGINE_EXPORT SubstraitExecutor { Status Close(); static Result> GetRecordBatchReader( - std::string& substrait_json, std::shared_ptr schema); + std::string& substrait_json); private: std::string substrait_json_; AsyncGenerator>* generator_; std::vector declarations_; std::shared_ptr plan_; - std::shared_ptr schema_; cp::ExecContext exec_context_; Status MakePlan(); diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 7a1b23fb28b..584899b71e0 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -69,7 +69,7 @@ endif() if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" OFF) option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" OFF) - option(PYARROW_BUILD_ENGINE "Build the PyArrow Substrait integration" OFF) + option(PYARROW_BUILD_ENGINE "Build the PyArrow Engine integration" OFF) option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF) option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF) option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF) diff --git a/python/pyarrow/_engine.pyx b/python/pyarrow/_engine.pyx index 866741e957f..51d1e5bd74c 100644 --- a/python/pyarrow/_engine.pyx +++ b/python/pyarrow/_engine.pyx @@ -17,8 +17,6 @@ # cython: language_level = 3 -import sys - from pyarrow.lib cimport * from pyarrow.includes.libarrow cimport * @@ -37,14 +35,11 @@ def run_query(plan, output_schema): CResult[shared_ptr[CRecordBatchReader]] c_res_reader shared_ptr[CRecordBatchReader] c_reader shared_ptr[CSchema] c_schema - c_string c_plan RecordBatchReader reader - c_plan = plan - c_schema = pyarrow_unwrap_schema(output_schema) - c_res_reader = GetRecordBatchReader(c_plan, c_schema) + c_res_reader = GetRecordBatchReader(plan, c_schema) c_reader = GetResultValue(c_res_reader) reader = RecordBatchReader.__new__(RecordBatchReader) diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index a47d7428882..6d0cd2d6fd7 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -86,6 +86,5 @@ def test_run_query(): assert res.schema == schema assert res.num_rows > 0 - + assert pq.read_table(filename).equals(res) - \ No newline at end of file From 373e817d11c71de89342b1e40337e9a10695f5e5 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 8 Apr 2022 14:26:47 +0530 Subject: [PATCH 25/74] rebase --- .../arrow/engine_substrait_example.cc | 14 +-- cpp/src/arrow/engine/substrait/serde_test.cc | 73 +++------------- cpp/src/arrow/engine/substrait/util.cc | 85 ++++++++++++------- cpp/src/arrow/engine/substrait/util.h | 35 +------- python/pyarrow/_engine.pyx | 15 ++-- python/pyarrow/tests/conftest.py | 6 ++ python/pyarrow/tests/test_substrait.py | 19 +---- 7 files changed, 88 insertions(+), 159 deletions(-) diff --git a/cpp/examples/arrow/engine_substrait_example.cc b/cpp/examples/arrow/engine_substrait_example.cc index e96ad640d67..905952f750f 100644 --- a/cpp/examples/arrow/engine_substrait_example.cc +++ b/cpp/examples/arrow/engine_substrait_example.cc @@ -76,24 +76,14 @@ std::string GetSubstraitPlanFromServer(const std::string& filename) { } arrow::Status Main(std::string substrait_json) { - cp::ExecContext exec_context; - - arrow::AsyncGenerator> sink_gen; - - ARROW_ASSIGN_OR_RAISE(auto plan, cp::ExecPlan::Make()); - - arrow::engine::SubstraitExecutor executor(substrait_json, &sink_gen, plan, - exec_context); - - ARROW_ASSIGN_OR_RAISE(auto sink_reader, executor.Execute()); + ARROW_ASSIGN_OR_RAISE(auto sink_reader, + arrow::engine::GetRecordBatchReader(substrait_json)); ARROW_ASSIGN_OR_RAISE(auto table, arrow::Table::FromRecordBatchReader(sink_reader.get())); std::cout << "Results : " << table->ToString() << std::endl; - RETURN_NOT_OK(executor.Close()); - return arrow::Status::OK(); } diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 8633b5ba53a..0fcec4041cb 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -29,6 +29,7 @@ #include "arrow/engine/substrait/extension_types.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" +#include "arrow/util/io_util.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/async_generator.h" @@ -730,6 +731,7 @@ TEST(Substrait, ExtensionSetFromPlan) { EXPECT_EQ(decoded_add_func.name, "add"); } +<<<<<<< HEAD <<<<<<< HEAD TEST(Substrait, ExtensionSetFromPlanMissingFunc) { ASSERT_OK_AND_ASSIGN(auto buf, internal::SubstraitFromJSON("Plan", R"({ @@ -757,16 +759,12 @@ TEST(Substrait, ExtensionSetFromPlanMissingFunc) { &ext_set)); } -TEST(Substrait, GetRecordBatchIterator) { -======= -TEST(Substrait, GetRecordBatchReader) { ->>>>>>> 7ed254e6a (addressing review comments) - const auto parquet_root = std::getenv("PARQUET_TEST_DATA"); - std::string dir_string(parquet_root); - std::stringstream ss; - ss << dir_string << "/binary.parquet"; - auto file_path = ss.str(); - +Result GetSubstraitJSON() { + ARROW_ASSIGN_OR_RAISE(std::string dir_string, + arrow::internal::GetEnvVar("PARQUET_TEST_DATA")); + auto file_name = + arrow::internal::PlatformFilename::FromString(dir_string)->Join("binary.parquet"); + auto file_path = file_name->ToString(); std::string substrait_json = R"({ "relations": [ {"rel": { @@ -797,59 +795,12 @@ TEST(Substrait, GetRecordBatchReader) { std::string filename_placeholder = "FILENAME_PLACEHOLDER"; substrait_json.replace(substrait_json.find(filename_placeholder), filename_placeholder.size(), file_path); - AsyncGenerator> sink_gen; - cp::ExecContext exec_context(default_memory_pool(), - arrow::internal::GetCpuThreadPool()); - ASSERT_OK_AND_ASSIGN(auto plan, cp::ExecPlan::Make()); - engine::SubstraitExecutor executor(substrait_json, &sink_gen, plan, - exec_context); - ASSERT_OK_AND_ASSIGN(auto reader, executor.Execute()); - ASSERT_OK(executor.Close()); - - ASSERT_OK_AND_ASSIGN(auto table, Table::FromRecordBatchReader(reader.get())); - EXPECT_GT(table->num_rows(), 0); + return substrait_json; } -TEST(Substrait, GetRecordBatchReaderUtil) { - const auto parquet_root = std::getenv("PARQUET_TEST_DATA"); - std::string dir_string(parquet_root); - std::stringstream ss; - ss << dir_string << "/binary.parquet"; - auto file_path = ss.str(); - - std::string substrait_json = R"({ - "relations": [ - {"rel": { - "read": { - "base_schema": { - "struct": { - "types": [ - {"binary": {}} - ] - }, - "names": [ - "foo" - ] - }, - "local_files": { - "items": [ - { - "uri_file": "file://FILENAME_PLACEHOLDER", - "format": "FILE_FORMAT_PARQUET" - } - ] - } - } - }} - ] - })"; - - std::string filename_placeholder = "FILENAME_PLACEHOLDER"; - substrait_json.replace(substrait_json.find(filename_placeholder), - filename_placeholder.size(), file_path); - - ASSERT_OK_AND_ASSIGN(auto reader, engine::SubstraitExecutor::GetRecordBatchReader( - substrait_json)); +TEST(Substrait, GetRecordBatchReader) { + ASSERT_OK_AND_ASSIGN(std::string substrait_json, GetSubstraitJSON()); + ASSERT_OK_AND_ASSIGN(auto reader, engine::GetRecordBatchReader(substrait_json)); ASSERT_OK_AND_ASSIGN(auto table, Table::FromRecordBatchReader(reader.get())); EXPECT_GT(table->num_rows(), 0); } diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index 6aa75756d9f..f88f6543e93 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -37,20 +37,61 @@ Future<> SubstraitSinkConsumer::Finish() { Status::ExecutionError("Error occurred in closing the batch producer")); } -Status SubstraitExecutor::MakePlan() { - ARROW_ASSIGN_OR_RAISE(auto serialized_plan, - engine::internal::SubstraitFromJSON("Plan", substrait_json_)); - RETURN_NOT_OK(engine::internal::SubstraitToJSON("Plan", *serialized_plan)); - std::vector> consumers; - std::function()> consumer_factory = [&] { - consumers.emplace_back(new SubstraitSinkConsumer{generator_}); - return consumers.back(); - }; - ARROW_ASSIGN_OR_RAISE(declarations_, - engine::DeserializePlan(*serialized_plan, consumer_factory)); +Status SubstraitSinkConsumer::Init(const std::shared_ptr& schema) { return Status::OK(); } +/// \brief An executor to run a Substrait Query +/// This interface is provided as a utility when creating language +/// bindings for consuming a Substrait plan. +class ARROW_ENGINE_EXPORT SubstraitExecutor { + public: + explicit SubstraitExecutor( + std::string substrait_json, + AsyncGenerator>* generator, + std::shared_ptr plan, cp::ExecContext exec_context) + : substrait_json_(substrait_json), + generator_(generator), + plan_(std::move(plan)), + exec_context_(exec_context) {} + + Result> Execute() { + RETURN_NOT_OK(SubstraitExecutor::Init()); + for (const cp::Declaration& decl : declarations_) { + RETURN_NOT_OK(decl.AddToPlan(plan_.get()).status()); + } + ARROW_RETURN_NOT_OK(plan_->Validate()); + ARROW_RETURN_NOT_OK(plan_->StartProducing()); + // schema of the output can be obtained by the output_schema + // of the input to the sink node. + auto schema = plan_->sinks()[0]->inputs()[0]->output_schema(); + std::shared_ptr sink_reader = cp::MakeGeneratorReader( + schema, std::move(*generator_), exec_context_.memory_pool()); + return sink_reader; + } + + Status Close() { return plan_->finished().status(); } + + Status Init() { + ARROW_ASSIGN_OR_RAISE(auto serialized_plan, + engine::internal::SubstraitFromJSON("Plan", substrait_json_)); + RETURN_NOT_OK(engine::internal::SubstraitToJSON("Plan", *serialized_plan)); + std::function()> consumer_factory = [&] { + return std::make_shared(generator_); + }; + ARROW_ASSIGN_OR_RAISE(declarations_, + engine::DeserializePlan(*serialized_plan, consumer_factory)); + return Status::OK(); + } + + private: + std::string substrait_json_; + AsyncGenerator>* generator_; + std::vector declarations_; + std::shared_ptr plan_; + cp::ExecContext exec_context_; +}; + arrow::PushGenerator>::Producer SubstraitSinkConsumer::MakeProducer( AsyncGenerator>* out_gen, @@ -62,25 +103,7 @@ SubstraitSinkConsumer::MakeProducer( return out; } -Result> SubstraitExecutor::Execute() { - RETURN_NOT_OK(SubstraitExecutor::MakePlan()); - for (const cp::Declaration& decl : declarations_) { - RETURN_NOT_OK(decl.AddToPlan(plan_.get()).status()); - } - ARROW_RETURN_NOT_OK(plan_->Validate()); - ARROW_RETURN_NOT_OK(plan_->StartProducing()); - auto schema = plan_->sinks()[0]->output_schema(); - std::shared_ptr sink_reader = cp::MakeGeneratorReader( - schema, std::move(*generator_), exec_context_.memory_pool()); - return sink_reader; -} - -Status SubstraitExecutor::Close() { - ARROW_RETURN_NOT_OK(plan_->finished().status()); - return Status::OK(); -} - -Result> SubstraitExecutor::GetRecordBatchReader( +Result> GetRecordBatchReader( std::string& substrait_json) { cp::ExecContext exec_context(arrow::default_memory_pool(), ::arrow::internal::GetCpuThreadPool()); @@ -89,7 +112,7 @@ Result> SubstraitExecutor::GetRecordBatchRead ARROW_ASSIGN_OR_RAISE(auto plan, cp::ExecPlan::Make()); arrow::engine::SubstraitExecutor executor(substrait_json, &sink_gen, plan, exec_context); - RETURN_NOT_OK(executor.MakePlan()); + RETURN_NOT_OK(executor.Init()); ARROW_ASSIGN_OR_RAISE(auto sink_reader, executor.Execute()); RETURN_NOT_OK(executor.Close()); return sink_reader; diff --git a/cpp/src/arrow/engine/substrait/util.h b/cpp/src/arrow/engine/substrait/util.h index 2447df74c70..65333a74610 100644 --- a/cpp/src/arrow/engine/substrait/util.h +++ b/cpp/src/arrow/engine/substrait/util.h @@ -42,6 +42,8 @@ class ARROW_ENGINE_EXPORT SubstraitSinkConsumer : public cp::SinkNodeConsumer { Status Consume(cp::ExecBatch batch) override; + Status Init(const std::shared_ptr& schema) override; + static arrow::PushGenerator>::Producer MakeProducer(AsyncGenerator>* out_gen, arrow::util::BackpressureOptions backpressure); @@ -52,37 +54,8 @@ class ARROW_ENGINE_EXPORT SubstraitSinkConsumer : public cp::SinkNodeConsumer { PushGenerator>::Producer producer_; }; -/// \brief An executor to run a Substrait Query -/// This interface is provided as a utility when creating language -/// bindings for consuming a Substrait plan. -class ARROW_ENGINE_EXPORT SubstraitExecutor { - public: - explicit SubstraitExecutor( - std::string substrait_json, - AsyncGenerator>* generator, - std::shared_ptr plan, - cp::ExecContext exec_context) - : substrait_json_(substrait_json), - generator_(generator), - plan_(std::move(plan)), - exec_context_(exec_context) {} - - Result> Execute(); - - Status Close(); - - static Result> GetRecordBatchReader( - std::string& substrait_json); - - private: - std::string substrait_json_; - AsyncGenerator>* generator_; - std::vector declarations_; - std::shared_ptr plan_; - cp::ExecContext exec_context_; - - Status MakePlan(); -}; +Result> GetRecordBatchReader( + std::string& substrait_json); } // namespace engine diff --git a/python/pyarrow/_engine.pyx b/python/pyarrow/_engine.pyx index 51d1e5bd74c..29b9d323832 100644 --- a/python/pyarrow/_engine.pyx +++ b/python/pyarrow/_engine.pyx @@ -21,25 +21,22 @@ from pyarrow.lib cimport * from pyarrow.includes.libarrow cimport * -def run_query(plan, output_schema): +def run_query(plan): """ - executes a substrait plan and returns a RecordBatchReader + Executes a substrait plan and returns a RecordBatchReader. - Paramters - --------- + Parameters + ---------- plan : bytes - output_schema: expected output schema + Substrait Plan as a byte stream. """ cdef: CResult[shared_ptr[CRecordBatchReader]] c_res_reader shared_ptr[CRecordBatchReader] c_reader - shared_ptr[CSchema] c_schema RecordBatchReader reader - c_schema = pyarrow_unwrap_schema(output_schema) - - c_res_reader = GetRecordBatchReader(plan, c_schema) + c_res_reader = GetRecordBatchReader(plan) c_reader = GetResultValue(c_res_reader) reader = RecordBatchReader.__new__(RecordBatchReader) diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 466b1647fdd..a26bf077e0f 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -181,6 +181,12 @@ except ImportError: pass +try: + import pyarrow.engine # noqa + defaults['engine'] = True +except ImportError: + pass + def pytest_addoption(parser): # Create options to selectively enable test groups diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index 6d0cd2d6fd7..292e8144097 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -17,7 +17,6 @@ import os import pathlib -import pyarrow as pa from pyarrow.lib import tobytes import pyarrow.parquet as pq @@ -30,11 +29,6 @@ engine = None -def test_import(): - # So we see the ImportError somewhere - import pyarrow.engine # noqa - - def resource_root(): """Get the path to the test resources directory.""" if not os.environ.get("PARQUET_TEST_DATA"): @@ -77,14 +71,9 @@ def test_run_query(): """ query = tobytes(query.replace("FILENAME_PLACEHOLDER", filename)) + reader = run_query(query) + res_tb = reader.read_all() - schema = pa.schema({"foo": pa.binary()}) - - reader = run_query(query, schema) - - res = reader.read_all() - - assert res.schema == schema - assert res.num_rows > 0 + expected_tb = pq.read_table(filename) - assert pq.read_table(filename).equals(res) + assert expected_tb.num_rows == res_tb.num_rows From ddf568d4edf30c51acb11cc0e0d0ea9d21d41a81 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 8 Apr 2022 14:28:20 +0530 Subject: [PATCH 26/74] removing examples --- .../arrow/engine_substrait_example.cc | 105 ------------------ .../substrait/query_execution_example.py | 81 -------------- 2 files changed, 186 deletions(-) delete mode 100644 cpp/examples/arrow/engine_substrait_example.cc delete mode 100644 python/examples/substrait/query_execution_example.py diff --git a/cpp/examples/arrow/engine_substrait_example.cc b/cpp/examples/arrow/engine_substrait_example.cc deleted file mode 100644 index 905952f750f..00000000000 --- a/cpp/examples/arrow/engine_substrait_example.cc +++ /dev/null @@ -1,105 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -namespace eng = arrow::engine; -namespace cp = arrow::compute; - -#define ABORT_ON_FAILURE(expr) \ - do { \ - arrow::Status status_ = (expr); \ - if (!status_.ok()) { \ - std::cerr << status_.message() << std::endl; \ - abort(); \ - } \ - } while (0); - -std::string GetSubstraitPlanFromServer(const std::string& filename) { - // Emulate server interaction by parsing hard coded JSON - std::string substrait_json = R"({ - "relations": [ - {"rel": { - "read": { - "base_schema": { - "struct": { - "types": [ - {"i64": {}}, - {"bool": {}} - ] - }, - "names": [ - "i", - "b" - ] - }, - "local_files": { - "items": [ - { - "uri_file": "file://FILENAME_PLACEHOLDER", - "format": "FILE_FORMAT_PARQUET" - } - ] - } - } - }} - ] - })"; - std::string filename_placeholder = "FILENAME_PLACEHOLDER"; - substrait_json.replace(substrait_json.find(filename_placeholder), - filename_placeholder.size(), filename); - return substrait_json; -} - -arrow::Status Main(std::string substrait_json) { - ARROW_ASSIGN_OR_RAISE(auto sink_reader, - arrow::engine::GetRecordBatchReader(substrait_json)); - - ARROW_ASSIGN_OR_RAISE(auto table, - arrow::Table::FromRecordBatchReader(sink_reader.get())); - - std::cout << "Results : " << table->ToString() << std::endl; - - return arrow::Status::OK(); -} - -int main(int argc, char** argv) { - if (argc < 2) { - std::cout << "Please specify a parquet file to scan" << std::endl; - // Fake pass for CI - return EXIT_SUCCESS; - } - auto substrait_json = GetSubstraitPlanFromServer(argv[1]); - - auto status = Main(substrait_json); - - if (!status.ok()) { - std::cout << "Error ocurred: " << status.message() << std::endl; - return EXIT_FAILURE; - } - return EXIT_SUCCESS; -} diff --git a/python/examples/substrait/query_execution_example.py b/python/examples/substrait/query_execution_example.py deleted file mode 100644 index 7970ec11a50..00000000000 --- a/python/examples/substrait/query_execution_example.py +++ /dev/null @@ -1,81 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import argparse -import pyarrow as pa -from pyarrow.lib import tobytes - -from pyarrow.engine import run_query - -""" -Data Sample ------------ - - i b -0 10 False -1 20 True -2 30 False - -Run Example ------------ - -python3 query_execution_example.py --filename -""" - -parser = argparse.ArgumentParser() -parser.add_argument("--filename", help="display a square of a given number", - type=str, required=True) -args = parser.parse_args() - -query = """ -{ - "relations": [ - {"rel": { - "read": { - "base_schema": { - "struct": { - "types": [ - {"i64": {}}, - {"bool": {}} - ] - }, - "names": [ - "i", - "b" - ] - }, - "local_files": { - "items": [ - { - "uri_file": "file://FILENAME_PLACEHOLDER", - "format": "FILE_FORMAT_PARQUET" - } - ] - } - } - }} - ] - } -""" - -query = tobytes(query.replace("FILENAME_PLACEHOLDER", args.filename)) - -schema = pa.schema({"i": pa.int64(), "b": pa.bool_()}) - -reader = run_query(query, schema) - -print(reader.read_all()) From a6f5683f4422643ee7e01a46eda44e906659cabd Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 8 Apr 2022 19:18:19 +0530 Subject: [PATCH 27/74] rebase --- cpp/examples/arrow/CMakeLists.txt | 5 ++- cpp/src/arrow/engine/substrait/util.cc | 34 +++++++++++++------ cpp/src/arrow/engine/substrait/util.h | 5 +++ python/pyarrow/_engine.pyx | 43 ++++++++++++++++++++++-- python/pyarrow/engine.py | 1 + python/pyarrow/tests/test_substrait.py | 46 ++++++++++++++++++-------- 6 files changed, 105 insertions(+), 29 deletions(-) diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt index 96461999f3a..1f14b0adc0d 100644 --- a/cpp/examples/arrow/CMakeLists.txt +++ b/cpp/examples/arrow/CMakeLists.txt @@ -21,9 +21,8 @@ if(ARROW_COMPUTE) add_arrow_example(compute_register_example) endif() -if(ARROW_SUBSTRAIT) - add_arrow_example(engine_substrait_consumption EXTRA_LINK_LIBS arrow_substrait_shared) - add_arrow_example(engine_substrait_example EXTRA_LINK_LIBS arrow_substrait_shared) +if(ARROW_ENGINE) + add_arrow_example(engine_substrait_consumption EXTRA_LINK_LIBS arrow_engine_shared) endif() if(ARROW_COMPUTE AND ARROW_CSV) diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index f88f6543e93..ce54aa29ca9 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -47,10 +47,10 @@ Status SubstraitSinkConsumer::Init(const std::shared_ptr& schema) { class ARROW_ENGINE_EXPORT SubstraitExecutor { public: explicit SubstraitExecutor( - std::string substrait_json, + std::shared_ptr substrait_buffer, AsyncGenerator>* generator, std::shared_ptr plan, cp::ExecContext exec_context) - : substrait_json_(substrait_json), + : substrait_buffer_(substrait_buffer), generator_(generator), plan_(std::move(plan)), exec_context_(exec_context) {} @@ -73,19 +73,19 @@ class ARROW_ENGINE_EXPORT SubstraitExecutor { Status Close() { return plan_->finished().status(); } Status Init() { - ARROW_ASSIGN_OR_RAISE(auto serialized_plan, - engine::internal::SubstraitFromJSON("Plan", substrait_json_)); - RETURN_NOT_OK(engine::internal::SubstraitToJSON("Plan", *serialized_plan)); + if (substrait_buffer_ == NULLPTR) { + return Status::Invalid("Buffer containing Substrait plan is null."); + } std::function()> consumer_factory = [&] { return std::make_shared(generator_); }; ARROW_ASSIGN_OR_RAISE(declarations_, - engine::DeserializePlan(*serialized_plan, consumer_factory)); + engine::DeserializePlan(*substrait_buffer_, consumer_factory)); return Status::OK(); } private: - std::string substrait_json_; + std::shared_ptr substrait_buffer_; AsyncGenerator>* generator_; std::vector declarations_; std::shared_ptr plan_; @@ -103,14 +103,22 @@ SubstraitSinkConsumer::MakeProducer( return out; } +/// \brief Retrieve a RecordBatchReader from a Substrait plan in JSON. Result> GetRecordBatchReader( std::string& substrait_json) { - cp::ExecContext exec_context(arrow::default_memory_pool(), - ::arrow::internal::GetCpuThreadPool()); + ARROW_ASSIGN_OR_RAISE(auto substrait_buffer, + GetSubstraitBufferFromJSON(substrait_json)); + return GetRecordBatchReader(substrait_buffer); +} +/// \brief Retrieve a RecordBatchReader from a Substrait plan in Buffer. +Result> GetRecordBatchReader( + std::shared_ptr substrait_buffer) { arrow::AsyncGenerator> sink_gen; ARROW_ASSIGN_OR_RAISE(auto plan, cp::ExecPlan::Make()); - arrow::engine::SubstraitExecutor executor(substrait_json, &sink_gen, plan, + cp::ExecContext exec_context(arrow::default_memory_pool(), + ::arrow::internal::GetCpuThreadPool()); + arrow::engine::SubstraitExecutor executor(substrait_buffer, &sink_gen, plan, exec_context); RETURN_NOT_OK(executor.Init()); ARROW_ASSIGN_OR_RAISE(auto sink_reader, executor.Execute()); @@ -118,6 +126,12 @@ Result> GetRecordBatchReader( return sink_reader; } +/// \brief Get Substrait Buffer from a Substrait JSON plan. +/// This is a helper method for Python tests. +Result> GetSubstraitBufferFromJSON(std::string& substrait_json) { + return engine::internal::SubstraitFromJSON("Plan", substrait_json); +} + } // namespace engine } // namespace arrow diff --git a/cpp/src/arrow/engine/substrait/util.h b/cpp/src/arrow/engine/substrait/util.h index 65333a74610..2e13b7e80e6 100644 --- a/cpp/src/arrow/engine/substrait/util.h +++ b/cpp/src/arrow/engine/substrait/util.h @@ -57,6 +57,11 @@ class ARROW_ENGINE_EXPORT SubstraitSinkConsumer : public cp::SinkNodeConsumer { Result> GetRecordBatchReader( std::string& substrait_json); +Result> GetRecordBatchReader( + std::shared_ptr substrait_buffer); + +Result> GetSubstraitBufferFromJSON(std::string& substrait_json); + } // namespace engine } // namespace arrow diff --git a/python/pyarrow/_engine.pyx b/python/pyarrow/_engine.pyx index 29b9d323832..d7f529fa613 100644 --- a/python/pyarrow/_engine.pyx +++ b/python/pyarrow/_engine.pyx @@ -17,6 +17,7 @@ # cython: language_level = 3 +from pyarrow import Buffer from pyarrow.lib cimport * from pyarrow.includes.libarrow cimport * @@ -27,18 +28,54 @@ def run_query(plan): Parameters ---------- - plan : bytes - Substrait Plan as a byte stream. + plan : bytes or Buffer + Substrait Plan can be fed as a encoded string in utf-8 + as a JSON string or as an Arrow Buffer. """ cdef: CResult[shared_ptr[CRecordBatchReader]] c_res_reader shared_ptr[CRecordBatchReader] c_reader RecordBatchReader reader + c_string c_str_plan + shared_ptr[CBuffer] c_buf_plan + + if isinstance(plan, bytes): + c_str_plan = plan + c_res_reader = GetRecordBatchReader(c_str_plan) + elif isinstance(plan, Buffer): + c_buf_plan = pyarrow_unwrap_buffer(plan) + c_res_reader = GetRecordBatchReader(c_buf_plan) + else: + raise ValueError("Expected bytes or pyarrow.Buffer") - c_res_reader = GetRecordBatchReader(plan) c_reader = GetResultValue(c_res_reader) reader = RecordBatchReader.__new__(RecordBatchReader) reader.reader = c_reader return reader + + +def get_buffer_from_json(plan): + """ + Returns Buffer object by converting substrait plan in + JSON. + + Parameter + --------- + plan: byte + Substrait plan as a bytes. + """ + + cdef: + CResult[shared_ptr[CBuffer]] c_res_buffer + c_string c_str_plan + shared_ptr[CBuffer] c_buf_plan + + if isinstance(plan, bytes): + c_str_plan = plan + c_res_buffer = GetSubstraitBufferFromJSON(c_str_plan) + c_buf_plan = GetResultValue(c_res_buffer) + else: + raise ValueError("Expected plan in bytes.") + return pyarrow_wrap_buffer(c_buf_plan) diff --git a/python/pyarrow/engine.py b/python/pyarrow/engine.py index e975f4b616f..7c87d3ff121 100644 --- a/python/pyarrow/engine.py +++ b/python/pyarrow/engine.py @@ -17,4 +17,5 @@ from pyarrow._engine import ( # noqa run_query, + get_buffer_from_json, ) diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index 292e8144097..2c154ac9ed6 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -24,24 +24,13 @@ from pyarrow import engine from pyarrow.engine import ( run_query, + get_buffer_from_json, ) except ImportError: engine = None -def resource_root(): - """Get the path to the test resources directory.""" - if not os.environ.get("PARQUET_TEST_DATA"): - raise RuntimeError("Test resources not found; set " - "PARQUET_TEST_DATA to " - "/cpp/submodules/parquet-testing/data") - return pathlib.Path(os.environ["PARQUET_TEST_DATA"]) - - -def test_run_query(): - filename = str(resource_root() / "binary.parquet") - - query = """ +_substrait_query = """ { "relations": [ {"rel": { @@ -70,6 +59,21 @@ def test_run_query(): } """ + +def resource_root(): + """Get the path to the test resources directory.""" + if not os.environ.get("PARQUET_TEST_DATA"): + raise RuntimeError("Test resources not found; set " + "PARQUET_TEST_DATA to " + "/cpp/submodules/parquet-testing/data") + return pathlib.Path(os.environ["PARQUET_TEST_DATA"]) + + +def test_run_query(): + filename = str(resource_root() / "binary.parquet") + + query = _substrait_query + query = tobytes(query.replace("FILENAME_PLACEHOLDER", filename)) reader = run_query(query) res_tb = reader.read_all() @@ -77,3 +81,19 @@ def test_run_query(): expected_tb = pq.read_table(filename) assert expected_tb.num_rows == res_tb.num_rows + + +def test_run_query_in_bytes(): + filename = str(resource_root() / "binary.parquet") + + query = _substrait_query + query = tobytes(query.replace("FILENAME_PLACEHOLDER", filename)) + + buf = get_buffer_from_json(query) + + reader = run_query(buf) + res_tb = reader.read_all() + + expected_tb = pq.read_table(filename) + + assert expected_tb.num_rows == res_tb.num_rows From 52fa43761a7ba102eba3a98419f586cd978ad09f Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 8 Apr 2022 19:44:18 +0530 Subject: [PATCH 28/74] adding pymarktest --- python/pyarrow/tests/test_substrait.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index 2c154ac9ed6..f9e8474c3fe 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -29,6 +29,10 @@ except ImportError: engine = None +# Marks all of the tests in this module +# Ignore these with pytest ... -m 'not engine' +pytestmark = pytest.mark.engine + _substrait_query = """ { From 7c47d08e417d07a4a7c0c5bb0f400e38ead26662 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sun, 10 Apr 2022 15:23:25 +0530 Subject: [PATCH 29/74] rebase --- cpp/src/arrow/engine/substrait/serde_test.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 0fcec4041cb..2ae2a57f289 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -731,8 +731,6 @@ TEST(Substrait, ExtensionSetFromPlan) { EXPECT_EQ(decoded_add_func.name, "add"); } -<<<<<<< HEAD -<<<<<<< HEAD TEST(Substrait, ExtensionSetFromPlanMissingFunc) { ASSERT_OK_AND_ASSIGN(auto buf, internal::SubstraitFromJSON("Plan", R"({ "relations": [], From b80ff396c68136ca715a680e8a10ad485a8f9e51 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sun, 10 Apr 2022 15:39:56 +0530 Subject: [PATCH 30/74] rebase --- cpp/src/arrow/engine/substrait/serde_test.cc | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 2ae2a57f289..a3db4b7c5f7 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -16,7 +16,6 @@ // under the License. #include "arrow/engine/substrait/serde.h" -#include "arrow/engine/substrait/util.h" #include #include @@ -29,11 +28,8 @@ #include "arrow/engine/substrait/extension_types.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" -#include "arrow/util/io_util.h" #include "arrow/util/key_value_metadata.h" -#include "arrow/util/async_generator.h" - using testing::ElementsAre; using testing::Eq; using testing::HasSubstr; @@ -42,7 +38,6 @@ using testing::UnorderedElementsAre; namespace arrow { using internal::checked_cast; -namespace cp = arrow::compute; namespace engine { From fb552b4b6b903fac6f645c23adbd9721840d7330 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sun, 10 Apr 2022 16:03:41 +0530 Subject: [PATCH 31/74] update test cases --- cpp/src/arrow/engine/substrait/serde_test.cc | 1 + cpp/src/arrow/engine/substrait/util.h | 6 +++--- python/pyarrow/tests/test_substrait.py | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index a3db4b7c5f7..0cc11e3e4cd 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -16,6 +16,7 @@ // under the License. #include "arrow/engine/substrait/serde.h" +#include "arrow/engine/substrait/util.h" #include #include diff --git a/cpp/src/arrow/engine/substrait/util.h b/cpp/src/arrow/engine/substrait/util.h index 2e13b7e80e6..4febdb96fe9 100644 --- a/cpp/src/arrow/engine/substrait/util.h +++ b/cpp/src/arrow/engine/substrait/util.h @@ -54,13 +54,13 @@ class ARROW_ENGINE_EXPORT SubstraitSinkConsumer : public cp::SinkNodeConsumer { PushGenerator>::Producer producer_; }; -Result> GetRecordBatchReader( +ARROW_ENGINE_EXPORT Result> GetRecordBatchReader( std::string& substrait_json); -Result> GetRecordBatchReader( +ARROW_ENGINE_EXPORT Result> GetRecordBatchReader( std::shared_ptr substrait_buffer); -Result> GetSubstraitBufferFromJSON(std::string& substrait_json); +ARROW_ENGINE_EXPORT Result> GetSubstraitBufferFromJSON(std::string& substrait_json); } // namespace engine diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index f9e8474c3fe..41cddbac91c 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -19,6 +19,7 @@ import pathlib from pyarrow.lib import tobytes import pyarrow.parquet as pq +import pytest try: from pyarrow import engine From 376a40d9187d8ad07263fcfe127ce42eb6266c05 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sun, 10 Apr 2022 16:24:31 +0530 Subject: [PATCH 32/74] addressing reviews --- cpp/src/arrow/engine/substrait/util.cc | 63 ++++++++++++++++---------- cpp/src/arrow/engine/substrait/util.h | 29 ++---------- 2 files changed, 42 insertions(+), 50 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index ce54aa29ca9..2411564ca5b 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -16,20 +16,37 @@ // under the License. #include "arrow/engine/substrait/util.h" +#include "arrow/util/async_generator.h" namespace arrow { namespace engine { -Status SubstraitSinkConsumer::Consume(cp::ExecBatch batch) { +/// \brief A SinkNodeConsumer specialized to output ExecBatches via PushGenerator +class ARROW_ENGINE_EXPORT SubstraitSinkConsumer : public compute::SinkNodeConsumer { + public: + explicit SubstraitSinkConsumer( + AsyncGenerator>* generator, + arrow::util::BackpressureOptions backpressure = {}) + : producer_(MakeProducer(generator, std::move(backpressure))) {} + + Status Consume(compute::ExecBatch batch) override { // Consume a batch of data bool did_push = producer_.Push(batch); if (!did_push) return Status::ExecutionError("Producer closed already"); return Status::OK(); } -Future<> SubstraitSinkConsumer::Finish() { - producer_.Push(IterationEnd>()); + Status Init(const std::shared_ptr& schema) override { + return Status::OK(); +} + + static arrow::PushGenerator>::Producer + MakeProducer(AsyncGenerator>* out_gen, + arrow::util::BackpressureOptions backpressure); + + Future<> Finish() override{ + producer_.Push(IterationEnd>()); if (producer_.Close()) { return Future<>::MakeFinished(); } @@ -37,9 +54,9 @@ Future<> SubstraitSinkConsumer::Finish() { Status::ExecutionError("Error occurred in closing the batch producer")); } -Status SubstraitSinkConsumer::Init(const std::shared_ptr& schema) { - return Status::OK(); -} + private: + PushGenerator>::Producer producer_; +}; /// \brief An executor to run a Substrait Query /// This interface is provided as a utility when creating language @@ -48,8 +65,8 @@ class ARROW_ENGINE_EXPORT SubstraitExecutor { public: explicit SubstraitExecutor( std::shared_ptr substrait_buffer, - AsyncGenerator>* generator, - std::shared_ptr plan, cp::ExecContext exec_context) + AsyncGenerator>* generator, + std::shared_ptr plan, compute::ExecContext exec_context) : substrait_buffer_(substrait_buffer), generator_(generator), plan_(std::move(plan)), @@ -57,7 +74,7 @@ class ARROW_ENGINE_EXPORT SubstraitExecutor { Result> Execute() { RETURN_NOT_OK(SubstraitExecutor::Init()); - for (const cp::Declaration& decl : declarations_) { + for (const compute::Declaration& decl : declarations_) { RETURN_NOT_OK(decl.AddToPlan(plan_.get()).status()); } ARROW_RETURN_NOT_OK(plan_->Validate()); @@ -65,7 +82,7 @@ class ARROW_ENGINE_EXPORT SubstraitExecutor { // schema of the output can be obtained by the output_schema // of the input to the sink node. auto schema = plan_->sinks()[0]->inputs()[0]->output_schema(); - std::shared_ptr sink_reader = cp::MakeGeneratorReader( + std::shared_ptr sink_reader = compute::MakeGeneratorReader( schema, std::move(*generator_), exec_context_.memory_pool()); return sink_reader; } @@ -76,7 +93,7 @@ class ARROW_ENGINE_EXPORT SubstraitExecutor { if (substrait_buffer_ == NULLPTR) { return Status::Invalid("Buffer containing Substrait plan is null."); } - std::function()> consumer_factory = [&] { + std::function()> consumer_factory = [&] { return std::make_shared(generator_); }; ARROW_ASSIGN_OR_RAISE(declarations_, @@ -86,24 +103,23 @@ class ARROW_ENGINE_EXPORT SubstraitExecutor { private: std::shared_ptr substrait_buffer_; - AsyncGenerator>* generator_; - std::vector declarations_; - std::shared_ptr plan_; - cp::ExecContext exec_context_; + AsyncGenerator>* generator_; + std::vector declarations_; + std::shared_ptr plan_; + compute::ExecContext exec_context_; }; -arrow::PushGenerator>::Producer +arrow::PushGenerator>::Producer SubstraitSinkConsumer::MakeProducer( - AsyncGenerator>* out_gen, + AsyncGenerator>* out_gen, arrow::util::BackpressureOptions backpressure) { - arrow::PushGenerator> push_gen( + arrow::PushGenerator> push_gen( std::move(backpressure)); auto out = push_gen.producer(); *out_gen = std::move(push_gen); return out; } -/// \brief Retrieve a RecordBatchReader from a Substrait plan in JSON. Result> GetRecordBatchReader( std::string& substrait_json) { ARROW_ASSIGN_OR_RAISE(auto substrait_buffer, @@ -111,12 +127,11 @@ Result> GetRecordBatchReader( return GetRecordBatchReader(substrait_buffer); } -/// \brief Retrieve a RecordBatchReader from a Substrait plan in Buffer. Result> GetRecordBatchReader( std::shared_ptr substrait_buffer) { - arrow::AsyncGenerator> sink_gen; - ARROW_ASSIGN_OR_RAISE(auto plan, cp::ExecPlan::Make()); - cp::ExecContext exec_context(arrow::default_memory_pool(), + arrow::AsyncGenerator> sink_gen; + ARROW_ASSIGN_OR_RAISE(auto plan, compute::ExecPlan::Make()); + compute::ExecContext exec_context(arrow::default_memory_pool(), ::arrow::internal::GetCpuThreadPool()); arrow::engine::SubstraitExecutor executor(substrait_buffer, &sink_gen, plan, exec_context); @@ -126,8 +141,6 @@ Result> GetRecordBatchReader( return sink_reader; } -/// \brief Get Substrait Buffer from a Substrait JSON plan. -/// This is a helper method for Python tests. Result> GetSubstraitBufferFromJSON(std::string& substrait_json) { return engine::internal::SubstraitFromJSON("Plan", substrait_json); } diff --git a/cpp/src/arrow/engine/substrait/util.h b/cpp/src/arrow/engine/substrait/util.h index 4febdb96fe9..351b20a786f 100644 --- a/cpp/src/arrow/engine/substrait/util.h +++ b/cpp/src/arrow/engine/substrait/util.h @@ -22,44 +22,23 @@ #include #include "arrow/compute/type_fwd.h" #include "arrow/engine/api.h" -#include "arrow/util/async_generator.h" #include "arrow/util/iterator.h" #include "arrow/util/optional.h" namespace arrow { -namespace cp = arrow::compute; - namespace engine { -/// \brief A SinkNodeConsumer specialized to output ExecBatches via PushGenerator -class ARROW_ENGINE_EXPORT SubstraitSinkConsumer : public cp::SinkNodeConsumer { - public: - explicit SubstraitSinkConsumer( - AsyncGenerator>* generator, - arrow::util::BackpressureOptions backpressure = {}) - : producer_(MakeProducer(generator, std::move(backpressure))) {} - - Status Consume(cp::ExecBatch batch) override; - - Status Init(const std::shared_ptr& schema) override; - - static arrow::PushGenerator>::Producer - MakeProducer(AsyncGenerator>* out_gen, - arrow::util::BackpressureOptions backpressure); - - Future<> Finish() override; - - private: - PushGenerator>::Producer producer_; -}; - +/// \brief Retrieve a RecordBatchReader from a Substrait plan in JSON. ARROW_ENGINE_EXPORT Result> GetRecordBatchReader( std::string& substrait_json); +/// \brief Retrieve a RecordBatchReader from a Substrait plan in Buffer. ARROW_ENGINE_EXPORT Result> GetRecordBatchReader( std::shared_ptr substrait_buffer); +/// \brief Get Substrait Buffer from a Substrait JSON plan. +/// This is a helper method for Python tests. ARROW_ENGINE_EXPORT Result> GetSubstraitBufferFromJSON(std::string& substrait_json); } // namespace engine From b02ff3203eb1bd714b515ae1933fd8f11adbb754 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 11 Apr 2022 18:35:29 +0530 Subject: [PATCH 33/74] adding test cases for invalid plan --- cpp/src/arrow/engine/substrait/serde_test.cc | 8 ++++++ cpp/src/arrow/engine/substrait/util.cc | 30 +++++++++----------- cpp/src/arrow/engine/substrait/util.h | 3 +- python/pyarrow/tests/test_substrait.py | 13 +++++++++ 4 files changed, 37 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 0cc11e3e4cd..8c8f156df05 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -799,5 +799,13 @@ TEST(Substrait, GetRecordBatchReader) { EXPECT_GT(table->num_rows(), 0); } +TEST(Substrait, InvalidPlan) { + std::string substrait_json = R"({ + "relations": [ + ] + })"; + ASSERT_RAISES(Invalid, engine::GetRecordBatchReader(substrait_json)); +} + } // namespace engine } // namespace arrow diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index 2411564ca5b..1c549c14fae 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -31,28 +31,26 @@ class ARROW_ENGINE_EXPORT SubstraitSinkConsumer : public compute::SinkNodeConsum : producer_(MakeProducer(generator, std::move(backpressure))) {} Status Consume(compute::ExecBatch batch) override { - // Consume a batch of data - bool did_push = producer_.Push(batch); - if (!did_push) return Status::ExecutionError("Producer closed already"); - return Status::OK(); -} + // Consume a batch of data + bool did_push = producer_.Push(batch); + if (!did_push) return Status::ExecutionError("Producer closed already"); + return Status::OK(); + } - Status Init(const std::shared_ptr& schema) override { - return Status::OK(); -} + Status Init(const std::shared_ptr& schema) override { return Status::OK(); } static arrow::PushGenerator>::Producer MakeProducer(AsyncGenerator>* out_gen, arrow::util::BackpressureOptions backpressure); - Future<> Finish() override{ - producer_.Push(IterationEnd>()); - if (producer_.Close()) { - return Future<>::MakeFinished(); + Future<> Finish() override { + producer_.Push(IterationEnd>()); + if (producer_.Close()) { + return Future<>::MakeFinished(); + } + return Future<>::MakeFinished( + Status::ExecutionError("Error occurred in closing the batch producer")); } - return Future<>::MakeFinished( - Status::ExecutionError("Error occurred in closing the batch producer")); -} private: PushGenerator>::Producer producer_; @@ -132,7 +130,7 @@ Result> GetRecordBatchReader( arrow::AsyncGenerator> sink_gen; ARROW_ASSIGN_OR_RAISE(auto plan, compute::ExecPlan::Make()); compute::ExecContext exec_context(arrow::default_memory_pool(), - ::arrow::internal::GetCpuThreadPool()); + ::arrow::internal::GetCpuThreadPool()); arrow::engine::SubstraitExecutor executor(substrait_buffer, &sink_gen, plan, exec_context); RETURN_NOT_OK(executor.Init()); diff --git a/cpp/src/arrow/engine/substrait/util.h b/cpp/src/arrow/engine/substrait/util.h index 351b20a786f..b4fd71bcda8 100644 --- a/cpp/src/arrow/engine/substrait/util.h +++ b/cpp/src/arrow/engine/substrait/util.h @@ -39,7 +39,8 @@ ARROW_ENGINE_EXPORT Result> GetRecordBatchRea /// \brief Get Substrait Buffer from a Substrait JSON plan. /// This is a helper method for Python tests. -ARROW_ENGINE_EXPORT Result> GetSubstraitBufferFromJSON(std::string& substrait_json); +ARROW_ENGINE_EXPORT Result> GetSubstraitBufferFromJSON( + std::string& substrait_json); } // namespace engine diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index 41cddbac91c..43512eedb60 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -18,6 +18,7 @@ import os import pathlib from pyarrow.lib import tobytes +from pyarrow.lib import ArrowInvalid import pyarrow.parquet as pq import pytest @@ -102,3 +103,15 @@ def test_run_query_in_bytes(): expected_tb = pq.read_table(filename) assert expected_tb.num_rows == res_tb.num_rows + + +def test_invalid_plan(): + query = """ + { + "relations": [ + ] + } + """ + exec_message = "ExecPlan has no node" + with pytest.raises(ArrowInvalid, match=exec_message): + run_query(tobytes(query)) From 0239ecc613b1551bc5d6c740c6c22400c8dcf59e Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 18 Apr 2022 10:34:27 +0530 Subject: [PATCH 34/74] temp commit to test AppVeyor --- cpp/src/arrow/engine/substrait/serde_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 8c8f156df05..8527da18e0b 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -785,7 +785,7 @@ Result GetSubstraitJSON() { }} ] })"; - + std::cout << "File Path : >>>>" << file_path << std::endl; std::string filename_placeholder = "FILENAME_PLACEHOLDER"; substrait_json.replace(substrait_json.find(filename_placeholder), filename_placeholder.size(), file_path); From 3814334db648413e593f90235a96223db51afb40 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 18 Apr 2022 11:05:06 +0530 Subject: [PATCH 35/74] excluding redundant test --- python/pyarrow/tests/test_substrait.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index 43512eedb60..fa3724c0e4e 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -22,15 +22,10 @@ import pyarrow.parquet as pq import pytest -try: - from pyarrow import engine - from pyarrow.engine import ( - run_query, - get_buffer_from_json, - ) -except ImportError: - engine = None - +from pyarrow.engine import ( + run_query, + get_buffer_from_json, +) # Marks all of the tests in this module # Ignore these with pytest ... -m 'not engine' pytestmark = pytest.mark.engine From f4c25818419c5e3c66923ddb07059531aaa161a2 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 18 Apr 2022 14:54:20 +0530 Subject: [PATCH 36/74] updating test case --- cpp/src/arrow/engine/substrait/serde_test.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 8527da18e0b..82f6211671a 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -776,7 +776,7 @@ Result GetSubstraitJSON() { "local_files": { "items": [ { - "uri_file": "file://FILENAME_PLACEHOLDER", + "uri_file": "FILENAME_PLACEHOLDER", "format": "FILE_FORMAT_PARQUET" } ] @@ -785,6 +785,11 @@ Result GetSubstraitJSON() { }} ] })"; +#ifdef _WIN32 + file_path = "file:///" + file_path; +#else + file_path = "file://" + file_path; +#endif std::cout << "File Path : >>>>" << file_path << std::endl; std::string filename_placeholder = "FILENAME_PLACEHOLDER"; substrait_json.replace(substrait_json.find(filename_placeholder), From e4b6daa70f11bd34069144e4b5823c320ca1ec81 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 18 Apr 2022 15:17:28 +0530 Subject: [PATCH 37/74] test fpath --- cpp/src/arrow/engine/substrait/serde_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 82f6211671a..42ab1d09977 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -786,7 +786,7 @@ Result GetSubstraitJSON() { ] })"; #ifdef _WIN32 - file_path = "file:///" + file_path; + file_path = "file:/" + file_path; #else file_path = "file://" + file_path; #endif From 0cea68d503a12510ff226887989cfe6715873dba Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 18 Apr 2022 15:34:07 +0530 Subject: [PATCH 38/74] updating workflow --- .github/workflows/python.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 5ababa32953..fa2e7121840 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -120,6 +120,7 @@ jobs: env: ARROW_HOME: /usr/local ARROW_DATASET: ON + ARROW_ENGINE: ON ARROW_FLIGHT: ON ARROW_GANDIVA: ON ARROW_HDFS: ON From 06f7bad47a7ba0f43460d0e280a3ee59bb7f7bfa Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 18 Apr 2022 18:15:11 +0530 Subject: [PATCH 39/74] adding engine flag for python build script --- ci/scripts/python_build.sh | 1 + cpp/src/arrow/engine/substrait/serde_test.cc | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/scripts/python_build.sh b/ci/scripts/python_build.sh index e87117ce877..f5b4aa460d1 100755 --- a/ci/scripts/python_build.sh +++ b/ci/scripts/python_build.sh @@ -56,6 +56,7 @@ export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE:-debug} export PYARROW_WITH_CUDA=${ARROW_CUDA:-OFF} export PYARROW_WITH_DATASET=${ARROW_DATASET:-ON} +export PYARROW_WITH_ENGINE=${ARROW_ENGINE:-OFF} export PYARROW_WITH_FLIGHT=${ARROW_FLIGHT:-OFF} export PYARROW_WITH_GANDIVA=${ARROW_GANDIVA:-OFF} export PYARROW_WITH_HDFS=${ARROW_HDFS:-ON} diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 42ab1d09977..3d54a0b079e 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -786,8 +786,10 @@ Result GetSubstraitJSON() { ] })"; #ifdef _WIN32 - file_path = "file:/" + file_path; + // Path is supposed to start with "X:/..." + file_path = "file:///" + file_path; #else + // Path is supposed to start with "/..." file_path = "file://" + file_path; #endif std::cout << "File Path : >>>>" << file_path << std::endl; From 80c1bb281af4f9fe7a1038f50fb1e474762d3e76 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 21 Apr 2022 18:31:29 +0530 Subject: [PATCH 40/74] rebase --- cpp/src/arrow/engine/substrait/serde_test.cc | 4 ++-- cpp/src/arrow/engine/substrait/util.cc | 24 +++++++++----------- cpp/src/arrow/engine/substrait/util.h | 14 ++++++------ python/pyarrow/_engine.pyx | 23 +++++++++++-------- python/pyarrow/engine.py | 2 +- python/pyarrow/tests/test_substrait.py | 24 +++++++++----------- 6 files changed, 46 insertions(+), 45 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 3d54a0b079e..7df5cd0529d 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -801,7 +801,7 @@ Result GetSubstraitJSON() { TEST(Substrait, GetRecordBatchReader) { ASSERT_OK_AND_ASSIGN(std::string substrait_json, GetSubstraitJSON()); - ASSERT_OK_AND_ASSIGN(auto reader, engine::GetRecordBatchReader(substrait_json)); + ASSERT_OK_AND_ASSIGN(auto reader, engine::ExecuteJsonPlan(substrait_json)); ASSERT_OK_AND_ASSIGN(auto table, Table::FromRecordBatchReader(reader.get())); EXPECT_GT(table->num_rows(), 0); } @@ -811,7 +811,7 @@ TEST(Substrait, InvalidPlan) { "relations": [ ] })"; - ASSERT_RAISES(Invalid, engine::GetRecordBatchReader(substrait_json)); + ASSERT_RAISES(Invalid, engine::ExecuteJsonPlan(substrait_json)); } } // namespace engine diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index 1c549c14fae..d4c93584d6f 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -65,18 +65,17 @@ class ARROW_ENGINE_EXPORT SubstraitExecutor { std::shared_ptr substrait_buffer, AsyncGenerator>* generator, std::shared_ptr plan, compute::ExecContext exec_context) - : substrait_buffer_(substrait_buffer), + : substrait_buffer_(std::move(substrait_buffer)), generator_(generator), plan_(std::move(plan)), exec_context_(exec_context) {} Result> Execute() { - RETURN_NOT_OK(SubstraitExecutor::Init()); for (const compute::Declaration& decl : declarations_) { RETURN_NOT_OK(decl.AddToPlan(plan_.get()).status()); } - ARROW_RETURN_NOT_OK(plan_->Validate()); - ARROW_RETURN_NOT_OK(plan_->StartProducing()); + RETURN_NOT_OK(plan_->Validate()); + RETURN_NOT_OK(plan_->StartProducing()); // schema of the output can be obtained by the output_schema // of the input to the sink node. auto schema = plan_->sinks()[0]->inputs()[0]->output_schema(); @@ -118,28 +117,27 @@ SubstraitSinkConsumer::MakeProducer( return out; } -Result> GetRecordBatchReader( - std::string& substrait_json) { - ARROW_ASSIGN_OR_RAISE(auto substrait_buffer, - GetSubstraitBufferFromJSON(substrait_json)); - return GetRecordBatchReader(substrait_buffer); +Result> ExecuteJsonPlan( + const std::string& substrait_json) { + ARROW_ASSIGN_OR_RAISE(auto substrait_buffer, ParseJsonPlan(substrait_json)); + return ExecuteSerializedPlan(substrait_buffer); } -Result> GetRecordBatchReader( +Result> ExecuteSerializedPlan( std::shared_ptr substrait_buffer) { arrow::AsyncGenerator> sink_gen; ARROW_ASSIGN_OR_RAISE(auto plan, compute::ExecPlan::Make()); compute::ExecContext exec_context(arrow::default_memory_pool(), ::arrow::internal::GetCpuThreadPool()); - arrow::engine::SubstraitExecutor executor(substrait_buffer, &sink_gen, plan, - exec_context); + arrow::engine::SubstraitExecutor executor(std::move(substrait_buffer), &sink_gen, + std::move(plan), exec_context); RETURN_NOT_OK(executor.Init()); ARROW_ASSIGN_OR_RAISE(auto sink_reader, executor.Execute()); RETURN_NOT_OK(executor.Close()); return sink_reader; } -Result> GetSubstraitBufferFromJSON(std::string& substrait_json) { +Result> ParseJsonPlan(const std::string& substrait_json) { return engine::internal::SubstraitFromJSON("Plan", substrait_json); } diff --git a/cpp/src/arrow/engine/substrait/util.h b/cpp/src/arrow/engine/substrait/util.h index b4fd71bcda8..fbbe42bef0d 100644 --- a/cpp/src/arrow/engine/substrait/util.h +++ b/cpp/src/arrow/engine/substrait/util.h @@ -30,17 +30,17 @@ namespace arrow { namespace engine { /// \brief Retrieve a RecordBatchReader from a Substrait plan in JSON. -ARROW_ENGINE_EXPORT Result> GetRecordBatchReader( - std::string& substrait_json); +ARROW_ENGINE_EXPORT Result> ExecuteJsonPlan( + const std::string& substrait_json); -/// \brief Retrieve a RecordBatchReader from a Substrait plan in Buffer. -ARROW_ENGINE_EXPORT Result> GetRecordBatchReader( +/// \brief Retrieve a RecordBatchReader from a Substrait plan. +ARROW_ENGINE_EXPORT Result> ExecuteSerializedPlan( std::shared_ptr substrait_buffer); -/// \brief Get Substrait Buffer from a Substrait JSON plan. +/// \brief Get a Buffer from a Substrait JSON plan. /// This is a helper method for Python tests. -ARROW_ENGINE_EXPORT Result> GetSubstraitBufferFromJSON( - std::string& substrait_json); +ARROW_ENGINE_EXPORT Result> ParseJsonPlan( + const std::string& substrait_json); } // namespace engine diff --git a/python/pyarrow/_engine.pyx b/python/pyarrow/_engine.pyx index d7f529fa613..9a466c83bcc 100644 --- a/python/pyarrow/_engine.pyx +++ b/python/pyarrow/_engine.pyx @@ -42,10 +42,10 @@ def run_query(plan): if isinstance(plan, bytes): c_str_plan = plan - c_res_reader = GetRecordBatchReader(c_str_plan) + c_res_reader = ExecuteJsonPlan(c_str_plan) elif isinstance(plan, Buffer): c_buf_plan = pyarrow_unwrap_buffer(plan) - c_res_reader = GetRecordBatchReader(c_buf_plan) + c_res_reader = ExecuteSerializedPlan(c_buf_plan) else: raise ValueError("Expected bytes or pyarrow.Buffer") @@ -56,15 +56,20 @@ def run_query(plan): return reader -def get_buffer_from_json(plan): +def _parse_json_plan(plan): """ - Returns Buffer object by converting substrait plan in - JSON. + Parse a JSON plan into equivalent serialized Protobuf. - Parameter - --------- + Parameters + ---------- plan: byte - Substrait plan as a bytes. + Substrait plan as bytes. This is obtained by encoding in utf-8 + a Substrait plan in JSON format. + + Returns + ------- + Buffer + pyarrow.Buffer object is returned. """ cdef: @@ -74,7 +79,7 @@ def get_buffer_from_json(plan): if isinstance(plan, bytes): c_str_plan = plan - c_res_buffer = GetSubstraitBufferFromJSON(c_str_plan) + c_res_buffer = ParseJsonPlan(c_str_plan) c_buf_plan = GetResultValue(c_res_buffer) else: raise ValueError("Expected plan in bytes.") diff --git a/python/pyarrow/engine.py b/python/pyarrow/engine.py index 7c87d3ff121..bda87f793a8 100644 --- a/python/pyarrow/engine.py +++ b/python/pyarrow/engine.py @@ -17,5 +17,5 @@ from pyarrow._engine import ( # noqa run_query, - get_buffer_from_json, + _parse_json_plan, ) diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index fa3724c0e4e..5ff07e9277e 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -22,10 +22,11 @@ import pyarrow.parquet as pq import pytest -from pyarrow.engine import ( - run_query, - get_buffer_from_json, -) +try: + import pyarrow.engine as engine +except ImportError: + engine = None + # Marks all of the tests in this module # Ignore these with pytest ... -m 'not engine' pytestmark = pytest.mark.engine @@ -73,10 +74,8 @@ def resource_root(): def test_run_query(): filename = str(resource_root() / "binary.parquet") - query = _substrait_query - - query = tobytes(query.replace("FILENAME_PLACEHOLDER", filename)) - reader = run_query(query) + query = tobytes(_substrait_query.replace("FILENAME_PLACEHOLDER", filename)) + reader = engine.run_query(query) res_tb = reader.read_all() expected_tb = pq.read_table(filename) @@ -87,12 +86,11 @@ def test_run_query(): def test_run_query_in_bytes(): filename = str(resource_root() / "binary.parquet") - query = _substrait_query - query = tobytes(query.replace("FILENAME_PLACEHOLDER", filename)) + query = tobytes(_substrait_query.replace("FILENAME_PLACEHOLDER", filename)) - buf = get_buffer_from_json(query) + buf = engine._parse_json_plan(query) - reader = run_query(buf) + reader = engine.run_query(buf) res_tb = reader.read_all() expected_tb = pq.read_table(filename) @@ -109,4 +107,4 @@ def test_invalid_plan(): """ exec_message = "ExecPlan has no node" with pytest.raises(ArrowInvalid, match=exec_message): - run_query(tobytes(query)) + engine.run_query(tobytes(query)) From 39c3f3e316611ca5f9bb612e0783c313f34a4b74 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 26 Apr 2022 11:50:12 +0530 Subject: [PATCH 41/74] updating conftest for engine addition --- python/pyarrow/tests/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index a26bf077e0f..967fcac5954 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -49,6 +49,7 @@ 'bz2', 'cython', 'dataset', + 'engine', 'hypothesis', 'fastparquet', 'gandiva', @@ -78,6 +79,7 @@ 'bz2': Codec.is_available('bz2'), 'cython': False, 'dataset': False, + 'engine': False, 'fastparquet': False, 'flight': False, 'gandiva': False, From 48051ce48c6ae653227874f1bb0d27f4da602ac4 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 26 Apr 2022 12:15:23 +0530 Subject: [PATCH 42/74] updating test cases and imports --- cpp/src/arrow/engine/substrait/util.h | 5 +---- python/pyarrow/_engine.pyx | 7 +++---- python/pyarrow/engine.py | 1 - python/pyarrow/tests/test_substrait.py | 3 ++- 4 files changed, 6 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/util.h b/cpp/src/arrow/engine/substrait/util.h index fbbe42bef0d..65bea2a9a50 100644 --- a/cpp/src/arrow/engine/substrait/util.h +++ b/cpp/src/arrow/engine/substrait/util.h @@ -18,9 +18,6 @@ #pragma once #include -#include -#include -#include "arrow/compute/type_fwd.h" #include "arrow/engine/api.h" #include "arrow/util/iterator.h" #include "arrow/util/optional.h" @@ -37,7 +34,7 @@ ARROW_ENGINE_EXPORT Result> ExecuteJsonPlan( ARROW_ENGINE_EXPORT Result> ExecuteSerializedPlan( std::shared_ptr substrait_buffer); -/// \brief Get a Buffer from a Substrait JSON plan. +/// \brief Get a Serialized Plan from a Substrait JSON plan. /// This is a helper method for Python tests. ARROW_ENGINE_EXPORT Result> ParseJsonPlan( const std::string& substrait_json); diff --git a/python/pyarrow/_engine.pyx b/python/pyarrow/_engine.pyx index 9a466c83bcc..921915aa664 100644 --- a/python/pyarrow/_engine.pyx +++ b/python/pyarrow/_engine.pyx @@ -29,8 +29,8 @@ def run_query(plan): Parameters ---------- plan : bytes or Buffer - Substrait Plan can be fed as a encoded string in utf-8 - as a JSON string or as an Arrow Buffer. + Substrait plan can be fed as a serialized plan (Buffer) + or a JSON plan. """ cdef: @@ -63,8 +63,7 @@ def _parse_json_plan(plan): Parameters ---------- plan: byte - Substrait plan as bytes. This is obtained by encoding in utf-8 - a Substrait plan in JSON format. + Parse a Substrait plan in JSON to a serialized plan. Returns ------- diff --git a/python/pyarrow/engine.py b/python/pyarrow/engine.py index bda87f793a8..e975f4b616f 100644 --- a/python/pyarrow/engine.py +++ b/python/pyarrow/engine.py @@ -17,5 +17,4 @@ from pyarrow._engine import ( # noqa run_query, - _parse_json_plan, ) diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index 5ff07e9277e..0a45a66864b 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -17,6 +17,7 @@ import os import pathlib +import pyarrow as pa from pyarrow.lib import tobytes from pyarrow.lib import ArrowInvalid import pyarrow.parquet as pq @@ -88,7 +89,7 @@ def test_run_query_in_bytes(): query = tobytes(_substrait_query.replace("FILENAME_PLACEHOLDER", filename)) - buf = engine._parse_json_plan(query) + buf = pa._engine._parse_json_plan(query) reader = engine.run_query(buf) res_tb = reader.read_all() From c2e90920d4abb3711ce322ad20ee9279ebc22735 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 26 Apr 2022 12:18:58 +0530 Subject: [PATCH 43/74] adding missing imports --- cpp/src/arrow/engine/substrait/util.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index d4c93584d6f..3f5b8c384ce 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -17,6 +17,7 @@ #include "arrow/engine/substrait/util.h" #include "arrow/util/async_generator.h" +#include "arrow/util/async_util.h" namespace arrow { From e643c3ef260a82c47199997460c31f01a18261b4 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 26 Apr 2022 15:42:28 +0530 Subject: [PATCH 44/74] refactor the usage --- cpp/src/arrow/engine/substrait/util.cc | 28 +++++++++++++------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index 3f5b8c384ce..fd2cabb9a2a 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -27,8 +27,8 @@ namespace engine { class ARROW_ENGINE_EXPORT SubstraitSinkConsumer : public compute::SinkNodeConsumer { public: explicit SubstraitSinkConsumer( - AsyncGenerator>* generator, - arrow::util::BackpressureOptions backpressure = {}) + AsyncGenerator>* generator, + util::BackpressureOptions backpressure = {}) : producer_(MakeProducer(generator, std::move(backpressure))) {} Status Consume(compute::ExecBatch batch) override { @@ -40,12 +40,12 @@ class ARROW_ENGINE_EXPORT SubstraitSinkConsumer : public compute::SinkNodeConsum Status Init(const std::shared_ptr& schema) override { return Status::OK(); } - static arrow::PushGenerator>::Producer - MakeProducer(AsyncGenerator>* out_gen, - arrow::util::BackpressureOptions backpressure); + static arrow::PushGenerator>::Producer MakeProducer( + AsyncGenerator>* out_gen, + util::BackpressureOptions backpressure); Future<> Finish() override { - producer_.Push(IterationEnd>()); + producer_.Push(IterationEnd>()); if (producer_.Close()) { return Future<>::MakeFinished(); } @@ -54,7 +54,7 @@ class ARROW_ENGINE_EXPORT SubstraitSinkConsumer : public compute::SinkNodeConsum } private: - PushGenerator>::Producer producer_; + PushGenerator>::Producer producer_; }; /// \brief An executor to run a Substrait Query @@ -64,7 +64,7 @@ class ARROW_ENGINE_EXPORT SubstraitExecutor { public: explicit SubstraitExecutor( std::shared_ptr substrait_buffer, - AsyncGenerator>* generator, + AsyncGenerator>* generator, std::shared_ptr plan, compute::ExecContext exec_context) : substrait_buffer_(std::move(substrait_buffer)), generator_(generator), @@ -101,17 +101,17 @@ class ARROW_ENGINE_EXPORT SubstraitExecutor { private: std::shared_ptr substrait_buffer_; - AsyncGenerator>* generator_; + AsyncGenerator>* generator_; std::vector declarations_; std::shared_ptr plan_; compute::ExecContext exec_context_; }; -arrow::PushGenerator>::Producer +arrow::PushGenerator>::Producer SubstraitSinkConsumer::MakeProducer( - AsyncGenerator>* out_gen, - arrow::util::BackpressureOptions backpressure) { - arrow::PushGenerator> push_gen( + AsyncGenerator>* out_gen, + util::BackpressureOptions backpressure) { + arrow::PushGenerator> push_gen( std::move(backpressure)); auto out = push_gen.producer(); *out_gen = std::move(push_gen); @@ -126,7 +126,7 @@ Result> ExecuteJsonPlan( Result> ExecuteSerializedPlan( std::shared_ptr substrait_buffer) { - arrow::AsyncGenerator> sink_gen; + arrow::AsyncGenerator> sink_gen; ARROW_ASSIGN_OR_RAISE(auto plan, compute::ExecPlan::Make()); compute::ExecContext exec_context(arrow::default_memory_pool(), ::arrow::internal::GetCpuThreadPool()); From b3a80c62730490cd55017239d85ec60d47250aa2 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 26 Apr 2022 19:10:09 +0530 Subject: [PATCH 45/74] refactor from engine to substrait and rebase --- .github/workflows/python.yml | 2 +- ci/scripts/python_build.sh | 2 +- cpp/examples/arrow/CMakeLists.txt | 4 ++-- cpp/src/arrow/engine/substrait/util.cc | 19 +++++++++--------- python/CMakeLists.txt | 20 +++++++++---------- .../pyarrow/{_engine.pyx => _substrait.pyx} | 2 +- python/pyarrow/{engine.py => substrait.py} | 2 +- python/pyarrow/tests/conftest.py | 8 ++++---- python/pyarrow/tests/test_substrait.py | 14 ++++++------- python/setup.py | 16 +++++++-------- 10 files changed, 44 insertions(+), 45 deletions(-) rename python/pyarrow/{_engine.pyx => _substrait.pyx} (99%) rename python/pyarrow/{engine.py => substrait.py} (95%) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index fa2e7121840..438def6e680 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -120,7 +120,7 @@ jobs: env: ARROW_HOME: /usr/local ARROW_DATASET: ON - ARROW_ENGINE: ON + ARROW_SUBSTRAIT: ON ARROW_FLIGHT: ON ARROW_GANDIVA: ON ARROW_HDFS: ON diff --git a/ci/scripts/python_build.sh b/ci/scripts/python_build.sh index f5b4aa460d1..0fdea7033a6 100755 --- a/ci/scripts/python_build.sh +++ b/ci/scripts/python_build.sh @@ -56,7 +56,7 @@ export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE:-debug} export PYARROW_WITH_CUDA=${ARROW_CUDA:-OFF} export PYARROW_WITH_DATASET=${ARROW_DATASET:-ON} -export PYARROW_WITH_ENGINE=${ARROW_ENGINE:-OFF} +export PYARROW_WITH_SUBSTRAIT=${ARROW_SUBSTRAIT:-OFF} export PYARROW_WITH_FLIGHT=${ARROW_FLIGHT:-OFF} export PYARROW_WITH_GANDIVA=${ARROW_GANDIVA:-OFF} export PYARROW_WITH_HDFS=${ARROW_HDFS:-ON} diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt index 1f14b0adc0d..229373665d5 100644 --- a/cpp/examples/arrow/CMakeLists.txt +++ b/cpp/examples/arrow/CMakeLists.txt @@ -21,8 +21,8 @@ if(ARROW_COMPUTE) add_arrow_example(compute_register_example) endif() -if(ARROW_ENGINE) - add_arrow_example(engine_substrait_consumption EXTRA_LINK_LIBS arrow_engine_shared) +if(ARROW_SUBSTRAIT) + add_arrow_example(engine_substrait_consumption EXTRA_LINK_LIBS arrow_substrait_shared) endif() if(ARROW_COMPUTE AND ARROW_CSV) diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index fd2cabb9a2a..4b6ae40d213 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -27,9 +27,8 @@ namespace engine { class ARROW_ENGINE_EXPORT SubstraitSinkConsumer : public compute::SinkNodeConsumer { public: explicit SubstraitSinkConsumer( - AsyncGenerator>* generator, - util::BackpressureOptions backpressure = {}) - : producer_(MakeProducer(generator, std::move(backpressure))) {} + AsyncGenerator>* generator) + : producer_(MakeProducer(generator)) {} Status Consume(compute::ExecBatch batch) override { // Consume a batch of data @@ -38,11 +37,13 @@ class ARROW_ENGINE_EXPORT SubstraitSinkConsumer : public compute::SinkNodeConsum return Status::OK(); } - Status Init(const std::shared_ptr& schema) override { return Status::OK(); } + Status Init(const std::shared_ptr& schema, + compute::BackpressureControl* backpressure_control) override { + return Status::OK(); + } static arrow::PushGenerator>::Producer MakeProducer( - AsyncGenerator>* out_gen, - util::BackpressureOptions backpressure); + AsyncGenerator>* out_gen); Future<> Finish() override { producer_.Push(IterationEnd>()); @@ -109,10 +110,8 @@ class ARROW_ENGINE_EXPORT SubstraitExecutor { arrow::PushGenerator>::Producer SubstraitSinkConsumer::MakeProducer( - AsyncGenerator>* out_gen, - util::BackpressureOptions backpressure) { - arrow::PushGenerator> push_gen( - std::move(backpressure)); + AsyncGenerator>* out_gen) { + arrow::PushGenerator> push_gen; auto out = push_gen.producer(); *out_gen = std::move(push_gen); return out; diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 584899b71e0..3e08253f329 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -69,7 +69,7 @@ endif() if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" OFF) option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" OFF) - option(PYARROW_BUILD_ENGINE "Build the PyArrow Engine integration" OFF) + option(PYARROW_BUILD_SUBSTRAIT "Build the PyArrow Substrait integration" OFF) option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF) option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF) option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF) @@ -228,8 +228,8 @@ if(PYARROW_BUILD_FLIGHT) set(ARROW_FLIGHT TRUE) endif() -if(PYARROW_BUILD_ENGINE) - set(ARROW_ENGINE TRUE) +if(PYARROW_BUILD_SUBSTRAIT) + set(ARROW_SUBSTRAIT TRUE) endif() # Arrow @@ -541,14 +541,14 @@ if(PYARROW_BUILD_FLIGHT) endif() # Engine -if(PYARROW_BUILD_ENGINE) - find_package(ArrowEngine REQUIRED) +if(PYARROW_BUILD_SUBSTRAIT) + find_package(ArrowSubstrait REQUIRED) if(PYARROW_BUNDLE_ARROW_CPP) - bundle_arrow_lib(ARROW_ENGINE_SHARED_LIB SO_VERSION ${ARROW_SO_VERSION}) + bundle_arrow_lib(ARROW_SUBSTRAIT_SHARED_LIB SO_VERSION ${ARROW_SO_VERSION}) endif() - set(ENGINE_LINK_LIBS arrow_engine_shared) - set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _engine) + set(SUBSTRAIT_LINK_LIBS arrow_substrait_shared) + set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _substrait) endif() # Gandiva @@ -641,8 +641,8 @@ if(PYARROW_BUILD_FLIGHT) target_link_libraries(_flight PRIVATE ${FLIGHT_LINK_LIBS}) endif() -if(PYARROW_BUILD_ENGINE) - target_link_libraries(_engine PRIVATE ${ENGINE_LINK_LIBS}) +if(PYARROW_BUILD_SUBSTRAIT) + target_link_libraries(_substrait PRIVATE ${SUBSTRAIT_LINK_LIBS}) endif() if(PYARROW_BUILD_DATASET) diff --git a/python/pyarrow/_engine.pyx b/python/pyarrow/_substrait.pyx similarity index 99% rename from python/pyarrow/_engine.pyx rename to python/pyarrow/_substrait.pyx index 921915aa664..4a67f111fe8 100644 --- a/python/pyarrow/_engine.pyx +++ b/python/pyarrow/_substrait.pyx @@ -67,7 +67,7 @@ def _parse_json_plan(plan): Returns ------- - Buffer + Buffer pyarrow.Buffer object is returned. """ diff --git a/python/pyarrow/engine.py b/python/pyarrow/substrait.py similarity index 95% rename from python/pyarrow/engine.py rename to python/pyarrow/substrait.py index e975f4b616f..e3ff28f4eba 100644 --- a/python/pyarrow/engine.py +++ b/python/pyarrow/substrait.py @@ -15,6 +15,6 @@ # specific language governing permissions and limitations # under the License. -from pyarrow._engine import ( # noqa +from pyarrow._substrait import ( # noqa run_query, ) diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 967fcac5954..a5aae6f634e 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -49,7 +49,6 @@ 'bz2', 'cython', 'dataset', - 'engine', 'hypothesis', 'fastparquet', 'gandiva', @@ -67,6 +66,7 @@ 'plasma', 's3', 'snappy', + 'substrait', 'tensorflow', 'flight', 'slow', @@ -79,7 +79,6 @@ 'bz2': Codec.is_available('bz2'), 'cython': False, 'dataset': False, - 'engine': False, 'fastparquet': False, 'flight': False, 'gandiva': False, @@ -100,6 +99,7 @@ 's3': False, 'slow': False, 'snappy': Codec.is_available('snappy'), + 'substrait': False, 'tensorflow': False, 'zstd': Codec.is_available('zstd'), } @@ -184,8 +184,8 @@ pass try: - import pyarrow.engine # noqa - defaults['engine'] = True + import pyarrow.substrait # noqa + defaults['substrait'] = True except ImportError: pass diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index 0a45a66864b..716cff85f74 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -24,13 +24,13 @@ import pytest try: - import pyarrow.engine as engine + import pyarrow.substrait as substrait except ImportError: - engine = None + substrait = None # Marks all of the tests in this module # Ignore these with pytest ... -m 'not engine' -pytestmark = pytest.mark.engine +pytestmark = pytest.mark.substrait _substrait_query = """ @@ -76,7 +76,7 @@ def test_run_query(): filename = str(resource_root() / "binary.parquet") query = tobytes(_substrait_query.replace("FILENAME_PLACEHOLDER", filename)) - reader = engine.run_query(query) + reader = substrait.run_query(query) res_tb = reader.read_all() expected_tb = pq.read_table(filename) @@ -89,9 +89,9 @@ def test_run_query_in_bytes(): query = tobytes(_substrait_query.replace("FILENAME_PLACEHOLDER", filename)) - buf = pa._engine._parse_json_plan(query) + buf = pa._substrait._parse_json_plan(query) - reader = engine.run_query(buf) + reader = substrait.run_query(buf) res_tb = reader.read_all() expected_tb = pq.read_table(filename) @@ -108,4 +108,4 @@ def test_invalid_plan(): """ exec_message = "ExecPlan has no node" with pytest.raises(ArrowInvalid, match=exec_message): - engine.run_query(tobytes(query)) + substrait.run_query(tobytes(query)) diff --git a/python/setup.py b/python/setup.py index a60cba7fe24..79ec3c8447e 100755 --- a/python/setup.py +++ b/python/setup.py @@ -108,7 +108,7 @@ def run(self): 'namespace of boost (default: boost)'), ('with-cuda', None, 'build the Cuda extension'), ('with-flight', None, 'build the Flight extension'), - ('with-engine', None, 'build the Engine extension'), + ('with-substrait', None, 'build the Substrait extension'), ('with-dataset', None, 'build the Dataset extension'), ('with-parquet', None, 'build the Parquet extension'), ('with-parquet-encryption', None, @@ -161,8 +161,8 @@ def initialize_options(self): os.environ.get('PYARROW_WITH_HDFS', '0')) self.with_cuda = strtobool( os.environ.get('PYARROW_WITH_CUDA', '0')) - self.with_engine = strtobool( - os.environ.get('PYARROW_WITH_ENGINE', '0')) + self.with_substrait = strtobool( + os.environ.get('PYARROW_WITH_SUBSTRAIT', '0')) self.with_flight = strtobool( os.environ.get('PYARROW_WITH_FLIGHT', '0')) self.with_dataset = strtobool( @@ -206,7 +206,6 @@ def initialize_options(self): '_json', '_compute', '_cuda', - '_engine', '_flight', '_dataset', '_dataset_orc', @@ -218,6 +217,7 @@ def initialize_options(self): '_orc', '_plasma', '_s3fs', + '_substrait', '_hdfs', '_hdfsio', 'gandiva'] @@ -272,7 +272,7 @@ def append_cmake_bool(value, varname): cmake_options += ['-G', self.cmake_generator] append_cmake_bool(self.with_cuda, 'PYARROW_BUILD_CUDA') - append_cmake_bool(self.with_engine, 'PYARROW_BUILD_ENGINE') + append_cmake_bool(self.with_substrait, 'PYARROW_BUILD_SUBSTRAIT') append_cmake_bool(self.with_flight, 'PYARROW_BUILD_FLIGHT') append_cmake_bool(self.with_gandiva, 'PYARROW_BUILD_GANDIVA') append_cmake_bool(self.with_dataset, 'PYARROW_BUILD_DATASET') @@ -398,8 +398,8 @@ def _bundle_arrow_cpp(self, build_prefix, build_lib): move_shared_libs(build_prefix, build_lib, "arrow_python") if self.with_cuda: move_shared_libs(build_prefix, build_lib, "arrow_cuda") - if self.with_engine: - move_shared_libs(build_prefix, build_lib, "arrow_engine") + if self.with_substrait: + move_shared_libs(build_prefix, build_lib, "arrow_substrait") if self.with_flight: move_shared_libs(build_prefix, build_lib, "arrow_flight") move_shared_libs(build_prefix, build_lib, @@ -445,7 +445,7 @@ def _failure_permitted(self, name): return True if name == '_flight' and not self.with_flight: return True - if name == '_engine' and not self.with_engine: + if name == '_substrait' and not self.with_substrait: return True if name == '_s3fs' and not self.with_s3: return True From 222cacbc66fa197d614831af93e0a9cd8726b073 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 26 Apr 2022 19:49:39 +0530 Subject: [PATCH 46/74] fix import --- cpp/src/arrow/engine/substrait/util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/engine/substrait/util.h b/cpp/src/arrow/engine/substrait/util.h index 65bea2a9a50..50684880a52 100644 --- a/cpp/src/arrow/engine/substrait/util.h +++ b/cpp/src/arrow/engine/substrait/util.h @@ -18,7 +18,7 @@ #pragma once #include -#include "arrow/engine/api.h" +#include "arrow/engine/substrait/api.h" #include "arrow/util/iterator.h" #include "arrow/util/optional.h" From 7c861cbc9deb3a2f6ae8d842ea1f8b6d47d4ce9d Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 28 Apr 2022 05:32:49 +0530 Subject: [PATCH 47/74] minor changes --- python/pyarrow/_substrait.pyx | 14 ++++++-------- python/pyarrow/tests/test_substrait.py | 11 ++++++++--- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/python/pyarrow/_substrait.pyx b/python/pyarrow/_substrait.pyx index 4a67f111fe8..974f1f821f1 100644 --- a/python/pyarrow/_substrait.pyx +++ b/python/pyarrow/_substrait.pyx @@ -62,13 +62,13 @@ def _parse_json_plan(plan): Parameters ---------- - plan: byte + plan: bytes Parse a Substrait plan in JSON to a serialized plan. Returns ------- Buffer - pyarrow.Buffer object is returned. + A buffer containing the serialized Protobuf plan. """ cdef: @@ -76,10 +76,8 @@ def _parse_json_plan(plan): c_string c_str_plan shared_ptr[CBuffer] c_buf_plan - if isinstance(plan, bytes): - c_str_plan = plan - c_res_buffer = ParseJsonPlan(c_str_plan) - c_buf_plan = GetResultValue(c_res_buffer) - else: - raise ValueError("Expected plan in bytes.") + c_str_plan = plan + c_res_buffer = ParseJsonPlan(c_str_plan) + c_buf_plan = GetResultValue(c_res_buffer) + return pyarrow_wrap_buffer(c_buf_plan) diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index 716cff85f74..82c2e56ac96 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -20,14 +20,19 @@ import pyarrow as pa from pyarrow.lib import tobytes from pyarrow.lib import ArrowInvalid -import pyarrow.parquet as pq -import pytest + +try: + import pyarrow.parquet as pq +except ImportError: + pq = None try: import pyarrow.substrait as substrait except ImportError: substrait = None +import pytest + # Marks all of the tests in this module # Ignore these with pytest ... -m 'not engine' pytestmark = pytest.mark.substrait @@ -84,7 +89,7 @@ def test_run_query(): assert expected_tb.num_rows == res_tb.num_rows -def test_run_query_in_bytes(): +def test_run_serialized_query(): filename = str(resource_root() / "binary.parquet") query = tobytes(_substrait_query.replace("FILENAME_PLACEHOLDER", filename)) From ee73c4080790c9aa8a8ae62213fa033db0a69521 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 28 Apr 2022 07:36:20 +0530 Subject: [PATCH 48/74] rebase --- cpp/src/arrow/engine/substrait/serde_test.cc | 6 ++- cpp/src/arrow/engine/substrait/util.cc | 13 +------ cpp/src/arrow/engine/substrait/util.h | 4 -- python/pyarrow/_substrait.pyx | 15 ++------ python/pyarrow/tests/test_substrait.py | 40 +++++++------------- 5 files changed, 24 insertions(+), 54 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 7df5cd0529d..d74dba5ce0e 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -801,7 +801,8 @@ Result GetSubstraitJSON() { TEST(Substrait, GetRecordBatchReader) { ASSERT_OK_AND_ASSIGN(std::string substrait_json, GetSubstraitJSON()); - ASSERT_OK_AND_ASSIGN(auto reader, engine::ExecuteJsonPlan(substrait_json)); + ASSERT_OK_AND_ASSIGN(auto buf, engine::ParseJsonPlan(substrait_json)); + ASSERT_OK_AND_ASSIGN(auto reader, engine::ExecuteSerializedPlan(buf)); ASSERT_OK_AND_ASSIGN(auto table, Table::FromRecordBatchReader(reader.get())); EXPECT_GT(table->num_rows(), 0); } @@ -811,7 +812,8 @@ TEST(Substrait, InvalidPlan) { "relations": [ ] })"; - ASSERT_RAISES(Invalid, engine::ExecuteJsonPlan(substrait_json)); + ASSERT_OK_AND_ASSIGN(auto buf, engine::ParseJsonPlan(substrait_json)); + ASSERT_RAISES(Invalid, engine::ExecuteSerializedPlan(buf)); } } // namespace engine diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index 4b6ae40d213..e493afc1b8f 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -47,11 +47,8 @@ class ARROW_ENGINE_EXPORT SubstraitSinkConsumer : public compute::SinkNodeConsum Future<> Finish() override { producer_.Push(IterationEnd>()); - if (producer_.Close()) { - return Future<>::MakeFinished(); - } - return Future<>::MakeFinished( - Status::ExecutionError("Error occurred in closing the batch producer")); + ARROW_UNUSED(producer_.Close()); + return Future<>::MakeFinished(); } private: @@ -117,12 +114,6 @@ SubstraitSinkConsumer::MakeProducer( return out; } -Result> ExecuteJsonPlan( - const std::string& substrait_json) { - ARROW_ASSIGN_OR_RAISE(auto substrait_buffer, ParseJsonPlan(substrait_json)); - return ExecuteSerializedPlan(substrait_buffer); -} - Result> ExecuteSerializedPlan( std::shared_ptr substrait_buffer) { arrow::AsyncGenerator> sink_gen; diff --git a/cpp/src/arrow/engine/substrait/util.h b/cpp/src/arrow/engine/substrait/util.h index 50684880a52..00ce43863c8 100644 --- a/cpp/src/arrow/engine/substrait/util.h +++ b/cpp/src/arrow/engine/substrait/util.h @@ -26,10 +26,6 @@ namespace arrow { namespace engine { -/// \brief Retrieve a RecordBatchReader from a Substrait plan in JSON. -ARROW_ENGINE_EXPORT Result> ExecuteJsonPlan( - const std::string& substrait_json); - /// \brief Retrieve a RecordBatchReader from a Substrait plan. ARROW_ENGINE_EXPORT Result> ExecuteSerializedPlan( std::shared_ptr substrait_buffer); diff --git a/python/pyarrow/_substrait.pyx b/python/pyarrow/_substrait.pyx index 974f1f821f1..03358e4a84a 100644 --- a/python/pyarrow/_substrait.pyx +++ b/python/pyarrow/_substrait.pyx @@ -28,9 +28,8 @@ def run_query(plan): Parameters ---------- - plan : bytes or Buffer - Substrait plan can be fed as a serialized plan (Buffer) - or a JSON plan. + plan : Buffer + Substrait plan can be fed as a serialized plan (Buffer). """ cdef: @@ -40,14 +39,8 @@ def run_query(plan): c_string c_str_plan shared_ptr[CBuffer] c_buf_plan - if isinstance(plan, bytes): - c_str_plan = plan - c_res_reader = ExecuteJsonPlan(c_str_plan) - elif isinstance(plan, Buffer): - c_buf_plan = pyarrow_unwrap_buffer(plan) - c_res_reader = ExecuteSerializedPlan(c_buf_plan) - else: - raise ValueError("Expected bytes or pyarrow.Buffer") + c_buf_plan = pyarrow_unwrap_buffer(plan) + c_res_reader = ExecuteSerializedPlan(c_buf_plan) c_reader = GetResultValue(c_res_reader) diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index 82c2e56ac96..3f837ec0df4 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -38,7 +38,17 @@ pytestmark = pytest.mark.substrait -_substrait_query = """ +def resource_root(): + """Get the path to the test resources directory.""" + if not os.environ.get("PARQUET_TEST_DATA"): + raise RuntimeError("Test resources not found; set " + "PARQUET_TEST_DATA to " + "/cpp/submodules/parquet-testing/data") + return pathlib.Path(os.environ["PARQUET_TEST_DATA"]) + + +def test_run_serialized_query(): + substrait_query = """ { "relations": [ {"rel": { @@ -67,32 +77,9 @@ } """ - -def resource_root(): - """Get the path to the test resources directory.""" - if not os.environ.get("PARQUET_TEST_DATA"): - raise RuntimeError("Test resources not found; set " - "PARQUET_TEST_DATA to " - "/cpp/submodules/parquet-testing/data") - return pathlib.Path(os.environ["PARQUET_TEST_DATA"]) - - -def test_run_query(): - filename = str(resource_root() / "binary.parquet") - - query = tobytes(_substrait_query.replace("FILENAME_PLACEHOLDER", filename)) - reader = substrait.run_query(query) - res_tb = reader.read_all() - - expected_tb = pq.read_table(filename) - - assert expected_tb.num_rows == res_tb.num_rows - - -def test_run_serialized_query(): filename = str(resource_root() / "binary.parquet") - query = tobytes(_substrait_query.replace("FILENAME_PLACEHOLDER", filename)) + query = tobytes(substrait_query.replace("FILENAME_PLACEHOLDER", filename)) buf = pa._substrait._parse_json_plan(query) @@ -111,6 +98,7 @@ def test_invalid_plan(): ] } """ + buf = pa._substrait._parse_json_plan(tobytes(query)) exec_message = "ExecPlan has no node" with pytest.raises(ArrowInvalid, match=exec_message): - substrait.run_query(tobytes(query)) + substrait.run_query(buf) From 256f51439fbe9297762836058a9b157ef8e3ee77 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 28 Apr 2022 08:04:47 +0530 Subject: [PATCH 49/74] testing windows ci issue --- cpp/src/arrow/engine/substrait/serde_test.cc | 34 +++++++++++++------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index d74dba5ce0e..fafe03f5883 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -753,12 +753,22 @@ TEST(Substrait, ExtensionSetFromPlanMissingFunc) { &ext_set)); } -Result GetSubstraitJSON() { +Result GetDataPath(const std::string& file_name) { + std::stringstream ss; ARROW_ASSIGN_OR_RAISE(std::string dir_string, arrow::internal::GetEnvVar("PARQUET_TEST_DATA")); - auto file_name = - arrow::internal::PlatformFilename::FromString(dir_string)->Join("binary.parquet"); - auto file_path = file_name->ToString(); + ss << dir_string; + ss << "/" << file_name; + return ss.str(); +} + +Result GetSubstraitJSON() { + // ARROW_ASSIGN_OR_RAISE(std::string dir_string, + // arrow::internal::GetEnvVar("PARQUET_TEST_DATA")); + // auto file_name = + // arrow::internal::PlatformFilename::FromString(dir_string)->Join("binary.parquet"); + ARROW_ASSIGN_OR_RAISE(auto file_path, GetDataPath("binary.parquet")); + // auto file_path = file_name->ToString(); std::string substrait_json = R"({ "relations": [ {"rel": { @@ -776,7 +786,7 @@ Result GetSubstraitJSON() { "local_files": { "items": [ { - "uri_file": "FILENAME_PLACEHOLDER", + "uri_file": "file://FILENAME_PLACEHOLDER", "format": "FILE_FORMAT_PARQUET" } ] @@ -785,13 +795,13 @@ Result GetSubstraitJSON() { }} ] })"; -#ifdef _WIN32 - // Path is supposed to start with "X:/..." - file_path = "file:///" + file_path; -#else - // Path is supposed to start with "/..." - file_path = "file://" + file_path; -#endif + // #ifdef _WIN32 + // // Path is supposed to start with "X:/..." + // file_path = "file:///" + file_path; + // #else + // // Path is supposed to start with "/..." + // file_path = "file://" + file_path; + // #endif std::cout << "File Path : >>>>" << file_path << std::endl; std::string filename_placeholder = "FILENAME_PLACEHOLDER"; substrait_json.replace(substrait_json.find(filename_placeholder), From e3ed91ddc5b4ea3488931e14576aa197114007f7 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 28 Apr 2022 08:34:01 +0530 Subject: [PATCH 50/74] testing path --- cpp/src/arrow/engine/substrait/serde_test.cc | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index fafe03f5883..5b07a69aad8 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -763,6 +763,8 @@ Result GetDataPath(const std::string& file_name) { } Result GetSubstraitJSON() { + // TODO: testing the Windows build issue + // following comments will be removed accordingly // ARROW_ASSIGN_OR_RAISE(std::string dir_string, // arrow::internal::GetEnvVar("PARQUET_TEST_DATA")); // auto file_name = @@ -786,7 +788,7 @@ Result GetSubstraitJSON() { "local_files": { "items": [ { - "uri_file": "file://FILENAME_PLACEHOLDER", + "uri_file": "FILENAME_PLACEHOLDER", "format": "FILE_FORMAT_PARQUET" } ] @@ -795,13 +797,14 @@ Result GetSubstraitJSON() { }} ] })"; - // #ifdef _WIN32 - // // Path is supposed to start with "X:/..." - // file_path = "file:///" + file_path; - // #else - // // Path is supposed to start with "/..." - // file_path = "file://" + file_path; - // #endif +// fixing path for OS +#ifdef _WIN32 + // Path is supposed to start with "X:/..." + file_path = "file:///" + file_path; +#else + // Path is supposed to start with "/..." + file_path = "file://" + file_path; +#endif std::cout << "File Path : >>>>" << file_path << std::endl; std::string filename_placeholder = "FILENAME_PLACEHOLDER"; substrait_json.replace(substrait_json.find(filename_placeholder), From ffcf56780adca2e0753b4ece066dbfa192ae1fdc Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 28 Apr 2022 12:27:56 +0530 Subject: [PATCH 51/74] updating the test case skipping windows --- cpp/src/arrow/engine/substrait/serde_test.cc | 37 +++++++------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 5b07a69aad8..de7adc50713 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -31,6 +31,10 @@ #include "arrow/testing/matchers.h" #include "arrow/util/key_value_metadata.h" +#include +#include +#include + using testing::ElementsAre; using testing::Eq; using testing::HasSubstr; @@ -753,24 +757,12 @@ TEST(Substrait, ExtensionSetFromPlanMissingFunc) { &ext_set)); } -Result GetDataPath(const std::string& file_name) { - std::stringstream ss; +Result GetSubstraitJSON() { ARROW_ASSIGN_OR_RAISE(std::string dir_string, arrow::internal::GetEnvVar("PARQUET_TEST_DATA")); - ss << dir_string; - ss << "/" << file_name; - return ss.str(); -} - -Result GetSubstraitJSON() { - // TODO: testing the Windows build issue - // following comments will be removed accordingly - // ARROW_ASSIGN_OR_RAISE(std::string dir_string, - // arrow::internal::GetEnvVar("PARQUET_TEST_DATA")); - // auto file_name = - // arrow::internal::PlatformFilename::FromString(dir_string)->Join("binary.parquet"); - ARROW_ASSIGN_OR_RAISE(auto file_path, GetDataPath("binary.parquet")); - // auto file_path = file_name->ToString(); + auto file_name = + arrow::internal::PlatformFilename::FromString(dir_string)->Join("binary.parquet"); + auto file_path = file_name->ToString(); std::string substrait_json = R"({ "relations": [ {"rel": { @@ -797,15 +789,6 @@ Result GetSubstraitJSON() { }} ] })"; -// fixing path for OS -#ifdef _WIN32 - // Path is supposed to start with "X:/..." - file_path = "file:///" + file_path; -#else - // Path is supposed to start with "/..." - file_path = "file://" + file_path; -#endif - std::cout << "File Path : >>>>" << file_path << std::endl; std::string filename_placeholder = "FILENAME_PLACEHOLDER"; substrait_json.replace(substrait_json.find(filename_placeholder), filename_placeholder.size(), file_path); @@ -813,11 +796,15 @@ Result GetSubstraitJSON() { } TEST(Substrait, GetRecordBatchReader) { +#ifdef _WIN32 + GTEST_SKIP() << "Substrait File URI not supported for Windows"; +#else ASSERT_OK_AND_ASSIGN(std::string substrait_json, GetSubstraitJSON()); ASSERT_OK_AND_ASSIGN(auto buf, engine::ParseJsonPlan(substrait_json)); ASSERT_OK_AND_ASSIGN(auto reader, engine::ExecuteSerializedPlan(buf)); ASSERT_OK_AND_ASSIGN(auto table, Table::FromRecordBatchReader(reader.get())); EXPECT_GT(table->num_rows(), 0); +#endif } TEST(Substrait, InvalidPlan) { From db5dba1018d0a270c0db9741ce77c00b06820325 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 28 Apr 2022 12:55:35 +0530 Subject: [PATCH 52/74] remove unnecessary imports and adding skip for windows --- cpp/src/arrow/engine/substrait/serde_test.cc | 4 ---- python/pyarrow/tests/test_substrait.py | 3 +++ 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index de7adc50713..3c41d26f85b 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -31,10 +31,6 @@ #include "arrow/testing/matchers.h" #include "arrow/util/key_value_metadata.h" -#include -#include -#include - using testing::ElementsAre; using testing::Eq; using testing::HasSubstr; diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index 3f837ec0df4..b7486366d88 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -32,6 +32,7 @@ substrait = None import pytest +import sys # Marks all of the tests in this module # Ignore these with pytest ... -m 'not engine' @@ -47,6 +48,8 @@ def resource_root(): return pathlib.Path(os.environ["PARQUET_TEST_DATA"]) +@pytest.mark.skipif(sys.platform == 'win32', + reason="file based URI is not fully supported for Windows") def test_run_serialized_query(): substrait_query = """ { From 717de203ad3c4b7a227e5e834b6991feb9fbe6d4 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 28 Apr 2022 19:27:17 +0530 Subject: [PATCH 53/74] addressing feedbacks --- cpp/src/arrow/engine/substrait/serde_test.cc | 4 ++-- cpp/src/arrow/engine/substrait/util.cc | 2 ++ python/pyarrow/_substrait.pyx | 4 ++-- python/pyarrow/tests/test_substrait.py | 6 +++--- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 3c41d26f85b..14668f1b303 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -776,7 +776,7 @@ Result GetSubstraitJSON() { "local_files": { "items": [ { - "uri_file": "FILENAME_PLACEHOLDER", + "uri_file": "file://FILENAME_PLACEHOLDER", "format": "FILE_FORMAT_PARQUET" } ] @@ -793,7 +793,7 @@ Result GetSubstraitJSON() { TEST(Substrait, GetRecordBatchReader) { #ifdef _WIN32 - GTEST_SKIP() << "Substrait File URI not supported for Windows"; + GTEST_SKIP() << "ARROW-16392: Substrait File URI not supported for Windows"; #else ASSERT_OK_AND_ASSIGN(std::string substrait_json, GetSubstraitJSON()); ASSERT_OK_AND_ASSIGN(auto buf, engine::ParseJsonPlan(substrait_json)); diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index e493afc1b8f..296e7948dfa 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -23,6 +23,7 @@ namespace arrow { namespace engine { +namespace { /// \brief A SinkNodeConsumer specialized to output ExecBatches via PushGenerator class ARROW_ENGINE_EXPORT SubstraitSinkConsumer : public compute::SinkNodeConsumer { public: @@ -113,6 +114,7 @@ SubstraitSinkConsumer::MakeProducer( *out_gen = std::move(push_gen); return out; } +} // namespace Result> ExecuteSerializedPlan( std::shared_ptr substrait_buffer) { diff --git a/python/pyarrow/_substrait.pyx b/python/pyarrow/_substrait.pyx index 03358e4a84a..487663bc3b4 100644 --- a/python/pyarrow/_substrait.pyx +++ b/python/pyarrow/_substrait.pyx @@ -24,12 +24,12 @@ from pyarrow.includes.libarrow cimport * def run_query(plan): """ - Executes a substrait plan and returns a RecordBatchReader. + Execute a Substrait plan and read the results as a RecordBatchReader. Parameters ---------- plan : Buffer - Substrait plan can be fed as a serialized plan (Buffer). + The serialized Substrait plan to execute. """ cdef: diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index b7486366d88..ca8628c4ad1 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -35,8 +35,8 @@ import sys # Marks all of the tests in this module -# Ignore these with pytest ... -m 'not engine' -pytestmark = pytest.mark.substrait +# Ignore these with pytest ... -m 'not substrait' +pytestmark = [pytest.mark.parquet, pytest.mark.substrait] def resource_root(): @@ -49,7 +49,7 @@ def resource_root(): @pytest.mark.skipif(sys.platform == 'win32', - reason="file based URI is not fully supported for Windows") + reason="ARROW-16392: file based URI is not fully supported for Windows") def test_run_serialized_query(): substrait_query = """ { From b3a36ada51c01c51f4fe120ffb949f2bbf7af8a1 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 28 Apr 2022 19:32:11 +0530 Subject: [PATCH 54/74] fix the commit order and formatting in pytest --- python/pyarrow/tests/test_substrait.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index ca8628c4ad1..dd475b3a41b 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -16,7 +16,10 @@ # under the License. import os +import sys import pathlib +import pytest + import pyarrow as pa from pyarrow.lib import tobytes from pyarrow.lib import ArrowInvalid @@ -31,9 +34,6 @@ except ImportError: substrait = None -import pytest -import sys - # Marks all of the tests in this module # Ignore these with pytest ... -m 'not substrait' pytestmark = [pytest.mark.parquet, pytest.mark.substrait] @@ -49,7 +49,8 @@ def resource_root(): @pytest.mark.skipif(sys.platform == 'win32', - reason="ARROW-16392: file based URI is not fully supported for Windows") + reason="ARROW-16392: file based URI is" + + " not fully supported for Windows") def test_run_serialized_query(): substrait_query = """ { From 18099329c082025ed4c880312ffa6e05d99edbd2 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 28 Apr 2022 20:25:31 +0530 Subject: [PATCH 55/74] removed the export keyword --- cpp/src/arrow/engine/substrait/util.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index 296e7948dfa..0cc11260b15 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -25,7 +25,7 @@ namespace engine { namespace { /// \brief A SinkNodeConsumer specialized to output ExecBatches via PushGenerator -class ARROW_ENGINE_EXPORT SubstraitSinkConsumer : public compute::SinkNodeConsumer { +class SubstraitSinkConsumer : public compute::SinkNodeConsumer { public: explicit SubstraitSinkConsumer( AsyncGenerator>* generator) @@ -59,7 +59,7 @@ class ARROW_ENGINE_EXPORT SubstraitSinkConsumer : public compute::SinkNodeConsum /// \brief An executor to run a Substrait Query /// This interface is provided as a utility when creating language /// bindings for consuming a Substrait plan. -class ARROW_ENGINE_EXPORT SubstraitExecutor { +class SubstraitExecutor { public: explicit SubstraitExecutor( std::shared_ptr substrait_buffer, From f40c2cf3a3f03274c4a6e92e78922fec57cf2040 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sat, 30 Apr 2022 08:06:53 +0530 Subject: [PATCH 56/74] updating test condition --- cpp/src/arrow/engine/substrait/serde_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 14668f1b303..0fe2662590e 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -799,7 +799,7 @@ TEST(Substrait, GetRecordBatchReader) { ASSERT_OK_AND_ASSIGN(auto buf, engine::ParseJsonPlan(substrait_json)); ASSERT_OK_AND_ASSIGN(auto reader, engine::ExecuteSerializedPlan(buf)); ASSERT_OK_AND_ASSIGN(auto table, Table::FromRecordBatchReader(reader.get())); - EXPECT_GT(table->num_rows(), 0); + EXPECT_EQ(table->num_rows(), 12); #endif } From 749be1b192d2be1306ec6925ccbc76a216a29045 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sat, 30 Apr 2022 08:08:24 +0530 Subject: [PATCH 57/74] adding comment for test --- cpp/src/arrow/engine/substrait/serde_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 0fe2662590e..97b04552174 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -799,6 +799,8 @@ TEST(Substrait, GetRecordBatchReader) { ASSERT_OK_AND_ASSIGN(auto buf, engine::ParseJsonPlan(substrait_json)); ASSERT_OK_AND_ASSIGN(auto reader, engine::ExecuteSerializedPlan(buf)); ASSERT_OK_AND_ASSIGN(auto table, Table::FromRecordBatchReader(reader.get())); + // Note: assuming the binary.parquet file contains fixed amount of records + // in case of a test failure, re-evalaute the content in the file EXPECT_EQ(table->num_rows(), 12); #endif } From 3d808cfcc8c468ef1c8569452f2824781d8d11ac Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 3 May 2022 17:06:32 +0530 Subject: [PATCH 58/74] update libarrow.pxd --- python/pyarrow/includes/libarrow.pxd | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 2e51864b860..ffbe973f382 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2696,3 +2696,9 @@ cdef extern from "arrow/python/udf.h" namespace "arrow::py": CStatus RegisterScalarFunction(PyObject* function, function[CallbackUdf] wrapper, const CScalarUdfOptions& options) + +cdef extern from "arrow/engine/substrait/util.h" namespace "arrow::engine" nogil: + CResult[shared_ptr[CRecordBatchReader]] ExecuteSerializedPlan(shared_ptr[CBuffer] substrait_buffer) + CResult[shared_ptr[CBuffer]] ParseJsonPlan(const c_string& substrait_json) + + From 3b0da1fc7b829b4586edad20f4fc0383be8b51bd Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 3 May 2022 17:25:04 +0530 Subject: [PATCH 59/74] rebase --- cpp/src/arrow/python/CMakeLists.txt | 3 ++- python/pyarrow/includes/libarrow.pxd | 6 ++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 5ba83b5a490..c37240a426c 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -44,7 +44,8 @@ set(ARROW_PYTHON_SRCS numpy_to_arrow.cc python_to_arrow.cc pyarrow.cc - serialize.cc) + serialize.cc + udf.cc) set_source_files_properties(init.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON SKIP_UNITY_BUILD_INCLUSION ON) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index ffbe973f382..2007cc8a516 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2698,7 +2698,5 @@ cdef extern from "arrow/python/udf.h" namespace "arrow::py": function[CallbackUdf] wrapper, const CScalarUdfOptions& options) cdef extern from "arrow/engine/substrait/util.h" namespace "arrow::engine" nogil: - CResult[shared_ptr[CRecordBatchReader]] ExecuteSerializedPlan(shared_ptr[CBuffer] substrait_buffer) - CResult[shared_ptr[CBuffer]] ParseJsonPlan(const c_string& substrait_json) - - + CResult[shared_ptr[CRecordBatchReader]] ExecuteSerializedPlan(shared_ptr[CBuffer] substrait_buffer) + CResult[shared_ptr[CBuffer]] ParseJsonPlan(const c_string& substrait_json) From d66096029ec3f92c3965c701c61e6d26ab2c9190 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 6 May 2022 18:25:58 +0530 Subject: [PATCH 60/74] changing the buffer usage in the substrait consumer --- cpp/src/arrow/engine/substrait/serde_test.cc | 4 ++-- cpp/src/arrow/engine/substrait/util.cc | 23 +++++++------------- cpp/src/arrow/engine/substrait/util.h | 2 +- python/pyarrow/_substrait.pyx | 3 ++- python/pyarrow/includes/libarrow.pxd | 2 +- python/pyarrow/tests/test_substrait.py | 4 ++-- 6 files changed, 16 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 97b04552174..13fef3b277d 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -797,7 +797,7 @@ TEST(Substrait, GetRecordBatchReader) { #else ASSERT_OK_AND_ASSIGN(std::string substrait_json, GetSubstraitJSON()); ASSERT_OK_AND_ASSIGN(auto buf, engine::ParseJsonPlan(substrait_json)); - ASSERT_OK_AND_ASSIGN(auto reader, engine::ExecuteSerializedPlan(buf)); + ASSERT_OK_AND_ASSIGN(auto reader, engine::ExecuteSerializedPlan(*buf)); ASSERT_OK_AND_ASSIGN(auto table, Table::FromRecordBatchReader(reader.get())); // Note: assuming the binary.parquet file contains fixed amount of records // in case of a test failure, re-evalaute the content in the file @@ -811,7 +811,7 @@ TEST(Substrait, InvalidPlan) { ] })"; ASSERT_OK_AND_ASSIGN(auto buf, engine::ParseJsonPlan(substrait_json)); - ASSERT_RAISES(Invalid, engine::ExecuteSerializedPlan(buf)); + ASSERT_RAISES(Invalid, engine::ExecuteSerializedPlan(*buf)); } } // namespace engine diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index 0cc11260b15..880b8c2d54a 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -47,7 +47,6 @@ class SubstraitSinkConsumer : public compute::SinkNodeConsumer { AsyncGenerator>* out_gen); Future<> Finish() override { - producer_.Push(IterationEnd>()); ARROW_UNUSED(producer_.Close()); return Future<>::MakeFinished(); } @@ -62,13 +61,9 @@ class SubstraitSinkConsumer : public compute::SinkNodeConsumer { class SubstraitExecutor { public: explicit SubstraitExecutor( - std::shared_ptr substrait_buffer, AsyncGenerator>* generator, std::shared_ptr plan, compute::ExecContext exec_context) - : substrait_buffer_(std::move(substrait_buffer)), - generator_(generator), - plan_(std::move(plan)), - exec_context_(exec_context) {} + : generator_(generator), plan_(std::move(plan)), exec_context_(exec_context) {} Result> Execute() { for (const compute::Declaration& decl : declarations_) { @@ -86,20 +81,19 @@ class SubstraitExecutor { Status Close() { return plan_->finished().status(); } - Status Init() { - if (substrait_buffer_ == NULLPTR) { - return Status::Invalid("Buffer containing Substrait plan is null."); + Status Init(const Buffer& substrait_buffer) { + if (substrait_buffer.size() == 0) { + return Status::Invalid("Empty substrait plan is passed."); } std::function()> consumer_factory = [&] { return std::make_shared(generator_); }; ARROW_ASSIGN_OR_RAISE(declarations_, - engine::DeserializePlan(*substrait_buffer_, consumer_factory)); + engine::DeserializePlan(substrait_buffer, consumer_factory)); return Status::OK(); } private: - std::shared_ptr substrait_buffer_; AsyncGenerator>* generator_; std::vector declarations_; std::shared_ptr plan_; @@ -117,14 +111,13 @@ SubstraitSinkConsumer::MakeProducer( } // namespace Result> ExecuteSerializedPlan( - std::shared_ptr substrait_buffer) { + const Buffer& substrait_buffer) { arrow::AsyncGenerator> sink_gen; ARROW_ASSIGN_OR_RAISE(auto plan, compute::ExecPlan::Make()); compute::ExecContext exec_context(arrow::default_memory_pool(), ::arrow::internal::GetCpuThreadPool()); - arrow::engine::SubstraitExecutor executor(std::move(substrait_buffer), &sink_gen, - std::move(plan), exec_context); - RETURN_NOT_OK(executor.Init()); + arrow::engine::SubstraitExecutor executor(&sink_gen, std::move(plan), exec_context); + RETURN_NOT_OK(executor.Init(substrait_buffer)); ARROW_ASSIGN_OR_RAISE(auto sink_reader, executor.Execute()); RETURN_NOT_OK(executor.Close()); return sink_reader; diff --git a/cpp/src/arrow/engine/substrait/util.h b/cpp/src/arrow/engine/substrait/util.h index 00ce43863c8..9e93de0d898 100644 --- a/cpp/src/arrow/engine/substrait/util.h +++ b/cpp/src/arrow/engine/substrait/util.h @@ -28,7 +28,7 @@ namespace engine { /// \brief Retrieve a RecordBatchReader from a Substrait plan. ARROW_ENGINE_EXPORT Result> ExecuteSerializedPlan( - std::shared_ptr substrait_buffer); + const Buffer& substrait_buffer); /// \brief Get a Serialized Plan from a Substrait JSON plan. /// This is a helper method for Python tests. diff --git a/python/pyarrow/_substrait.pyx b/python/pyarrow/_substrait.pyx index 487663bc3b4..75b562ebfee 100644 --- a/python/pyarrow/_substrait.pyx +++ b/python/pyarrow/_substrait.pyx @@ -16,6 +16,7 @@ # under the License. # cython: language_level = 3 +from cython.operator cimport dereference as deref from pyarrow import Buffer from pyarrow.lib cimport * @@ -40,7 +41,7 @@ def run_query(plan): shared_ptr[CBuffer] c_buf_plan c_buf_plan = pyarrow_unwrap_buffer(plan) - c_res_reader = ExecuteSerializedPlan(c_buf_plan) + c_res_reader = ExecuteSerializedPlan(deref(c_buf_plan)) c_reader = GetResultValue(c_res_reader) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 2007cc8a516..e0d215fbc60 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2698,5 +2698,5 @@ cdef extern from "arrow/python/udf.h" namespace "arrow::py": function[CallbackUdf] wrapper, const CScalarUdfOptions& options) cdef extern from "arrow/engine/substrait/util.h" namespace "arrow::engine" nogil: - CResult[shared_ptr[CRecordBatchReader]] ExecuteSerializedPlan(shared_ptr[CBuffer] substrait_buffer) + CResult[shared_ptr[CRecordBatchReader]] ExecuteSerializedPlan(const CBuffer& substrait_buffer) CResult[shared_ptr[CBuffer]] ParseJsonPlan(const c_string& substrait_json) diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index dd475b3a41b..4c7f9878d5c 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -36,7 +36,7 @@ # Marks all of the tests in this module # Ignore these with pytest ... -m 'not substrait' -pytestmark = [pytest.mark.parquet, pytest.mark.substrait] +pytestmark = [pytest.mark.dataset, pytest.mark.parquet, pytest.mark.substrait] def resource_root(): @@ -103,6 +103,6 @@ def test_invalid_plan(): } """ buf = pa._substrait._parse_json_plan(tobytes(query)) - exec_message = "ExecPlan has no node" + exec_message = "Empty substrait plan is passed." with pytest.raises(ArrowInvalid, match=exec_message): substrait.run_query(buf) From 7739b08d1a7f8ed724d635bb75c270481e925d58 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 6 May 2022 18:44:02 +0530 Subject: [PATCH 61/74] adding a comment on context usage --- cpp/src/arrow/engine/substrait/util.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index 880b8c2d54a..73dc2d7cf50 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -114,6 +114,7 @@ Result> ExecuteSerializedPlan( const Buffer& substrait_buffer) { arrow::AsyncGenerator> sink_gen; ARROW_ASSIGN_OR_RAISE(auto plan, compute::ExecPlan::Make()); + // TODO(ARROW-15732) compute::ExecContext exec_context(arrow::default_memory_pool(), ::arrow::internal::GetCpuThreadPool()); arrow::engine::SubstraitExecutor executor(&sink_gen, std::move(plan), exec_context); From 9e19335f7e03de111e080108542353b7e4e373fc Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 10 May 2022 18:19:47 +0530 Subject: [PATCH 62/74] addressing reviews p1 --- cpp/src/arrow/engine/substrait/serde_test.cc | 8 +-- cpp/src/arrow/engine/substrait/util.cc | 53 +++++++++----------- cpp/src/arrow/engine/substrait/util.h | 6 +-- python/pyarrow/_substrait.pyx | 9 ++-- python/pyarrow/includes/libarrow.pxd | 4 -- python/pyarrow/libarrow_substrait.pxd | 26 ++++++++++ 6 files changed, 62 insertions(+), 44 deletions(-) create mode 100644 python/pyarrow/libarrow_substrait.pxd diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 13fef3b277d..deee2d14456 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -796,8 +796,8 @@ TEST(Substrait, GetRecordBatchReader) { GTEST_SKIP() << "ARROW-16392: Substrait File URI not supported for Windows"; #else ASSERT_OK_AND_ASSIGN(std::string substrait_json, GetSubstraitJSON()); - ASSERT_OK_AND_ASSIGN(auto buf, engine::ParseJsonPlan(substrait_json)); - ASSERT_OK_AND_ASSIGN(auto reader, engine::ExecuteSerializedPlan(*buf)); + ASSERT_OK_AND_ASSIGN(auto buf, substrait::SerializeJsonPlan(substrait_json)); + ASSERT_OK_AND_ASSIGN(auto reader, substrait::ExecuteSerializedPlan(*buf)); ASSERT_OK_AND_ASSIGN(auto table, Table::FromRecordBatchReader(reader.get())); // Note: assuming the binary.parquet file contains fixed amount of records // in case of a test failure, re-evalaute the content in the file @@ -810,8 +810,8 @@ TEST(Substrait, InvalidPlan) { "relations": [ ] })"; - ASSERT_OK_AND_ASSIGN(auto buf, engine::ParseJsonPlan(substrait_json)); - ASSERT_RAISES(Invalid, engine::ExecuteSerializedPlan(*buf)); + ASSERT_OK_AND_ASSIGN(auto buf, substrait::SerializeJsonPlan(substrait_json)); + ASSERT_RAISES(Invalid, substrait::ExecuteSerializedPlan(*buf)); } } // namespace engine diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index 73dc2d7cf50..1df9f310542 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -21,38 +21,39 @@ namespace arrow { -namespace engine { +namespace substrait { namespace { /// \brief A SinkNodeConsumer specialized to output ExecBatches via PushGenerator class SubstraitSinkConsumer : public compute::SinkNodeConsumer { public: explicit SubstraitSinkConsumer( - AsyncGenerator>* generator) - : producer_(MakeProducer(generator)) {} + arrow::PushGenerator>::Producer producer) + : producer_(std::move(producer)) {} Status Consume(compute::ExecBatch batch) override { // Consume a batch of data bool did_push = producer_.Push(batch); - if (!did_push) return Status::ExecutionError("Producer closed already"); + if (!did_push) return Status::Invalid("Producer closed already"); return Status::OK(); } Status Init(const std::shared_ptr& schema, compute::BackpressureControl* backpressure_control) override { + schema_ = schema; return Status::OK(); } - static arrow::PushGenerator>::Producer MakeProducer( - AsyncGenerator>* out_gen); - Future<> Finish() override { ARROW_UNUSED(producer_.Close()); return Future<>::MakeFinished(); } + std::shared_ptr schema() { return schema_; } + private: - PushGenerator>::Producer producer_; + arrow::PushGenerator>::Producer producer_; + std::shared_ptr schema_; }; /// \brief An executor to run a Substrait Query @@ -60,10 +61,11 @@ class SubstraitSinkConsumer : public compute::SinkNodeConsumer { /// bindings for consuming a Substrait plan. class SubstraitExecutor { public: - explicit SubstraitExecutor( - AsyncGenerator>* generator, - std::shared_ptr plan, compute::ExecContext exec_context) - : generator_(generator), plan_(std::move(plan)), exec_context_(exec_context) {} + explicit SubstraitExecutor(std::shared_ptr plan, + compute::ExecContext exec_context) + : plan_(std::move(plan)), exec_context_(exec_context) {} + + ~SubstraitExecutor() { ARROW_CHECK_OK(this->Close()); } Result> Execute() { for (const compute::Declaration& decl : declarations_) { @@ -73,9 +75,9 @@ class SubstraitExecutor { RETURN_NOT_OK(plan_->StartProducing()); // schema of the output can be obtained by the output_schema // of the input to the sink node. - auto schema = plan_->sinks()[0]->inputs()[0]->output_schema(); + auto schema = sink_consumer_->schema(); std::shared_ptr sink_reader = compute::MakeGeneratorReader( - schema, std::move(*generator_), exec_context_.memory_pool()); + std::move(schema), std::move(generator_), exec_context_.memory_pool()); return sink_reader; } @@ -85,49 +87,42 @@ class SubstraitExecutor { if (substrait_buffer.size() == 0) { return Status::Invalid("Empty substrait plan is passed."); } + sink_consumer_ = std::make_shared(generator_.producer()); std::function()> consumer_factory = [&] { - return std::make_shared(generator_); + return sink_consumer_; }; ARROW_ASSIGN_OR_RAISE(declarations_, engine::DeserializePlan(substrait_buffer, consumer_factory)); + return Status::OK(); } private: - AsyncGenerator>* generator_; + arrow::PushGenerator> generator_; std::vector declarations_; std::shared_ptr plan_; compute::ExecContext exec_context_; + std::shared_ptr sink_consumer_; }; -arrow::PushGenerator>::Producer -SubstraitSinkConsumer::MakeProducer( - AsyncGenerator>* out_gen) { - arrow::PushGenerator> push_gen; - auto out = push_gen.producer(); - *out_gen = std::move(push_gen); - return out; -} } // namespace Result> ExecuteSerializedPlan( const Buffer& substrait_buffer) { - arrow::AsyncGenerator> sink_gen; ARROW_ASSIGN_OR_RAISE(auto plan, compute::ExecPlan::Make()); // TODO(ARROW-15732) compute::ExecContext exec_context(arrow::default_memory_pool(), ::arrow::internal::GetCpuThreadPool()); - arrow::engine::SubstraitExecutor executor(&sink_gen, std::move(plan), exec_context); + SubstraitExecutor executor(std::move(plan), exec_context); RETURN_NOT_OK(executor.Init(substrait_buffer)); ARROW_ASSIGN_OR_RAISE(auto sink_reader, executor.Execute()); - RETURN_NOT_OK(executor.Close()); return sink_reader; } -Result> ParseJsonPlan(const std::string& substrait_json) { +Result> SerializeJsonPlan(const std::string& substrait_json) { return engine::internal::SubstraitFromJSON("Plan", substrait_json); } -} // namespace engine +} // namespace substrait } // namespace arrow diff --git a/cpp/src/arrow/engine/substrait/util.h b/cpp/src/arrow/engine/substrait/util.h index 9e93de0d898..2c90f99af03 100644 --- a/cpp/src/arrow/engine/substrait/util.h +++ b/cpp/src/arrow/engine/substrait/util.h @@ -24,7 +24,7 @@ namespace arrow { -namespace engine { +namespace substrait { /// \brief Retrieve a RecordBatchReader from a Substrait plan. ARROW_ENGINE_EXPORT Result> ExecuteSerializedPlan( @@ -32,9 +32,9 @@ ARROW_ENGINE_EXPORT Result> ExecuteSerialized /// \brief Get a Serialized Plan from a Substrait JSON plan. /// This is a helper method for Python tests. -ARROW_ENGINE_EXPORT Result> ParseJsonPlan( +ARROW_ENGINE_EXPORT Result> SerializeJsonPlan( const std::string& substrait_json); -} // namespace engine +} // namespace substrait } // namespace arrow diff --git a/python/pyarrow/_substrait.pyx b/python/pyarrow/_substrait.pyx index 75b562ebfee..ba37fa5d0ca 100644 --- a/python/pyarrow/_substrait.pyx +++ b/python/pyarrow/_substrait.pyx @@ -21,6 +21,7 @@ from cython.operator cimport dereference as deref from pyarrow import Buffer from pyarrow.lib cimport * from pyarrow.includes.libarrow cimport * +from pyarrow.libarrow_substrait cimport * def run_query(plan): @@ -57,7 +58,7 @@ def _parse_json_plan(plan): Parameters ---------- plan: bytes - Parse a Substrait plan in JSON to a serialized plan. + Substrait plan in JSON. Returns ------- @@ -71,7 +72,7 @@ def _parse_json_plan(plan): shared_ptr[CBuffer] c_buf_plan c_str_plan = plan - c_res_buffer = ParseJsonPlan(c_str_plan) - c_buf_plan = GetResultValue(c_res_buffer) - + c_res_buffer = SerializeJsonPlan(c_str_plan) + with nogil: + c_buf_plan = GetResultValue(c_res_buffer) return pyarrow_wrap_buffer(c_buf_plan) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index e0d215fbc60..2e51864b860 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2696,7 +2696,3 @@ cdef extern from "arrow/python/udf.h" namespace "arrow::py": CStatus RegisterScalarFunction(PyObject* function, function[CallbackUdf] wrapper, const CScalarUdfOptions& options) - -cdef extern from "arrow/engine/substrait/util.h" namespace "arrow::engine" nogil: - CResult[shared_ptr[CRecordBatchReader]] ExecuteSerializedPlan(const CBuffer& substrait_buffer) - CResult[shared_ptr[CBuffer]] ParseJsonPlan(const c_string& substrait_json) diff --git a/python/pyarrow/libarrow_substrait.pxd b/python/pyarrow/libarrow_substrait.pxd new file mode 100644 index 00000000000..e7cb95d1009 --- /dev/null +++ b/python/pyarrow/libarrow_substrait.pxd @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ + +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * + + +cdef extern from "arrow/engine/substrait/util.h" namespace "arrow::substrait" nogil: + CResult[shared_ptr[CRecordBatchReader]] ExecuteSerializedPlan(const CBuffer& substrait_buffer) + CResult[shared_ptr[CBuffer]] SerializeJsonPlan(const c_string& substrait_json) From ffc87ea4d3569f5682560008bc8012854297d7db Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 10 May 2022 18:23:13 +0530 Subject: [PATCH 63/74] change order --- .github/workflows/python.yml | 2 +- ci/scripts/python_build.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 438def6e680..b14559d12a1 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -120,7 +120,6 @@ jobs: env: ARROW_HOME: /usr/local ARROW_DATASET: ON - ARROW_SUBSTRAIT: ON ARROW_FLIGHT: ON ARROW_GANDIVA: ON ARROW_HDFS: ON @@ -131,6 +130,7 @@ jobs: ARROW_PLASMA: ON ARROW_PYTHON: ON ARROW_S3: ON + ARROW_SUBSTRAIT: ON ARROW_WITH_ZLIB: ON ARROW_WITH_LZ4: ON ARROW_WITH_BZ2: ON diff --git a/ci/scripts/python_build.sh b/ci/scripts/python_build.sh index 0fdea7033a6..b90321643c7 100755 --- a/ci/scripts/python_build.sh +++ b/ci/scripts/python_build.sh @@ -56,7 +56,6 @@ export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE:-debug} export PYARROW_WITH_CUDA=${ARROW_CUDA:-OFF} export PYARROW_WITH_DATASET=${ARROW_DATASET:-ON} -export PYARROW_WITH_SUBSTRAIT=${ARROW_SUBSTRAIT:-OFF} export PYARROW_WITH_FLIGHT=${ARROW_FLIGHT:-OFF} export PYARROW_WITH_GANDIVA=${ARROW_GANDIVA:-OFF} export PYARROW_WITH_HDFS=${ARROW_HDFS:-ON} @@ -65,6 +64,7 @@ export PYARROW_WITH_PLASMA=${ARROW_PLASMA:-OFF} export PYARROW_WITH_PARQUET=${ARROW_PARQUET:-OFF} export PYARROW_WITH_PARQUET_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION:-ON} export PYARROW_WITH_S3=${ARROW_S3:-OFF} +export PYARROW_WITH_SUBSTRAIT=${ARROW_SUBSTRAIT:-OFF} export PYARROW_PARALLEL=${n_jobs} From 784c07bdf34269895fb761725210d5fa8f5708b9 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 10 May 2022 19:17:45 +0530 Subject: [PATCH 64/74] replace parquest tests with arrow ipc --- python/pyarrow/tests/test_substrait.py | 36 ++++++++++++-------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index 4c7f9878d5c..f4da373a70e 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -17,7 +17,6 @@ import os import sys -import pathlib import pytest import pyarrow as pa @@ -36,22 +35,13 @@ # Marks all of the tests in this module # Ignore these with pytest ... -m 'not substrait' -pytestmark = [pytest.mark.dataset, pytest.mark.parquet, pytest.mark.substrait] - - -def resource_root(): - """Get the path to the test resources directory.""" - if not os.environ.get("PARQUET_TEST_DATA"): - raise RuntimeError("Test resources not found; set " - "PARQUET_TEST_DATA to " - "/cpp/submodules/parquet-testing/data") - return pathlib.Path(os.environ["PARQUET_TEST_DATA"]) +pytestmark = [pytest.mark.dataset, pytest.mark.substrait] @pytest.mark.skipif(sys.platform == 'win32', reason="ARROW-16392: file based URI is" + " not fully supported for Windows") -def test_run_serialized_query(): +def test_run_serialized_query(tmpdir): substrait_query = """ { "relations": [ @@ -60,7 +50,7 @@ def test_run_serialized_query(): "base_schema": { "struct": { "types": [ - {"binary": {}} + {"i64": {}} ] }, "names": [ @@ -70,8 +60,7 @@ def test_run_serialized_query(): "local_files": { "items": [ { - "uri_file": "file://FILENAME_PLACEHOLDER", - "format": "FILE_FORMAT_PARQUET" + "uri_file": "file://FILENAME_PLACEHOLDER" } ] } @@ -81,18 +70,25 @@ def test_run_serialized_query(): } """ - filename = str(resource_root() / "binary.parquet") + schema = pa.schema([pa.field('foo', pa.int64())]) + path = os.path.join(str(tmpdir), 'substrait_data.arrow') + with pa.OSFile(path, 'wb') as sink: + with pa.ipc.new_file(sink, schema) as writer: + batch = pa.record_batch( + [pa.array(range(5), type=pa.int64())], schema) + writer.write(batch) - query = tobytes(substrait_query.replace("FILENAME_PLACEHOLDER", filename)) + with pa.OSFile(path, 'rb') as source: + expected_table = pa.ipc.open_file(source).read_all() + + query = tobytes(substrait_query.replace("FILENAME_PLACEHOLDER", path)) buf = pa._substrait._parse_json_plan(query) reader = substrait.run_query(buf) res_tb = reader.read_all() - expected_tb = pq.read_table(filename) - - assert expected_tb.num_rows == res_tb.num_rows + assert expected_table.num_rows == res_tb.num_rows def test_invalid_plan(): From 83b66023906cf0abe2f359d90f49bec8030d93d2 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 12 May 2022 20:43:20 +0530 Subject: [PATCH 65/74] updating namespace --- cpp/src/arrow/engine/substrait/util.cc | 4 ++++ cpp/src/arrow/engine/substrait/util.h | 4 ++++ python/pyarrow/libarrow_substrait.pxd | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index 1df9f310542..7789bbae775 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -21,6 +21,8 @@ namespace arrow { +namespace engine { + namespace substrait { namespace { @@ -125,4 +127,6 @@ Result> SerializeJsonPlan(const std::string& substrait_j } // namespace substrait +} // namespace engine + } // namespace arrow diff --git a/cpp/src/arrow/engine/substrait/util.h b/cpp/src/arrow/engine/substrait/util.h index 2c90f99af03..860a459da2f 100644 --- a/cpp/src/arrow/engine/substrait/util.h +++ b/cpp/src/arrow/engine/substrait/util.h @@ -24,6 +24,8 @@ namespace arrow { +namespace engine { + namespace substrait { /// \brief Retrieve a RecordBatchReader from a Substrait plan. @@ -37,4 +39,6 @@ ARROW_ENGINE_EXPORT Result> SerializeJsonPlan( } // namespace substrait +} // namespace engine + } // namespace arrow diff --git a/python/pyarrow/libarrow_substrait.pxd b/python/pyarrow/libarrow_substrait.pxd index e7cb95d1009..2e1a17b06bd 100644 --- a/python/pyarrow/libarrow_substrait.pxd +++ b/python/pyarrow/libarrow_substrait.pxd @@ -21,6 +21,6 @@ from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport * -cdef extern from "arrow/engine/substrait/util.h" namespace "arrow::substrait" nogil: +cdef extern from "arrow/engine/substrait/util.h" namespace "arrow::engine::substrait" nogil: CResult[shared_ptr[CRecordBatchReader]] ExecuteSerializedPlan(const CBuffer& substrait_buffer) CResult[shared_ptr[CBuffer]] SerializeJsonPlan(const c_string& substrait_json) From 0a377ea50ec46ba3bf87866f323f73049b1d9216 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 13 May 2022 08:08:50 +0530 Subject: [PATCH 66/74] type_internal: substrait namespace modification (fix amd ci failure) --- .../arrow/engine/substrait/type_internal.cc | 128 +++++++++--------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc b/cpp/src/arrow/engine/substrait/type_internal.cc index c1dac97b682..6dfbf781af4 100644 --- a/cpp/src/arrow/engine/substrait/type_internal.cc +++ b/cpp/src/arrow/engine/substrait/type_internal.cc @@ -46,7 +46,7 @@ Status CheckVariation(const TypeMessage& type) { template bool IsNullable(const TypeMessage& type) { // FIXME what can we do with NULLABILITY_UNSPECIFIED - return type.nullability() != substrait::Type::NULLABILITY_REQUIRED; + return type.nullability() != ::substrait::Type::NULLABILITY_REQUIRED; } template @@ -99,66 +99,66 @@ Result FieldsFromProto(int size, const Types& types, } // namespace Result, bool>> FromProto( - const substrait::Type& type, const ExtensionSet& ext_set) { + const ::substrait::Type& type, const ExtensionSet& ext_set) { switch (type.kind_case()) { - case substrait::Type::kBool: + case ::substrait::Type::kBool: return FromProtoImpl(type.bool_()); - case substrait::Type::kI8: + case ::substrait::Type::kI8: return FromProtoImpl(type.i8()); - case substrait::Type::kI16: + case ::substrait::Type::kI16: return FromProtoImpl(type.i16()); - case substrait::Type::kI32: + case ::substrait::Type::kI32: return FromProtoImpl(type.i32()); - case substrait::Type::kI64: + case ::substrait::Type::kI64: return FromProtoImpl(type.i64()); - case substrait::Type::kFp32: + case ::substrait::Type::kFp32: return FromProtoImpl(type.fp32()); - case substrait::Type::kFp64: + case ::substrait::Type::kFp64: return FromProtoImpl(type.fp64()); - case substrait::Type::kString: + case ::substrait::Type::kString: return FromProtoImpl(type.string()); - case substrait::Type::kBinary: + case ::substrait::Type::kBinary: return FromProtoImpl(type.binary()); - case substrait::Type::kTimestamp: + case ::substrait::Type::kTimestamp: return FromProtoImpl(type.timestamp(), TimeUnit::MICRO); - case substrait::Type::kTimestampTz: + case ::substrait::Type::kTimestampTz: return FromProtoImpl(type.timestamp_tz(), TimeUnit::MICRO, TimestampTzTimezoneString()); - case substrait::Type::kDate: + case ::substrait::Type::kDate: return FromProtoImpl(type.date()); - case substrait::Type::kTime: + case ::substrait::Type::kTime: return FromProtoImpl(type.time(), TimeUnit::MICRO); - case substrait::Type::kIntervalYear: + case ::substrait::Type::kIntervalYear: return FromProtoImpl(type.interval_year(), interval_year); - case substrait::Type::kIntervalDay: + case ::substrait::Type::kIntervalDay: return FromProtoImpl(type.interval_day(), interval_day); - case substrait::Type::kUuid: + case ::substrait::Type::kUuid: return FromProtoImpl(type.uuid(), uuid); - case substrait::Type::kFixedChar: + case ::substrait::Type::kFixedChar: return FromProtoImpl(type.fixed_char(), fixed_char, type.fixed_char().length()); - case substrait::Type::kVarchar: + case ::substrait::Type::kVarchar: return FromProtoImpl(type.varchar(), varchar, type.varchar().length()); - case substrait::Type::kFixedBinary: + case ::substrait::Type::kFixedBinary: return FromProtoImpl(type.fixed_binary(), type.fixed_binary().length()); - case substrait::Type::kDecimal: { + case ::substrait::Type::kDecimal: { const auto& decimal = type.decimal(); return FromProtoImpl(decimal, decimal.precision(), decimal.scale()); } - case substrait::Type::kStruct: { + case ::substrait::Type::kStruct: { const auto& struct_ = type.struct_(); ARROW_ASSIGN_OR_RAISE(auto fields, FieldsFromProto( @@ -168,7 +168,7 @@ Result, bool>> FromProto( return FromProtoImpl(struct_, std::move(fields)); } - case substrait::Type::kList: { + case ::substrait::Type::kList: { const auto& list = type.list(); if (!list.has_type()) { @@ -182,7 +182,7 @@ Result, bool>> FromProto( list, field("item", std::move(type_nullable.first), type_nullable.second)); } - case substrait::Type::kMap: { + case ::substrait::Type::kMap: { const auto& map = type.map(); static const std::array kMissing = {"key and value", "value", "key", @@ -206,7 +206,7 @@ Result, bool>> FromProto( field("value", std::move(value_nullable.first), value_nullable.second)); } - case substrait::Type::kUserDefinedTypeReference: { + case ::substrait::Type::kUserDefinedTypeReference: { uint32_t anchor = type.user_defined_type_reference(); ARROW_ASSIGN_OR_RAISE(auto type_record, ext_set.DecodeType(anchor)); return std::make_pair(std::move(type_record.type), true); @@ -226,18 +226,18 @@ struct DataTypeToProtoImpl { Status Visit(const NullType& t) { return EncodeUserDefined(t); } Status Visit(const BooleanType& t) { - return SetWith(&substrait::Type::set_allocated_bool_); + return SetWith(&::substrait::Type::set_allocated_bool_); } - Status Visit(const Int8Type& t) { return SetWith(&substrait::Type::set_allocated_i8); } + Status Visit(const Int8Type& t) { return SetWith(&::substrait::Type::set_allocated_i8); } Status Visit(const Int16Type& t) { - return SetWith(&substrait::Type::set_allocated_i16); + return SetWith(&::substrait::Type::set_allocated_i16); } Status Visit(const Int32Type& t) { - return SetWith(&substrait::Type::set_allocated_i32); + return SetWith(&::substrait::Type::set_allocated_i32); } Status Visit(const Int64Type& t) { - return SetWith(&substrait::Type::set_allocated_i64); + return SetWith(&::substrait::Type::set_allocated_i64); } Status Visit(const UInt8Type& t) { return EncodeUserDefined(t); } @@ -247,26 +247,26 @@ struct DataTypeToProtoImpl { Status Visit(const HalfFloatType& t) { return EncodeUserDefined(t); } Status Visit(const FloatType& t) { - return SetWith(&substrait::Type::set_allocated_fp32); + return SetWith(&::substrait::Type::set_allocated_fp32); } Status Visit(const DoubleType& t) { - return SetWith(&substrait::Type::set_allocated_fp64); + return SetWith(&::substrait::Type::set_allocated_fp64); } Status Visit(const StringType& t) { - return SetWith(&substrait::Type::set_allocated_string); + return SetWith(&::substrait::Type::set_allocated_string); } Status Visit(const BinaryType& t) { - return SetWith(&substrait::Type::set_allocated_binary); + return SetWith(&::substrait::Type::set_allocated_binary); } Status Visit(const FixedSizeBinaryType& t) { - SetWithThen(&substrait::Type::set_allocated_fixed_binary)->set_length(t.byte_width()); + SetWithThen(&::substrait::Type::set_allocated_fixed_binary)->set_length(t.byte_width()); return Status::OK(); } Status Visit(const Date32Type& t) { - return SetWith(&substrait::Type::set_allocated_date); + return SetWith(&::substrait::Type::set_allocated_date); } Status Visit(const Date64Type& t) { return NotImplemented(t); } @@ -274,10 +274,10 @@ struct DataTypeToProtoImpl { if (t.unit() != TimeUnit::MICRO) return NotImplemented(t); if (t.timezone() == "") { - return SetWith(&substrait::Type::set_allocated_timestamp); + return SetWith(&::substrait::Type::set_allocated_timestamp); } if (t.timezone() == TimestampTzTimezoneString()) { - return SetWith(&substrait::Type::set_allocated_timestamp_tz); + return SetWith(&::substrait::Type::set_allocated_timestamp_tz); } return NotImplemented(t); @@ -286,14 +286,14 @@ struct DataTypeToProtoImpl { Status Visit(const Time32Type& t) { return NotImplemented(t); } Status Visit(const Time64Type& t) { if (t.unit() != TimeUnit::MICRO) return NotImplemented(t); - return SetWith(&substrait::Type::set_allocated_time); + return SetWith(&::substrait::Type::set_allocated_time); } Status Visit(const MonthIntervalType& t) { return EncodeUserDefined(t); } Status Visit(const DayTimeIntervalType& t) { return EncodeUserDefined(t); } Status Visit(const Decimal128Type& t) { - auto dec = SetWithThen(&substrait::Type::set_allocated_decimal); + auto dec = SetWithThen(&::substrait::Type::set_allocated_decimal); dec->set_precision(t.precision()); dec->set_scale(t.scale()); return Status::OK(); @@ -304,18 +304,18 @@ struct DataTypeToProtoImpl { // FIXME assert default field name; custom ones won't roundtrip ARROW_ASSIGN_OR_RAISE( auto type, ToProto(*t.value_type(), t.value_field()->nullable(), ext_set_)); - SetWithThen(&substrait::Type::set_allocated_list)->set_allocated_type(type.release()); + SetWithThen(&::substrait::Type::set_allocated_list)->set_allocated_type(type.release()); return Status::OK(); } Status Visit(const StructType& t) { - auto types = SetWithThen(&substrait::Type::set_allocated_struct_)->mutable_types(); + auto types = SetWithThen(&::substrait::Type::set_allocated_struct_)->mutable_types(); types->Reserve(t.num_fields()); for (const auto& field : t.fields()) { if (field->metadata() != nullptr) { - return Status::Invalid("substrait::Type::Struct does not support field metadata"); + return Status::Invalid("::substrait::Type::Struct does not support field metadata"); } ARROW_ASSIGN_OR_RAISE(auto type, ToProto(*field->type(), field->nullable(), ext_set_)); @@ -330,7 +330,7 @@ struct DataTypeToProtoImpl { Status Visit(const MapType& t) { // FIXME assert default field names; custom ones won't roundtrip - auto map = SetWithThen(&substrait::Type::set_allocated_map); + auto map = SetWithThen(&::substrait::Type::set_allocated_map); ARROW_ASSIGN_OR_RAISE(auto key, ToProto(*t.key_type(), /*nullable=*/false, ext_set_)); map->set_allocated_key(key.release()); @@ -344,25 +344,25 @@ struct DataTypeToProtoImpl { Status Visit(const ExtensionType& t) { if (UnwrapUuid(t)) { - return SetWith(&substrait::Type::set_allocated_uuid); + return SetWith(&::substrait::Type::set_allocated_uuid); } if (auto length = UnwrapFixedChar(t)) { - SetWithThen(&substrait::Type::set_allocated_fixed_char)->set_length(*length); + SetWithThen(&::substrait::Type::set_allocated_fixed_char)->set_length(*length); return Status::OK(); } if (auto length = UnwrapVarChar(t)) { - SetWithThen(&substrait::Type::set_allocated_varchar)->set_length(*length); + SetWithThen(&::substrait::Type::set_allocated_varchar)->set_length(*length); return Status::OK(); } if (UnwrapIntervalYear(t)) { - return SetWith(&substrait::Type::set_allocated_interval_year); + return SetWith(&::substrait::Type::set_allocated_interval_year); } if (UnwrapIntervalDay(t)) { - return SetWith(&substrait::Type::set_allocated_interval_day); + return SetWith(&::substrait::Type::set_allocated_interval_day); } return NotImplemented(t); @@ -376,10 +376,10 @@ struct DataTypeToProtoImpl { Status Visit(const MonthDayNanoIntervalType& t) { return EncodeUserDefined(t); } template - Sub* SetWithThen(void (substrait::Type::*set_allocated_sub)(Sub*)) { + Sub* SetWithThen(void (::substrait::Type::*set_allocated_sub)(Sub*)) { auto sub = internal::make_unique(); - sub->set_nullability(nullable_ ? substrait::Type::NULLABILITY_NULLABLE - : substrait::Type::NULLABILITY_REQUIRED); + sub->set_nullability(nullable_ ? ::substrait::Type::NULLABILITY_NULLABLE + : ::substrait::Type::NULLABILITY_REQUIRED); auto out = sub.get(); (type_->*set_allocated_sub)(sub.release()); @@ -387,7 +387,7 @@ struct DataTypeToProtoImpl { } template - Status SetWith(void (substrait::Type::*set_allocated_sub)(Sub*)) { + Status SetWith(void (::substrait::Type::*set_allocated_sub)(Sub*)) { return SetWithThen(set_allocated_sub), Status::OK(); } @@ -399,25 +399,25 @@ struct DataTypeToProtoImpl { } Status NotImplemented(const DataType& t) { - return Status::NotImplemented("conversion to substrait::Type from ", t.ToString()); + return Status::NotImplemented("conversion to ::substrait::Type from ", t.ToString()); } Status operator()(const DataType& type) { return VisitTypeInline(type, this); } - substrait::Type* type_; + ::substrait::Type* type_; bool nullable_; ExtensionSet* ext_set_; }; } // namespace -Result> ToProto(const DataType& type, bool nullable, +Result> ToProto(const DataType& type, bool nullable, ExtensionSet* ext_set) { - auto out = internal::make_unique(); + auto out = internal::make_unique<::substrait::Type>(); RETURN_NOT_OK((DataTypeToProtoImpl{out.get(), nullable, ext_set})(type)); return std::move(out); } -Result> FromProto(const substrait::NamedStruct& named_struct, +Result> FromProto(const ::substrait::NamedStruct& named_struct, const ExtensionSet& ext_set) { if (!named_struct.has_struct_()) { return Status::Invalid("While converting ", named_struct.DebugString(), @@ -461,25 +461,25 @@ void ToProtoGetDepthFirstNames(const FieldVector& fields, } } // namespace -Result> ToProto(const Schema& schema, +Result> ToProto(const Schema& schema, ExtensionSet* ext_set) { if (schema.metadata()) { - return Status::Invalid("substrait::NamedStruct does not support schema metadata"); + return Status::Invalid("::substrait::NamedStruct does not support schema metadata"); } - auto named_struct = internal::make_unique(); + auto named_struct = internal::make_unique<::substrait::NamedStruct>(); auto names = named_struct->mutable_names(); names->Reserve(schema.num_fields()); ToProtoGetDepthFirstNames(schema.fields(), names); - auto struct_ = internal::make_unique(); + auto struct_ = internal::make_unique<::substrait::Type::Struct>(); auto types = struct_->mutable_types(); types->Reserve(schema.num_fields()); for (const auto& field : schema.fields()) { if (field->metadata() != nullptr) { - return Status::Invalid("substrait::NamedStruct does not support field metadata"); + return Status::Invalid("::substrait::NamedStruct does not support field metadata"); } ARROW_ASSIGN_OR_RAISE(auto type, ToProto(*field->type(), field->nullable(), ext_set)); From b6045da8e10cdfeda538b3c148a1426b0993b4d5 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 13 May 2022 09:08:31 +0530 Subject: [PATCH 67/74] fix style issue --- cpp/src/arrow/engine/substrait/type_internal.cc | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc b/cpp/src/arrow/engine/substrait/type_internal.cc index 6dfbf781af4..c7b94b41040 100644 --- a/cpp/src/arrow/engine/substrait/type_internal.cc +++ b/cpp/src/arrow/engine/substrait/type_internal.cc @@ -229,7 +229,9 @@ struct DataTypeToProtoImpl { return SetWith(&::substrait::Type::set_allocated_bool_); } - Status Visit(const Int8Type& t) { return SetWith(&::substrait::Type::set_allocated_i8); } + Status Visit(const Int8Type& t) { + return SetWith(&::substrait::Type::set_allocated_i8); + } Status Visit(const Int16Type& t) { return SetWith(&::substrait::Type::set_allocated_i16); } @@ -261,7 +263,8 @@ struct DataTypeToProtoImpl { } Status Visit(const FixedSizeBinaryType& t) { - SetWithThen(&::substrait::Type::set_allocated_fixed_binary)->set_length(t.byte_width()); + SetWithThen(&::substrait::Type::set_allocated_fixed_binary) + ->set_length(t.byte_width()); return Status::OK(); } @@ -304,7 +307,8 @@ struct DataTypeToProtoImpl { // FIXME assert default field name; custom ones won't roundtrip ARROW_ASSIGN_OR_RAISE( auto type, ToProto(*t.value_type(), t.value_field()->nullable(), ext_set_)); - SetWithThen(&::substrait::Type::set_allocated_list)->set_allocated_type(type.release()); + SetWithThen(&::substrait::Type::set_allocated_list) + ->set_allocated_type(type.release()); return Status::OK(); } @@ -315,7 +319,8 @@ struct DataTypeToProtoImpl { for (const auto& field : t.fields()) { if (field->metadata() != nullptr) { - return Status::Invalid("::substrait::Type::Struct does not support field metadata"); + return Status::Invalid( + "::substrait::Type::Struct does not support field metadata"); } ARROW_ASSIGN_OR_RAISE(auto type, ToProto(*field->type(), field->nullable(), ext_set_)); @@ -411,7 +416,7 @@ struct DataTypeToProtoImpl { } // namespace Result> ToProto(const DataType& type, bool nullable, - ExtensionSet* ext_set) { + ExtensionSet* ext_set) { auto out = internal::make_unique<::substrait::Type>(); RETURN_NOT_OK((DataTypeToProtoImpl{out.get(), nullable, ext_set})(type)); return std::move(out); @@ -462,7 +467,7 @@ void ToProtoGetDepthFirstNames(const FieldVector& fields, } // namespace Result> ToProto(const Schema& schema, - ExtensionSet* ext_set) { + ExtensionSet* ext_set) { if (schema.metadata()) { return Status::Invalid("::substrait::NamedStruct does not support schema metadata"); } From fc0772c9cb2a5cc7bc33b34a08c40a710ea6f459 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 17 May 2022 07:42:53 +0530 Subject: [PATCH 68/74] addressing review comments --- cpp/src/arrow/engine/substrait/util.cc | 3 --- python/pyarrow/_substrait.pyx | 3 ++- python/pyarrow/tests/test_substrait.py | 26 ++++++++++++-------------- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index 7789bbae775..d20116724e5 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -75,8 +75,6 @@ class SubstraitExecutor { } RETURN_NOT_OK(plan_->Validate()); RETURN_NOT_OK(plan_->StartProducing()); - // schema of the output can be obtained by the output_schema - // of the input to the sink node. auto schema = sink_consumer_->schema(); std::shared_ptr sink_reader = compute::MakeGeneratorReader( std::move(schema), std::move(generator_), exec_context_.memory_pool()); @@ -95,7 +93,6 @@ class SubstraitExecutor { }; ARROW_ASSIGN_OR_RAISE(declarations_, engine::DeserializePlan(substrait_buffer, consumer_factory)); - return Status::OK(); } diff --git a/python/pyarrow/_substrait.pyx b/python/pyarrow/_substrait.pyx index ba37fa5d0ca..a3715a7dd2a 100644 --- a/python/pyarrow/_substrait.pyx +++ b/python/pyarrow/_substrait.pyx @@ -42,7 +42,8 @@ def run_query(plan): shared_ptr[CBuffer] c_buf_plan c_buf_plan = pyarrow_unwrap_buffer(plan) - c_res_reader = ExecuteSerializedPlan(deref(c_buf_plan)) + with nogil: + c_res_reader = ExecuteSerializedPlan(deref(c_buf_plan)) c_reader = GetResultValue(c_res_reader) diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index f4da373a70e..a121f1281c3 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -23,11 +23,6 @@ from pyarrow.lib import tobytes from pyarrow.lib import ArrowInvalid -try: - import pyarrow.parquet as pq -except ImportError: - pq = None - try: import pyarrow.substrait as substrait except ImportError: @@ -70,16 +65,19 @@ def test_run_serialized_query(tmpdir): } """ - schema = pa.schema([pa.field('foo', pa.int64())]) path = os.path.join(str(tmpdir), 'substrait_data.arrow') - with pa.OSFile(path, 'wb') as sink: - with pa.ipc.new_file(sink, schema) as writer: - batch = pa.record_batch( - [pa.array(range(5), type=pa.int64())], schema) - writer.write(batch) + # with pa.OSFile(path, 'wb') as sink: + # with pa.ipc.new_file(sink, schema) as writer: + # batch = pa.record_batch( + # [pa.array(range(5), type=pa.int64())], schema) + # writer.write(batch) - with pa.OSFile(path, 'rb') as source: - expected_table = pa.ipc.open_file(source).read_all() + # with pa.OSFile(path, 'rb') as source: + # expected_table = pa.ipc.open_file(source).read_all() + expected_table = pa.table([[1, 2, 3, 4, 5]], names=['foo']) + with pa.ipc.RecordBatchFileWriter(path, + schema=expected_table.schema) as writer: + writer.write_table(expected_table) query = tobytes(substrait_query.replace("FILENAME_PLACEHOLDER", path)) @@ -88,7 +86,7 @@ def test_run_serialized_query(tmpdir): reader = substrait.run_query(buf) res_tb = reader.read_all() - assert expected_table.num_rows == res_tb.num_rows + assert expected_table.select(["foo"]) == res_tb.select(["foo"]) def test_invalid_plan(): From f259e7bcec8a3642b3f87b838f39f35643a6bdfd Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 17 May 2022 08:07:05 +0530 Subject: [PATCH 69/74] removing comments and formatting --- python/pyarrow/tests/test_substrait.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index a121f1281c3..056457bfd3e 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -66,18 +66,9 @@ def test_run_serialized_query(tmpdir): """ path = os.path.join(str(tmpdir), 'substrait_data.arrow') - # with pa.OSFile(path, 'wb') as sink: - # with pa.ipc.new_file(sink, schema) as writer: - # batch = pa.record_batch( - # [pa.array(range(5), type=pa.int64())], schema) - # writer.write(batch) - - # with pa.OSFile(path, 'rb') as source: - # expected_table = pa.ipc.open_file(source).read_all() - expected_table = pa.table([[1, 2, 3, 4, 5]], names=['foo']) - with pa.ipc.RecordBatchFileWriter(path, - schema=expected_table.schema) as writer: - writer.write_table(expected_table) + table = pa.table([[1, 2, 3, 4, 5]], names=['foo']) + with pa.ipc.RecordBatchFileWriter(path, schema=table.schema) as writer: + writer.write_table(table) query = tobytes(substrait_query.replace("FILENAME_PLACEHOLDER", path)) @@ -86,7 +77,7 @@ def test_run_serialized_query(tmpdir): reader = substrait.run_query(buf) res_tb = reader.read_all() - assert expected_table.select(["foo"]) == res_tb.select(["foo"]) + assert table.select(["foo"]) == res_tb.select(["foo"]) def test_invalid_plan(): From 1e21a0b4bb2d655ef872df55d5a521eae66daf99 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 20 May 2022 21:23:57 +0530 Subject: [PATCH 70/74] move libarrow_substrait and minor refactor --- python/pyarrow/_substrait.pyx | 2 +- python/pyarrow/{ => includes}/libarrow_substrait.pxd | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename python/pyarrow/{ => includes}/libarrow_substrait.pxd (100%) diff --git a/python/pyarrow/_substrait.pyx b/python/pyarrow/_substrait.pyx index a3715a7dd2a..7f079fb717b 100644 --- a/python/pyarrow/_substrait.pyx +++ b/python/pyarrow/_substrait.pyx @@ -21,7 +21,7 @@ from cython.operator cimport dereference as deref from pyarrow import Buffer from pyarrow.lib cimport * from pyarrow.includes.libarrow cimport * -from pyarrow.libarrow_substrait cimport * +from pyarrow.includes.libarrow_substrait cimport * def run_query(plan): diff --git a/python/pyarrow/libarrow_substrait.pxd b/python/pyarrow/includes/libarrow_substrait.pxd similarity index 100% rename from python/pyarrow/libarrow_substrait.pxd rename to python/pyarrow/includes/libarrow_substrait.pxd From ce2834a3991ec46ef058ee22a759f40e11005f1b Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 20 May 2022 21:39:43 +0530 Subject: [PATCH 71/74] refactor substrait consumer --- cpp/src/arrow/engine/substrait/util.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index d20116724e5..ff87cd7ec44 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -23,8 +23,6 @@ namespace arrow { namespace engine { -namespace substrait { - namespace { /// \brief A SinkNodeConsumer specialized to output ExecBatches via PushGenerator class SubstraitSinkConsumer : public compute::SinkNodeConsumer { @@ -57,6 +55,11 @@ class SubstraitSinkConsumer : public compute::SinkNodeConsumer { arrow::PushGenerator>::Producer producer_; std::shared_ptr schema_; }; +} // namespace + +namespace substrait { + +namespace { /// \brief An executor to run a Substrait Query /// This interface is provided as a utility when creating language From 167e8aab9a003344374a2b9653574ceb27815c0d Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 20 May 2022 22:04:13 +0530 Subject: [PATCH 72/74] todo docs added --- python/pyarrow/tests/test_substrait.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index 056457bfd3e..8df35bbba44 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -64,7 +64,7 @@ def test_run_serialized_query(tmpdir): ] } """ - + # TODO: replace with ipc when the support is finalized in C++ path = os.path.join(str(tmpdir), 'substrait_data.arrow') table = pa.table([[1, 2, 3, 4, 5]], names=['foo']) with pa.ipc.RecordBatchFileWriter(path, schema=table.schema) as writer: From 00cc8d36659022d90f88341b7031821169c3e17d Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sat, 21 May 2022 00:19:10 +0530 Subject: [PATCH 73/74] reverting the change --- cpp/src/arrow/engine/substrait/util.cc | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index ff87cd7ec44..85119312307 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -23,7 +23,10 @@ namespace arrow { namespace engine { +namespace substrait { + namespace { + /// \brief A SinkNodeConsumer specialized to output ExecBatches via PushGenerator class SubstraitSinkConsumer : public compute::SinkNodeConsumer { public: @@ -55,11 +58,6 @@ class SubstraitSinkConsumer : public compute::SinkNodeConsumer { arrow::PushGenerator>::Producer producer_; std::shared_ptr schema_; }; -} // namespace - -namespace substrait { - -namespace { /// \brief An executor to run a Substrait Query /// This interface is provided as a utility when creating language From af47f038c43fd1a3fda6a54e56a57a3f5f8da22d Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sat, 21 May 2022 01:05:24 +0530 Subject: [PATCH 74/74] fix build break --- cpp/src/arrow/engine/substrait/util.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/engine/substrait/util.cc b/cpp/src/arrow/engine/substrait/util.cc index 85119312307..bc2aa36856e 100644 --- a/cpp/src/arrow/engine/substrait/util.cc +++ b/cpp/src/arrow/engine/substrait/util.cc @@ -93,7 +93,7 @@ class SubstraitExecutor { return sink_consumer_; }; ARROW_ASSIGN_OR_RAISE(declarations_, - engine::DeserializePlan(substrait_buffer, consumer_factory)); + engine::DeserializePlans(substrait_buffer, consumer_factory)); return Status::OK(); }