From 5ab85a24ece1d3b60fca56e4c6396564b202a1c7 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 15 Jul 2021 15:46:04 -0400 Subject: [PATCH 1/2] ARROW-11206: [C++][Compute][Python] Rename 'project' to 'make_struct' --- cpp/src/arrow/compute/api_scalar.cc | 25 +++---- cpp/src/arrow/compute/api_scalar.h | 12 ++-- cpp/src/arrow/compute/exec/expression.cc | 5 +- .../arrow/compute/exec/expression_internal.h | 7 +- cpp/src/arrow/compute/function_test.cc | 6 +- .../compute/kernels/scalar_if_else_test.cc | 8 +-- .../arrow/compute/kernels/scalar_nested.cc | 57 +++++++++------- .../compute/kernels/scalar_nested_test.cc | 65 +++++++++++-------- cpp/src/arrow/dataset/scanner.cc | 2 +- cpp/src/arrow/dataset/scanner_internal.h | 4 +- cpp/src/arrow/dataset/scanner_test.cc | 3 +- docs/source/cpp/compute.rst | 2 +- python/pyarrow/_compute.pyx | 6 +- python/pyarrow/compute.py | 2 +- python/pyarrow/includes/libarrow.pxd | 6 +- python/pyarrow/tests/test_compute.py | 2 +- 16 files changed, 115 insertions(+), 97 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 68df5f98b10..533e0e8cd0a 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -154,10 +154,10 @@ static auto kSliceOptionsType = GetFunctionOptionsType( DataMember("step", &SliceOptions::step)); static auto kCompareOptionsType = GetFunctionOptionsType(DataMember("op", &CompareOptions::op)); -static auto kProjectOptionsType = GetFunctionOptionsType( - DataMember("field_names", &ProjectOptions::field_names), - DataMember("field_nullability", &ProjectOptions::field_nullability), - DataMember("field_metadata", &ProjectOptions::field_metadata)); +static auto kMakeStructOptionsType = GetFunctionOptionsType( + DataMember("field_names", &MakeStructOptions::field_names), + DataMember("field_nullability", &MakeStructOptions::field_nullability), + DataMember("field_metadata", &MakeStructOptions::field_metadata)); static auto kDayOfWeekOptionsType = GetFunctionOptionsType( DataMember("one_based_numbering", &DayOfWeekOptions::one_based_numbering), DataMember("week_start", &DayOfWeekOptions::week_start)); @@ -265,21 +265,22 @@ CompareOptions::CompareOptions(CompareOperator op) CompareOptions::CompareOptions() : CompareOptions(CompareOperator::EQUAL) {} constexpr char CompareOptions::kTypeName[]; -ProjectOptions::ProjectOptions(std::vector n, std::vector r, - std::vector> m) - : FunctionOptions(internal::kProjectOptionsType), +MakeStructOptions::MakeStructOptions( + std::vector n, std::vector r, + std::vector> m) + : FunctionOptions(internal::kMakeStructOptionsType), field_names(std::move(n)), field_nullability(std::move(r)), field_metadata(std::move(m)) {} -ProjectOptions::ProjectOptions(std::vector n) - : FunctionOptions(internal::kProjectOptionsType), +MakeStructOptions::MakeStructOptions(std::vector n) + : FunctionOptions(internal::kMakeStructOptionsType), field_names(std::move(n)), field_nullability(field_names.size(), true), field_metadata(field_names.size(), NULLPTR) {} -ProjectOptions::ProjectOptions() : ProjectOptions(std::vector()) {} -constexpr char ProjectOptions::kTypeName[]; +MakeStructOptions::MakeStructOptions() : MakeStructOptions(std::vector()) {} +constexpr char MakeStructOptions::kTypeName[]; DayOfWeekOptions::DayOfWeekOptions(bool one_based_numbering, uint32_t week_start) : FunctionOptions(internal::kDayOfWeekOptionsType), @@ -304,7 +305,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kTrimOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kSliceOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kCompareOptionsType)); - DCHECK_OK(registry->AddFunctionOptionsType(kProjectOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kMakeStructOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kDayOfWeekOptionsType)); } } // namespace internal diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index bbaa4d13a21..1578bbf7236 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -226,13 +226,13 @@ class ARROW_EXPORT CompareOptions : public FunctionOptions { enum CompareOperator op; }; -class ARROW_EXPORT ProjectOptions : public FunctionOptions { +class ARROW_EXPORT MakeStructOptions : public FunctionOptions { public: - ProjectOptions(std::vector n, std::vector r, - std::vector> m); - explicit ProjectOptions(std::vector n); - ProjectOptions(); - constexpr static char const kTypeName[] = "ProjectOptions"; + MakeStructOptions(std::vector n, std::vector r, + std::vector> m); + explicit MakeStructOptions(std::vector n); + MakeStructOptions(); + constexpr static char const kTypeName[] = "MakeStructOptions"; /// Names for wrapped columns std::vector field_names; diff --git a/cpp/src/arrow/compute/exec/expression.cc b/cpp/src/arrow/compute/exec/expression.cc index bc9a9103f6d..4aab64a46a4 100644 --- a/cpp/src/arrow/compute/exec/expression.cc +++ b/cpp/src/arrow/compute/exec/expression.cc @@ -166,7 +166,7 @@ std::string Expression::ToString() const { return binary(std::move(op)); } - if (auto options = GetProjectOptions(*call)) { + if (auto options = GetMakeStructOptions(*call)) { std::string out = "{"; auto argument = call->arguments.begin(); for (const auto& field_name : options->field_names) { @@ -1122,7 +1122,8 @@ Result Deserialize(std::shared_ptr buffer) { } Expression project(std::vector values, std::vector names) { - return call("project", std::move(values), compute::ProjectOptions{std::move(names)}); + return call("make_struct", std::move(values), + compute::MakeStructOptions{std::move(names)}); } Expression equal(Expression lhs, Expression rhs) { diff --git a/cpp/src/arrow/compute/exec/expression_internal.h b/cpp/src/arrow/compute/exec/expression_internal.h index 51d242e8d66..dc38924d932 100644 --- a/cpp/src/arrow/compute/exec/expression_internal.h +++ b/cpp/src/arrow/compute/exec/expression_internal.h @@ -220,9 +220,10 @@ inline bool IsSetLookup(const std::string& function) { return function == "is_in" || function == "index_in"; } -inline const compute::ProjectOptions* GetProjectOptions(const Expression::Call& call) { - if (call.function_name != "project") return nullptr; - return checked_cast(call.options.get()); +inline const compute::MakeStructOptions* GetMakeStructOptions( + const Expression::Call& call) { + if (call.function_name != "make_struct") return nullptr; + return checked_cast(call.options.get()); } /// A helper for unboxing an Expression composed of associative function calls. diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc index 752ade284b7..225f80736a6 100644 --- a/cpp/src/arrow/compute/function_test.cc +++ b/cpp/src/arrow/compute/function_test.cc @@ -86,10 +86,10 @@ TEST(FunctionOptions, Equality) { options.emplace_back(new CompareOptions(CompareOperator::EQUAL)); options.emplace_back(new CompareOptions(CompareOperator::LESS)); // N.B. we never actually use field_nullability or field_metadata in Arrow - options.emplace_back(new ProjectOptions({"col1"}, {true}, {})); - options.emplace_back(new ProjectOptions({"col1"}, {false}, {})); + options.emplace_back(new MakeStructOptions({"col1"}, {true}, {})); + options.emplace_back(new MakeStructOptions({"col1"}, {false}, {})); options.emplace_back( - new ProjectOptions({"col1"}, {false}, {key_value_metadata({{"key", "val"}})})); + new MakeStructOptions({"col1"}, {false}, {key_value_metadata({{"key", "val"}})})); options.emplace_back(new DayOfWeekOptions(false, 1)); options.emplace_back(new CastOptions(CastOptions::Safe(boolean()))); options.emplace_back(new CastOptions(CastOptions::Unsafe(int64()))); diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index cd2d04a13e0..48bdd39616e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -324,13 +324,7 @@ class TestCaseWhenNumeric : public ::testing::Test {}; TYPED_TEST_SUITE(TestCaseWhenNumeric, NumericBasedTypes); Datum MakeStruct(const std::vector& conds) { - ProjectOptions options; - options.field_names.resize(conds.size()); - options.field_metadata.resize(conds.size()); - for (const auto& datum : conds) { - options.field_nullability.push_back(datum.null_count() > 0); - } - EXPECT_OK_AND_ASSIGN(auto result, CallFunction("project", conds, &options)); + EXPECT_OK_AND_ASSIGN(auto result, CallFunction("make_struct", conds)); return result; } diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index e4ab3f9b418..e9f0696c8fd 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -62,15 +62,23 @@ const FunctionDoc list_value_length_doc{ "Null values emit a null in the output."), {"lists"}}; -Result ProjectResolve(KernelContext* ctx, - const std::vector& descrs) { - const auto& names = OptionsWrapper::Get(ctx).field_names; - const auto& nullable = OptionsWrapper::Get(ctx).field_nullability; - const auto& metadata = OptionsWrapper::Get(ctx).field_metadata; - - if (names.size() != descrs.size() || nullable.size() != descrs.size() || - metadata.size() != descrs.size()) { - return Status::Invalid("project() was passed ", descrs.size(), " arguments but ", +Result MakeStructResolve(KernelContext* ctx, + const std::vector& descrs) { + auto names = OptionsWrapper::Get(ctx).field_names; + auto nullable = OptionsWrapper::Get(ctx).field_nullability; + auto metadata = OptionsWrapper::Get(ctx).field_metadata; + + if (names.size() == 0) { + names.resize(descrs.size()); + nullable.resize(descrs.size(), true); + metadata.resize(descrs.size(), nullptr); + int i = 0; + for (auto& name : names) { + name = std::to_string(i++); + } + } else if (names.size() != descrs.size() || nullable.size() != descrs.size() || + metadata.size() != descrs.size()) { + return Status::Invalid("make_struct() was passed ", descrs.size(), " arguments but ", names.size(), " field names, ", nullable.size(), " nullability bits, and ", metadata.size(), " metadata dictionaries."); @@ -94,15 +102,16 @@ Result ProjectResolve(KernelContext* ctx, } } - fields[i] = field(names[i], descr.type, nullable[i], metadata[i]); + fields[i] = + field(std::move(names[i]), descr.type, nullable[i], std::move(metadata[i])); ++i; } return ValueDescr{struct_(std::move(fields)), shape}; } -Status ProjectExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - ARROW_ASSIGN_OR_RAISE(auto descr, ProjectResolve(ctx, batch.GetDescriptors())); +Status MakeStructExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + ARROW_ASSIGN_OR_RAISE(auto descr, MakeStructResolve(ctx, batch.GetDescriptors())); for (int i = 0; i < batch.num_values(); ++i) { const auto& field = checked_cast(*descr.type).field(i); @@ -139,11 +148,11 @@ Status ProjectExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { return Status::OK(); } -const FunctionDoc project_doc{"Wrap Arrays into a StructArray", - ("Names of the StructArray's fields are\n" - "specified through ProjectOptions."), - {"*args"}, - "ProjectOptions"}; +const FunctionDoc make_struct_doc{"Wrap Arrays into a StructArray", + ("Names of the StructArray's fields are\n" + "specified through MakeStructOptions."), + {"*args"}, + "MakeStructOptions"}; } // namespace @@ -156,15 +165,17 @@ void RegisterScalarNested(FunctionRegistry* registry) { ListValueLength)); DCHECK_OK(registry->AddFunction(std::move(list_value_length))); - auto project_function = - std::make_shared("project", Arity::VarArgs(), &project_doc); - ScalarKernel kernel{KernelSignature::Make({InputType{}}, OutputType{ProjectResolve}, + static MakeStructOptions kDefaultMakeStructOptions; + auto make_struct_function = std::make_shared( + "make_struct", Arity::VarArgs(), &make_struct_doc, &kDefaultMakeStructOptions); + + ScalarKernel kernel{KernelSignature::Make({InputType{}}, OutputType{MakeStructResolve}, /*is_varargs=*/true), - ProjectExec, OptionsWrapper::Init}; + MakeStructExec, OptionsWrapper::Init}; kernel.null_handling = NullHandling::OUTPUT_NOT_NULL; kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; - DCHECK_OK(project_function->AddKernel(std::move(kernel))); - DCHECK_OK(registry->AddFunction(std::move(project_function))); + DCHECK_OK(make_struct_function->AddKernel(std::move(kernel))); + DCHECK_OK(registry->AddFunction(std::move(make_struct_function))); } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index 42de9bcdb50..ef489955fa6 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -22,6 +22,7 @@ #include "arrow/compute/kernels/test_util.h" #include "arrow/result.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" #include "arrow/util/key_value_metadata.h" namespace arrow { @@ -39,48 +40,55 @@ TEST(TestScalarNested, ListValueLength) { } struct { + Result operator()(std::vector args) { + return CallFunction("make_struct", args); + } + template Result operator()(std::vector args, std::vector field_names, Options... options) { - ProjectOptions opts{field_names, options...}; - return CallFunction("project", args, &opts); + MakeStructOptions opts{field_names, options...}; + return CallFunction("make_struct", args, &opts); } -} Project; +} MakeStruct; -TEST(Project, Scalar) { +TEST(MakeStruct, Scalar) { auto i32 = MakeScalar(1); auto f64 = MakeScalar(2.5); auto str = MakeScalar("yo"); - ASSERT_OK_AND_ASSIGN(auto expected, - StructScalar::Make({i32, f64, str}, {"i", "f", "s"})); - ASSERT_OK_AND_EQ(Datum(expected), Project({i32, f64, str}, {"i", "f", "s"})); + EXPECT_THAT(MakeStruct({i32, f64, str}, {"i", "f", "s"}), + ResultWith(Datum(*StructScalar::Make({i32, f64, str}, {"i", "f", "s"})))); - // Three field names but one input value - ASSERT_RAISES(Invalid, Project({str}, {"i", "f", "s"})); + // Names default to field_index + EXPECT_THAT(MakeStruct({i32, f64, str}), + ResultWith(Datum(*StructScalar::Make({i32, f64, str}, {"0", "1", "2"})))); // No field names or input values is fine - expected.reset(new StructScalar{{}, struct_({})}); - ASSERT_OK_AND_EQ(Datum(expected), Project(/*args=*/{}, /*field_names=*/{})); + EXPECT_THAT(MakeStruct({}), ResultWith(Datum(*StructScalar::Make({}, {})))); + + // Three field names but one input value + EXPECT_THAT(MakeStruct({str}, {"i", "f", "s"}), Raises(StatusCode::Invalid)); } -TEST(Project, Array) { +TEST(MakeStruct, Array) { std::vector field_names{"i", "s"}; auto i32 = ArrayFromJSON(int32(), "[42, 13, 7]"); auto str = ArrayFromJSON(utf8(), R"(["aa", "aa", "aa"])"); - ASSERT_OK_AND_ASSIGN(Datum expected, StructArray::Make({i32, str}, field_names)); - ASSERT_OK_AND_EQ(expected, Project({i32, str}, field_names)); + EXPECT_THAT(MakeStruct({i32, str}, {"i", "s"}), + ResultWith(Datum(*StructArray::Make({i32, str}, field_names)))); // Scalars are broadcast to the length of the arrays - ASSERT_OK_AND_EQ(expected, Project({i32, MakeScalar("aa")}, field_names)); + EXPECT_THAT(MakeStruct({i32, MakeScalar("aa")}, {"i", "s"}), + ResultWith(Datum(*StructArray::Make({i32, str}, field_names)))); // Array length mismatch - ASSERT_RAISES(Invalid, Project({i32->Slice(1), str}, field_names)); + EXPECT_THAT(MakeStruct({i32->Slice(1), str}, field_names), Raises(StatusCode::Invalid)); } -TEST(Project, NullableMetadataPassedThru) { +TEST(MakeStruct, NullableMetadataPassedThru) { auto i32 = ArrayFromJSON(int32(), "[42, 13, 7]"); auto str = ArrayFromJSON(utf8(), R"(["aa", "aa", "aa"])"); @@ -90,7 +98,7 @@ TEST(Project, NullableMetadataPassedThru) { key_value_metadata({"a", "b"}, {"ALPHA", "BRAVO"}), nullptr}; ASSERT_OK_AND_ASSIGN(auto proj, - Project({i32, str}, field_names, nullability, metadata)); + MakeStruct({i32, str}, field_names, nullability, metadata)); AssertTypeEqual(*proj.type(), StructType({ field("i", int32(), /*nullable=*/true, metadata[0]), @@ -98,11 +106,12 @@ TEST(Project, NullableMetadataPassedThru) { })); // error: projecting an array containing nulls with nullable=false - str = ArrayFromJSON(utf8(), R"(["aa", null, "aa"])"); - ASSERT_RAISES(Invalid, Project({i32, str}, field_names, nullability, metadata)); + EXPECT_THAT(MakeStruct({i32, ArrayFromJSON(utf8(), R"(["aa", null, "aa"])")}, + field_names, nullability, metadata), + Raises(StatusCode::Invalid)); } -TEST(Project, ChunkedArray) { +TEST(MakeStruct, ChunkedArray) { std::vector field_names{"i", "s"}; auto i32_0 = ArrayFromJSON(int32(), "[42, 13, 7]"); @@ -122,16 +131,16 @@ TEST(Project, ChunkedArray) { ASSERT_OK_AND_ASSIGN(Datum expected, ChunkedArray::Make({expected_0, expected_1, expected_2})); - ASSERT_OK_AND_EQ(expected, Project({i32, str}, field_names)); + ASSERT_OK_AND_EQ(expected, MakeStruct({i32, str}, field_names)); // Scalars are broadcast to the length of the arrays - ASSERT_OK_AND_EQ(expected, Project({i32, MakeScalar("aa")}, field_names)); + ASSERT_OK_AND_EQ(expected, MakeStruct({i32, MakeScalar("aa")}, field_names)); // Array length mismatch - ASSERT_RAISES(Invalid, Project({i32->Slice(1), str}, field_names)); + ASSERT_RAISES(Invalid, MakeStruct({i32->Slice(1), str}, field_names)); } -TEST(Project, ChunkedArrayDifferentChunking) { +TEST(MakeStruct, ChunkedArrayDifferentChunking) { std::vector field_names{"i", "s"}; auto i32_0 = ArrayFromJSON(int32(), "[42, 13, 7]"); @@ -159,13 +168,13 @@ TEST(Project, ChunkedArrayDifferentChunking) { ASSERT_OK_AND_ASSIGN(Datum expected, ChunkedArray::Make(expected_chunks)); - ASSERT_OK_AND_EQ(expected, Project({i32, str}, field_names)); + ASSERT_OK_AND_EQ(expected, MakeStruct({i32, str}, field_names)); // Scalars are broadcast to the length of the arrays - ASSERT_OK_AND_EQ(expected, Project({i32, MakeScalar("aa")}, field_names)); + ASSERT_OK_AND_EQ(expected, MakeStruct({i32, MakeScalar("aa")}, field_names)); // Array length mismatch - ASSERT_RAISES(Invalid, Project({i32->Slice(1), str}, field_names)); + ASSERT_RAISES(Invalid, MakeStruct({i32->Slice(1), str}, field_names)); } } // namespace compute diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc index 2f7a115bb4b..0a289985ca2 100644 --- a/cpp/src/arrow/dataset/scanner.cc +++ b/cpp/src/arrow/dataset/scanner.cc @@ -621,7 +621,7 @@ Result AsyncScanner::ScanBatchesUnorderedAsync( compute::MakeFilterNode(scan, "filter", scan_options_->filter)); auto exprs = scan_options_->projection.call()->arguments; - auto names = checked_cast( + auto names = checked_cast( scan_options_->projection.call()->options.get()) ->field_names; ARROW_ASSIGN_OR_RAISE( diff --git a/cpp/src/arrow/dataset/scanner_internal.h b/cpp/src/arrow/dataset/scanner_internal.h index 27b32aa6f19..a7ba070b2cf 100644 --- a/cpp/src/arrow/dataset/scanner_internal.h +++ b/cpp/src/arrow/dataset/scanner_internal.h @@ -225,7 +225,7 @@ inline Status SetProjection(ScanOptions* options, const compute::Expression& pro inline Status SetProjection(ScanOptions* options, std::vector exprs, std::vector names) { - compute::ProjectOptions project_options{std::move(names)}; + compute::MakeStructOptions project_options{std::move(names)}; for (size_t i = 0; i < exprs.size(); ++i) { if (auto ref = exprs[i].field_ref()) { @@ -239,7 +239,7 @@ inline Status SetProjection(ScanOptions* options, std::vector names) { diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc index 74f558d1738..5dc83c662de 100644 --- a/cpp/src/arrow/dataset/scanner_test.cc +++ b/cpp/src/arrow/dataset/scanner_test.cc @@ -1362,7 +1362,8 @@ TEST(ScanNode, MinimalEndToEnd) { // just be a list of materialized field names) compute::Expression a_times_2 = call("multiply", {field_ref("a"), literal(2)}); ASSERT_OK_AND_ASSIGN(a_times_2, a_times_2.Bind(*dataset->schema())); - options->projection = call("project", {a_times_2}, compute::ProjectOptions{{"a * 2"}}); + options->projection = + call("make_struct", {a_times_2}, compute::MakeStructOptions{{"a * 2"}}); // construct the scan node ASSERT_OK_AND_ASSIGN(compute::ExecNode * scan, diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index ed97faead74..3541b187ddd 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -920,7 +920,7 @@ Structural transforms (null if input is null). Output type is Int32 for List, Int64 for LargeList. * \(10) The output struct's field types are the types of its arguments. The - field names are specified using an instance of :struct:`ProjectOptions`. + field names are specified using an instance of :struct:`MakeStructOptions`. The output shape will be scalar if all inputs are scalar, otherwise any scalars will be broadcast to arrays. diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index d3267dc02d7..a16ea5d42ff 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -862,16 +862,16 @@ class PartitionNthOptions(_PartitionNthOptions): self._set_options(pivot) -cdef class _ProjectOptions(FunctionOptions): +cdef class _MakeStructOptions(FunctionOptions): def _set_options(self, field_names): cdef: vector[c_string] c_field_names for n in field_names: c_field_names.push_back(tobytes(n)) - self.wrapped.reset(new CProjectOptions(field_names)) + self.wrapped.reset(new CMakeStructOptions(field_names)) -class ProjectOptions(_ProjectOptions): +class MakeStructOptions(_MakeStructOptions): def __init__(self, field_names): self._set_options(field_names) diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 15d1adcbafe..85f637fce5a 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -41,7 +41,7 @@ ModeOptions, PadOptions, PartitionNthOptions, - ProjectOptions, + MakeStructOptions, QuantileOptions, ReplaceSliceOptions, ReplaceSubstringOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 6977c26cac5..bd3bdb251f3 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1971,9 +1971,9 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: CPartitionNthOptions(int64_t pivot) int64_t pivot - cdef cppclass CProjectOptions \ - "arrow::compute::ProjectOptions"(CFunctionOptions): - CProjectOptions(vector[c_string] field_names) + cdef cppclass CMakeStructOptions \ + "arrow::compute::MakeStructOptions"(CFunctionOptions): + CMakeStructOptions(vector[c_string] field_names) vector[c_string] field_names ctypedef enum CSortOrder" arrow::compute::SortOrder": diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index b65970745ec..e17a7d1ceb2 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -119,7 +119,7 @@ def test_option_class_equality(): pc.MatchSubstringOptions("pattern"), pc.PadOptions(5, " "), pc.PartitionNthOptions(1), - pc.ProjectOptions([b"field", b"names"]), + pc.MakeStructOptions([b"field", b"names"]), pc.DayOfWeekOptions(False, 0), pc.ReplaceSliceOptions(start=0, stop=1, replacement="a"), pc.ReplaceSubstringOptions("a", "b"), From be7e9618a5c93512a18aa06b06a5e01aa82a34f6 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 15 Jul 2021 16:10:16 -0400 Subject: [PATCH 2/2] repair MakeStructOptions construction, add tests --- python/pyarrow/_compute.pyx | 2 +- python/pyarrow/tests/test_compute.py | 28 +++++++++++++++++++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index a16ea5d42ff..46cfdc4e2ef 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -868,7 +868,7 @@ cdef class _MakeStructOptions(FunctionOptions): vector[c_string] c_field_names for n in field_names: c_field_names.push_back(tobytes(n)) - self.wrapped.reset(new CMakeStructOptions(field_names)) + self.wrapped.reset(new CMakeStructOptions(c_field_names)) class MakeStructOptions(_MakeStructOptions): diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index e17a7d1ceb2..c98b3a224b8 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -119,7 +119,7 @@ def test_option_class_equality(): pc.MatchSubstringOptions("pattern"), pc.PadOptions(5, " "), pc.PartitionNthOptions(1), - pc.MakeStructOptions([b"field", b"names"]), + pc.MakeStructOptions(["field", "names"]), pc.DayOfWeekOptions(False, 0), pc.ReplaceSliceOptions(start=0, stop=1, replacement="a"), pc.ReplaceSubstringOptions("a", "b"), @@ -1645,3 +1645,29 @@ def test_min_max_element_wise(): assert result == pa.array([2, 3, None]) result = pc.min_element_wise(arr1, arr3, skip_nulls=False) assert result == pa.array([1, 2, None]) + + +def test_make_struct(): + assert pc.make_struct(1, 'a').as_py() == {'0': 1, '1': 'a'} + + assert pc.make_struct(1, 'a', field_names=['i', 's']).as_py() == { + 'i': 1, 's': 'a'} + + assert pc.make_struct([1, 2, 3], + "a b c".split()) == pa.StructArray.from_arrays([ + [1, 2, 3], + "a b c".split()], names='0 1'.split()) + + with pytest.raises(ValueError, match="Array arguments must all " + "be the same length"): + pc.make_struct([1, 2, 3, 4], "a b c".split()) + + with pytest.raises(ValueError, match="0 arguments but 2 field names"): + pc.make_struct(field_names=['one', 'two']) + + +def test_case_when(): + assert pc.case_when(pc.make_struct([True, False, None], + [False, True, None]), + [1, 2, 3], + [11, 12, 13]) == pa.array([1, 12, None])