From 8e318d02ee89b2a91cc0a71d8ec5208c2f61fed0 Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Wed, 9 Jun 2021 16:29:26 +0200 Subject: [PATCH 01/61] Complex Number Extension Type --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/extension_type_test.cc | 28 +++++++- cpp/src/arrow/extensions/complex_type.cc | 88 ++++++++++++++++++++++++ cpp/src/arrow/extensions/complex_type.h | 44 ++++++++++++ cpp/src/arrow/python/numpy_convert.cc | 27 ++++++++ cpp/src/arrow/python/numpy_internal.h | 2 + cpp/src/arrow/python/python_to_arrow.cc | 17 +++++ 7 files changed, 206 insertions(+), 1 deletion(-) create mode 100644 cpp/src/arrow/extensions/complex_type.cc create mode 100644 cpp/src/arrow/extensions/complex_type.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index f6d5a540c98..333ca6b6b59 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -155,6 +155,7 @@ set(ARROW_SRCS array/diff.cc array/util.cc array/validate.cc + extensions/complex_type.cc builder.cc buffer.cc chunked_array.cc diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index cd1c3b9790e..2aba8512053 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -27,6 +27,7 @@ #include "arrow/array/array_nested.h" #include "arrow/array/util.h" #include "arrow/extension_type.h" +#include "arrow/extensions/complex_type.h" #include "arrow/io/memory.h" #include "arrow/ipc/options.h" #include "arrow/ipc/reader.h" @@ -178,15 +179,40 @@ class ExtStructType : public ExtensionType { class TestExtensionType : public ::testing::Test { public: - void SetUp() { ASSERT_OK(RegisterExtensionType(std::make_shared())); } + void SetUp() { + ASSERT_OK(RegisterExtensionType(std::make_shared())); + ASSERT_OK(RegisterExtensionType(std::make_shared(float32()))); + } void TearDown() { if (GetExtensionType("uuid")) { ASSERT_OK(UnregisterExtensionType("uuid")); } + if (GetExtensionType("complex")) { + ASSERT_OK(UnregisterExtensionType("complex")); + } } }; +TEST_F(TestExtensionType, ComplexTypeTest) { + auto registered_type = GetExtensionType("complex"); + ASSERT_NE(registered_type, nullptr); + + auto type = complex64(); + ASSERT_EQ(type->id(), Type::EXTENSION); + + const auto & ext_type = static_cast(*type); + std::string serialized = ext_type.Serialize(); + + ASSERT_OK_AND_ASSIGN(auto deserialized, + ext_type.Deserialize(fixed_size_list(float32(), 2), serialized)); + + ASSERT_TRUE(deserialized->Equals(*type)); + ASSERT_FALSE(deserialized->Equals(*fixed_size_list(float32(), 2))); + + //auto type2 = complex(int16()); +} + TEST_F(TestExtensionType, ExtensionTypeTest) { auto type_not_exist = GetExtensionType("uuid-unknown"); ASSERT_EQ(type_not_exist, nullptr); diff --git a/cpp/src/arrow/extensions/complex_type.cc b/cpp/src/arrow/extensions/complex_type.cc new file mode 100644 index 00000000000..8a62e40879f --- /dev/null +++ b/cpp/src/arrow/extensions/complex_type.cc @@ -0,0 +1,88 @@ +#include + +#include + +namespace arrow { + +std::shared_ptr ComplexType::MakeType(std::shared_ptr subtype) +{ + return fixed_size_list(FloatCast(subtype), 2); +} + +std::shared_ptr ComplexType::FloatCast(std::shared_ptr subtype) +{ + auto float_type = std::dynamic_pointer_cast(subtype); + + if(!float_type) + { + throw std::runtime_error("ComplexType subtype not floating point"); + } + + if(float_type->precision() != FloatingPointType::SINGLE && + float_type->precision() != FloatingPointType::DOUBLE) + { + throw std::runtime_error("Complex subtype must be single or double precision"); + } + + return float_type; +} + + +std::string ComplexType::name() const { + std::stringstream ss("complex"); + + switch(subtype()->precision()) + { + case FloatingPointType::SINGLE: + ss << "64"; + break; + case FloatingPointType::DOUBLE: + ss << "128"; + break; + case FloatingPointType::HALF: + default: + throw std::runtime_error("Complex Type must be single or double precision"); + break; + } + + return ss.str(); +} + +std::string ComplexType::extension_name() const { + return "complex"; +} + + +bool ComplexType::ExtensionEquals(const ExtensionType& other) const { + const auto& other_ext = static_cast(other); + if (other_ext.extension_name() != this->extension_name()) { + return false; + } + return this->subtype() == static_cast(other).subtype(); +} + +Result> ComplexType::Deserialize( + std::shared_ptr storage_type, + const std::string& serialized) const { + + auto ltype = std::static_pointer_cast(storage_type); + return std::make_shared(ltype->value_type()); +} + +std::string ComplexType::Serialize() const { +return ""; +} + +std::shared_ptr complex(std::shared_ptr subtype) { + return std::make_shared(subtype); +} + +std::shared_ptr complex64() { + return std::make_shared(float32()); +} + +std::shared_ptr complex128() { + return std::make_shared(float64()); +} + +}; \ No newline at end of file diff --git a/cpp/src/arrow/extensions/complex_type.h b/cpp/src/arrow/extensions/complex_type.h new file mode 100644 index 00000000000..076ef744da1 --- /dev/null +++ b/cpp/src/arrow/extensions/complex_type.h @@ -0,0 +1,44 @@ +#include "arrow/extension_type.h" + +namespace arrow { + +class ComplexArray : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; +}; + +class ComplexType : public ExtensionType { + private: + std::shared_ptr subtype_; + + static std::shared_ptr MakeType(std::shared_ptr subtype); + static std::shared_ptr FloatCast(std::shared_ptr subtype); + + public: + explicit ComplexType(std::shared_ptr subtype) : + ExtensionType(MakeType(subtype)), + subtype_(FloatCast(subtype)) {} + + std::shared_ptr subtype() const { return subtype_; } + std::string name() const override; + std::string extension_name() const override; + + bool ExtensionEquals(const ExtensionType& other) const override; + + std::shared_ptr MakeArray(std::shared_ptr data) const override { + return std::make_shared(data); + } + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized) const override; + + std::string Serialize() const override; +}; + +std::shared_ptr complex(std::shared_ptr subtype); +std::shared_ptr complex64(); +std::shared_ptr complex128(); + + +}; \ No newline at end of file diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc index bf4afb2a0a1..8202331cddf 100644 --- a/cpp/src/arrow/python/numpy_convert.cc +++ b/cpp/src/arrow/python/numpy_convert.cc @@ -30,6 +30,8 @@ #include "arrow/type.h" #include "arrow/util/logging.h" +#include "arrow/extensions/complex_type.h" + #include "arrow/python/common.h" #include "arrow/python/pyarrow.h" #include "arrow/python/type_traits.h" @@ -84,6 +86,12 @@ Status GetTensorType(PyObject* dtype, std::shared_ptr* out) { TO_ARROW_TYPE_CASE(FLOAT16, float16); TO_ARROW_TYPE_CASE(FLOAT32, float32); TO_ARROW_TYPE_CASE(FLOAT64, float64); + case NPY_COMPLEX64: + *out = complex(float32()); + break; + case NPY_COMPLEX128: + *out = complex(float64()); + break; default: { return Status::NotImplemented("Unsupported numpy type ", descr->type_num); } @@ -109,6 +117,23 @@ Status GetNumPyType(const DataType& type, int* type_num) { NUMPY_TYPE_CASE(HALF_FLOAT, FLOAT16); NUMPY_TYPE_CASE(FLOAT, FLOAT32); NUMPY_TYPE_CASE(DOUBLE, FLOAT64); + case Type::EXTENSION: { + const auto * ptr = dynamic_cast(&type); + + if(ptr == nullptr) { + // continue into the default branch + } else if(ptr->subtype()->Equals(float32())) { + *type_num = NPY_COMPLEX64; + break; + } else if(ptr->subtype()->Equals(float64())) { + *type_num = NPY_COMPLEX128; + break; + } else { + return Status::NotImplemented("Unsupported complex tensor type: ", ptr->ToString()); + break; + } + } + default: { return Status::NotImplemented("Unsupported tensor type: ", type.ToString()); } @@ -144,6 +169,8 @@ Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr* out) { TO_ARROW_TYPE_CASE(FLOAT16, float16); TO_ARROW_TYPE_CASE(FLOAT32, float32); TO_ARROW_TYPE_CASE(FLOAT64, float64); + TO_ARROW_TYPE_CASE(COMPLEX64, complex64); + TO_ARROW_TYPE_CASE(COMPLEX128, complex128); TO_ARROW_TYPE_CASE(STRING, binary); TO_ARROW_TYPE_CASE(UNICODE, utf8); case NPY_DATETIME: { diff --git a/cpp/src/arrow/python/numpy_internal.h b/cpp/src/arrow/python/numpy_internal.h index f43599eb3eb..58f61d5899e 100644 --- a/cpp/src/arrow/python/numpy_internal.h +++ b/cpp/src/arrow/python/numpy_internal.h @@ -102,6 +102,8 @@ static inline std::string GetNumPyTypeName(int npy_type) { TYPE_CASE(FLOAT16, "float16") TYPE_CASE(FLOAT32, "float32") TYPE_CASE(FLOAT64, "float64") + TYPE_CASE(COMPLEX64, "complex64") + TYPE_CASE(COMPLEX128, "complex128") TYPE_CASE(DATETIME, "datetime64") TYPE_CASE(TIMEDELTA, "timedelta64") TYPE_CASE(OBJECT, "object") diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index b2d9f1cb5a3..bf45142f1f7 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -33,6 +34,7 @@ #include "arrow/array/builder_dict.h" #include "arrow/array/builder_nested.h" #include "arrow/array/builder_primitive.h" +#include "arrow/extensions/complex_type.h" #include "arrow/chunked_array.h" #include "arrow/status.h" #include "arrow/type.h" @@ -164,6 +166,21 @@ class PyValue { return value; } + static Result> Convert(const ComplexType*, const O&, I obj) { + std::complex value; + + if (PyComplex_Check(obj)) { + value = std::complex( + PyComplex_RealAsDouble(obj), + PyComplex_ImagAsDouble(obj)); + RETURN_IF_PYERROR(); + } else { + return internal::InvalidValue(obj, "tried to convert to std::complex"); + } + + return value; + }; + static Result Convert(const Decimal128Type* type, const O&, I obj) { Decimal128 value; RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value)); From bd2cc0b9f0c984c0a708a025955f380fc0894d7f Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Wed, 9 Jun 2021 17:05:02 +0200 Subject: [PATCH 02/61] lint --- cpp/src/arrow/extension_type_test.cc | 4 +- cpp/src/arrow/extensions/complex_type.cc | 98 ++++++++++-------------- cpp/src/arrow/extensions/complex_type.h | 28 ++++--- cpp/src/arrow/python/numpy_convert.cc | 11 +-- cpp/src/arrow/python/python_to_arrow.cc | 7 +- 5 files changed, 66 insertions(+), 82 deletions(-) diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index 2aba8512053..e2094e84cc0 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -201,7 +201,7 @@ TEST_F(TestExtensionType, ComplexTypeTest) { auto type = complex64(); ASSERT_EQ(type->id(), Type::EXTENSION); - const auto & ext_type = static_cast(*type); + const auto& ext_type = static_cast(*type); std::string serialized = ext_type.Serialize(); ASSERT_OK_AND_ASSIGN(auto deserialized, @@ -210,7 +210,7 @@ TEST_F(TestExtensionType, ComplexTypeTest) { ASSERT_TRUE(deserialized->Equals(*type)); ASSERT_FALSE(deserialized->Equals(*fixed_size_list(float32(), 2))); - //auto type2 = complex(int16()); + // auto type2 = complex(int16()); } TEST_F(TestExtensionType, ExtensionTypeTest) { diff --git a/cpp/src/arrow/extensions/complex_type.cc b/cpp/src/arrow/extensions/complex_type.cc index 8a62e40879f..6b1a1bd6b21 100644 --- a/cpp/src/arrow/extensions/complex_type.cc +++ b/cpp/src/arrow/extensions/complex_type.cc @@ -4,85 +4,71 @@ namespace arrow { -std::shared_ptr ComplexType::MakeType(std::shared_ptr subtype) -{ - return fixed_size_list(FloatCast(subtype), 2); +std::shared_ptr ComplexType::MakeType(std::shared_ptr subtype) { + return fixed_size_list(FloatCast(subtype), 2); } -std::shared_ptr ComplexType::FloatCast(std::shared_ptr subtype) -{ - auto float_type = std::dynamic_pointer_cast(subtype); +std::shared_ptr ComplexType::FloatCast( + std::shared_ptr subtype) { + auto float_type = std::dynamic_pointer_cast(subtype); - if(!float_type) - { - throw std::runtime_error("ComplexType subtype not floating point"); - } + if (!float_type) { + throw std::runtime_error("ComplexType subtype not floating point"); + } - if(float_type->precision() != FloatingPointType::SINGLE && - float_type->precision() != FloatingPointType::DOUBLE) - { - throw std::runtime_error("Complex subtype must be single or double precision"); - } + if (float_type->precision() != FloatingPointType::SINGLE && + float_type->precision() != FloatingPointType::DOUBLE) { + throw std::runtime_error("Complex subtype must be single or double precision"); + } - return float_type; + return float_type; } - std::string ComplexType::name() const { - std::stringstream ss("complex"); - - switch(subtype()->precision()) - { - case FloatingPointType::SINGLE: - ss << "64"; - break; - case FloatingPointType::DOUBLE: - ss << "128"; - break; - case FloatingPointType::HALF: - default: - throw std::runtime_error("Complex Type must be single or double precision"); - break; - } - - return ss.str(); -} - -std::string ComplexType::extension_name() const { - return "complex"; + std::stringstream ss("complex"); + + switch (subtype()->precision()) { + case FloatingPointType::SINGLE: + ss << "64"; + break; + case FloatingPointType::DOUBLE: + ss << "128"; + break; + case FloatingPointType::HALF: + default: + throw std::runtime_error("Complex Type must be single or double precision"); + break; + } + + return ss.str(); } +std::string ComplexType::extension_name() const { return "complex"; } bool ComplexType::ExtensionEquals(const ExtensionType& other) const { - const auto& other_ext = static_cast(other); - if (other_ext.extension_name() != this->extension_name()) { - return false; - } - return this->subtype() == static_cast(other).subtype(); + const auto& other_ext = static_cast(other); + if (other_ext.extension_name() != this->extension_name()) { + return false; + } + return this->subtype() == static_cast(other).subtype(); } Result> ComplexType::Deserialize( - std::shared_ptr storage_type, - const std::string& serialized) const { - - auto ltype = std::static_pointer_cast(storage_type); - return std::make_shared(ltype->value_type()); + std::shared_ptr storage_type, const std::string& serialized) const { + auto ltype = std::static_pointer_cast(storage_type); + return std::make_shared(ltype->value_type()); } -std::string ComplexType::Serialize() const { -return ""; -} +std::string ComplexType::Serialize() const { return ""; } std::shared_ptr complex(std::shared_ptr subtype) { return std::make_shared(subtype); } -std::shared_ptr complex64() { - return std::make_shared(float32()); -} +std::shared_ptr complex64() { return std::make_shared(float32()); } std::shared_ptr complex128() { - return std::make_shared(float64()); + return std::make_shared(float64()); } -}; \ No newline at end of file +}; // namespace arrow \ No newline at end of file diff --git a/cpp/src/arrow/extensions/complex_type.h b/cpp/src/arrow/extensions/complex_type.h index 076ef744da1..1b813c421c0 100644 --- a/cpp/src/arrow/extensions/complex_type.h +++ b/cpp/src/arrow/extensions/complex_type.h @@ -3,25 +3,24 @@ namespace arrow { class ComplexArray : public ExtensionArray { - public: - using ExtensionArray::ExtensionArray; + public: + using ExtensionArray::ExtensionArray; }; class ComplexType : public ExtensionType { - private: - std::shared_ptr subtype_; + private: + std::shared_ptr subtype_; - static std::shared_ptr MakeType(std::shared_ptr subtype); - static std::shared_ptr FloatCast(std::shared_ptr subtype); + static std::shared_ptr MakeType(std::shared_ptr subtype); + static std::shared_ptr FloatCast(std::shared_ptr subtype); - public: - explicit ComplexType(std::shared_ptr subtype) : - ExtensionType(MakeType(subtype)), - subtype_(FloatCast(subtype)) {} + public: + explicit ComplexType(std::shared_ptr subtype) + : ExtensionType(MakeType(subtype)), subtype_(FloatCast(subtype)) {} - std::shared_ptr subtype() const { return subtype_; } - std::string name() const override; - std::string extension_name() const override; + std::shared_ptr subtype() const { return subtype_; } + std::string name() const override; + std::string extension_name() const override; bool ExtensionEquals(const ExtensionType& other) const override; @@ -40,5 +39,4 @@ std::shared_ptr complex(std::shared_ptr subtype); std::shared_ptr complex64(); std::shared_ptr complex128(); - -}; \ No newline at end of file +}; // namespace arrow \ No newline at end of file diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc index 8202331cddf..241b0323513 100644 --- a/cpp/src/arrow/python/numpy_convert.cc +++ b/cpp/src/arrow/python/numpy_convert.cc @@ -118,18 +118,19 @@ Status GetNumPyType(const DataType& type, int* type_num) { NUMPY_TYPE_CASE(FLOAT, FLOAT32); NUMPY_TYPE_CASE(DOUBLE, FLOAT64); case Type::EXTENSION: { - const auto * ptr = dynamic_cast(&type); + const auto* ptr = dynamic_cast(&type); - if(ptr == nullptr) { + if (ptr == nullptr) { // continue into the default branch - } else if(ptr->subtype()->Equals(float32())) { + } else if (ptr->subtype()->Equals(float32())) { *type_num = NPY_COMPLEX64; break; - } else if(ptr->subtype()->Equals(float64())) { + } else if (ptr->subtype()->Equals(float64())) { *type_num = NPY_COMPLEX128; break; } else { - return Status::NotImplemented("Unsupported complex tensor type: ", ptr->ToString()); + return Status::NotImplemented("Unsupported complex tensor type: ", + ptr->ToString()); break; } } diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index bf45142f1f7..3739508b45f 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -34,8 +34,8 @@ #include "arrow/array/builder_dict.h" #include "arrow/array/builder_nested.h" #include "arrow/array/builder_primitive.h" -#include "arrow/extensions/complex_type.h" #include "arrow/chunked_array.h" +#include "arrow/extensions/complex_type.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" @@ -170,9 +170,8 @@ class PyValue { std::complex value; if (PyComplex_Check(obj)) { - value = std::complex( - PyComplex_RealAsDouble(obj), - PyComplex_ImagAsDouble(obj)); + value = + std::complex(PyComplex_RealAsDouble(obj), PyComplex_ImagAsDouble(obj)); RETURN_IF_PYERROR(); } else { return internal::InvalidValue(obj, "tried to convert to std::complex"); From 2b6f0d106bd7d7864005a6604c077bf50a1aedb1 Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Wed, 9 Jun 2021 17:35:22 +0200 Subject: [PATCH 03/61] more linting --- cpp/src/arrow/extensions/complex_type.cc | 2 +- cpp/src/arrow/extensions/complex_type.h | 2 +- cpp/src/arrow/python/python_to_arrow.cc | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/extensions/complex_type.cc b/cpp/src/arrow/extensions/complex_type.cc index 6b1a1bd6b21..a8a6be9f0cc 100644 --- a/cpp/src/arrow/extensions/complex_type.cc +++ b/cpp/src/arrow/extensions/complex_type.cc @@ -71,4 +71,4 @@ std::shared_ptr complex128() { return std::make_shared(float64()); } -}; // namespace arrow \ No newline at end of file +}; // namespace arrow diff --git a/cpp/src/arrow/extensions/complex_type.h b/cpp/src/arrow/extensions/complex_type.h index 1b813c421c0..de0305392b1 100644 --- a/cpp/src/arrow/extensions/complex_type.h +++ b/cpp/src/arrow/extensions/complex_type.h @@ -39,4 +39,4 @@ std::shared_ptr complex(std::shared_ptr subtype); std::shared_ptr complex64(); std::shared_ptr complex128(); -}; // namespace arrow \ No newline at end of file +}; // namespace arrow diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 3739508b45f..cb40ca827aa 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -178,7 +178,7 @@ class PyValue { } return value; - }; + } static Result Convert(const Decimal128Type* type, const O&, I obj) { Decimal128 value; From 1db170d96424a4cfe1b7312a1ae814f4d3b50816 Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Wed, 9 Jun 2021 17:54:52 +0200 Subject: [PATCH 04/61] Copyright headers --- cpp/src/arrow/extensions/complex_type.cc | 19 +++++++++++++++++++ cpp/src/arrow/extensions/complex_type.h | 19 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/cpp/src/arrow/extensions/complex_type.cc b/cpp/src/arrow/extensions/complex_type.cc index a8a6be9f0cc..8e8d21f0e09 100644 --- a/cpp/src/arrow/extensions/complex_type.cc +++ b/cpp/src/arrow/extensions/complex_type.cc @@ -1,3 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Complex Number Extension Type + #include #include diff --git a/cpp/src/arrow/extensions/complex_type.h b/cpp/src/arrow/extensions/complex_type.h index de0305392b1..97ce3f9afbe 100644 --- a/cpp/src/arrow/extensions/complex_type.h +++ b/cpp/src/arrow/extensions/complex_type.h @@ -1,3 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Complex Number Extension Type + #include "arrow/extension_type.h" namespace arrow { From e4b930929faf4cb6a770e6194a240bc48576ad9a Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Mon, 21 Jun 2021 14:51:21 +0200 Subject: [PATCH 05/61] [skip ci] WIP --- cpp/src/arrow/extension_type_test.cc | 12 ++-- cpp/src/arrow/extensions/complex_type.cc | 66 ++++----------------- cpp/src/arrow/extensions/complex_type.h | 75 ++++++++++++++++++------ cpp/src/arrow/testing/gtest_util.cc | 2 + cpp/src/arrow/type.cc | 2 + cpp/src/arrow/type_fwd.h | 16 +++++ cpp/src/arrow/visitor_inline.h | 2 + 7 files changed, 97 insertions(+), 78 deletions(-) diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index e2094e84cc0..b20c30c6a54 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -181,21 +181,25 @@ class TestExtensionType : public ::testing::Test { public: void SetUp() { ASSERT_OK(RegisterExtensionType(std::make_shared())); - ASSERT_OK(RegisterExtensionType(std::make_shared(float32()))); + ASSERT_OK(RegisterExtensionType(std::make_shared())); + ASSERT_OK(RegisterExtensionType(std::make_shared())); } void TearDown() { if (GetExtensionType("uuid")) { ASSERT_OK(UnregisterExtensionType("uuid")); } - if (GetExtensionType("complex")) { - ASSERT_OK(UnregisterExtensionType("complex")); + if (GetExtensionType("arrow.complex64")) { + ASSERT_OK(UnregisterExtensionType("arrow.complex64")); + } + if (GetExtensionType("arrow.complex128")) { + ASSERT_OK(UnregisterExtensionType("arrow.complex128")); } } }; TEST_F(TestExtensionType, ComplexTypeTest) { - auto registered_type = GetExtensionType("complex"); + auto registered_type = GetExtensionType("arrow.complex64"); ASSERT_NE(registered_type, nullptr); auto type = complex64(); diff --git a/cpp/src/arrow/extensions/complex_type.cc b/cpp/src/arrow/extensions/complex_type.cc index 8e8d21f0e09..30ce7442062 100644 --- a/cpp/src/arrow/extensions/complex_type.cc +++ b/cpp/src/arrow/extensions/complex_type.cc @@ -19,75 +19,29 @@ #include -#include +#include "arrow/extensions/complex_type.h" namespace arrow { -std::shared_ptr ComplexType::MakeType(std::shared_ptr subtype) { - return fixed_size_list(FloatCast(subtype), 2); -} - -std::shared_ptr ComplexType::FloatCast( - std::shared_ptr subtype) { - auto float_type = std::dynamic_pointer_cast(subtype); - - if (!float_type) { - throw std::runtime_error("ComplexType subtype not floating point"); - } - - if (float_type->precision() != FloatingPointType::SINGLE && - float_type->precision() != FloatingPointType::DOUBLE) { - throw std::runtime_error("Complex subtype must be single or double precision"); - } - - return float_type; -} - -std::string ComplexType::name() const { - std::stringstream ss("complex"); - switch (subtype()->precision()) { - case FloatingPointType::SINGLE: - ss << "64"; - break; - case FloatingPointType::DOUBLE: - ss << "128"; - break; - case FloatingPointType::HALF: - default: - throw std::runtime_error("Complex Type must be single or double precision"); - break; - } - - return ss.str(); -} - -std::string ComplexType::extension_name() const { return "complex"; } - -bool ComplexType::ExtensionEquals(const ExtensionType& other) const { +bool ComplexFloatType::ExtensionEquals(const ExtensionType& other) const { const auto& other_ext = static_cast(other); - if (other_ext.extension_name() != this->extension_name()) { - return false; - } - return this->subtype() == static_cast(other).subtype(); + return other_ext.extension_name() == this->extension_name(); } -Result> ComplexType::Deserialize( - std::shared_ptr storage_type, const std::string& serialized) const { - auto ltype = std::static_pointer_cast(storage_type); - return std::make_shared(ltype->value_type()); +bool ComplexDoubleType::ExtensionEquals(const ExtensionType& other) const { + const auto& other_ext = static_cast(other); + return other_ext.extension_name() == this->extension_name(); } -std::string ComplexType::Serialize() const { return ""; } -std::shared_ptr complex(std::shared_ptr subtype) { - return std::make_shared(subtype); +std::shared_ptr complex64() { + return std::make_shared(); } -std::shared_ptr complex64() { return std::make_shared(float32()); } - std::shared_ptr complex128() { - return std::make_shared(float64()); + return std::make_shared(); } + }; // namespace arrow diff --git a/cpp/src/arrow/extensions/complex_type.h b/cpp/src/arrow/extensions/complex_type.h index 97ce3f9afbe..8b868fad90a 100644 --- a/cpp/src/arrow/extensions/complex_type.h +++ b/cpp/src/arrow/extensions/complex_type.h @@ -21,41 +21,80 @@ namespace arrow { -class ComplexArray : public ExtensionArray { + +std::shared_ptr complex64(); +std::shared_ptr complex128(); + + +class ComplexFloatArray : public ExtensionArray { public: using ExtensionArray::ExtensionArray; }; -class ComplexType : public ExtensionType { - private: - std::shared_ptr subtype_; +class ComplexFloatType : public ExtensionType { + public: + explicit ComplexFloatType() + : ExtensionType(fixed_size_list(float32(), 2)) {} + + std::string name() const override { + return "complex64"; + } + + std::string extension_name() const override { + return "arrow.complex64"; + } + + bool ExtensionEquals(const ExtensionType& other) const override; + + std::shared_ptr MakeArray(std::shared_ptr data) const override { + return std::make_shared(data); + } + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized) const override { + return complex64(); + }; + + std::string Serialize() const override { + return ""; + } +}; + - static std::shared_ptr MakeType(std::shared_ptr subtype); - static std::shared_ptr FloatCast(std::shared_ptr subtype); +class ComplexDoubleArray : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; +}; +class ComplexDoubleType : public ExtensionType { public: - explicit ComplexType(std::shared_ptr subtype) - : ExtensionType(MakeType(subtype)), subtype_(FloatCast(subtype)) {} + explicit ComplexDoubleType() + : ExtensionType(fixed_size_list(float64(), 2)) {} + + std::string name() const override { + return "complex128"; + } - std::shared_ptr subtype() const { return subtype_; } - std::string name() const override; - std::string extension_name() const override; + std::string extension_name() const override { + return "arrow.complex128"; + } bool ExtensionEquals(const ExtensionType& other) const override; std::shared_ptr MakeArray(std::shared_ptr data) const override { - return std::make_shared(data); + return std::make_shared(data); } Result> Deserialize( std::shared_ptr storage_type, - const std::string& serialized) const override; + const std::string& serialized) const override { + return complex128(); + }; - std::string Serialize() const override; + std::string Serialize() const override { + return ""; + } }; -std::shared_ptr complex(std::shared_ptr subtype); -std::shared_ptr complex64(); -std::shared_ptr complex128(); - }; // namespace arrow diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index eb0edd56566..85c1129e3a8 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -73,6 +73,8 @@ std::vector AllTypeIds() { Type::HALF_FLOAT, Type::FLOAT, Type::DOUBLE, + Type::COMPLEX_FLOAT, + Type::COMPLEX_DOUBLE, Type::DECIMAL128, Type::DECIMAL256, Type::DATE32, diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 344585446fc..4489c116e6d 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -130,6 +130,8 @@ std::string ToString(Type::type id) { TO_STRING_CASE(HALF_FLOAT) TO_STRING_CASE(FLOAT) TO_STRING_CASE(DOUBLE) + TO_STRING_CASE(COMPLEX_FLOAT) + TO_STRING_CASE(COMPLEX_DOUBLE) TO_STRING_CASE(DECIMAL128) TO_STRING_CASE(DECIMAL256) TO_STRING_CASE(DATE32) diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 7e564106bbe..14d0b7480ad 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -207,6 +207,12 @@ _NUMERIC_TYPE_DECL(HalfFloat) _NUMERIC_TYPE_DECL(Float) _NUMERIC_TYPE_DECL(Double) +class ComplexFloatType; +class ComplexFloatArray; + +class ComplexDoubleType; +class ComplexDoubleArray; + #undef _NUMERIC_TYPE_DECL enum class DateUnit : char { DAY = 0, MILLI = 1 }; @@ -394,6 +400,12 @@ struct Type { /// Like LIST, but with 64-bit offsets LARGE_LIST, + // Single-precision 32-bit complex numbers + COMPLEX_FLOAT, + + // Double-precision 64-bit complex numbers + COMPLEX_DOUBLE, + // Leave this at the end MAX_ID }; @@ -430,6 +442,10 @@ std::shared_ptr ARROW_EXPORT float16(); std::shared_ptr ARROW_EXPORT float32(); /// \brief Return a DoubleType instance std::shared_ptr ARROW_EXPORT float64(); +/// \brief Return a ComplexFloatType instance +std::shared_ptr ARROW_EXPORT complex64(); +/// \brief Return a ComplexDoubleType instance +std::shared_ptr ARROW_EXPORT complex128(); /// \brief Return a StringType instance std::shared_ptr ARROW_EXPORT utf8(); /// \brief Return a LargeStringType instance diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h index 132c35aeaa1..eba2522044f 100644 --- a/cpp/src/arrow/visitor_inline.h +++ b/cpp/src/arrow/visitor_inline.h @@ -56,6 +56,8 @@ namespace arrow { ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \ ACTION(String); \ ACTION(Binary); \ + ACTION(ComplexFloat); \ + ACTION(ComplexDouble); \ ACTION(LargeString); \ ACTION(LargeBinary); \ ACTION(FixedSizeBinary); \ From fe1e1456d026ee286412802fa8696fb8a7897ed5 Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Mon, 21 Jun 2021 15:33:59 +0200 Subject: [PATCH 06/61] Add Type Traits --- cpp/src/arrow/type_traits.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index b74aa3b0adb..f6b8135ebc5 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -362,6 +362,30 @@ struct CTypeTraits using ArrowType = DayTimeIntervalType; }; +template <> +struct TypeTraits { + using ArrayType = ComplexFloatArray; + using BuilderType = FixedSizeListBuilder; + using ScalarType = FixedSizeListScalar; + using OffsetType = Int32Type; + using OffsetArrayType = Int32Array; + using OffsetBuilderType = Int32Builder; + using OffsetScalarType = Int32Scalar; + constexpr static bool is_parameter_free = true; +}; + +template <> +struct TypeTraits { + using ArrayType = ComplexDoubleArray; + using BuilderType = FixedSizeListBuilder; + using ScalarType = FixedSizeListScalar; + using OffsetType = Int32Type; + using OffsetArrayType = Int32Array; + using OffsetBuilderType = Int32Builder; + using OffsetScalarType = Int32Scalar; + constexpr static bool is_parameter_free = true; +}; + template <> struct TypeTraits { using ArrayType = ListArray; From 76c8ef6dffce53f8de174b0a16efdcc79d2866f0 Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Mon, 21 Jun 2021 15:34:23 +0200 Subject: [PATCH 07/61] [skip ci] #include complex_type.h --- cpp/src/arrow/visitor_inline.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h index eba2522044f..665217221b1 100644 --- a/cpp/src/arrow/visitor_inline.h +++ b/cpp/src/arrow/visitor_inline.h @@ -23,6 +23,7 @@ #include "arrow/array.h" #include "arrow/extension_type.h" +#include "arrow/extensions/complex_type.h" #include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/type.h" From 5b72101949941d9f2d3b2c8cbae94c347ab7d308 Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Mon, 21 Jun 2021 15:48:54 +0200 Subject: [PATCH 08/61] Fix header include --- cpp/src/arrow/extensions/complex_type.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/extensions/complex_type.h b/cpp/src/arrow/extensions/complex_type.h index 8b868fad90a..22f584b1066 100644 --- a/cpp/src/arrow/extensions/complex_type.h +++ b/cpp/src/arrow/extensions/complex_type.h @@ -16,6 +16,7 @@ // under the License. // Complex Number Extension Type +#pragma once #include "arrow/extension_type.h" From 0d7524eece221570e8c4f9cd32eee89622be9179 Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Mon, 21 Jun 2021 15:49:23 +0200 Subject: [PATCH 09/61] Remove Complex{Float,Double} Types in Visitor Actions --- cpp/src/arrow/visitor_inline.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h index 665217221b1..303c4361633 100644 --- a/cpp/src/arrow/visitor_inline.h +++ b/cpp/src/arrow/visitor_inline.h @@ -51,15 +51,17 @@ namespace arrow { ACTION(Float); \ ACTION(Double) + +// ACTION(ComplexFloat); +// ACTION(ComplexDouble); + #define ARROW_GENERATE_FOR_ALL_TYPES(ACTION) \ ACTION(Null); \ ACTION(Boolean); \ ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \ ACTION(String); \ ACTION(Binary); \ - ACTION(ComplexFloat); \ - ACTION(ComplexDouble); \ - ACTION(LargeString); \ + ACTION(LargeString); \ ACTION(LargeBinary); \ ACTION(FixedSizeBinary); \ ACTION(Duration); \ From ae58a3a8764537ea9912f3036028787ec8c181cf Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Mon, 21 Jun 2021 16:10:57 +0200 Subject: [PATCH 10/61] cleanup --- cpp/src/arrow/visitor_inline.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h index 303c4361633..dbd8c09112d 100644 --- a/cpp/src/arrow/visitor_inline.h +++ b/cpp/src/arrow/visitor_inline.h @@ -52,16 +52,13 @@ namespace arrow { ACTION(Double) -// ACTION(ComplexFloat); -// ACTION(ComplexDouble); - #define ARROW_GENERATE_FOR_ALL_TYPES(ACTION) \ ACTION(Null); \ ACTION(Boolean); \ ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \ ACTION(String); \ ACTION(Binary); \ - ACTION(LargeString); \ + ACTION(LargeString); \ ACTION(LargeBinary); \ ACTION(FixedSizeBinary); \ ACTION(Duration); \ From dc2125b8712ddab2e5638a6469636116a8e77c54 Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Mon, 21 Jun 2021 17:31:13 +0200 Subject: [PATCH 11/61] Remove COMPLEX_{FLOAT,DOUBLE} from Type --- cpp/src/arrow/testing/gtest_util.cc | 2 -- cpp/src/arrow/type.cc | 2 -- cpp/src/arrow/type_fwd.h | 6 ------ cpp/src/arrow/type_traits.h | 24 ------------------------ 4 files changed, 34 deletions(-) diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 85c1129e3a8..eb0edd56566 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -73,8 +73,6 @@ std::vector AllTypeIds() { Type::HALF_FLOAT, Type::FLOAT, Type::DOUBLE, - Type::COMPLEX_FLOAT, - Type::COMPLEX_DOUBLE, Type::DECIMAL128, Type::DECIMAL256, Type::DATE32, diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 4489c116e6d..344585446fc 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -130,8 +130,6 @@ std::string ToString(Type::type id) { TO_STRING_CASE(HALF_FLOAT) TO_STRING_CASE(FLOAT) TO_STRING_CASE(DOUBLE) - TO_STRING_CASE(COMPLEX_FLOAT) - TO_STRING_CASE(COMPLEX_DOUBLE) TO_STRING_CASE(DECIMAL128) TO_STRING_CASE(DECIMAL256) TO_STRING_CASE(DATE32) diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 14d0b7480ad..3001efef7be 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -400,12 +400,6 @@ struct Type { /// Like LIST, but with 64-bit offsets LARGE_LIST, - // Single-precision 32-bit complex numbers - COMPLEX_FLOAT, - - // Double-precision 64-bit complex numbers - COMPLEX_DOUBLE, - // Leave this at the end MAX_ID }; diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index f6b8135ebc5..b74aa3b0adb 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -362,30 +362,6 @@ struct CTypeTraits using ArrowType = DayTimeIntervalType; }; -template <> -struct TypeTraits { - using ArrayType = ComplexFloatArray; - using BuilderType = FixedSizeListBuilder; - using ScalarType = FixedSizeListScalar; - using OffsetType = Int32Type; - using OffsetArrayType = Int32Array; - using OffsetBuilderType = Int32Builder; - using OffsetScalarType = Int32Scalar; - constexpr static bool is_parameter_free = true; -}; - -template <> -struct TypeTraits { - using ArrayType = ComplexDoubleArray; - using BuilderType = FixedSizeListBuilder; - using ScalarType = FixedSizeListScalar; - using OffsetType = Int32Type; - using OffsetArrayType = Int32Array; - using OffsetBuilderType = Int32Builder; - using OffsetScalarType = Int32Scalar; - constexpr static bool is_parameter_free = true; -}; - template <> struct TypeTraits { using ArrayType = ListArray; From fbe7508cdf7479cac2c0ce74683b074aba884b1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 15 Jun 2021 18:36:56 +0900 Subject: [PATCH 12/61] ARROW-13080: [Release] Generate the API docs in ubuntu 20.10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pass ubuntu version as a docker build variable instead of a container runtime environment variable. Closes #10532 from kszucs/post-docs-ubuntu-version Authored-by: Krisztián Szűcs Signed-off-by: Sutou Kouhei --- dev/release/post-09-docs.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dev/release/post-09-docs.sh b/dev/release/post-09-docs.sh index c9f75b48b2c..8751b22887f 100755 --- a/dev/release/post-09-docs.sh +++ b/dev/release/post-09-docs.sh @@ -43,10 +43,9 @@ popd pushd "${ARROW_DIR}" git checkout "${release_tag}" -archery docker run \ +UBUNTU=20.10 archery docker run \ -v "${ARROW_SITE_DIR}/docs:/build/docs" \ -e ARROW_DOCS_VERSION="${version}" \ - -e UBUNTU=20.10 \ ubuntu-docs : ${PUSH:=1} From 5521edf2d1d37ae3542c342842a680d7b53eac2d Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Tue, 15 Jun 2021 11:56:52 +0200 Subject: [PATCH 13/61] ARROW-12431: [Python] Mask is inverted when creating FixedSizeBinaryArray Closes #10199 from amol-/ARROW-12431 Authored-by: Alessandro Molina Signed-off-by: Antoine Pitrou --- cpp/src/arrow/python/numpy_to_arrow.cc | 15 +++++++-- python/pyarrow/tests/test_array.py | 45 ++++++++++++++++++++++++++ python/pyarrow/tests/test_pandas.py | 2 +- 3 files changed, 59 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index c17e70823d5..a382f766333 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -594,9 +594,20 @@ Status NumPyConverter::Visit(const FixedSizeBinaryType& type) { if (mask_ != nullptr) { Ndarray1DIndexer mask_values(mask_); - RETURN_NOT_OK(builder.AppendValues(data, length_, mask_values.data())); + RETURN_NOT_OK(builder.Reserve(length_)); + for (int64_t i = 0; i < length_; ++i) { + if (mask_values[i]) { + RETURN_NOT_OK(builder.AppendNull()); + } else { + RETURN_NOT_OK(builder.Append(data)); + } + data += stride_; + } } else { - RETURN_NOT_OK(builder.AppendValues(data, length_)); + for (int64_t i = 0; i < length_; ++i) { + RETURN_NOT_OK(builder.Append(data)); + data += stride_; + } } std::shared_ptr result; diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 086ed4cb160..30500bc3c5b 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2714,6 +2714,51 @@ def test_array_masked(): assert arr.type == pa.int64() +def test_binary_array_masked(): + # ARROW-12431 + masked_basic = pa.array([b'\x05'], type=pa.binary(1), + mask=np.array([False])) + assert [b'\x05'] == masked_basic.to_pylist() + + # Fixed Length Binary + masked = pa.array(np.array([b'\x05']), type=pa.binary(1), + mask=np.array([False])) + assert [b'\x05'] == masked.to_pylist() + + masked_nulls = pa.array(np.array([b'\x05']), type=pa.binary(1), + mask=np.array([True])) + assert [None] == masked_nulls.to_pylist() + + # Variable Length Binary + masked = pa.array(np.array([b'\x05']), type=pa.binary(), + mask=np.array([False])) + assert [b'\x05'] == masked.to_pylist() + + masked_nulls = pa.array(np.array([b'\x05']), type=pa.binary(), + mask=np.array([True])) + assert [None] == masked_nulls.to_pylist() + + # Fixed Length Binary, copy + npa = np.array([b'aaa', b'bbb', b'ccc']*10) + arrow_array = pa.array(npa, type=pa.binary(3), + mask=np.array([False, False, False]*10)) + npa[npa == b"bbb"] = b"XXX" + assert ([b'aaa', b'bbb', b'ccc']*10) == arrow_array.to_pylist() + + +def test_binary_array_strided(): + # Masked + nparray = np.array([b"ab", b"cd", b"ef"]) + arrow_array = pa.array(nparray[::2], pa.binary(2), + mask=np.array([False, False])) + assert [b"ab", b"ef"] == arrow_array.to_pylist() + + # Unmasked + nparray = np.array([b"ab", b"cd", b"ef"]) + arrow_array = pa.array(nparray[::2], pa.binary(2)) + assert [b"ab", b"ef"] == arrow_array.to_pylist() + + def test_array_invalid_mask_raises(): # ARROW-10742 cases = [ diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 77c18b839c6..7f904433fa2 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -1705,7 +1705,7 @@ def test_numpy_string_array_to_fixed_size_binary(self): expected = pa.array(list(arr), type=pa.binary(3)) assert converted.equals(expected) - mask = np.array([True, False, True]) + mask = np.array([False, True, False]) converted = pa.array(arr, type=pa.binary(3), mask=mask) expected = pa.array([b'foo', None, b'baz'], type=pa.binary(3)) assert converted.equals(expected) From 40585c1ff28aaf55d4ec74cc1b58c35d39ae5d81 Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Tue, 15 Jun 2021 16:44:37 +0200 Subject: [PATCH 14/61] ARROW-13003: [C++] Fix key map unaligned access Closes #10489 from cyb70289/13003-unaligned-access Authored-by: Yibo Cai Signed-off-by: Antoine Pitrou --- cpp/src/arrow/compute/exec/key_compare.cc | 21 +++---- cpp/src/arrow/compute/exec/key_map.cc | 71 +++++++++++++---------- cpp/src/arrow/compute/exec/util.cc | 16 ++--- 3 files changed, 59 insertions(+), 49 deletions(-) diff --git a/cpp/src/arrow/compute/exec/key_compare.cc b/cpp/src/arrow/compute/exec/key_compare.cc index f8d74859b01..7a5b0be9990 100644 --- a/cpp/src/arrow/compute/exec/key_compare.cc +++ b/cpp/src/arrow/compute/exec/key_compare.cc @@ -21,6 +21,7 @@ #include #include "arrow/compute/exec/util.h" +#include "arrow/util/ubsan.h" namespace arrow { namespace compute { @@ -170,19 +171,19 @@ void KeyCompare::CompareFixedLengthImp(uint32_t num_rows_already_processed, // if (num_64bit_words == 0) { for (; istripe < num_loops_less_one; ++istripe) { - uint64_t key_left = key_left_ptr[istripe]; - uint64_t key_right = key_right_ptr[istripe]; + uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]); + uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]); result_or |= (key_left ^ key_right); } } else if (num_64bit_words == 2) { - uint64_t key_left = key_left_ptr[istripe]; - uint64_t key_right = key_right_ptr[istripe]; + uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]); + uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]); result_or |= (key_left ^ key_right); ++istripe; } - uint64_t key_left = key_left_ptr[istripe]; - uint64_t key_right = key_right_ptr[istripe]; + uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]); + uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]); result_or |= (tail_mask & (key_left ^ key_right)); int result = (result_or == 0 ? 0xff : 0); @@ -246,16 +247,16 @@ void KeyCompare::CompareVaryingLengthImp( int32_t istripe; // length can be zero for (istripe = 0; istripe < (static_cast(length) + 7) / 8 - 1; ++istripe) { - uint64_t key_left = key_left_ptr[istripe]; - uint64_t key_right = key_right_ptr[istripe]; + uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]); + uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]); result_or |= (key_left ^ key_right); } uint32_t length_remaining = length - static_cast(istripe) * 8; uint64_t tail_mask = tail_masks[length_remaining]; - uint64_t key_left = key_left_ptr[istripe]; - uint64_t key_right = key_right_ptr[istripe]; + uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]); + uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]); result_or |= (tail_mask & (key_left ^ key_right)); int result = (result_or == 0 ? 0xff : 0); diff --git a/cpp/src/arrow/compute/exec/key_map.cc b/cpp/src/arrow/compute/exec/key_map.cc index c48487793e0..ac47c04403c 100644 --- a/cpp/src/arrow/compute/exec/key_map.cc +++ b/cpp/src/arrow/compute/exec/key_map.cc @@ -24,6 +24,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/ubsan.h" namespace arrow { @@ -153,7 +154,7 @@ void SwissTable::lookup_1(const uint16_t* selection, const int num_keys, for (int i = 0; i < num_keys; ++i) { int id; if (use_selection) { - id = selection[i]; + id = util::SafeLoad(&selection[i]); } else { id = i; } @@ -168,7 +169,7 @@ void SwissTable::lookup_1(const uint16_t* selection, const int num_keys, uint32_t num_block_bytes = num_groupid_bits + 8; const uint8_t* blockbase = reinterpret_cast(blocks_) + static_cast(iblock) * num_block_bytes; - uint64_t block = *reinterpret_cast(blockbase); + uint64_t block = util::SafeLoadAs(blockbase); // Call helper functions to obtain the output triplet: // - match (of a stamp) found flag @@ -182,8 +183,8 @@ void SwissTable::lookup_1(const uint16_t* selection, const int num_keys, uint64_t islot = next_slot_to_visit(iblock, islot_in_block, match_found); out_match_bitvector[id / 8] |= match_found << (id & 7); - out_groupids[id] = static_cast(groupid); - out_slot_ids[id] = static_cast(islot); + util::SafeStore(&out_groupids[id], static_cast(groupid)); + util::SafeStore(&out_slot_ids[id], static_cast(islot)); } } @@ -239,7 +240,7 @@ Status SwissTable::lookup_2(const uint32_t* hashes, uint32_t* inout_num_selected uint16_t* ids[3]{inout_selection, ids_for_comparison_buf.mutable_data(), ids_inserted_buf.mutable_data()}; auto push_id = [&num_ids, &ids](int category, int id) { - ids[category][num_ids[category]++] = static_cast(id); + util::SafeStore(&ids[category][num_ids[category]++], static_cast(id)); }; uint64_t num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_); @@ -256,9 +257,9 @@ Status SwissTable::lookup_2(const uint32_t* hashes, uint32_t* inout_num_selected num_inserted_ + num_ids[category_inserted] < num_groups_limit; ++num_processed) { // row id in original batch - int id = inout_selection[num_processed]; + int id = util::SafeLoad(&inout_selection[num_processed]); - uint64_t slot_id = wrap_global_slot_id(inout_next_slot_ids[id]); + uint64_t slot_id = wrap_global_slot_id(util::SafeLoad(&inout_next_slot_ids[id])); uint64_t block_id = slot_id >> 3; uint32_t hash = hashes[id]; uint8_t* blockbase = blocks_ + num_block_bytes * block_id; @@ -278,11 +279,13 @@ Status SwissTable::lookup_2(const uint32_t* hashes, uint32_t* inout_num_selected // In that case we can insert group id value using aligned 64-bit word access. ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 || num_groupid_bits == 32 || num_groupid_bits == 64); - reinterpret_cast(blockbase + 8)[groupid_bit_offset >> 6] |= - (static_cast(group_id) << (groupid_bit_offset & 63)); + uint64_t* ptr = + &reinterpret_cast(blockbase + 8)[groupid_bit_offset >> 6]; + util::SafeStore(ptr, util::SafeLoad(ptr) | (static_cast(group_id) + << (groupid_bit_offset & 63))); hashes_[slot_id] = hash; - out_group_ids[id] = group_id; + util::SafeStore(&out_group_ids[id], group_id); push_id(category_inserted, id); } else { // We search for a slot with a matching stamp within a single block. @@ -298,8 +301,8 @@ Status SwissTable::lookup_2(const uint32_t* hashes, uint32_t* inout_num_selected ARROW_DCHECK(new_groupid < num_inserted_ + num_ids[category_inserted]); new_slot = static_cast(next_slot_to_visit(block_id, new_slot, new_match_found)); - inout_next_slot_ids[id] = new_slot; - out_group_ids[id] = new_groupid; + util::SafeStore(&inout_next_slot_ids[id], new_slot); + util::SafeStore(&out_group_ids[id], new_groupid); push_id(new_match_found, id); } } @@ -410,7 +413,8 @@ Status SwissTable::map(const int num_keys, const uint32_t* hashes, // for (uint32_t i = 0; i < num_ids; ++i) { // First slot in the new starting block - slot_ids[ids[i]] = (hashes[ids[i]] >> (bits_hash_ - log_blocks_)) * 8; + const int16_t id = util::SafeLoad(&ids[i]); + util::SafeStore(&slot_ids[id], (hashes[id] >> (bits_hash_ - log_blocks_)) * 8); } } } while (num_ids > 0); @@ -457,9 +461,8 @@ Status SwissTable::grow_double() { static_cast(CountLeadingZeros(block & kHighBitOfEachByte) >> 3); int full_slots_new[2]; full_slots_new[0] = full_slots_new[1] = 0; - *reinterpret_cast(double_block_base_new) = kHighBitOfEachByte; - *reinterpret_cast(double_block_base_new + block_size_after) = - kHighBitOfEachByte; + util::SafeStore(double_block_base_new, kHighBitOfEachByte); + util::SafeStore(double_block_base_new + block_size_after, kHighBitOfEachByte); for (int j = 0; j < full_slots; ++j) { uint64_t slot_id = i * 8 + j; @@ -474,18 +477,20 @@ Status SwissTable::grow_double() { uint8_t stamp_new = hash >> ((bits_hash_ - log_blocks_after - bits_stamp_)) & stamp_mask; uint64_t group_id_bit_offs = j * num_group_id_bits_before; - uint64_t group_id = (*reinterpret_cast(block_base + 8 + - (group_id_bit_offs >> 3)) >> - (group_id_bit_offs & 7)) & - group_id_mask_before; + uint64_t group_id = + (util::SafeLoadAs(block_base + 8 + (group_id_bit_offs >> 3)) >> + (group_id_bit_offs & 7)) & + group_id_mask_before; uint64_t slot_id_new = i * 16 + ihalf * 8 + full_slots_new[ihalf]; hashes_new[slot_id_new] = hash; uint8_t* block_base_new = double_block_base_new + ihalf * block_size_after; block_base_new[7 - full_slots_new[ihalf]] = stamp_new; int group_id_bit_offs_new = full_slots_new[ihalf] * num_group_id_bits_after; - *reinterpret_cast(block_base_new + 8 + (group_id_bit_offs_new >> 3)) |= - (group_id << (group_id_bit_offs_new & 7)); + uint64_t* ptr = + reinterpret_cast(block_base_new + 8 + (group_id_bit_offs_new >> 3)); + util::SafeStore(ptr, + util::SafeLoad(ptr) | (group_id << (group_id_bit_offs_new & 7))); full_slots_new[ihalf]++; } } @@ -495,7 +500,7 @@ Status SwissTable::grow_double() { for (int i = 0; i < (1 << log_blocks_); ++i) { // How many full slots in this block uint8_t* block_base = blocks_ + i * block_size_before; - uint64_t block = *reinterpret_cast(block_base); + uint64_t block = util::SafeLoadAs(block_base); int full_slots = static_cast(CountLeadingZeros(block & kHighBitOfEachByte) >> 3); for (int j = 0; j < full_slots; ++j) { @@ -508,21 +513,21 @@ Status SwissTable::grow_double() { } uint64_t group_id_bit_offs = j * num_group_id_bits_before; - uint64_t group_id = (*reinterpret_cast(block_base + 8 + - (group_id_bit_offs >> 3)) >> - (group_id_bit_offs & 7)) & - group_id_mask_before; + uint64_t group_id = + (util::SafeLoadAs(block_base + 8 + (group_id_bit_offs >> 3)) >> + (group_id_bit_offs & 7)) & + group_id_mask_before; uint8_t stamp_new = hash >> ((bits_hash_ - log_blocks_after - bits_stamp_)) & stamp_mask; uint8_t* block_base_new = blocks_new + block_id_new * block_size_after; - uint64_t block_new = *reinterpret_cast(block_base_new); + uint64_t block_new = util::SafeLoadAs(block_base_new); int full_slots_new = static_cast(CountLeadingZeros(block_new & kHighBitOfEachByte) >> 3); while (full_slots_new == 8) { block_id_new = (block_id_new + 1) & ((1 << log_blocks_after) - 1); block_base_new = blocks_new + block_id_new * block_size_after; - block_new = *reinterpret_cast(block_base_new); + block_new = util::SafeLoadAs(block_base_new); full_slots_new = static_cast(CountLeadingZeros(block_new & kHighBitOfEachByte) >> 3); } @@ -530,8 +535,10 @@ Status SwissTable::grow_double() { hashes_new[block_id_new * 8 + full_slots_new] = hash; block_base_new[7 - full_slots_new] = stamp_new; int group_id_bit_offs_new = full_slots_new * num_group_id_bits_after; - *reinterpret_cast(block_base_new + 8 + (group_id_bit_offs_new >> 3)) |= - (group_id << (group_id_bit_offs_new & 7)); + uint64_t* ptr = + reinterpret_cast(block_base_new + 8 + (group_id_bit_offs_new >> 3)); + util::SafeStore(ptr, + util::SafeLoad(ptr) | (group_id << (group_id_bit_offs_new & 7))); } } @@ -567,7 +574,7 @@ Status SwissTable::init(int64_t hardware_flags, MemoryPool* pool, // Initialize all status bytes to represent an empty slot. for (uint64_t i = 0; i < (static_cast(1) << log_blocks_); ++i) { - *reinterpret_cast(blocks_ + i * block_bytes) = kHighBitOfEachByte; + util::SafeStore(blocks_ + i * block_bytes, kHighBitOfEachByte); } uint64_t num_slots = 1ULL << (log_blocks_ + 3); diff --git a/cpp/src/arrow/compute/exec/util.cc b/cpp/src/arrow/compute/exec/util.cc index 5f1c0776c56..88303348645 100644 --- a/cpp/src/arrow/compute/exec/util.cc +++ b/cpp/src/arrow/compute/exec/util.cc @@ -19,6 +19,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/ubsan.h" namespace arrow { @@ -66,7 +67,7 @@ void BitUtil::bits_to_indexes_internal(int64_t hardware_flags, const int num_bit #endif *num_indexes = 0; for (int i = 0; i < num_bits / unroll; ++i) { - uint64_t word = reinterpret_cast(bits)[i]; + uint64_t word = util::SafeLoad(&reinterpret_cast(bits)[i]); if (bit_to_search == 0) { word = ~word; } @@ -81,7 +82,8 @@ void BitUtil::bits_to_indexes_internal(int64_t hardware_flags, const int num_bit #endif // Optionally process the last partial word with masking out bits outside range if (tail) { - uint64_t word = reinterpret_cast(bits)[num_bits / unroll]; + uint64_t word = + util::SafeLoad(&reinterpret_cast(bits)[num_bits / unroll]); if (bit_to_search == 0) { word = ~word; } @@ -144,7 +146,7 @@ void BitUtil::bits_to_bytes_internal(const int num_bits, const uint8_t* bits, unpacked |= (bits_next & 1); unpacked &= 0x0101010101010101ULL; unpacked *= 255; - reinterpret_cast(bytes)[i] = unpacked; + util::SafeStore(&reinterpret_cast(bytes)[i], unpacked); } } @@ -153,7 +155,7 @@ void BitUtil::bytes_to_bits_internal(const int num_bits, const uint8_t* bytes, constexpr int unroll = 8; // Process 8 bits at a time for (int i = 0; i < (num_bits + unroll - 1) / unroll; ++i) { - uint64_t bytes_next = reinterpret_cast(bytes)[i]; + uint64_t bytes_next = util::SafeLoad(&reinterpret_cast(bytes)[i]); bytes_next &= 0x0101010101010101ULL; bytes_next |= (bytes_next >> 7); // Pairs of adjacent output bits in individual bytes bytes_next |= (bytes_next >> 14); // 4 adjacent output bits in individual bytes @@ -184,7 +186,7 @@ void BitUtil::bits_to_bytes(int64_t hardware_flags, const int num_bits, unpacked |= (bits_next & 1); unpacked &= 0x0101010101010101ULL; unpacked *= 255; - reinterpret_cast(bytes)[i] = unpacked; + util::SafeStore(&reinterpret_cast(bytes)[i], unpacked); } } @@ -201,7 +203,7 @@ void BitUtil::bytes_to_bits(int64_t hardware_flags, const int num_bits, // Process 8 bits at a time constexpr int unroll = 8; for (int i = num_processed / unroll; i < (num_bits + unroll - 1) / unroll; ++i) { - uint64_t bytes_next = reinterpret_cast(bytes)[i]; + uint64_t bytes_next = util::SafeLoad(&reinterpret_cast(bytes)[i]); bytes_next &= 0x0101010101010101ULL; bytes_next |= (bytes_next >> 7); // Pairs of adjacent output bits in individual bytes bytes_next |= (bytes_next >> 14); // 4 adjacent output bits in individual bytes @@ -220,7 +222,7 @@ bool BitUtil::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, uint64_t result_or = 0; uint32_t i; for (i = 0; i < num_bytes / 8; ++i) { - uint64_t x = reinterpret_cast(bytes)[i]; + uint64_t x = util::SafeLoad(&reinterpret_cast(bytes)[i]); result_or |= x; } if (num_bytes % 8 > 0) { From 522696b164efee500d7046a65657ba61e34162a8 Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 15 Jun 2021 11:22:25 -0400 Subject: [PATCH 15/61] ARROW-12597: [C++] Enable per-row-group parallelism in async Parquet reader This adds an OptionalParallelForAsync which lets us have per-row-group parallelism without nested parallelism in the async Parquet reader. This also uses TransferAlways, taking care of ARROW-12916. `enable_parallel_column_conversion` is kept as it still affects the threaded scanner. Closes #10482 from lidavidm/arrow-12597 Authored-by: David Li Signed-off-by: David Li --- cpp/src/arrow/dataset/file_parquet.cc | 6 +- cpp/src/arrow/dataset/file_parquet.h | 3 +- cpp/src/arrow/dataset/test_util.h | 14 +++- cpp/src/arrow/ipc/reader.cc | 9 +-- cpp/src/arrow/util/parallel.h | 37 +++++++++ cpp/src/arrow/util/vector.h | 13 +++ cpp/src/parquet/arrow/reader.cc | 110 ++++++++++++++------------ 7 files changed, 128 insertions(+), 64 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 8c325d21da1..0ebbd0a5333 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -362,11 +362,7 @@ Future> ParquetFileFormat::GetReader parquet_scan_options->arrow_reader_properties->cache_options()); arrow_properties.set_io_context( parquet_scan_options->arrow_reader_properties->io_context()); - // TODO: ARROW-12597 will let us enable parallel conversion - if (!options->use_threads) { - arrow_properties.set_use_threads( - parquet_scan_options->enable_parallel_column_conversion); - } + arrow_properties.set_use_threads(options->use_threads); std::unique_ptr arrow_reader; RETURN_NOT_OK(parquet::arrow::FileReader::Make(options->pool, std::move(reader), std::move(arrow_properties), diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h index 8286e2776cb..347f4032046 100644 --- a/cpp/src/arrow/dataset/file_parquet.h +++ b/cpp/src/arrow/dataset/file_parquet.h @@ -222,7 +222,8 @@ class ARROW_DS_EXPORT ParquetFragmentScanOptions : public FragmentScanOptions { /// EXPERIMENTAL: Parallelize conversion across columns. This option is ignored if a /// scan is already parallelized across input files to avoid thread contention. This /// option will be removed after support is added for simultaneous parallelization - /// across files and columns. + /// across files and columns. Only affects the threaded reader; the async reader + /// will parallelize across columns if use_threads is enabled. bool enable_parallel_column_conversion = false; }; diff --git a/cpp/src/arrow/dataset/test_util.h b/cpp/src/arrow/dataset/test_util.h index 1e4222eec8c..39223eba35b 100644 --- a/cpp/src/arrow/dataset/test_util.h +++ b/cpp/src/arrow/dataset/test_util.h @@ -310,6 +310,7 @@ class DatasetFixtureMixinWithParam : public DatasetFixtureMixin, struct TestFormatParams { bool use_async; + bool use_threads; int num_batches; int items_per_batch; @@ -318,7 +319,8 @@ struct TestFormatParams { std::string ToString() const { // GTest requires this to be alphanumeric std::stringstream ss; - ss << (use_async ? "Async" : "Sync") << num_batches << "b" << items_per_batch << "r"; + ss << (use_async ? "Async" : "Sync") << (use_threads ? "Threaded" : "Serial") + << num_batches << "b" << items_per_batch << "r"; return ss.str(); } @@ -328,8 +330,12 @@ struct TestFormatParams { } static std::vector Values() { - std::vector values{{/*async=*/false, 16, 1024}, - {/*async=*/true, 16, 1024}}; + std::vector values; + for (const bool async : std::vector{true, false}) { + for (const bool use_threads : std::vector{true, false}) { + values.push_back(TestFormatParams{async, use_threads, 16, 1024}); + } + } return values; } }; @@ -511,6 +517,7 @@ class FileFormatScanMixin : public FileFormatFixtureMixin, auto dataset = std::make_shared(schema, FragmentVector{fragment}); ScannerBuilder builder(dataset, opts_); ARROW_EXPECT_OK(builder.UseAsync(GetParam().use_async)); + ARROW_EXPECT_OK(builder.UseThreads(GetParam().use_threads)); EXPECT_OK_AND_ASSIGN(auto scanner, builder.Finish()); EXPECT_OK_AND_ASSIGN(auto batch_it, scanner->ScanBatches()); return MakeMapIterator([](TaggedRecordBatch tagged) { return tagged.record_batch; }, @@ -519,6 +526,7 @@ class FileFormatScanMixin : public FileFormatFixtureMixin, // Scan the fragment directly, without using the scanner. RecordBatchIterator PhysicalBatches(std::shared_ptr fragment) { + opts_->use_threads = GetParam().use_threads; if (GetParam().use_async) { EXPECT_OK_AND_ASSIGN(auto batch_gen, fragment->ScanBatchesAsync(opts_)); EXPECT_OK_AND_ASSIGN(auto batch_it, MakeGeneratorIterator(std::move(batch_gen))); diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 7c26bce913d..a3c345cc440 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -54,6 +54,7 @@ #include "arrow/util/string.h" #include "arrow/util/thread_pool.h" #include "arrow/util/ubsan.h" +#include "arrow/util/vector.h" #include "arrow/visitor_inline.h" #include "generated/File_generated.h" // IWYU pragma: export @@ -1368,12 +1369,10 @@ Future IpcFileRecordBatchGenerator::operator( auto read_messages = All(std::move(messages)); if (executor_) read_messages = executor_->Transfer(read_messages); read_dictionaries_ = read_messages.Then( - [=](const std::vector>> maybe_messages) + [=](const std::vector>>& maybe_messages) -> Status { - std::vector> messages(state->num_dictionaries()); - for (size_t i = 0; i < messages.size(); i++) { - ARROW_ASSIGN_OR_RAISE(messages[i], maybe_messages[i]); - } + ARROW_ASSIGN_OR_RAISE(auto messages, + arrow::internal::UnwrapOrRaise(maybe_messages)); return ReadDictionaries(state.get(), std::move(messages)); }); } diff --git a/cpp/src/arrow/util/parallel.h b/cpp/src/arrow/util/parallel.h index e56a71b91af..80f60fbdb36 100644 --- a/cpp/src/arrow/util/parallel.h +++ b/cpp/src/arrow/util/parallel.h @@ -21,7 +21,9 @@ #include #include "arrow/status.h" +#include "arrow/util/functional.h" #include "arrow/util/thread_pool.h" +#include "arrow/util/vector.h" namespace arrow { namespace internal { @@ -44,6 +46,21 @@ Status ParallelFor(int num_tasks, FUNCTION&& func, return st; } +template ::ValueType> +Future> ParallelForAsync( + std::vector inputs, FUNCTION&& func, + Executor* executor = internal::GetCpuThreadPool()) { + std::vector> futures(inputs.size()); + for (size_t i = 0; i < inputs.size(); ++i) { + ARROW_ASSIGN_OR_RAISE(futures[i], executor->Submit(func, i, std::move(inputs[i]))); + } + return All(std::move(futures)) + .Then([](const std::vector>& results) -> Result> { + return UnwrapOrRaise(results); + }); +} + // A parallelizer that takes a `Status(int)` function and calls it with // arguments between 0 and `num_tasks - 1`, in sequence or in parallel, // depending on the input boolean. @@ -61,5 +78,25 @@ Status OptionalParallelFor(bool use_threads, int num_tasks, FUNCTION&& func, } } +// A parallelizer that takes a `Result(int index, T item)` function and +// calls it with each item from the input array, in sequence or in parallel, +// depending on the input boolean. + +template ::ValueType> +Future> OptionalParallelForAsync( + bool use_threads, std::vector inputs, FUNCTION&& func, + Executor* executor = internal::GetCpuThreadPool()) { + if (use_threads) { + return ParallelForAsync(std::move(inputs), std::forward(func), executor); + } else { + std::vector result(inputs.size()); + for (size_t i = 0; i < inputs.size(); ++i) { + ARROW_ASSIGN_OR_RAISE(result[i], func(i, inputs[i])); + } + return result; + } +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/vector.h b/cpp/src/arrow/util/vector.h index b9f2e2a45aa..3ef0074aa9d 100644 --- a/cpp/src/arrow/util/vector.h +++ b/cpp/src/arrow/util/vector.h @@ -133,5 +133,18 @@ Result> UnwrapOrRaise(std::vector>&& results) { return std::move(out); } +template +Result> UnwrapOrRaise(const std::vector>& results) { + std::vector out; + out.reserve(results.size()); + for (const auto& result : results) { + if (!result.ok()) { + return result.status(); + } + out.push_back(result.ValueUnsafe()); + } + return std::move(out); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 14eb7495805..4f5f79c964a 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -293,10 +293,12 @@ class FileReaderImpl : public FileReader { const std::vector& indices, std::shared_ptr* table) override; - // Helper method used by ReadRowGroups/Generator - read the given row groups/columns, - // skipping bounds checks and pre-buffering. - Status DecodeRowGroups(const std::vector& row_groups, - const std::vector& indices, std::shared_ptr
* table); + // Helper method used by ReadRowGroups - read the given row groups/columns, skipping + // bounds checks and pre-buffering. Takes a shared_ptr to self to keep the reader + // alive in async contexts. + Future> DecodeRowGroups( + std::shared_ptr self, const std::vector& row_groups, + const std::vector& column_indices, ::arrow::internal::Executor* cpu_executor); Status ReadRowGroups(const std::vector& row_groups, std::shared_ptr
* table) override { @@ -1007,10 +1009,9 @@ class RowGroupGenerator { return SubmitRead(cpu_executor_, reader, row_group, column_indices); } auto ready = reader->parquet_reader()->WhenBuffered({row_group}, column_indices); - // TODO(ARROW-12916): always transfer here - if (cpu_executor_) ready = cpu_executor_->Transfer(ready); - return ready.Then([=]() -> ::arrow::Result { - return ReadOneRowGroup(reader, row_group, column_indices); + if (cpu_executor_) ready = cpu_executor_->TransferAlways(ready); + return ready.Then([=]() -> ::arrow::Future { + return ReadOneRowGroup(cpu_executor_, reader, row_group, column_indices); }); } @@ -1024,31 +1025,25 @@ class RowGroupGenerator { ::arrow::internal::Executor* cpu_executor, std::shared_ptr self, const int row_group, const std::vector& column_indices) { if (!cpu_executor) { - return Future::MakeFinished( - ReadOneRowGroup(self, row_group, column_indices)); + return ReadOneRowGroup(cpu_executor, self, row_group, column_indices); } // If we have an executor, then force transfer (even if I/O was complete) - return ::arrow::DeferNotOk( - cpu_executor->Submit(ReadOneRowGroup, self, row_group, column_indices)); + return ::arrow::DeferNotOk(cpu_executor->Submit(ReadOneRowGroup, cpu_executor, self, + row_group, column_indices)); } - static ::arrow::Result ReadOneRowGroup( - std::shared_ptr self, const int row_group, - const std::vector& column_indices) { - std::shared_ptr<::arrow::Table> table; + static ::arrow::Future ReadOneRowGroup( + ::arrow::internal::Executor* cpu_executor, std::shared_ptr self, + const int row_group, const std::vector& column_indices) { // Skips bound checks/pre-buffering, since we've done that already - RETURN_NOT_OK(self->DecodeRowGroups({row_group}, column_indices, &table)); - auto table_reader = std::make_shared<::arrow::TableBatchReader>(*table); - ::arrow::RecordBatchVector batches; - while (true) { - std::shared_ptr<::arrow::RecordBatch> batch; - RETURN_NOT_OK(table_reader->ReadNext(&batch)); - if (!batch) { - break; - } - batches.push_back(batch); - } - return ::arrow::MakeVectorGenerator(std::move(batches)); + return self->DecodeRowGroups(self, {row_group}, column_indices, cpu_executor) + .Then([](const std::shared_ptr
& table) + -> ::arrow::Result { + ::arrow::TableBatchReader table_reader(*table); + ::arrow::RecordBatchVector batches; + RETURN_NOT_OK(table_reader.ReadAll(&batches)); + return ::arrow::MakeVectorGenerator(std::move(batches)); + }); } std::shared_ptr arrow_reader_; @@ -1104,34 +1099,49 @@ Status FileReaderImpl::ReadRowGroups(const std::vector& row_groups, END_PARQUET_CATCH_EXCEPTIONS } - return DecodeRowGroups(row_groups, column_indices, out); + auto fut = DecodeRowGroups(/*self=*/nullptr, row_groups, column_indices, + /*cpu_executor=*/nullptr); + ARROW_ASSIGN_OR_RAISE(*out, fut.MoveResult()); + return Status::OK(); } -// Also used by RowGroupGenerator - skip bounds check/pre-buffer to avoid doing that twice -Status FileReaderImpl::DecodeRowGroups(const std::vector& row_groups, - const std::vector& column_indices, - std::shared_ptr
* out) { +Future> FileReaderImpl::DecodeRowGroups( + std::shared_ptr self, const std::vector& row_groups, + const std::vector& column_indices, ::arrow::internal::Executor* cpu_executor) { + // `self` is used solely to keep `this` alive in an async context - but we use this + // in a sync context too so use `this` over `self` std::vector> readers; std::shared_ptr<::arrow::Schema> result_schema; RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, &readers, &result_schema)); - - ::arrow::ChunkedArrayVector columns(readers.size()); - RETURN_NOT_OK(::arrow::internal::OptionalParallelFor( - reader_properties_.use_threads(), static_cast(readers.size()), [&](int i) { - return ReadColumn(static_cast(i), row_groups, readers[i].get(), &columns[i]); - })); - - int64_t num_rows = 0; - if (!columns.empty()) { - num_rows = columns[0]->length(); - } else { - for (int i : row_groups) { - num_rows += parquet_reader()->metadata()->RowGroup(i)->num_rows(); + // OptionalParallelForAsync requires an executor + if (!cpu_executor) cpu_executor = ::arrow::internal::GetCpuThreadPool(); + + auto read_column = [row_groups, self, this](size_t i, + std::shared_ptr reader) + -> ::arrow::Result> { + std::shared_ptr<::arrow::ChunkedArray> column; + RETURN_NOT_OK(ReadColumn(static_cast(i), row_groups, reader.get(), &column)); + return column; + }; + auto make_table = [result_schema, row_groups, self, + this](const ::arrow::ChunkedArrayVector& columns) + -> ::arrow::Result> { + int64_t num_rows = 0; + if (!columns.empty()) { + num_rows = columns[0]->length(); + } else { + for (int i : row_groups) { + num_rows += parquet_reader()->metadata()->RowGroup(i)->num_rows(); + } } - } - - *out = Table::Make(std::move(result_schema), std::move(columns), num_rows); - return (*out)->Validate(); + auto table = Table::Make(std::move(result_schema), columns, num_rows); + RETURN_NOT_OK(table->Validate()); + return table; + }; + return ::arrow::internal::OptionalParallelForAsync(reader_properties_.use_threads(), + std::move(readers), read_column, + cpu_executor) + .Then(std::move(make_table)); } std::shared_ptr FileReaderImpl::RowGroup(int row_group_index) { From f216d62ddd89f804250424e6358e58e69b7b5770 Mon Sep 17 00:00:00 2001 From: Karik Isichei Date: Tue, 15 Jun 2021 18:25:11 +0200 Subject: [PATCH 16/61] ARROW-12096: [C++] Allows users to define arrow timestamp unit for Parquet INT96 timestamp Have added functionality in C++ code to allow users to define the arrow timestamp unit when reading parquet INT96 types. This avoids the overflow bug when trying to convert INT96 values which have dates which are out of bounds for Arrow NS Timestamp. See added test: `TestArrowReadWrite.DownsampleDeprecatedInt96` which demonstrates use and expected results. Main discussion of changes in [JIRA Issue ARROW-12096](https://issues.apache.org/jira/browse/ARROW-12096). Closes #10461 from isichei/ARROW-12096 Lead-authored-by: Karik Isichei Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .../parquet/arrow/arrow_reader_writer_test.cc | 56 +++++++++++++++++++ cpp/src/parquet/arrow/reader_internal.cc | 43 +++++++++----- cpp/src/parquet/arrow/schema.cc | 4 +- cpp/src/parquet/arrow/schema_internal.cc | 19 +++---- cpp/src/parquet/arrow/schema_internal.h | 7 ++- cpp/src/parquet/properties.h | 14 ++++- cpp/src/parquet/types.h | 43 ++++++++++++-- 7 files changed, 150 insertions(+), 36 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 677458ce37e..6c82b8dee78 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -558,6 +558,35 @@ void ReadSingleColumnFileStatistics(std::unique_ptr file_reader, ASSERT_OK(StatisticsAsScalars(*statistics, min, max)); } +void DownsampleInt96RoundTrip(std::shared_ptr arrow_vector_in, + std::shared_ptr arrow_vector_out, + ::arrow::TimeUnit::type unit) { + // Create single input table of NS to be written to parquet with INT96 + auto input_schema = + ::arrow::schema({::arrow::field("f", ::arrow::timestamp(TimeUnit::NANO))}); + auto input = Table::Make(input_schema, {arrow_vector_in}); + + // Create an expected schema for each resulting table (one for each "downsampled" ts) + auto ex_schema = ::arrow::schema({::arrow::field("f", ::arrow::timestamp(unit))}); + auto ex_result = Table::Make(ex_schema, {arrow_vector_out}); + + std::shared_ptr
result; + + ArrowReaderProperties arrow_reader_prop; + arrow_reader_prop.set_coerce_int96_timestamp_unit(unit); + + ASSERT_NO_FATAL_FAILURE(DoRoundtrip( + input, input->num_rows(), &result, default_writer_properties(), + ArrowWriterProperties::Builder().enable_deprecated_int96_timestamps()->build(), + arrow_reader_prop)); + + ASSERT_NO_FATAL_FAILURE(::arrow::AssertSchemaEqual(*ex_result->schema(), + *result->schema(), + /*check_metadata=*/false)); + + ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_result, *result)); +} + // Non-template base class for TestParquetIO, to avoid code duplication class ParquetIOTestBase : public ::testing::Test { public: @@ -1671,6 +1700,33 @@ TEST(TestArrowReadWrite, UseDeprecatedInt96) { ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_result, *result)); } +TEST(TestArrowReadWrite, DownsampleDeprecatedInt96) { + using ::arrow::ArrayFromJSON; + using ::arrow::field; + using ::arrow::schema; + + // Timestamp values at 2000-01-01 00:00:00, + // then with increment unit of 1ns, 1us, 1ms and 1s. + auto a_nano = + ArrayFromJSON(timestamp(TimeUnit::NANO), + "[946684800000000000, 946684800000000001, 946684800000001000, " + "946684800001000000, 946684801000000000]"); + auto a_micro = ArrayFromJSON(timestamp(TimeUnit::MICRO), + "[946684800000000, 946684800000000, 946684800000001, " + "946684800001000, 946684801000000]"); + auto a_milli = ArrayFromJSON( + timestamp(TimeUnit::MILLI), + "[946684800000, 946684800000, 946684800000, 946684800001, 946684801000]"); + auto a_second = + ArrayFromJSON(timestamp(TimeUnit::SECOND), + "[946684800, 946684800, 946684800, 946684800, 946684801]"); + + ASSERT_NO_FATAL_FAILURE(DownsampleInt96RoundTrip(a_nano, a_nano, TimeUnit::NANO)); + ASSERT_NO_FATAL_FAILURE(DownsampleInt96RoundTrip(a_nano, a_micro, TimeUnit::MICRO)); + ASSERT_NO_FATAL_FAILURE(DownsampleInt96RoundTrip(a_nano, a_milli, TimeUnit::MILLI)); + ASSERT_NO_FATAL_FAILURE(DownsampleInt96RoundTrip(a_nano, a_second, TimeUnit::SECOND)); +} + TEST(TestArrowReadWrite, CoerceTimestamps) { using ::arrow::ArrayFromVector; using ::arrow::field; diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index 1410a5f89e2..0ffa3e89970 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -353,7 +353,8 @@ Status TransferBool(RecordReader* reader, MemoryPool* pool, Datum* out) { } Status TransferInt96(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr& type, Datum* out) { + const std::shared_ptr& type, Datum* out, + const ::arrow::TimeUnit::type int96_arrow_time_unit) { int64_t length = reader->values_written(); auto values = reinterpret_cast(reader->values()); ARROW_ASSIGN_OR_RAISE(auto data, @@ -365,7 +366,20 @@ Status TransferInt96(RecordReader* reader, MemoryPool* pool, // isn't representable as a 64-bit Unix timestamp. *data_ptr++ = 0; } else { - *data_ptr++ = Int96GetNanoSeconds(values[i]); + switch (int96_arrow_time_unit) { + case ::arrow::TimeUnit::NANO: + *data_ptr++ = Int96GetNanoSeconds(values[i]); + break; + case ::arrow::TimeUnit::MICRO: + *data_ptr++ = Int96GetMicroSeconds(values[i]); + break; + case ::arrow::TimeUnit::MILLI: + *data_ptr++ = Int96GetMilliSeconds(values[i]); + break; + case ::arrow::TimeUnit::SECOND: + *data_ptr++ = Int96GetSeconds(values[i]); + break; + } } } *out = std::make_shared(type, length, std::move(data), @@ -742,20 +756,19 @@ Status TransferColumnData(RecordReader* reader, std::shared_ptr value_ case ::arrow::Type::TIMESTAMP: { const ::arrow::TimestampType& timestamp_type = checked_cast<::arrow::TimestampType&>(*value_type); - switch (timestamp_type.unit()) { - case ::arrow::TimeUnit::MILLI: - case ::arrow::TimeUnit::MICRO: { - result = TransferZeroCopy(reader, value_type); - } break; - case ::arrow::TimeUnit::NANO: { - if (descr->physical_type() == ::parquet::Type::INT96) { - RETURN_NOT_OK(TransferInt96(reader, pool, value_type, &result)); - } else { + if (descr->physical_type() == ::parquet::Type::INT96) { + RETURN_NOT_OK( + TransferInt96(reader, pool, value_type, &result, timestamp_type.unit())); + } else { + switch (timestamp_type.unit()) { + case ::arrow::TimeUnit::MILLI: + case ::arrow::TimeUnit::MICRO: + case ::arrow::TimeUnit::NANO: result = TransferZeroCopy(reader, value_type); - } - } break; - default: - return Status::NotImplemented("TimeUnit not supported"); + break; + default: + return Status::NotImplemented("TimeUnit not supported"); + } } } break; default: diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 7610ce17605..eb7fd628dfc 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -454,7 +454,9 @@ bool IsDictionaryReadSupported(const ArrowType& type) { ::arrow::Result> GetTypeForNode( int column_index, const schema::PrimitiveNode& primitive_node, SchemaTreeContext* ctx) { - ASSIGN_OR_RAISE(std::shared_ptr storage_type, GetArrowType(primitive_node)); + ASSIGN_OR_RAISE( + std::shared_ptr storage_type, + GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit())); if (ctx->properties.read_dictionary(column_index) && IsDictionaryReadSupported(*storage_type)) { return ::arrow::dictionary(::arrow::int32(), storage_type); diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc index fbdfa09a040..064bf4f55cc 100644 --- a/cpp/src/parquet/arrow/schema_internal.cc +++ b/cpp/src/parquet/arrow/schema_internal.cc @@ -179,9 +179,9 @@ Result> FromInt64(const LogicalType& logical_type) { } } -Result> GetArrowType(Type::type physical_type, - const LogicalType& logical_type, - int type_length) { +Result> GetArrowType( + Type::type physical_type, const LogicalType& logical_type, int type_length, + const ::arrow::TimeUnit::type int96_arrow_time_unit) { if (logical_type.is_invalid() || logical_type.is_null()) { return ::arrow::null(); } @@ -194,7 +194,7 @@ Result> GetArrowType(Type::type physical_type, case ParquetType::INT64: return FromInt64(logical_type); case ParquetType::INT96: - return ::arrow::timestamp(::arrow::TimeUnit::NANO); + return ::arrow::timestamp(int96_arrow_time_unit); case ParquetType::FLOAT: return ::arrow::float32(); case ParquetType::DOUBLE: @@ -211,14 +211,11 @@ Result> GetArrowType(Type::type physical_type, } } -Result> GetArrowType(const schema::PrimitiveNode& primitive) { +Result> GetArrowType( + const schema::PrimitiveNode& primitive, + const ::arrow::TimeUnit::type int96_arrow_time_unit) { return GetArrowType(primitive.physical_type(), *primitive.logical_type(), - primitive.type_length()); -} - -Result> GetArrowType(const ColumnDescriptor& descriptor) { - return GetArrowType(descriptor.physical_type(), *descriptor.logical_type(), - descriptor.type_length()); + primitive.type_length(), int96_arrow_time_unit); } } // namespace arrow diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h index ec0d9571304..fb837c3ee6c 100644 --- a/cpp/src/parquet/arrow/schema_internal.h +++ b/cpp/src/parquet/arrow/schema_internal.h @@ -40,9 +40,12 @@ Result> GetArrowType(Type::type physical_type int type_length); Result> GetArrowType( - const schema::PrimitiveNode& primitive); + Type::type physical_type, const LogicalType& logical_type, int type_length, + ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO); + Result> GetArrowType( - const ColumnDescriptor& descriptor); + const schema::PrimitiveNode& primitive, + ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO); } // namespace arrow } // namespace parquet diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 5018fff9531..d217b8efa52 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -575,7 +575,8 @@ class PARQUET_EXPORT ArrowReaderProperties { read_dict_indices_(), batch_size_(kArrowDefaultBatchSize), pre_buffer_(false), - cache_options_(::arrow::io::CacheOptions::Defaults()) {} + cache_options_(::arrow::io::CacheOptions::Defaults()), + coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO) {} void set_use_threads(bool use_threads) { use_threads_ = use_threads; } @@ -620,6 +621,16 @@ class PARQUET_EXPORT ArrowReaderProperties { const ::arrow::io::IOContext& io_context() const { return io_context_; } + /// Set timestamp unit to use for deprecated INT96-encoded timestamps + /// (default is NANO). + void set_coerce_int96_timestamp_unit(::arrow::TimeUnit::type unit) { + coerce_int96_timestamp_unit_ = unit; + } + + ::arrow::TimeUnit::type coerce_int96_timestamp_unit() const { + return coerce_int96_timestamp_unit_; + } + private: bool use_threads_; std::unordered_set read_dict_indices_; @@ -627,6 +638,7 @@ class PARQUET_EXPORT ArrowReaderProperties { bool pre_buffer_; ::arrow::io::IOContext io_context_; ::arrow::io::CacheOptions cache_options_; + ::arrow::TimeUnit::type coerce_int96_timestamp_unit_; }; /// EXPERIMENTAL: Constructs the default ArrowReaderProperties diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 4529dbe6133..6bd67f1ee5f 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -591,15 +591,46 @@ static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds)); } -static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) { +struct DecodedInt96 { + uint64_t days_since_epoch; + uint64_t nanoseconds; +}; + +static inline DecodedInt96 DecodeInt96Timestamp(const parquet::Int96& i96) { // We do the computations in the unsigned domain to avoid unsigned behaviour // on overflow. - uint64_t days_since_epoch = - i96.value[2] - static_cast(kJulianToUnixEpochDays); - uint64_t nanoseconds = 0; + DecodedInt96 result; + result.days_since_epoch = i96.value[2] - static_cast(kJulianToUnixEpochDays); + result.nanoseconds = 0; + + memcpy(&result.nanoseconds, &i96.value, sizeof(uint64_t)); + return result; +} + +static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) { + const auto decoded = DecodeInt96Timestamp(i96); + return static_cast(decoded.days_since_epoch * kNanosecondsPerDay + + decoded.nanoseconds); +} + +static inline int64_t Int96GetMicroSeconds(const parquet::Int96& i96) { + const auto decoded = DecodeInt96Timestamp(i96); + uint64_t microseconds = decoded.nanoseconds / static_cast(1000); + return static_cast(decoded.days_since_epoch * kMicrosecondsPerDay + + microseconds); +} + +static inline int64_t Int96GetMilliSeconds(const parquet::Int96& i96) { + const auto decoded = DecodeInt96Timestamp(i96); + uint64_t milliseconds = decoded.nanoseconds / static_cast(1000000); + return static_cast(decoded.days_since_epoch * kMillisecondsPerDay + + milliseconds); +} - memcpy(&nanoseconds, &i96.value, sizeof(uint64_t)); - return static_cast(days_since_epoch * kNanosecondsPerDay + nanoseconds); +static inline int64_t Int96GetSeconds(const parquet::Int96& i96) { + const auto decoded = DecodeInt96Timestamp(i96); + uint64_t seconds = decoded.nanoseconds / static_cast(1000000000); + return static_cast(decoded.days_since_epoch * kSecondsPerDay + seconds); } static inline std::string Int96ToString(const Int96& a) { From 71bcfae48d6b951d2bf6e2f5d57cff4bd75ce33f Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Tue, 15 Jun 2021 18:33:01 +0200 Subject: [PATCH 17/61] ARROW-13027: [C++] Fix ASAN stack traces in CI Before change: ``` Direct leak of 65536 byte(s) in 1 object(s) allocated from: #0 0x522f09 in #1 0x7f28ae5826f4 in #2 0x7f28ae57fa5d in #3 0x7f28ae58cb0f in #4 0x7f28ae58bda0 in ... ``` After change: ``` Direct leak of 65536 byte(s) in 1 object(s) allocated from: #0 0x522f09 in posix_memalign (/build/cpp/debug/arrow-dataset-file-csv-test+0x522f09) #1 0x7f28ae5826f4 in arrow::(anonymous namespace)::SystemAllocator::AllocateAligned(long, unsigned char**) /arrow/cpp/src/arrow/memory_pool.cc:213:24 #2 0x7f28ae57fa5d in arrow::BaseMemoryPoolImpl::Allocate(long, unsigned char**) /arrow/cpp/src/arrow/memory_pool.cc:405:5 #3 0x7f28ae58cb0f in arrow::PoolBuffer::Reserve(long) /arrow/cpp/src/arrow/memory_pool.cc:717:9 #4 0x7f28ae58bda0 in arrow::PoolBuffer::Resize(long, bool) /arrow/cpp/src/arrow/memory_pool.cc:741:7 ... ``` Closes #10498 from westonpace/feature/ARROW-13027--c-fix-asan-stack-traces-in-ci Authored-by: Weston Pace Signed-off-by: Antoine Pitrou --- ci/docker/ubuntu-20.04-cpp.dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index c75c013799a..c2a468d9e35 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -127,6 +127,7 @@ ENV ARROW_BUILD_TESTS=ON \ ARROW_WITH_SNAPPY=ON \ ARROW_WITH_ZLIB=ON \ ARROW_WITH_ZSTD=ON \ + ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-${llvm}/bin/llvm-symbolizer \ AWSSDK_SOURCE=BUNDLED \ GTest_SOURCE=BUNDLED \ gRPC_SOURCE=BUNDLED \ From 322d2c81acd12330a8309f612406195183d13cdf Mon Sep 17 00:00:00 2001 From: Giordon Stark Date: Tue, 15 Jun 2021 21:24:49 +0200 Subject: [PATCH 18/61] ARROW-13085: [Python] Document compatible toolchains for python bindings This is a documentation-only PR that adds an additional note for users compibiling C++ extensions using the shared libraries bundled with the python package. Adding this note on the toolchain will help resolve (confusing?) segfaults that occur. Before (toolchain) change: - segfault when running the minimal cpp example After (toolchain) change: - no segfault when running the minimal cpp example Please see the linked JIRA for more details. Closes #10535 from kratsg/docs/pythonBindingExtensions Lead-authored-by: Giordon Stark Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- docs/source/python/extending.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/source/python/extending.rst b/docs/source/python/extending.rst index 738a7369f70..5f6ddb154e6 100644 --- a/docs/source/python/extending.rst +++ b/docs/source/python/extending.rst @@ -466,3 +466,14 @@ installed. This function will attempt to create symlinks like pip install pyarrow python -c "import pyarrow; pyarrow.create_library_symlinks()" + +Toolchain Compatibility (Linux) +""""""""""""""""""""""""""""""" + +The Python wheels for Linux are built using the +`PyPA manylinux images `_ which use +the CentOS `devtoolset-8` or `devtoolset-9` depending on which manylinux +wheel version (2010 or 2014) is being used. In addition to the other notes +above, if you are compiling C++ using these shared libraries, you will need +to make sure you use a compatible toolchain as well or you might see a +segfault during runtime. From b33a1a9c803fa4b44b684c280782629a46cdb7db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 16 Jun 2021 08:39:22 +0900 Subject: [PATCH 19/61] ARROW-13082: [CI] Forward R argument to ubuntu-docs build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `R=4.1 archery docker run ubuntu-docs` Closes #10534 from kszucs/forward-r-arg-to-docs-build Authored-by: Krisztián Szűcs Signed-off-by: Sutou Kouhei --- docker-compose.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/docker-compose.yml b/docker-compose.yml index 1133bfa3b29..fa0f0a28ad1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1294,6 +1294,7 @@ services: cache_from: - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-docs args: + r: ${R} jdk: ${JDK} node: ${NODE} base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3 From e5d450e6b4fe06022ea82e631eec18338c87d7b5 Mon Sep 17 00:00:00 2001 From: Diana Clarke Date: Wed, 16 Jun 2021 01:32:21 +0000 Subject: [PATCH 20/61] ARROW-13073: [Developer] archery benchmark list: unexpected keyword 'benchmark_filter' ``` $ archery benchmark list Traceback (most recent call last): File "/Users/diana/envs/arrow/bin/archery", line 33, in sys.exit(load_entry_point('archery', 'console_scripts', 'archery')()) File "/Users/diana/envs/arrow/lib/python3.9/site-packages/click/core.py", line 1137, in __call__ return self.main(*args, **kwargs) File "/Users/diana/envs/arrow/lib/python3.9/site-packages/click/core.py", line 1062, in main rv = self.invoke(ctx) File "/Users/diana/envs/arrow/lib/python3.9/site-packages/click/core.py", line 1668, in invoke return _process_result(sub_ctx.command.invoke(sub_ctx)) File "/Users/diana/envs/arrow/lib/python3.9/site-packages/click/core.py", line 1668, in invoke return _process_result(sub_ctx.command.invoke(sub_ctx)) File "/Users/diana/envs/arrow/lib/python3.9/site-packages/click/core.py", line 1404, in invoke return ctx.invoke(self.callback, **ctx.params) File "/Users/diana/envs/arrow/lib/python3.9/site-packages/click/core.py", line 763, in invoke return __callback(*args, **kwargs) File "/Users/diana/envs/arrow/lib/python3.9/site-packages/click/decorators.py", line 26, in new_func return f(get_current_context(), *args, **kwargs) File "/Users/diana/workspace/arrow/dev/archery/archery/cli.py", line 430, in benchmark_list conf = CppBenchmarkRunner.default_configuration( File "/Users/diana/workspace/arrow/dev/archery/archery/benchmark/runner.py", line 118, in default_configuration return CppConfiguration( TypeError: __init__() got an unexpected keyword argument 'benchmark_filter' ``` Closes #10528 from dianaclarke/ARROW-13073 Authored-by: Diana Clarke Signed-off-by: Yibo Cai --- dev/archery/archery/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index c35b0864900..9442b2917e0 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -416,7 +416,6 @@ def benchmark_filter_options(cmd): @click.argument("rev_or_path", metavar="[]", default="WORKSPACE", required=False) @benchmark_common_options -@benchmark_filter_options @click.pass_context def benchmark_list(ctx, rev_or_path, src, preserve, output, cmake_extras, java_home, java_options, build_extras, benchmark_extras, From 4fc96b54902d3737857dc567a160c6336b6bbf3e Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 16 Jun 2021 14:03:23 +0900 Subject: [PATCH 21/61] ARROW-11782: [GLib][Ruby][Dataset] Remove bindings for internal classes Closes #10533 from kou/glib-dataset-factory Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .../arrow-dataset-glib/arrow-dataset-glib.h | 2 + .../arrow-dataset-glib/arrow-dataset-glib.hpp | 2 + c_glib/arrow-dataset-glib/dataset-factory.cpp | 468 ++++++++++++++++ c_glib/arrow-dataset-glib/dataset-factory.h | 98 ++++ c_glib/arrow-dataset-glib/dataset-factory.hpp | 27 + c_glib/arrow-dataset-glib/dataset.cpp | 365 ++++++++++++ c_glib/arrow-dataset-glib/dataset.h | 65 +++ c_glib/arrow-dataset-glib/dataset.hpp | 48 ++ c_glib/arrow-dataset-glib/meson.build | 6 + c_glib/arrow-dataset-glib/scanner.cpp | 527 +++++------------- c_glib/arrow-dataset-glib/scanner.h | 77 +-- c_glib/arrow-dataset-glib/scanner.hpp | 20 +- c_glib/arrow-glib/basic-array.cpp | 6 +- .../arrow-dataset-glib-docs.xml | 20 +- .../test-file-system-dataset-factory.rb | 55 ++ .../test/dataset/test-file-system-dataset.rb | 23 +- .../test/dataset/test-in-memory-scan-task.rb | 59 -- c_glib/test/dataset/test-scan-options.rb | 47 -- c_glib/test/dataset/test-scanner.rb | 48 ++ c_glib/test/helper/buildable.rb | 19 +- .../test/helper/writable.rb | 27 +- c_glib/test/run-test.rb | 5 +- cpp/src/arrow/dataset/discovery.h | 15 +- .../{scan-options.rb => dataset.rb} | 20 +- .../lib/arrow-dataset/in-memory-scan-task.rb | 35 -- .../lib/arrow-dataset/loader.rb | 3 +- ruby/red-arrow-dataset/test/helper.rb | 2 + ...options.rb => test-file-system-dataset.rb} | 28 +- 28 files changed, 1462 insertions(+), 655 deletions(-) create mode 100644 c_glib/arrow-dataset-glib/dataset-factory.cpp create mode 100644 c_glib/arrow-dataset-glib/dataset-factory.h create mode 100644 c_glib/arrow-dataset-glib/dataset-factory.hpp create mode 100644 c_glib/arrow-dataset-glib/dataset.cpp create mode 100644 c_glib/arrow-dataset-glib/dataset.h create mode 100644 c_glib/arrow-dataset-glib/dataset.hpp create mode 100644 c_glib/test/dataset/test-file-system-dataset-factory.rb rename ruby/red-arrow-dataset/test/test-in-memory-scan-task.rb => c_glib/test/dataset/test-file-system-dataset.rb (64%) delete mode 100644 c_glib/test/dataset/test-in-memory-scan-task.rb delete mode 100644 c_glib/test/dataset/test-scan-options.rb create mode 100644 c_glib/test/dataset/test-scanner.rb rename ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-fragment.rb => c_glib/test/helper/writable.rb (63%) rename ruby/red-arrow-dataset/lib/arrow-dataset/{scan-options.rb => dataset.rb} (69%) delete mode 100644 ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-scan-task.rb rename ruby/red-arrow-dataset/test/{test-scan-options.rb => test-file-system-dataset.rb} (58%) diff --git a/c_glib/arrow-dataset-glib/arrow-dataset-glib.h b/c_glib/arrow-dataset-glib/arrow-dataset-glib.h index ff160452845..03e56516112 100644 --- a/c_glib/arrow-dataset-glib/arrow-dataset-glib.h +++ b/c_glib/arrow-dataset-glib/arrow-dataset-glib.h @@ -21,6 +21,8 @@ #include +#include +#include #include #include #include diff --git a/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp b/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp index c221825bc2a..65341b9b77e 100644 --- a/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp +++ b/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp @@ -21,6 +21,8 @@ #include +#include +#include #include #include #include diff --git a/c_glib/arrow-dataset-glib/dataset-factory.cpp b/c_glib/arrow-dataset-glib/dataset-factory.cpp new file mode 100644 index 00000000000..146db69adfc --- /dev/null +++ b/c_glib/arrow-dataset-glib/dataset-factory.cpp @@ -0,0 +1,468 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +#include +#include +#include + +G_BEGIN_DECLS + +/** + * SECTION: dataset-factory + * @section_id: dataset-factory + * @title: Dataset factory related classes + * @include: arrow-dataset-glib/arrow-dataset-glib.h + * + * #GADatasetDatasetFactory is a base class for dataset factories. + * + * #GADatasetFileSystemDatasetFactory is a class for + * #GADatasetFileSystemDataset factory. + * + * Since: 5.0.0 + */ + +typedef struct GADatasetDatasetFactoryPrivate_ { + std::shared_ptr factory; +} GADatasetDatasetFactoryPrivate; + +enum { + PROP_DATASET_FACTORY = 1, +}; + +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GADatasetDatasetFactory, + gadataset_dataset_factory, + G_TYPE_OBJECT) + +#define GADATASET_DATASET_FACTORY_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_dataset_factory_get_instance_private( \ + GADATASET_DATASET_FACTORY(obj))) + +static void +gadataset_dataset_factory_finalize(GObject *object) +{ + auto priv = GADATASET_DATASET_FACTORY_GET_PRIVATE(object); + priv->factory.~shared_ptr(); + G_OBJECT_CLASS(gadataset_dataset_factory_parent_class)->finalize(object); +} + +static void +gadataset_dataset_factory_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_DATASET_FACTORY_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DATASET_FACTORY: + { + auto arrow_factory_pointer = + static_cast *>( + g_value_get_pointer(value)); + if (arrow_factory_pointer) { + priv->factory = *arrow_factory_pointer; + } + } + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_dataset_factory_init(GADatasetDatasetFactory *object) +{ + auto priv = GADATASET_DATASET_FACTORY_GET_PRIVATE(object); + new(&priv->factory) std::shared_ptr; +} + +static void +gadataset_dataset_factory_class_init(GADatasetDatasetFactoryClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->finalize = gadataset_dataset_factory_finalize; + gobject_class->set_property = gadataset_dataset_factory_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("dataset-factory", + "Dataset factory", + "The raw " + "std::shared *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_DATASET_FACTORY, spec); +} + +/** + * gadataset_dataset_factory_finish: + * @factory: A #GADatasetDatasetFactory. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A newly created #GADatasetDataset on success, %NULL on error. + * + * Since: 5.0.0 + */ +GADatasetDataset * +gadataset_dataset_factory_finish(GADatasetDatasetFactory *factory, + GError **error) +{ + auto arrow_factory = gadataset_dataset_factory_get_raw(factory); + auto arrow_dataset_result = arrow_factory->Finish(); + if (garrow::check(error, arrow_dataset_result, "[dataset-factory][finish]")) { + auto arrow_dataset = *arrow_dataset_result; + return gadataset_dataset_new_raw(&arrow_dataset); + } else { + return NULL; + } +} + + +typedef struct GADatasetFileSystemDatasetFactoryPrivate_ { + GADatasetFileFormat *format; + GArrowFileSystem *file_system; + GList *files; + arrow::dataset::FileSystemFactoryOptions options; +} GADatasetFileSystemDatasetFactoryPrivate; + +enum { + PROP_FORMAT = 1, + PROP_FILE_SYSTEM, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileSystemDatasetFactory, + gadataset_file_system_dataset_factory, + GADATASET_TYPE_DATASET_FACTORY) + +#define GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_file_system_dataset_factory_get_instance_private( \ + GADATASET_FILE_SYSTEM_DATASET_FACTORY(obj))) + +static void +gadataset_file_system_dataset_factory_dispose(GObject *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object); + + if (priv->format) { + g_object_unref(priv->format); + priv->format = NULL; + } + + if (priv->file_system) { + g_object_unref(priv->file_system); + priv->file_system = NULL; + } + + if (priv->files) { + g_list_free_full(priv->files, g_object_unref); + priv->files = NULL; + } + + G_OBJECT_CLASS( + gadataset_file_system_dataset_factory_parent_class)->dispose(object); +} + +static void +gadataset_file_system_dataset_factory_finalize(GObject *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object); + priv->options.~FileSystemFactoryOptions(); + G_OBJECT_CLASS( + gadataset_file_system_dataset_factory_parent_class)->finalize(object); +} + +static void +gadataset_file_system_dataset_factory_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FORMAT: + priv->format = GADATASET_FILE_FORMAT(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_system_dataset_factory_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FORMAT: + g_value_set_object(value, priv->format); + break; + case PROP_FILE_SYSTEM: + g_value_set_object(value, priv->file_system); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_system_dataset_factory_init( + GADatasetFileSystemDatasetFactory *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object); + new(&priv->options) arrow::dataset::FileSystemFactoryOptions; +} + +static void +gadataset_file_system_dataset_factory_class_init( + GADatasetFileSystemDatasetFactoryClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->dispose = gadataset_file_system_dataset_factory_dispose; + gobject_class->finalize = gadataset_file_system_dataset_factory_finalize; + gobject_class->set_property = gadataset_file_system_dataset_factory_set_property; + gobject_class->get_property = gadataset_file_system_dataset_factory_get_property; + + GParamSpec *spec; + /** + * GADatasetFileSystemDatasetFactory:format: + * + * Format passed to #GADatasetFileSystemDataset. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("format", + "Format", + "Format passed to GADatasetFileSystemDataset", + GADATASET_TYPE_FILE_FORMAT, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_FORMAT, spec); + + /** + * GADatasetFileSystemDatasetFactory:file-system: + * + * File system passed to #GADatasetFileSystemDataset. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("file-system", + "File system", + "File system passed to GADatasetFileSystemDataset", + GARROW_TYPE_FILE_SYSTEM, + static_cast(G_PARAM_READABLE)); + g_object_class_install_property(gobject_class, PROP_FILE_SYSTEM, spec); +} + +/** + * gadataset_file_system_factory_new: + * @format: A #GADatasetFileFormat. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: A newly created #GADatasetDatasetFileSystemFactory on success, + * %NULL on error. + * + * Since: 5.0.0 + */ +GADatasetFileSystemDatasetFactory * +gadataset_file_system_dataset_factory_new(GADatasetFileFormat *format) +{ + return GADATASET_FILE_SYSTEM_DATASET_FACTORY( + g_object_new(GADATASET_TYPE_FILE_SYSTEM_DATASET_FACTORY, + "format", format, + NULL)); +} + +/** + * gadataset_file_system_dataset_factory_set_file_system: + * @factory: A #GADatasetFileSystemDatasetFactory. + * @file_system: A #GArrowFileSystem. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +gadataset_file_system_dataset_factory_set_file_system( + GADatasetFileSystemDatasetFactory *factory, + GArrowFileSystem *file_system, + GError **error) +{ + const gchar *context = "[file-system-dataset-factory][set-file-system]"; + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(factory); + if (priv->file_system) { + garrow::check(error, + arrow::Status::Invalid("file system is already set"), + context); + return FALSE; + } + priv->file_system = file_system; + g_object_ref(priv->file_system); + return TRUE; +} + +/** + * gadataset_file_system_dataset_factory_set_file_system_uri: + * @factory: A #GADatasetFileSystemDatasetFactory. + * @uri: An URI for file system. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +gadataset_file_system_dataset_factory_set_file_system_uri( + GADatasetFileSystemDatasetFactory *factory, + const gchar *uri, + GError **error) +{ + const gchar *context = "[file-system-dataset-factory][set-file-system-uri]"; + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(factory); + if (priv->file_system) { + garrow::check(error, + arrow::Status::Invalid("file system is already set"), + context); + return FALSE; + } + std::string internal_path; + auto arrow_file_system_result = + arrow::fs::FileSystemFromUri(uri, &internal_path); + if (!garrow::check(error, arrow_file_system_result, context)) { + return FALSE; + } + auto arrow_file_system = *arrow_file_system_result; + auto arrow_file_info_result = arrow_file_system->GetFileInfo(internal_path); + if (!garrow::check(error, arrow_file_info_result, context)) { + return FALSE; + } + priv->file_system = garrow_file_system_new_raw(&arrow_file_system); + auto file_info = garrow_file_info_new_raw(*arrow_file_info_result); + priv->files = g_list_prepend(priv->files, file_info); + return TRUE; +} + +/** + * gadataset_file_system_dataset_factory_add_path: + * @factory: A #GADatasetFileSystemDatasetFactory. + * @path: A path to be added. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +gadataset_file_system_dataset_factory_add_path( + GADatasetFileSystemDatasetFactory *factory, + const gchar *path, + GError **error) +{ + const gchar *context = "[file-system-dataset-factory][add-path]"; + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(factory); + if (!priv->file_system) { + garrow::check(error, + arrow::Status::Invalid("file system isn't set"), + context); + return FALSE; + } + auto arrow_file_system = garrow_file_system_get_raw(priv->file_system); + auto arrow_file_info_result = arrow_file_system->GetFileInfo(path); + if (!garrow::check(error, arrow_file_info_result, context)) { + return FALSE; + } + auto file_info = garrow_file_info_new_raw(*arrow_file_info_result); + priv->files = g_list_prepend(priv->files, file_info); + return TRUE; +} + +/** + * gadataset_file_system_dataset_factory_finish: + * @factory: A #GADatasetFileSystemDatasetFactory. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A newly created #GADatasetFileSystemDataset on success, %NULL on error. + * + * Since: 5.0.0 + */ +GADatasetFileSystemDataset * +gadataset_file_system_dataset_factory_finish( + GADatasetFileSystemDatasetFactory *factory, + GError **error) +{ + const gchar *context = "[file-system-dataset-factory][finish]"; + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(factory); + if (!priv->file_system) { + garrow::check(error, + arrow::Status::Invalid("file system isn't set"), + context); + return NULL; + } + auto arrow_file_system = garrow_file_system_get_raw(priv->file_system); + auto arrow_format = gadataset_file_format_get_raw(priv->format); + std::vector arrow_files; + priv->files = g_list_reverse(priv->files); + for (auto node = priv->files; node; node = node->next) { + auto file = GARROW_FILE_INFO(node->data); + arrow_files.push_back(*garrow_file_info_get_raw(file)); + } + priv->files = g_list_reverse(priv->files); + auto arrow_factory_result = + arrow::dataset::FileSystemDatasetFactory::Make(arrow_file_system, + arrow_files, + arrow_format, + priv->options); + if (!garrow::check(error, arrow_factory_result, context)) { + return NULL; + } + auto arrow_dataset_result = (*arrow_factory_result)->Finish(); + if (!garrow::check(error, arrow_dataset_result, context)) { + return NULL; + } + auto arrow_dataset = *arrow_dataset_result; + return GADATASET_FILE_SYSTEM_DATASET( + gadataset_dataset_new_raw(&arrow_dataset, + "dataset", &arrow_dataset, + "file-system", priv->file_system, + "format", priv->format, + NULL)); +} + + +G_END_DECLS + +std::shared_ptr +gadataset_dataset_factory_get_raw(GADatasetDatasetFactory *factory) +{ + auto priv = GADATASET_DATASET_FACTORY_GET_PRIVATE(factory); + return priv->factory; +} diff --git a/c_glib/arrow-dataset-glib/dataset-factory.h b/c_glib/arrow-dataset-glib/dataset-factory.h new file mode 100644 index 00000000000..e2ee3ed9806 --- /dev/null +++ b/c_glib/arrow-dataset-glib/dataset-factory.h @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +G_BEGIN_DECLS + +#define GADATASET_TYPE_DATASET_FACTORY (gadataset_dataset_factory_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetDatasetFactory, + gadataset_dataset_factory, + GADATASET, + DATASET_FACTORY, + GObject) +struct _GADatasetDatasetFactoryClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GADatasetDataset * +gadataset_dataset_factory_finish(GADatasetDatasetFactory *factory, + GError **error); + + +#define GADATASET_TYPE_FILE_SYSTEM_DATASET_FACTORY \ + (gadataset_file_system_dataset_factory_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetFileSystemDatasetFactory, + gadataset_file_system_dataset_factory, + GADATASET, + FILE_SYSTEM_DATASET_FACTORY, + GADatasetDatasetFactory) +struct _GADatasetFileSystemDatasetFactoryClass +{ + GADatasetDatasetFactoryClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GADatasetFileSystemDatasetFactory * +gadataset_file_system_dataset_factory_new(GADatasetFileFormat *file_format); +GARROW_AVAILABLE_IN_5_0 +gboolean +gadataset_file_system_dataset_factory_set_file_system( + GADatasetFileSystemDatasetFactory *factory, + GArrowFileSystem *file_system, + GError **error); +gboolean +gadataset_file_system_dataset_factory_set_file_system_uri( + GADatasetFileSystemDatasetFactory *factory, + const gchar *uri, + GError **error); + +GARROW_AVAILABLE_IN_5_0 +gboolean +gadataset_file_system_dataset_factory_add_path( + GADatasetFileSystemDatasetFactory *factory, + const gchar *path, + GError **error); +/* +GARROW_AVAILABLE_IN_5_0 +gboolean +gadataset_file_system_dataset_factory_add_file( + GADatasetFileSystemDatasetFactory *factory, + GArrowFileInfo *file, + GError **error); +GARROW_AVAILABLE_IN_5_0 +gboolean +gadataset_file_system_dataset_factory_add_selector( + GADatasetFileSystemDatasetFactory *factory, + GArrorFileSelector *selector, + GError **error); +*/ + +GARROW_AVAILABLE_IN_5_0 +GADatasetFileSystemDataset * +gadataset_file_system_dataset_factory_finish( + GADatasetFileSystemDatasetFactory *factory, + GError **error); + + +G_END_DECLS diff --git a/c_glib/arrow-dataset-glib/dataset-factory.hpp b/c_glib/arrow-dataset-glib/dataset-factory.hpp new file mode 100644 index 00000000000..114db35bc59 --- /dev/null +++ b/c_glib/arrow-dataset-glib/dataset-factory.hpp @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include + +std::shared_ptr +gadataset_dataset_factory_get_raw(GADatasetDatasetFactory *factory); diff --git a/c_glib/arrow-dataset-glib/dataset.cpp b/c_glib/arrow-dataset-glib/dataset.cpp new file mode 100644 index 00000000000..3bd62f99ef3 --- /dev/null +++ b/c_glib/arrow-dataset-glib/dataset.cpp @@ -0,0 +1,365 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +#include +#include +#include + +G_BEGIN_DECLS + +/** + * SECTION: dataset + * @section_id: dataset + * @title: Dataset related classes + * @include: arrow-dataset-glib/arrow-dataset-glib.h + * + * #GADatasetDataset is a base class for datasets. + * + * #GADatasetFileSystemDataset is a class for file system dataset. + * + * #GADatasetFileFormat is a base class for file formats. + * + * #GADatasetCSVFileFormat is a class for CSV file format. + * + * #GADatasetIPCFileFormat is a class for IPC file format. + * + * #GADatasetParquetFileFormat is a class for Apache Parquet file format. + * + * Since: 5.0.0 + */ + +typedef struct GADatasetDatasetPrivate_ { + std::shared_ptr dataset; +} GADatasetDatasetPrivate; + +enum { + PROP_DATASET = 1, +}; + +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GADatasetDataset, + gadataset_dataset, + G_TYPE_OBJECT) + +#define GADATASET_DATASET_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_dataset_get_instance_private( \ + GADATASET_DATASET(obj))) + +static void +gadataset_dataset_finalize(GObject *object) +{ + auto priv = GADATASET_DATASET_GET_PRIVATE(object); + priv->dataset.~shared_ptr(); + G_OBJECT_CLASS(gadataset_dataset_parent_class)->finalize(object); +} + +static void +gadataset_dataset_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_DATASET_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DATASET: + priv->dataset = + *static_cast *>( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_dataset_init(GADatasetDataset *object) +{ + auto priv = GADATASET_DATASET_GET_PRIVATE(object); + new(&priv->dataset) std::shared_ptr; +} + +static void +gadataset_dataset_class_init(GADatasetDatasetClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->finalize = gadataset_dataset_finalize; + gobject_class->set_property = gadataset_dataset_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("dataset", + "Dataset", + "The raw " + "std::shared *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_DATASET, spec); +} + +/** + * gadataset_dataset_begin_scan: + * @dataset: A #GADatasetDataset. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A newly created #GADatasetScannerBuilder on success, %NULL on error. + * + * Since: 5.0.0 + */ +GADatasetScannerBuilder * +gadataset_dataset_begin_scan(GADatasetDataset *dataset, + GError **error) +{ + return gadataset_scanner_builder_new(dataset, error); +} + +/** + * gadataset_dataset_to_table: + * @dataset: A #GADatasetDataset. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A loaded #GArrowTable on success, %NULL on error. + * + * Since: 5.0.0 + */ +GArrowTable * +gadataset_dataset_to_table(GADatasetDataset *dataset, + GError **error) +{ + auto arrow_dataset = gadataset_dataset_get_raw(dataset); + auto arrow_scanner_builder_result = arrow_dataset->NewScan(); + if (!garrow::check(error, + arrow_scanner_builder_result, + "[dataset][to-table]")) { + return NULL; + } + auto arrow_scanner_builder = *arrow_scanner_builder_result; + auto arrow_scanner_result = arrow_scanner_builder->Finish(); + if (!garrow::check(error, + arrow_scanner_result, + "[dataset][to-table]")) { + return NULL; + } + auto arrow_scanner = *arrow_scanner_result; + auto arrow_table_result = arrow_scanner->ToTable(); + if (!garrow::check(error, + arrow_scanner_result, + "[dataset][to-table]")) { + return NULL; + } + return garrow_table_new_raw(&(*arrow_table_result)); +} + +/** + * gadataset_dataset_get_type_name: + * @dataset: A #GADatasetDataset. + * + * Returns: The type name of @dataset. + * + * It should be freed with g_free() when no longer needed. + * + * Since: 5.0.0 + */ +gchar * +gadataset_dataset_get_type_name(GADatasetDataset *dataset) +{ + const auto arrow_dataset = gadataset_dataset_get_raw(dataset); + const auto &type_name = arrow_dataset->type_name(); + return g_strndup(type_name.data(), type_name.size()); +} + + +typedef struct GADatasetFileSystemDatasetPrivate_ { + GADatasetFileFormat *format; + GArrowFileSystem *file_system; +} GADatasetFileSystemDatasetPrivate; + +enum { + PROP_FORMAT = 1, + PROP_FILE_SYSTEM, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileSystemDataset, + gadataset_file_system_dataset, + GADATASET_TYPE_DATASET) + +#define GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_file_system_dataset_get_instance_private( \ + GADATASET_FILE_SYSTEM_DATASET(obj))) + +static void +gadataset_file_system_dataset_dispose(GObject *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(object); + + if (priv->format) { + g_object_unref(priv->format); + priv->format = NULL; + } + + if (priv->file_system) { + g_object_unref(priv->file_system); + priv->file_system = NULL; + } + + G_OBJECT_CLASS(gadataset_file_system_dataset_parent_class)->dispose(object); +} + +static void +gadataset_file_system_dataset_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FORMAT: + priv->format = GADATASET_FILE_FORMAT(g_value_dup_object(value)); + break; + case PROP_FILE_SYSTEM: + priv->file_system = GARROW_FILE_SYSTEM(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_system_dataset_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FORMAT: + g_value_set_object(value, priv->format); + break; + case PROP_FILE_SYSTEM: + g_value_set_object(value, priv->file_system); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_system_dataset_init(GADatasetFileSystemDataset *object) +{ +} + +static void +gadataset_file_system_dataset_class_init(GADatasetFileSystemDatasetClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->dispose = gadataset_file_system_dataset_dispose; + gobject_class->set_property = gadataset_file_system_dataset_set_property; + gobject_class->get_property = gadataset_file_system_dataset_get_property; + + GParamSpec *spec; + /** + * GADatasetFileSystemDataset:format: + * + * Format of the dataset. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("format", + "Format", + "Format of the dataset", + GADATASET_TYPE_FILE_FORMAT, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_FORMAT, spec); + + /** + * GADatasetFileSystemDataset:file-system: + * + * File system of the dataset. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("file-system", + "File system", + "File system of the dataset", + GARROW_TYPE_FILE_SYSTEM, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_FILE_SYSTEM, spec); +} + + +G_END_DECLS + +GADatasetDataset * +gadataset_dataset_new_raw( + std::shared_ptr *arrow_dataset) +{ + return gadataset_dataset_new_raw(arrow_dataset, + "dataset", arrow_dataset, + NULL); +} + +GADatasetDataset * +gadataset_dataset_new_raw( + std::shared_ptr *arrow_dataset, + const gchar *first_property_name, + ...) +{ + va_list args; + va_start(args, first_property_name); + auto array = gadataset_dataset_new_raw_valist(arrow_dataset, + first_property_name, + args); + va_end(args); + return array; +} + +GADatasetDataset * +gadataset_dataset_new_raw_valist( + std::shared_ptr *arrow_dataset, + const gchar *first_property_name, + va_list args) +{ + GType type = GADATASET_TYPE_DATASET; + const auto type_name = (*arrow_dataset)->type_name(); + if (type_name == "filesystem") { + type = GADATASET_TYPE_FILE_SYSTEM_DATASET; + } + return GADATASET_DATASET(g_object_new_valist(type, + first_property_name, + args)); +} + +std::shared_ptr +gadataset_dataset_get_raw(GADatasetDataset *dataset) +{ + auto priv = GADATASET_DATASET_GET_PRIVATE(dataset); + return priv->dataset; +} diff --git a/c_glib/arrow-dataset-glib/dataset.h b/c_glib/arrow-dataset-glib/dataset.h new file mode 100644 index 00000000000..97cf35d74d7 --- /dev/null +++ b/c_glib/arrow-dataset-glib/dataset.h @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +G_BEGIN_DECLS + +typedef struct _GADatasetScannerBuilder GADatasetScannerBuilder; + +#define GADATASET_TYPE_DATASET (gadataset_dataset_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetDataset, + gadataset_dataset, + GADATASET, + DATASET, + GObject) +struct _GADatasetDatasetClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GADatasetScannerBuilder * +gadataset_dataset_begin_scan(GADatasetDataset *dataset, + GError **error); +GARROW_AVAILABLE_IN_5_0 +GArrowTable * +gadataset_dataset_to_table(GADatasetDataset *dataset, + GError **error); +GARROW_AVAILABLE_IN_5_0 +gchar * +gadataset_dataset_get_type_name(GADatasetDataset *dataset); + + +#define GADATASET_TYPE_FILE_SYSTEM_DATASET \ + (gadataset_file_system_dataset_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetFileSystemDataset, + gadataset_file_system_dataset, + GADATASET, + FILE_SYSTEM_DATASET, + GADatasetDataset) +struct _GADatasetFileSystemDatasetClass +{ + GADatasetDatasetClass parent_class; +}; + + +G_END_DECLS diff --git a/c_glib/arrow-dataset-glib/dataset.hpp b/c_glib/arrow-dataset-glib/dataset.hpp new file mode 100644 index 00000000000..94dddd2eb7a --- /dev/null +++ b/c_glib/arrow-dataset-glib/dataset.hpp @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include + +GADatasetDataset * +gadataset_dataset_new_raw( + std::shared_ptr *arrow_dataset); +GADatasetDataset * +gadataset_dataset_new_raw( + std::shared_ptr *arrow_dataset, + const gchar *first_property_name, + ...); +GADatasetDataset * +gadataset_dataset_new_raw_valist( + std::shared_ptr *arrow_dataset, + const gchar *first_property_name, + va_list arg); +std::shared_ptr +gadataset_dataset_get_raw(GADatasetDataset *dataset); + +GADatasetFileFormat * +gadataset_file_format_new_raw( + std::shared_ptr *arrow_format); +std::shared_ptr +gadataset_dataset_get_raw(GADatasetDataset *dataset); + + diff --git a/c_glib/arrow-dataset-glib/meson.build b/c_glib/arrow-dataset-glib/meson.build index 04dc420b057..b3f617330cf 100644 --- a/c_glib/arrow-dataset-glib/meson.build +++ b/c_glib/arrow-dataset-glib/meson.build @@ -18,6 +18,8 @@ # under the License. sources = files( + 'dataset-factory.cpp', + 'dataset.cpp', 'file-format.cpp', 'fragment.cpp', 'scanner.cpp', @@ -25,6 +27,8 @@ sources = files( c_headers = files( 'arrow-dataset-glib.h', + 'dataset-factory.h', + 'dataset.h', 'file-format.h', 'fragment.h', 'scanner.h', @@ -32,6 +36,8 @@ c_headers = files( cpp_headers = files( 'arrow-dataset-glib.hpp', + 'dataset-factory.hpp', + 'dataset.hpp', 'file-format.hpp', 'fragment.hpp', 'scanner.hpp', diff --git a/c_glib/arrow-dataset-glib/scanner.cpp b/c_glib/arrow-dataset-glib/scanner.cpp index 04778c8ae99..7f8d8be5fdb 100644 --- a/c_glib/arrow-dataset-glib/scanner.cpp +++ b/c_glib/arrow-dataset-glib/scanner.cpp @@ -17,13 +17,10 @@ * under the License. */ -#include - #include -#include -#include +#include -#include +#include #include G_BEGIN_DECLS @@ -31,72 +28,55 @@ G_BEGIN_DECLS /** * SECTION: scanner * @section_id: scanner - * @title: Scanner classes + * @title: Scanner related classes * @include: arrow-dataset-glib/arrow-dataset-glib.h * - * #GADatasetScanOptions is a class for a set of scan options. - * - * #GADatasetScanTask is an abstract class for a scan task. + * #GADatasetScanner is a class for scanning dataset. * - * #GADatasetInMemoryScanTask is a class for a scan task of record batches. + * #GADatasetScannerBuilder is a class for building a scanner. * - * Since: 1.0.0 + * Since: 5.0.0 */ -/* arrow::dataset::ScanOptions */ - -typedef struct GADatasetScanOptionsPrivate_ { - std::shared_ptr scan_options; -} GADatasetScanOptionsPrivate; +typedef struct GADatasetScannerPrivate_ { + std::shared_ptr scanner; +} GADatasetScannerPrivate; enum { - PROP_SCAN_OPTIONS = 1, - PROP_FILTER, - PROP_EVALUATOR, - PROP_PROJECTOR, - PROP_BATCH_SIZE, - PROP_USE_THREADS, + PROP_SCANNER = 1, }; -G_DEFINE_TYPE_WITH_PRIVATE(GADatasetScanOptions, - gadataset_scan_options, +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetScanner, + gadataset_scanner, G_TYPE_OBJECT) -#define GADATASET_SCAN_OPTIONS_GET_PRIVATE(obj) \ - static_cast( \ - gadataset_scan_options_get_instance_private( \ - GADATASET_SCAN_OPTIONS(obj))) +#define GADATASET_SCANNER_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_scanner_get_instance_private( \ + GADATASET_SCANNER(obj))) static void -gadataset_scan_options_finalize(GObject *object) +gadataset_scanner_finalize(GObject *object) { - auto priv = GADATASET_SCAN_OPTIONS_GET_PRIVATE(object); - - priv->scan_options.~shared_ptr(); - - G_OBJECT_CLASS(gadataset_scan_options_parent_class)->finalize(object); + auto priv = GADATASET_SCANNER_GET_PRIVATE(object); + priv->scanner.~shared_ptr(); + G_OBJECT_CLASS(gadataset_scanner_parent_class)->finalize(object); } static void -gadataset_scan_options_set_property(GObject *object, - guint prop_id, - const GValue *value, - GParamSpec *pspec) +gadataset_scanner_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) { - auto priv = GADATASET_SCAN_OPTIONS_GET_PRIVATE(object); + auto priv = GADATASET_SCANNER_GET_PRIVATE(object); switch (prop_id) { - case PROP_SCAN_OPTIONS: - priv->scan_options = - *static_cast *>( + case PROP_SCANNER: + priv->scanner = + *static_cast *>( g_value_get_pointer(value)); break; - case PROP_BATCH_SIZE: - priv->scan_options->batch_size = g_value_get_int64(value); - break; - case PROP_USE_THREADS: - priv->scan_options->use_threads = g_value_get_boolean(value); - break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -104,193 +84,92 @@ gadataset_scan_options_set_property(GObject *object, } static void -gadataset_scan_options_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) +gadataset_scanner_init(GADatasetScanner *object) { - auto priv = GADATASET_SCAN_OPTIONS_GET_PRIVATE(object); - - switch (prop_id) { - case PROP_BATCH_SIZE: - g_value_set_int64(value, priv->scan_options->batch_size); - break; - case PROP_USE_THREADS: - g_value_set_boolean(value, priv->scan_options->use_threads); - break; - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } + auto priv = GADATASET_SCANNER_GET_PRIVATE(object); + new(&priv->scanner) std::shared_ptr; } static void -gadataset_scan_options_init(GADatasetScanOptions *object) +gadataset_scanner_class_init(GADatasetScannerClass *klass) { - auto priv = GADATASET_SCAN_OPTIONS_GET_PRIVATE(object); - new(&priv->scan_options) std::shared_ptr; -} + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->finalize = gadataset_scanner_finalize; + gobject_class->set_property = gadataset_scanner_set_property; -static void -gadataset_scan_options_class_init(GADatasetScanOptionsClass *klass) -{ - GObjectClass *gobject_class; GParamSpec *spec; - - gobject_class = G_OBJECT_CLASS(klass); - - gobject_class->finalize = gadataset_scan_options_finalize; - gobject_class->set_property = gadataset_scan_options_set_property; - gobject_class->get_property = gadataset_scan_options_get_property; - - auto scan_options = std::make_shared(); - - spec = g_param_spec_pointer("scan-options", - "ScanOptions", - "The raw std::shared *", + spec = g_param_spec_pointer("scanner", + "Scanner", + "The raw std::shared *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_SCAN_OPTIONS, spec); - - // TODO: PROP_FILTER - // TODO: PROP_EVALUATOR - // TODO: PROP_PROJECTOR - - /** - * GADatasetScanOptions:batch-size: - * - * Maximum row count for scanned batches. - * - * Since: 1.0.0 - */ - spec = g_param_spec_int64("batch-size", - "Batch size", - "Maximum row count for scanned batches", - 0, - G_MAXINT64, - scan_options->batch_size, - static_cast(G_PARAM_READWRITE)); - g_object_class_install_property(gobject_class, PROP_BATCH_SIZE, spec); - - /** - * GADatasetScanOptions:use-threads: - * - * Indicate if the Scanner should make use of a ThreadPool. - * - * Since: 4.0.0 - */ - spec = g_param_spec_boolean("use-threads", - "Use threads", - "Indicate if the Scanner should make use of a ThreadPool", - scan_options->use_threads, - static_cast(G_PARAM_READWRITE)); - g_object_class_install_property(gobject_class, PROP_USE_THREADS, spec); + g_object_class_install_property(gobject_class, PROP_SCANNER, spec); } /** - * gadataset_scan_options_new: - * @schema: A #GArrowSchema. - * - * Returns: A newly created #GADatasetScanOptions. - * - * Since: 1.0.0 - */ -GADatasetScanOptions * -gadataset_scan_options_new(GArrowSchema *schema) -{ - auto arrow_schema = garrow_schema_get_raw(schema); - auto arrow_scan_options = std::make_shared(); - arrow_scan_options->dataset_schema = arrow_schema; - return gadataset_scan_options_new_raw(&arrow_scan_options); -} - -/** - * gadataset_scan_options_get_schema: - * @scan_options: A #GADatasetScanOptions. + * gadataset_scanner_to_table: + * @scanner: A #GADatasetScanner. + * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: (transfer full): A #GArrowSchema. + * Returns: (transfer full) (nullable): + * A newly created #GArrowTable on success, %NULL on error. * - * Since: 1.0.0 + * Since: 5.0.0 */ -GArrowSchema * -gadataset_scan_options_get_schema(GADatasetScanOptions *scan_options) +GArrowTable * +gadataset_scanner_to_table(GADatasetScanner *scanner, + GError **error) { - auto priv = GADATASET_SCAN_OPTIONS_GET_PRIVATE(scan_options); - auto arrow_schema = priv->scan_options->dataset_schema; - return garrow_schema_new_raw(&arrow_schema); + auto arrow_scanner = gadataset_scanner_get_raw(scanner); + auto arrow_table_result = arrow_scanner->ToTable(); + if (garrow::check(error, arrow_table_result, "[scanner][to-table]")) { + auto arrow_table = *arrow_table_result; + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } } -/* arrow::dataset::ScanTask */ -typedef struct GADatasetScanTaskPrivate_ { - std::shared_ptr scan_task; - GADatasetScanOptions *options; - GADatasetFragment *fragment; -} GADatasetScanTaskPrivate; +typedef struct GADatasetScannerBuilderPrivate_ { + std::shared_ptr scanner_builder; +} GADatasetScannerBuilderPrivate; enum { - PROP_SCAN_TASK = 1, - PROP_OPTIONS, - PROP_FRAGMENT, + PROP_SCANNER_BUILDER = 1, }; -G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GADatasetScanTask, - gadataset_scan_task, - G_TYPE_OBJECT) - -#define GADATASET_SCAN_TASK_GET_PRIVATE(obj) \ - static_cast( \ - gadataset_scan_task_get_instance_private( \ - GADATASET_SCAN_TASK(obj))) - -static void -gadataset_scan_task_dispose(GObject *object) -{ - auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(object); - - if (priv->options) { - g_object_unref(priv->options); - priv->options = NULL; - } +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetScannerBuilder, + gadataset_scanner_builder, + G_TYPE_OBJECT) - if (priv->fragment) { - g_object_unref(priv->fragment); - priv->fragment = NULL; - } - - G_OBJECT_CLASS(gadataset_scan_task_parent_class)->dispose(object); -} +#define GADATASET_SCANNER_BUILDER_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_scanner_builder_get_instance_private( \ + GADATASET_SCANNER_BUILDER(obj))) static void -gadataset_scan_task_finalize(GObject *object) +gadataset_scanner_builder_finalize(GObject *object) { - auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(object); - - priv->scan_task.~shared_ptr(); - - G_OBJECT_CLASS(gadataset_scan_task_parent_class)->finalize(object); + auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(object); + priv->scanner_builder.~shared_ptr(); + G_OBJECT_CLASS(gadataset_scanner_builder_parent_class)->finalize(object); } static void -gadataset_scan_task_set_property(GObject *object, - guint prop_id, - const GValue *value, - GParamSpec *pspec) +gadataset_scanner_builder_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) { - auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(object); + auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(object); switch (prop_id) { - case PROP_SCAN_TASK: - priv->scan_task = - *static_cast *>( + case PROP_SCANNER_BUILDER: + priv->scanner_builder = + *static_cast *>( g_value_get_pointer(value)); break; - case PROP_OPTIONS: - priv->options = GADATASET_SCAN_OPTIONS(g_value_dup_object(value)); - break; - case PROP_FRAGMENT: - priv->fragment = GADATASET_FRAGMENT(g_value_dup_object(value)); - break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -298,230 +177,112 @@ gadataset_scan_task_set_property(GObject *object, } static void -gadataset_scan_task_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) +gadataset_scanner_builder_init(GADatasetScannerBuilder *object) { - auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(object); - - switch (prop_id) { - case PROP_OPTIONS: - g_value_set_object(value, priv->options); - break; - case PROP_FRAGMENT: - g_value_set_object(value, priv->fragment); - break; - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } + auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(object); + new(&priv->scanner_builder) std::shared_ptr; } static void -gadataset_scan_task_init(GADatasetScanTask *object) -{ - auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(object); - new(&priv->scan_task) std::shared_ptr; -} - -static void -gadataset_scan_task_class_init(GADatasetScanTaskClass *klass) +gadataset_scanner_builder_class_init(GADatasetScannerBuilderClass *klass) { auto gobject_class = G_OBJECT_CLASS(klass); - - gobject_class->dispose = gadataset_scan_task_dispose; - gobject_class->finalize = gadataset_scan_task_finalize; - gobject_class->set_property = gadataset_scan_task_set_property; - gobject_class->get_property = gadataset_scan_task_get_property; + gobject_class->finalize = gadataset_scanner_builder_finalize; + gobject_class->set_property = gadataset_scanner_builder_set_property; GParamSpec *spec; - spec = g_param_spec_pointer("scan-task", - "ScanTask", - "The raw std::shared *", + spec = g_param_spec_pointer("scanner-builder", + "Scanner builder", + "The raw " + "std::shared *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_SCAN_TASK, spec); - - /** - * GADatasetScanTask:options: - * - * The options of the scan task. - * - * Since: 1.0.0 - */ - spec = g_param_spec_object("options", - "Options", - "The options of the scan task", - GADATASET_TYPE_SCAN_OPTIONS, - static_cast(G_PARAM_READWRITE | - G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_OPTIONS, spec); - - /** - * GADatasetScanTask:fragment: - * - * The fragment of the scan task. - * - * Since: 4.0.0 - */ - spec = g_param_spec_object("fragment", - "Fragment", - "The fragment of the scan task", - GADATASET_TYPE_FRAGMENT, - static_cast(G_PARAM_READWRITE | - G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_FRAGMENT, spec); + g_object_class_install_property(gobject_class, PROP_SCANNER_BUILDER, spec); } /** - * gadataset_scan_task_get_options: - * @scan_task: A #GADatasetScanTask. - * - * Returns: (transfer full): A #GADatasetScanOptions. - * - * Since: 1.0.0 - */ -GADatasetScanOptions * -gadataset_scan_task_get_options(GADatasetScanTask *scan_task) -{ - auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(scan_task); - if (priv->options) { - g_object_ref(priv->options); - return priv->options; - } - - auto arrow_options = priv->scan_task->options(); - return gadataset_scan_options_new_raw(&arrow_options); -} - -/** - * gadataset_scan_task_get_fragment: - * @scan_task: A #GADatasetFragment. + * gadataset_scanner_builder_new: + * @dataset: A #GADatasetDatast to be scanned. + * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: (transfer full): A #GADatasetFragment. + * Returns: (nullable): A newly created #GADatasetScannerBuilder on success, + * %NULL on error. * - * Since: 4.0.0 + * Since: 5.0.0 */ -GADatasetFragment * -gadataset_scan_task_get_fragment(GADatasetScanTask *scan_task) +GADatasetScannerBuilder * +gadataset_scanner_builder_new(GADatasetDataset *dataset, GError **error) { - auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(scan_task); - if (priv->fragment) { - g_object_ref(priv->fragment); - return priv->fragment; + auto arrow_dataset = gadataset_dataset_get_raw(dataset); + auto arrow_scanner_builder_result = arrow_dataset->NewScan(); + if (garrow::check(error, + arrow_scanner_builder_result, + "[scanner-builder][new]")) { + auto arrow_scanner_builder = *arrow_scanner_builder_result; + return gadataset_scanner_builder_new_raw(&arrow_scanner_builder); + } else { + return NULL; } - - auto arrow_fragment = priv->scan_task->fragment(); - return gadataset_fragment_new_raw(&arrow_fragment); } /** - * gadataset_scan_task_execute: - * @scan_task: A #GADatasetScanTask. + * gadataset_scanner_builder_finish: + * @builder: A #GADatasetScannerBuilder. * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: (nullable) (transfer full): A newly created #GArrowRecordBatchIterator, - * or %NULL on error. + * Returns: (transfer full) (nullable): + * A newly created #GADatasetScanner on success, %NULL on error. * - * Since: 1.0.0 + * Since: 5.0.0 */ -GArrowRecordBatchIterator * -gadataset_scan_task_execute(GADatasetScanTask *scan_task, - GError **error) +GADatasetScanner * +gadataset_scanner_builder_finish(GADatasetScannerBuilder *builder, + GError **error) { - auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(scan_task); - auto arrow_result = priv->scan_task->Execute(); - if (garrow::check(error, arrow_result, "[datasets][scan-task][execute]")) { - auto arrow_record_batch_iteraor = std::move(*arrow_result); - return garrow_record_batch_iterator_new_raw(&arrow_record_batch_iteraor); + auto arrow_builder = gadataset_scanner_builder_get_raw(builder); + auto arrow_scanner_result = arrow_builder->Finish(); + if (garrow::check(error, arrow_scanner_result, "[scanner-builder][finish]")) { + auto arrow_scanner = *arrow_scanner_result; + return gadataset_scanner_new_raw(&arrow_scanner); } else { return NULL; } } -/* arrow::dataset::InMemoryScanTask */ - -G_DEFINE_TYPE(GADatasetInMemoryScanTask, - gadataset_in_memory_scan_task, - GADATASET_TYPE_SCAN_TASK) - -static void -gadataset_in_memory_scan_task_init(GADatasetInMemoryScanTask *object) -{ -} -static void -gadataset_in_memory_scan_task_class_init(GADatasetInMemoryScanTaskClass *klass) -{ -} +G_END_DECLS -/** - * gadataset_in_memory_scan_task_new: - * @record_batches: (array length=n_record_batches): - * (element-type GArrowRecordBatch): The record batches of the table. - * @n_record_batches: The number of record batches. - * @options: A #GADatasetScanOptions. - * @fragment: A #GADatasetInMemoryFragment. - * - * Returns: A newly created #GADatasetInMemoryScanTask. - * - * Since: 1.0.0 - */ -GADatasetInMemoryScanTask * -gadataset_in_memory_scan_task_new(GArrowRecordBatch **record_batches, - gsize n_record_batches, - GADatasetScanOptions *options, - GADatasetInMemoryFragment *fragment) +GADatasetScanner * +gadataset_scanner_new_raw( + std::shared_ptr *arrow_scanner) { - std::vector> arrow_record_batches; - arrow_record_batches.reserve(n_record_batches); - for (gsize i = 0; i < n_record_batches; ++i) { - auto arrow_record_batch = garrow_record_batch_get_raw(record_batches[i]); - arrow_record_batches.push_back(arrow_record_batch); - } - auto arrow_options = gadataset_scan_options_get_raw(options); - auto arrow_fragment = gadataset_fragment_get_raw(GADATASET_FRAGMENT(fragment)); - auto arrow_in_memory_scan_task = - std::make_shared(arrow_record_batches, - arrow_options, - arrow_fragment); - return gadataset_in_memory_scan_task_new_raw(&arrow_in_memory_scan_task, - options, - fragment); + auto scanner = + GADATASET_SCANNER(g_object_new(GADATASET_TYPE_SCANNER, + "scanner", arrow_scanner, + NULL)); + return scanner; } -G_END_DECLS - -GADatasetScanOptions * -gadataset_scan_options_new_raw( - std::shared_ptr *arrow_scan_options) +std::shared_ptr +gadataset_scanner_get_raw(GADatasetScanner *scanner) { - auto scan_options = - GADATASET_SCAN_OPTIONS(g_object_new(GADATASET_TYPE_SCAN_OPTIONS, - "scan-options", arrow_scan_options, - NULL)); - return scan_options; + auto priv = GADATASET_SCANNER_GET_PRIVATE(scanner); + return priv->scanner; } -std::shared_ptr -gadataset_scan_options_get_raw(GADatasetScanOptions *scan_options) +GADatasetScannerBuilder * +gadataset_scanner_builder_new_raw( + std::shared_ptr *arrow_scanner_builder) { - auto priv = GADATASET_SCAN_OPTIONS_GET_PRIVATE(scan_options); - return priv->scan_options; + return GADATASET_SCANNER_BUILDER( + g_object_new(GADATASET_TYPE_SCANNER_BUILDER, + "scanner-builder", arrow_scanner_builder, + NULL)); } -GADatasetInMemoryScanTask * -gadataset_in_memory_scan_task_new_raw( - std::shared_ptr *arrow_in_memory_scan_task, - GADatasetScanOptions *options, - GADatasetInMemoryFragment *fragment) +std::shared_ptr +gadataset_scanner_builder_get_raw(GADatasetScannerBuilder *scanner_builder) { - auto in_memory_scan_task = - GADATASET_IN_MEMORY_SCAN_TASK(g_object_new(GADATASET_TYPE_IN_MEMORY_SCAN_TASK, - "scan-task", arrow_in_memory_scan_task, - "options", options, - "fragment", fragment, - NULL)); - return in_memory_scan_task; + auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(scanner_builder); + return priv->scanner_builder; } diff --git a/c_glib/arrow-dataset-glib/scanner.h b/c_glib/arrow-dataset-glib/scanner.h index 90a60363e82..446815d6db1 100644 --- a/c_glib/arrow-dataset-glib/scanner.h +++ b/c_glib/arrow-dataset-glib/scanner.h @@ -19,76 +19,45 @@ #pragma once -#include - +#include #include G_BEGIN_DECLS -/* arrow::dataset::ScanOptions */ - -#define GADATASET_TYPE_SCAN_OPTIONS (gadataset_scan_options_get_type()) -G_DECLARE_DERIVABLE_TYPE(GADatasetScanOptions, - gadataset_scan_options, +#define GADATASET_TYPE_SCANNER (gadataset_scanner_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetScanner, + gadataset_scanner, GADATASET, - SCAN_OPTIONS, + SCANNER, GObject) -struct _GADatasetScanOptionsClass +struct _GADatasetScannerClass { GObjectClass parent_class; }; +GARROW_AVAILABLE_IN_5_0 +GArrowTable * +gadataset_scanner_to_table(GADatasetScanner *scanner, + GError **error); -GARROW_AVAILABLE_IN_1_0 -GADatasetScanOptions * -gadataset_scan_options_new(GArrowSchema *schema); -GARROW_AVAILABLE_IN_1_0 -GArrowSchema * -gadataset_scan_options_get_schema(GADatasetScanOptions *scan_options); - -/* arrow::dataset::ScanTask */ - -#define GADATASET_TYPE_SCAN_TASK (gadataset_scan_task_get_type()) -G_DECLARE_DERIVABLE_TYPE(GADatasetScanTask, - gadataset_scan_task, +#define GADATASET_TYPE_SCANNER_BUILDER (gadataset_scanner_builder_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetScannerBuilder, + gadataset_scanner_builder, GADATASET, - SCAN_TASK, + SCANNER_BUILDER, GObject) -struct _GADatasetScanTaskClass +struct _GADatasetScannerBuilderClass { GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_1_0 -GADatasetScanOptions * -gadataset_scan_task_get_options(GADatasetScanTask *scan_task); -GARROW_AVAILABLE_IN_4_0 -GADatasetFragment * -gadataset_scan_task_get_fragment(GADatasetScanTask *scan_task); -GARROW_AVAILABLE_IN_1_0 -GArrowRecordBatchIterator * -gadataset_scan_task_execute(GADatasetScanTask *scan_task, - GError **error); - -/* arrow::dataset::InMemoryScanTask */ - -#define GADATASET_TYPE_IN_MEMORY_SCAN_TASK \ - (gadataset_in_memory_scan_task_get_type()) -G_DECLARE_DERIVABLE_TYPE(GADatasetInMemoryScanTask, - gadataset_in_memory_scan_task, - GADATASET, - IN_MEMORY_SCAN_TASK, - GADatasetScanTask) -struct _GADatasetInMemoryScanTaskClass -{ - GADatasetScanTaskClass parent_class; -}; - -GARROW_AVAILABLE_IN_1_0 -GADatasetInMemoryScanTask * -gadataset_in_memory_scan_task_new(GArrowRecordBatch **record_batches, - gsize n_record_batches, - GADatasetScanOptions *options, - GADatasetInMemoryFragment *fragment); +GARROW_AVAILABLE_IN_5_0 +GADatasetScannerBuilder * +gadataset_scanner_builder_new(GADatasetDataset *dataset, + GError **error); +GARROW_AVAILABLE_IN_5_0 +GADatasetScanner * +gadataset_scanner_builder_finish(GADatasetScannerBuilder *builder, + GError **error); G_END_DECLS diff --git a/c_glib/arrow-dataset-glib/scanner.hpp b/c_glib/arrow-dataset-glib/scanner.hpp index ad3ac6a03cd..663ab6fc44b 100644 --- a/c_glib/arrow-dataset-glib/scanner.hpp +++ b/c_glib/arrow-dataset-glib/scanner.hpp @@ -24,14 +24,14 @@ #include #include -GADatasetScanOptions * -gadataset_scan_options_new_raw( - std::shared_ptr *arrow_scan_options); -std::shared_ptr -gadataset_scan_options_get_raw(GADatasetScanOptions *scan_options); +GADatasetScanner * +gadataset_scanner_new_raw( + std::shared_ptr *arrow_scanner); +std::shared_ptr +gadataset_scanner_get_raw(GADatasetScanner *scanner); -GADatasetInMemoryScanTask * -gadataset_in_memory_scan_task_new_raw( - std::shared_ptr *arrow_in_memory_scan_task, - GADatasetScanOptions *scan_options, - GADatasetInMemoryFragment *fragment); +GADatasetScannerBuilder * +gadataset_scanner_builder_new_raw( + std::shared_ptr *arrow_scanner_builder); +std::shared_ptr +gadataset_scanner_builder_get_raw(GADatasetScannerBuilder *scanner_builder); diff --git a/c_glib/arrow-glib/basic-array.cpp b/c_glib/arrow-glib/basic-array.cpp index d5b221a36b0..1eb65b88964 100644 --- a/c_glib/arrow-glib/basic-array.cpp +++ b/c_glib/arrow-glib/basic-array.cpp @@ -221,9 +221,9 @@ garrow_equal_options_set_property(GObject *object, static void garrow_equal_options_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) + guint prop_id, + GValue *value, + GParamSpec *pspec) { auto priv = GARROW_EQUAL_OPTIONS_GET_PRIVATE(object); diff --git a/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml b/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml index 9a1ae059378..3e8da5bd9d1 100644 --- a/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml +++ b/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml @@ -36,9 +36,15 @@ - - Read - + + Data + + Dataset + + Dataset factory + + + Scan Fragment @@ -60,6 +66,10 @@ Index of deprecated API + + Index of new symbols in 4.0.0 + + Index of new symbols in 4.0.0 @@ -68,9 +78,5 @@ Index of new symbols in 3.0.0 - - Index of new symbols in 1.0.0 - - diff --git a/c_glib/test/dataset/test-file-system-dataset-factory.rb b/c_glib/test/dataset/test-file-system-dataset-factory.rb new file mode 100644 index 00000000000..9ef629c222e --- /dev/null +++ b/c_glib/test/dataset/test-file-system-dataset-factory.rb @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDatasetFileSystemDatasetFactory < Test::Unit::TestCase + include Helper::Buildable + include Helper::Writable + + def setup + omit("Arrow Dataset is required") unless defined?(ArrowDataset) + Dir.mktmpdir do |tmpdir| + @dir = tmpdir + @path = File.join(@dir, "table.arrow") + @table = build_table(visible: [ + build_boolean_array([true, false, true]), + build_boolean_array([false, true, false, true]), + ], + point: [ + build_int32_array([1, 2, 3]), + build_int32_array([-1, -2, -3, -4]), + ]) + @format = ArrowDataset::IPCFileFormat.new + write_table(@table, @path) + yield + end + end + + def test_file_system + factory = ArrowDataset::FileSystemDatasetFactory.new(@format) + factory.file_system = Arrow::LocalFileSystem.new + factory.add_path(File.expand_path(@path)) + dataset = factory.finish + assert_equal(@table, dataset.to_table) + end + + def test_file_system_uri + factory = ArrowDataset::FileSystemDatasetFactory.new(@format) + factory.file_system_uri = build_file_uri(@path) + dataset = factory.finish + assert_equal(@table, dataset.to_table) + end +end diff --git a/ruby/red-arrow-dataset/test/test-in-memory-scan-task.rb b/c_glib/test/dataset/test-file-system-dataset.rb similarity index 64% rename from ruby/red-arrow-dataset/test/test-in-memory-scan-task.rb rename to c_glib/test/dataset/test-file-system-dataset.rb index 37f041d3159..6d6ec3b18c6 100644 --- a/ruby/red-arrow-dataset/test/test-in-memory-scan-task.rb +++ b/c_glib/test/dataset/test-file-system-dataset.rb @@ -15,19 +15,20 @@ # specific language governing permissions and limitations # under the License. -class TestInMemoryScanTask < Test::Unit::TestCase +class TestDatasetFileSystemDataset < Test::Unit::TestCase def setup - @record_batches = [ - Arrow::RecordBatch.new(visible: [true, false, true], - point: [1, 2, 3]), - ] + omit("Arrow Dataset is required") unless defined?(ArrowDataset) + Dir.mktmpdir do |tmpdir| + @dir = tmpdir + format = ArrowDataset::IPCFileFormat.new + factory = ArrowDataset::FileSystemDatasetFactory.new(format) + factory.file_system = Arrow::LocalFileSystem.new + @dataset = factory.finish + yield + end end - sub_test_case(".new") do - test("[[Arrow::RecordBatch]]") do - scan_task = ArrowDataset::InMemoryScanTask.new(@record_batches) - assert_equal(@record_batches, - scan_task.execute.to_a) - end + def test_type_name + assert_equal("filesystem", @dataset.type_name) end end diff --git a/c_glib/test/dataset/test-in-memory-scan-task.rb b/c_glib/test/dataset/test-in-memory-scan-task.rb deleted file mode 100644 index 06e3d0d2424..00000000000 --- a/c_glib/test/dataset/test-in-memory-scan-task.rb +++ /dev/null @@ -1,59 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -class TestDatasetInMemoryScanTask < Test::Unit::TestCase - include Helper::Buildable - - def setup - omit("Arrow Dataset is required") unless defined?(ArrowDataset) - fields = [ - Arrow::Field.new("visible", Arrow::BooleanDataType.new), - Arrow::Field.new("point", Arrow::Int32DataType.new), - ] - @schema = Arrow::Schema.new(fields) - @record_batches = [ - [ - build_boolean_array([true, false, true]), - build_int32_array([1, 2, 3]), - ], - [ - build_boolean_array([false, true, false, true]), - build_int32_array([-1, -2, -3, -4]), - ] - ].collect do |columns| - Arrow::RecordBatch.new(@schema, columns[0].length, columns) - end - - @scan_options = ArrowDataset::ScanOptions.new(@schema) - - @fragment = ArrowDataset::InMemoryFragment.new(@schema, - @record_batches) - - @scan_task = ArrowDataset::InMemoryScanTask.new(@record_batches, - @scan_options, - @fragment) - end - - def test_scan_options - assert_equal(@scan_options, @scan_task.options) - end - - def test_execute - assert_equal(@record_batches, - @scan_task.execute.to_list) - end -end diff --git a/c_glib/test/dataset/test-scan-options.rb b/c_glib/test/dataset/test-scan-options.rb deleted file mode 100644 index 0536b2a7cca..00000000000 --- a/c_glib/test/dataset/test-scan-options.rb +++ /dev/null @@ -1,47 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -class TestDatasetScanOptions < Test::Unit::TestCase - def setup - omit("Arrow Dataset is required") unless defined?(ArrowDataset) - @schema = Arrow::Schema.new([]) - @scan_options = ArrowDataset::ScanOptions.new(@schema) - end - - def test_schema - assert_equal(@schema, - @scan_options.schema) - end - - def test_batch_size - assert_equal(1<<20, - @scan_options.batch_size) - @scan_options.batch_size = 42 - assert_equal(42, - @scan_options.batch_size) - end - - def test_use_threads - assert do - not @scan_options.use_threads? - end - @scan_options.use_threads = true - assert do - @scan_options.use_threads? - end - end -end diff --git a/c_glib/test/dataset/test-scanner.rb b/c_glib/test/dataset/test-scanner.rb new file mode 100644 index 00000000000..f7702d4905f --- /dev/null +++ b/c_glib/test/dataset/test-scanner.rb @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDatasetScanner < Test::Unit::TestCase + include Helper::Buildable + include Helper::Writable + + def setup + omit("Arrow Dataset is required") unless defined?(ArrowDataset) + Dir.mktmpdir do |tmpdir| + path = File.join(tmpdir, "table.arrow") + @table = build_table(visible: [ + build_boolean_array([true, false, true]), + build_boolean_array([false, true, false, true]), + ], + point: [ + build_int32_array([1, 2, 3]), + build_int32_array([-1, -2, -3, -4]), + ]) + @format = ArrowDataset::IPCFileFormat.new + write_table(@table, path) + factory = ArrowDataset::FileSystemDatasetFactory.new(@format) + factory.file_system_uri = build_file_uri(path) + @dataset = factory.finish + builder = @dataset.begin_scan + @scanner = builder.finish + yield + end + end + + def test_to_table + assert_equal(@table, @scanner.to_table) + end +end diff --git a/c_glib/test/helper/buildable.rb b/c_glib/test/helper/buildable.rb index 04ae22f8715..356fa651c6a 100644 --- a/c_glib/test/helper/buildable.rb +++ b/c_glib/test/helper/buildable.rb @@ -205,7 +205,15 @@ def append_to_builder(builder, value) def build_table(columns) fields = [] chunked_arrays = [] - columns.each do |name, chunked_array| + columns.each do |name, data| + case data + when Arrow::Array + chunked_array = Arrow::ChunkedArray.new([data]) + when Array + chunked_array = Arrow::ChunkedArray.new(data) + else + chunked_array = data + end fields << Arrow::Field.new(name, chunked_array.value_data_type) chunked_arrays << chunked_array end @@ -222,6 +230,15 @@ def build_record_batch(columns) Arrow::RecordBatch.new(schema, n_rows, columns.values) end + def build_file_uri(path) + absolute_path = File.expand_path(path) + if absolute_path.start_with?("/") + "file://#{absolute_path}" + else + "file:///#{absolute_path}" + end + end + private def build_array(builder, values) values.each do |value| diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-fragment.rb b/c_glib/test/helper/writable.rb similarity index 63% rename from ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-fragment.rb rename to c_glib/test/helper/writable.rb index 917d6c79d0d..0053e972f91 100644 --- a/ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-fragment.rb +++ b/c_glib/test/helper/writable.rb @@ -15,18 +15,25 @@ # specific language governing permissions and limitations # under the License. -module ArrowDataset - class InMemoryFragment - alias_method :initialize_raw, :initialize - private :initialize_raw - def initialize(schema, record_batches) - record_batches = record_batches.collect do |record_batch| - unless record_batch.is_a?(Arrow::RecordBatch) - record_batch = Arrow::RecordBatch.new(record_batch) +module Helper + module Writable + def write_table(table, path, type: :file) + output = Arrow::FileOutputStream.new(path, false) + begin + if type == :file + writer_class = Arrow::RecordBatchFileWriter + else + writer_class = Arrow::RecordBatchStreamWriter end - record_batch + writer = writer_class.new(output, table.schema) + begin + writer.write_table(table) + ensure + writer.close + end + ensure + output.close end - initialize_raw(schema, record_batches) end end end diff --git a/c_glib/test/run-test.rb b/c_glib/test/run-test.rb index 044cb33a019..9c6af05224e 100755 --- a/c_glib/test/run-test.rb +++ b/c_glib/test/run-test.rb @@ -83,10 +83,11 @@ class BooleanScalar require_relative "helper/buildable" require_relative "helper/data-type" require_relative "helper/fixture" -require_relative "helper/omittable" -require_relative "helper/plasma-store" if defined?(ArrowFlight) require_relative "helper/flight-server" end +require_relative "helper/omittable" +require_relative "helper/plasma-store" +require_relative "helper/writable" exit(Test::Unit::AutoRunner.run(true, test_dir.to_s)) diff --git a/cpp/src/arrow/dataset/discovery.h b/cpp/src/arrow/dataset/discovery.h index 5559638448f..40c02051955 100644 --- a/cpp/src/arrow/dataset/discovery.h +++ b/cpp/src/arrow/dataset/discovery.h @@ -237,16 +237,23 @@ class ARROW_DS_EXPORT FileSystemDatasetFactory : public DatasetFactory { std::shared_ptr format, FileSystemFactoryOptions options); + /// \brief Build a FileSystemDatasetFactory from an explicit list of + /// file information. + /// + /// \param[in] filesystem passed to FileSystemDataset + /// \param[in] files passed to FileSystemDataset + /// \param[in] format passed to FileSystemDataset + /// \param[in] options see FileSystemFactoryOptions for more information. + static Result> Make( + std::shared_ptr filesystem, const std::vector& files, + std::shared_ptr format, FileSystemFactoryOptions options); + Result>> InspectSchemas( InspectOptions options) override; Result> Finish(FinishOptions options) override; protected: - static Result> Make( - std::shared_ptr filesystem, const std::vector& files, - std::shared_ptr format, FileSystemFactoryOptions options); - FileSystemDatasetFactory(std::vector files, std::shared_ptr filesystem, std::shared_ptr format, diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/scan-options.rb b/ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb similarity index 69% rename from ruby/red-arrow-dataset/lib/arrow-dataset/scan-options.rb rename to ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb index 1467743655b..a658fc3f2e0 100644 --- a/ruby/red-arrow-dataset/lib/arrow-dataset/scan-options.rb +++ b/ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb @@ -16,21 +16,13 @@ # under the License. module ArrowDataset - class ScanOptions + class Dataset class << self - def try_convert(value) - case value - when Hash - return nil unless value.key?(:schema) - options = new(value[:schema]) - value.each do |name, value| - next if name == :schema - options.__send__("#{name}=", value) - end - options - else - nil - end + def build(*args) + factory_class = ArrowDataset.const_get("#{name}Factory") + factory = factory_class.new(*args) + yield(factory) + factory.finish end end end diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-scan-task.rb b/ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-scan-task.rb deleted file mode 100644 index 5e127e179c6..00000000000 --- a/ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-scan-task.rb +++ /dev/null @@ -1,35 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -module ArrowDataset - class InMemoryScanTask - alias_method :initialize_raw, :initialize - private :initialize_raw - def initialize(record_batches, **options) - record_batches = record_batches.collect do |record_batch| - unless record_batch.is_a?(Arrow::RecordBatch) - record_batch = Arrow::RecordBatch.new(record_batch) - end - record_batch - end - options[:schema] ||= record_batches.first.schema - fragment = options.delete(:fragment) - fragment ||= InMemoryFragment.new(options[:schema], record_batches) - initialize_raw(record_batches, options, fragment) - end - end -end diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb b/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb index fcac52d268f..6a0dc5079d8 100644 --- a/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb +++ b/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb @@ -29,8 +29,7 @@ def post_load(repository, namespace) end def require_libraries - require "arrow-dataset/in-memory-scan-task" - require "arrow-dataset/scan-options" + require "arrow-dataset/dataset" end end end diff --git a/ruby/red-arrow-dataset/test/helper.rb b/ruby/red-arrow-dataset/test/helper.rb index 795df3beb01..7231eb1cb64 100644 --- a/ruby/red-arrow-dataset/test/helper.rb +++ b/ruby/red-arrow-dataset/test/helper.rb @@ -17,4 +17,6 @@ require "arrow-dataset" +require "tmpdir" + require "test-unit" diff --git a/ruby/red-arrow-dataset/test/test-scan-options.rb b/ruby/red-arrow-dataset/test/test-file-system-dataset.rb similarity index 58% rename from ruby/red-arrow-dataset/test/test-scan-options.rb rename to ruby/red-arrow-dataset/test/test-file-system-dataset.rb index a9a947ff88d..17cbcb88d74 100644 --- a/ruby/red-arrow-dataset/test/test-scan-options.rb +++ b/ruby/red-arrow-dataset/test/test-file-system-dataset.rb @@ -15,22 +15,24 @@ # specific language governing permissions and limitations # under the License. -class TestScanOptions < Test::Unit::TestCase +class TestFileSystemDataset < Test::Unit::TestCase def setup - @record_batches = [ - Arrow::RecordBatch.new(visible: [true, false, true], - point: [1, 2, 3]), - ] - @schema = @record_batches.first.schema + Dir.mktmpdir do |tmpdir| + @dir = tmpdir + @path = File.join(@dir, "table.arrow") + @table = Arrow::Table.new(visible: [true, false, true], + point: [1, 2, 3]) + @table.save(@path) + @format = ArrowDataset::IPCFileFormat.new + yield + end end - sub_test_case(".try_convert") do - def test_hash - batch_size = 1024 - context = ArrowDataset::ScanOptions.try_convert(schema: @schema, - batch_size: batch_size) - assert_equal([@schema, batch_size], - [context.schema, context.batch_size]) + test(".build") do + dataset = ArrowDataset::FileSystemDataset.build(@format) do |factory| + factory.file_system = Arrow::LocalFileSystem.new + factory.add_path(File.expand_path(@path)) end + assert_equal(@table, dataset.to_table) end end From cc0bd605e57ee39c522c262327359aa341de73bf Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Wed, 16 Jun 2021 11:00:11 +0200 Subject: [PATCH 22/61] ARROW-13036: [Doc] Mention recommended file extension(s) for Arrow IPC See JIRA Closes #10512 from westonpace/feature/ARROW-13036--doc-mention-recommended-file-extension-s-for-ar Authored-by: Weston Pace Signed-off-by: Antoine Pitrou --- docs/source/format/Columnar.rst | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 102c3a73317..52920a49b35 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -1006,19 +1006,21 @@ message flatbuffer is read, you can then read the message body. The stream writer can signal end-of-stream (EOS) either by writing 8 bytes containing the 4-byte continuation indicator (``0xFFFFFFFF``) followed by 0 -metadata length (``0x00000000``) or closing the stream interface. +metadata length (``0x00000000``) or closing the stream interface. We +recommend the ".arrows" file extension for the streaming format although +in many cases these streams will not ever be stored as files. IPC File Format --------------- -We define a "file format" supporting random access that is build with -the stream format. The file starts and ends with a magic string -``ARROW1`` (plus padding). What follows in the file is identical to -the stream format. At the end of the file, we write a *footer* -containing a redundant copy of the schema (which is a part of the -streaming format) plus memory offsets and sizes for each of the data -blocks in the file. This enables random access any record batch in the -file. See `File.fbs`_ for the precise details of the file footer. +We define a "file format" supporting random access that is an extension of +the stream format. The file starts and ends with a magic string ``ARROW1`` +(plus padding). What follows in the file is identical to the stream format. +At the end of the file, we write a *footer* containing a redundant copy of +the schema (which is a part of the streaming format) plus memory offsets and +sizes for each of the data blocks in the file. This enables random access to +any record batch in the file. See `File.fbs`_ for the precise details of the +file footer. Schematically we have: :: @@ -1034,8 +1036,9 @@ should be defined in a ``DictionaryBatch`` before they are used in a ``RecordBatch``, as long as the keys are defined somewhere in the file. Further more, it is invalid to have more than one **non-delta** dictionary batch per dictionary ID (i.e. dictionary replacement is not -supported). Delta dictionaries are applied in the order they appear in -the file footer. +supported). Delta dictionaries are applied in the order they appear in +the file footer. We recommend the ".arrow" extension for files created with +this format. Dictionary Messages ------------------- From 85a4052097a0d26e930d2a404ba86ecf3db1633d Mon Sep 17 00:00:00 2001 From: Nate Clark Date: Wed, 16 Jun 2021 12:58:39 +0200 Subject: [PATCH 23/61] ARROW-12995: [C++] Add validation to CSV options Closes #10505 from n3world/ARROW-12995-Validate_csv_opts Authored-by: Nate Clark Signed-off-by: Antoine Pitrou --- cpp/src/arrow/csv/options.cc | 43 ++++++++++++++++ cpp/src/arrow/csv/options.h | 14 ++++++ cpp/src/arrow/csv/reader.cc | 8 +++ cpp/src/arrow/csv/writer.cc | 2 + python/pyarrow/_csv.pyx | 13 +++++ python/pyarrow/includes/libarrow.pxd | 8 +++ python/pyarrow/tests/test_csv.py | 74 ++++++++++++++++++++++++++++ 7 files changed, 162 insertions(+) diff --git a/cpp/src/arrow/csv/options.cc b/cpp/src/arrow/csv/options.cc index a515abf2cf4..c71cfdaf295 100644 --- a/cpp/src/arrow/csv/options.cc +++ b/cpp/src/arrow/csv/options.cc @@ -22,6 +22,19 @@ namespace csv { ParseOptions ParseOptions::Defaults() { return ParseOptions(); } +Status ParseOptions::Validate() const { + if (ARROW_PREDICT_FALSE(delimiter == '\n' || delimiter == '\r')) { + return Status::Invalid("ParseOptions: delimiter cannot be \\r or \\n"); + } + if (ARROW_PREDICT_FALSE(quoting && (quote_char == '\n' || quote_char == '\r'))) { + return Status::Invalid("ParseOptions: quote_char cannot be \\r or \\n"); + } + if (ARROW_PREDICT_FALSE(escaping && (escape_char == '\n' || escape_char == '\r'))) { + return Status::Invalid("ParseOptions: escape_char cannot be \\r or \\n"); + } + return Status::OK(); +} + ConvertOptions ConvertOptions::Defaults() { auto options = ConvertOptions(); // Same default null / true / false spellings as in Pandas. @@ -33,8 +46,38 @@ ConvertOptions ConvertOptions::Defaults() { return options; } +Status ConvertOptions::Validate() const { return Status::OK(); } + ReadOptions ReadOptions::Defaults() { return ReadOptions(); } + +Status ReadOptions::Validate() const { + if (ARROW_PREDICT_FALSE(block_size < 1)) { + // Min is 1 because some tests use really small block sizes + return Status::Invalid("ReadOptions: block_size must be at least 1: ", block_size); + } + if (ARROW_PREDICT_FALSE(skip_rows < 0)) { + return Status::Invalid("ReadOptions: skip_rows cannot be negative: ", skip_rows); + } + if (ARROW_PREDICT_FALSE(skip_rows_after_names < 0)) { + return Status::Invalid("ReadOptions: skip_rows_after_names cannot be negative: ", + skip_rows_after_names); + } + if (ARROW_PREDICT_FALSE(autogenerate_column_names && !column_names.empty())) { + return Status::Invalid( + "ReadOptions: autogenerate_column_names cannot be true when column_names are " + "provided"); + } + return Status::OK(); +} + WriteOptions WriteOptions::Defaults() { return WriteOptions(); } +Status WriteOptions::Validate() const { + if (ARROW_PREDICT_FALSE(batch_size < 1)) { + return Status::Invalid("WriteOptions: batch_size must be at least 1: ", batch_size); + } + return Status::OK(); +} + } // namespace csv } // namespace arrow diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h index d9c94a03f86..790c47fc3f4 100644 --- a/cpp/src/arrow/csv/options.h +++ b/cpp/src/arrow/csv/options.h @@ -24,6 +24,7 @@ #include #include "arrow/csv/type_fwd.h" +#include "arrow/status.h" #include "arrow/util/visibility.h" namespace arrow { @@ -59,6 +60,9 @@ struct ARROW_EXPORT ParseOptions { /// Create parsing options with default values static ParseOptions Defaults(); + + /// \brief Test that all set options are valid + Status Validate() const; }; struct ARROW_EXPORT ConvertOptions { @@ -112,6 +116,9 @@ struct ARROW_EXPORT ConvertOptions { /// Create conversion options with default values, including conventional /// values for `null_values`, `true_values` and `false_values` static ConvertOptions Defaults(); + + /// \brief Test that all set options are valid + Status Validate() const; }; struct ARROW_EXPORT ReadOptions { @@ -124,6 +131,7 @@ struct ARROW_EXPORT ReadOptions { /// /// This will determine multi-threading granularity as well as /// the size of individual record batches. + /// Minimum valid value for block size is 1 int32_t block_size = 1 << 20; // 1 MB /// Number of header rows to skip (not including the row of column names, if any) @@ -143,6 +151,9 @@ struct ARROW_EXPORT ReadOptions { /// Create read options with default values static ReadOptions Defaults(); + + /// \brief Test that all set options are valid + Status Validate() const; }; /// Experimental @@ -158,6 +169,9 @@ struct ARROW_EXPORT WriteOptions { /// Create write options with default values static WriteOptions Defaults(); + + /// \brief Test that all set options are valid + Status Validate() const; }; } // namespace csv diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc index 068e06178c8..f221ffcadd9 100644 --- a/cpp/src/arrow/csv/reader.cc +++ b/cpp/src/arrow/csv/reader.cc @@ -1033,6 +1033,9 @@ Result> MakeTableReader( MemoryPool* pool, io::IOContext io_context, std::shared_ptr input, const ReadOptions& read_options, const ParseOptions& parse_options, const ConvertOptions& convert_options) { + RETURN_NOT_OK(parse_options.Validate()); + RETURN_NOT_OK(read_options.Validate()); + RETURN_NOT_OK(convert_options.Validate()); std::shared_ptr reader; if (read_options.use_threads) { auto cpu_executor = internal::GetCpuThreadPool(); @@ -1051,6 +1054,9 @@ Future> MakeStreamingReader( io::IOContext io_context, std::shared_ptr input, internal::Executor* cpu_executor, const ReadOptions& read_options, const ParseOptions& parse_options, const ConvertOptions& convert_options) { + RETURN_NOT_OK(parse_options.Validate()); + RETURN_NOT_OK(read_options.Validate()); + RETURN_NOT_OK(convert_options.Validate()); std::shared_ptr reader; reader = std::make_shared( io_context, cpu_executor, input, read_options, parse_options, convert_options, @@ -1182,6 +1188,8 @@ Future CountRowsAsync(io::IOContext io_context, internal::Executor* cpu_executor, const ReadOptions& read_options, const ParseOptions& parse_options) { + RETURN_NOT_OK(parse_options.Validate()); + RETURN_NOT_OK(read_options.Validate()); auto counter = std::make_shared( io_context, cpu_executor, std::move(input), read_options, parse_options); return counter->Count(); diff --git a/cpp/src/arrow/csv/writer.cc b/cpp/src/arrow/csv/writer.cc index ddd59b46fc1..e1c34a77ae9 100644 --- a/cpp/src/arrow/csv/writer.cc +++ b/cpp/src/arrow/csv/writer.cc @@ -414,6 +414,7 @@ class CSVConverter { Status WriteCSV(const Table& table, const WriteOptions& options, MemoryPool* pool, arrow::io::OutputStream* output) { + RETURN_NOT_OK(options.Validate()); if (pool == nullptr) { pool = default_memory_pool(); } @@ -424,6 +425,7 @@ Status WriteCSV(const Table& table, const WriteOptions& options, MemoryPool* poo Status WriteCSV(const RecordBatch& batch, const WriteOptions& options, MemoryPool* pool, arrow::io::OutputStream* output) { + RETURN_NOT_OK(options.Validate()); if (pool == nullptr) { pool = default_memory_pool(); } diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx index e7dda3fb953..8ede8272c07 100644 --- a/python/pyarrow/_csv.pyx +++ b/python/pyarrow/_csv.pyx @@ -58,6 +58,7 @@ cdef class ReadOptions(_Weakrefable): How much bytes to process at a time from the input stream. This will determine multi-threading granularity as well as the size of individual record batches or table chunks. + Minimum valid value for block size is 1 skip_rows: int, optional (default 0) The number of rows to skip before the column names (if any) and the CSV data. @@ -189,6 +190,9 @@ cdef class ReadOptions(_Weakrefable): def skip_rows_after_names(self, value): deref(self.options).skip_rows_after_names = value + def validate(self): + check_status(deref(self.options).Validate()) + def equals(self, ReadOptions other): return ( self.use_threads == other.use_threads and @@ -359,6 +363,9 @@ cdef class ParseOptions(_Weakrefable): def ignore_empty_lines(self, value): deref(self.options).ignore_empty_lines = value + def validate(self): + check_status(deref(self.options).Validate()) + def equals(self, ParseOptions other): return ( self.delimiter == other.delimiter and @@ -680,6 +687,9 @@ cdef class ConvertOptions(_Weakrefable): out.options.reset(new CCSVConvertOptions(move(options))) return out + def validate(self): + check_status(deref(self.options).Validate()) + def equals(self, ConvertOptions other): return ( self.check_utf8 == other.check_utf8 and @@ -941,6 +951,9 @@ cdef class WriteOptions(_Weakrefable): def batch_size(self, value): self.options.batch_size = value + def validate(self): + check_status(self.options.Validate()) + cdef _get_write_options(WriteOptions write_options, CCSVWriteOptions* out): if write_options is None: diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 35a2034eba4..b1fb04a1f8e 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1592,6 +1592,8 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil: @staticmethod CCSVParseOptions Defaults() + CStatus Validate() + cdef cppclass CCSVConvertOptions" arrow::csv::ConvertOptions": c_bool check_utf8 unordered_map[c_string, shared_ptr[CDataType]] column_types @@ -1613,6 +1615,8 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil: @staticmethod CCSVConvertOptions Defaults() + CStatus Validate() + cdef cppclass CCSVReadOptions" arrow::csv::ReadOptions": c_bool use_threads int32_t block_size @@ -1627,6 +1631,8 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil: @staticmethod CCSVReadOptions Defaults() + CStatus Validate() + cdef cppclass CCSVWriteOptions" arrow::csv::WriteOptions": c_bool include_header int32_t batch_size @@ -1634,6 +1640,8 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil: @staticmethod CCSVWriteOptions Defaults() + CStatus Validate() + cdef cppclass CCSVReader" arrow::csv::TableReader": @staticmethod CResult[shared_ptr[CCSVReader]] Make( diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index 32c0353fada..48cdff75f97 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -132,6 +132,34 @@ def test_read_options(): opts = cls(block_size=1234) assert opts.block_size == 1234 + opts.validate() + + match = "ReadOptions: block_size must be at least 1: 0" + with pytest.raises(pa.ArrowInvalid, match=match): + opts = cls() + opts.block_size = 0 + opts.validate() + + match = "ReadOptions: skip_rows cannot be negative: -1" + with pytest.raises(pa.ArrowInvalid, match=match): + opts = cls() + opts.skip_rows = -1 + opts.validate() + + match = "ReadOptions: skip_rows_after_names cannot be negative: -1" + with pytest.raises(pa.ArrowInvalid, match=match): + opts = cls() + opts.skip_rows_after_names = -1 + opts.validate() + + match = "ReadOptions: autogenerate_column_names cannot be true when" \ + " column_names are provided" + with pytest.raises(pa.ArrowInvalid, match=match): + opts = cls() + opts.autogenerate_column_names = True + opts.column_names = ('a', 'b') + opts.validate() + def test_parse_options(): cls = ParseOptions @@ -150,6 +178,44 @@ def test_parse_options(): newlines_in_values=True, ignore_empty_lines=False) + cls().validate() + opts = cls() + opts.delimiter = "\t" + opts.validate() + + match = "ParseOptions: delimiter cannot be \\\\r or \\\\n" + with pytest.raises(pa.ArrowInvalid, match=match): + opts = cls() + opts.delimiter = "\n" + opts.validate() + + with pytest.raises(pa.ArrowInvalid, match=match): + opts = cls() + opts.delimiter = "\r" + opts.validate() + + match = "ParseOptions: quote_char cannot be \\\\r or \\\\n" + with pytest.raises(pa.ArrowInvalid, match=match): + opts = cls() + opts.quote_char = "\n" + opts.validate() + + with pytest.raises(pa.ArrowInvalid, match=match): + opts = cls() + opts.quote_char = "\r" + opts.validate() + + match = "ParseOptions: escape_char cannot be \\\\r or \\\\n" + with pytest.raises(pa.ArrowInvalid, match=match): + opts = cls() + opts.escape_char = "\n" + opts.validate() + + with pytest.raises(pa.ArrowInvalid, match=match): + opts = cls() + opts.escape_char = "\r" + opts.validate() + def test_convert_options(): cls = ConvertOptions @@ -238,6 +304,14 @@ def test_write_options(): opts = cls(batch_size=9876) assert opts.batch_size == 9876 + opts.validate() + + match = "WriteOptions: batch_size must be at least 1: 0" + with pytest.raises(pa.ArrowInvalid, match=match): + opts = cls() + opts.batch_size = 0 + opts.validate() + class BaseTestCSVRead: From 44495dbca134574d89c75a408f9b8d24dd76819a Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 16 Jun 2021 14:29:03 +0200 Subject: [PATCH 24/61] ARROW-13090: [Python] Fix create_dir() implementation in FSSpecHandler Recent fsspec version have started raising FileExistsError if the target directory already exists. Ignore the error, as create_dir() is supposed to succeed in that case. Closes #10540 from pitrou/ARROW-13090-fsspec-create-dir Authored-by: Antoine Pitrou Signed-off-by: Joris Van den Bossche --- python/pyarrow/fs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index fe505530751..1b86e4b7e0f 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -263,7 +263,10 @@ def get_file_info_selector(self, selector): def create_dir(self, path, recursive): # mkdir also raises FileNotFoundError when base directory is not found - self.fs.mkdir(path, create_parents=recursive) + try: + self.fs.mkdir(path, create_parents=recursive) + except FileExistsError: + pass def delete_dir(self, path): self.fs.rm(path, recursive=True) From 43bafb875dd4578f72e449fd7c54c88c9df29dff Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 16 Jun 2021 20:05:33 +0200 Subject: [PATCH 25/61] ARROW-10115: [C++] Add CSV option to treat quoted strings as always non-null The option is only applicable to string and binary columns. Closes #10503 from pitrou/ARROW-10115-csv-quoted-nulls Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/csv/converter.cc | 1 + cpp/src/arrow/csv/converter_test.cc | 132 +++++++++++++++++---------- cpp/src/arrow/csv/options.h | 7 ++ python/pyarrow/_csv.pyx | 40 ++++++-- python/pyarrow/includes/libarrow.pxd | 1 + python/pyarrow/tests/test_csv.py | 16 +++- 6 files changed, 138 insertions(+), 59 deletions(-) diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc index feebf374e38..cb72b22b405 100644 --- a/cpp/src/arrow/csv/converter.cc +++ b/cpp/src/arrow/csv/converter.cc @@ -185,6 +185,7 @@ struct BinaryValueDecoder : public ValueDecoder { bool IsNull(const uint8_t* data, uint32_t size, bool quoted) { return options_.strings_can_be_null && + (!quoted || options_.quoted_strings_can_be_null) && ValueDecoder::IsNull(data, size, false /* quoted */); } }; diff --git a/cpp/src/arrow/csv/converter_test.cc b/cpp/src/arrow/csv/converter_test.cc index e12e3d17a83..4bed649d558 100644 --- a/cpp/src/arrow/csv/converter_test.cc +++ b/cpp/src/arrow/csv/converter_test.cc @@ -174,67 +174,105 @@ void AssertConversionError(const std::shared_ptr& type, // Converter tests template -static void TestBinaryConversionBasics() { - auto type = TypeTraits::type_singleton(); - AssertConversion(type, {"ab,cdé\n", ",\xffgh\n"}, - {{"ab", ""}, {"cdé", "\xffgh"}}); -} - -TEST(BinaryConversion, Basics) { TestBinaryConversionBasics(); } +class BinaryConversionTestBase : public testing::Test { + public: + std::shared_ptr type() { return TypeTraits::type_singleton(); } -TEST(LargeBinaryConversion, Basics) { TestBinaryConversionBasics(); } + void TestNulls() { + auto type = this->type(); + AssertConversion(type, {"ab,N/A\n", "NULL,\n"}, + {{"ab", "NULL"}, {"N/A", ""}}, + {{true, true}, {true, true}}); -TEST(BinaryConversion, Nulls) { - AssertConversion(binary(), {"ab,N/A\n", "NULL,\n"}, - {{"ab", "NULL"}, {"N/A", ""}}, - {{true, true}, {true, true}}); + auto options = ConvertOptions::Defaults(); + options.strings_can_be_null = true; + AssertConversion(type, {"ab,N/A\n", "NULL,\n"}, + {{"ab", ""}, {"", ""}}, + {{true, false}, {false, false}}, options); + AssertConversion(type, {"ab,\"N/A\"\n", "\"NULL\",\"\"\n"}, + {{"ab", ""}, {"", ""}}, + {{true, false}, {false, false}}, options); + options.quoted_strings_can_be_null = false; + AssertConversion(type, {"ab,N/A\n", "NULL,\n"}, + {{"ab", ""}, {"", ""}}, + {{true, false}, {false, false}}, options); + AssertConversion(type, {"ab,\"N/A\"\n", "\"NULL\",\"\"\n"}, + {{"ab", "NULL"}, {"N/A", ""}}, + {{true, true}, {true, true}}, options); + } - auto options = ConvertOptions::Defaults(); - options.strings_can_be_null = true; - AssertConversion(binary(), {"ab,N/A\n", "NULL,\n"}, - {{"ab", ""}, {"", ""}}, - {{true, false}, {false, false}}, options); -} + void TestCustomNulls() { + auto type = this->type(); + auto options = ConvertOptions::Defaults(); + options.null_values = {"xxx", "zzz"}; + AssertConversion(type, {"ab,N/A\n", "xxx,\"zzz\"\n"}, + {{"ab", "xxx"}, {"N/A", "zzz"}}, + {{true, true}, {true, true}}, options); + + options.strings_can_be_null = true; + AssertConversion(type, {"ab,N/A\n", "xxx,\"zzz\"\n"}, + {{"ab", ""}, {"N/A", ""}}, + {{true, false}, {true, false}}, options); + options.quoted_strings_can_be_null = false; + AssertConversion(type, {"ab,N/A\n", "xxx,\"zzz\"\n"}, + {{"ab", ""}, {"N/A", "zzz"}}, + {{true, false}, {true, true}}, options); + } +}; template -static void TestStringConversionBasics() { - auto type = TypeTraits::type_singleton(); - AssertConversion(type, {"ab,cdé\n", ",gh\n"}, - {{"ab", ""}, {"cdé", "gh"}}); +class BinaryConversionTest : public BinaryConversionTestBase { + public: + void TestBasics() { + auto type = this->type(); + AssertConversion(type, {"ab,cdé\n", ",\xffgh\n"}, + {{"ab", ""}, {"cdé", "\xffgh"}}); + } +}; - auto options = ConvertOptions::Defaults(); - options.check_utf8 = false; - AssertConversion(type, {"ab,cdé\n", ",\xffgh\n"}, - {{"ab", ""}, {"cdé", "\xffgh"}}, options, - /*validate_full=*/false); -} +using BinaryTestTypes = ::testing::Types; -TEST(StringConversion, Basics) { TestStringConversionBasics(); } +TYPED_TEST_SUITE(BinaryConversionTest, BinaryTestTypes); -TEST(LargeStringConversion, Basics) { TestStringConversionBasics(); } +TYPED_TEST(BinaryConversionTest, Basics) { this->TestBasics(); } -TEST(StringConversion, Nulls) { - AssertConversion(utf8(), {"ab,N/A\n", "NULL,\n"}, - {{"ab", "NULL"}, {"N/A", ""}}, - {{true, true}, {true, true}}); +TYPED_TEST(BinaryConversionTest, Nulls) { this->TestNulls(); } - auto options = ConvertOptions::Defaults(); - options.strings_can_be_null = true; - AssertConversion(utf8(), {"ab,N/A\n", "NULL,\n"}, - {{"ab", ""}, {"", ""}}, - {{true, false}, {false, false}}, options); -} +TYPED_TEST(BinaryConversionTest, CustomNulls) { this->TestNulls(); } template -static void TestStringConversionErrors() { - auto type = TypeTraits::type_singleton(); - // Invalid UTF8 in column 0 - AssertConversionError(type, {"ab,cdé\n", "\xff,gh\n"}, {0}); -} +class StringConversionTest : public BinaryConversionTestBase { + public: + void TestBasics() { + auto type = TypeTraits::type_singleton(); + AssertConversion(type, {"ab,cdé\n", ",gh\n"}, + {{"ab", ""}, {"cdé", "gh"}}); + } + + void TestInvalidUtf8() { + auto type = TypeTraits::type_singleton(); + // Invalid UTF8 in column 0 + AssertConversionError(type, {"ab,cdé\n", "\xff,gh\n"}, {0}); + + auto options = ConvertOptions::Defaults(); + options.check_utf8 = false; + AssertConversion(type, {"ab,cdé\n", ",\xffgh\n"}, + {{"ab", ""}, {"cdé", "\xffgh"}}, options, + /*validate_full=*/false); + } +}; + +using StringTestTypes = ::testing::Types; + +TYPED_TEST_SUITE(StringConversionTest, StringTestTypes); + +TYPED_TEST(StringConversionTest, Basics) { this->TestBasics(); } + +TYPED_TEST(StringConversionTest, Nulls) { this->TestNulls(); } -TEST(StringConversion, Errors) { TestStringConversionErrors(); } +TYPED_TEST(StringConversionTest, CustomNulls) { this->TestCustomNulls(); } -TEST(LargeStringConversion, Errors) { TestStringConversionErrors(); } +TYPED_TEST(StringConversionTest, InvalidUtf8) { this->TestInvalidUtf8(); } TEST(FixedSizeBinaryConversion, Basics) { AssertConversion( diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h index 790c47fc3f4..1e423fd76db 100644 --- a/cpp/src/arrow/csv/options.h +++ b/cpp/src/arrow/csv/options.h @@ -78,11 +78,18 @@ struct ARROW_EXPORT ConvertOptions { std::vector true_values; /// Recognized spellings for boolean false values std::vector false_values; + /// Whether string / binary columns can have null values. /// /// If true, then strings in "null_values" are considered null for string columns. /// If false, then all strings are valid string values. bool strings_can_be_null = false; + /// Whether string / binary columns can have quoted null values. + /// + /// If true *and* `strings_can_be_null` is true, then quoted strings in + /// "null_values" are also considered null for string columns. Otherwise, + /// quoted strings are never considered null. + bool quoted_strings_can_be_null = true; /// Whether to try to automatically dict-encode string / binary data. /// If true, then when type inference detects a string or binary column, diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx index 8ede8272c07..01cabc1d8b0 100644 --- a/python/pyarrow/_csv.pyx +++ b/python/pyarrow/_csv.pyx @@ -447,6 +447,12 @@ cdef class ConvertOptions(_Weakrefable): If true, then strings in null_values are considered null for string columns. If false, then all strings are valid string values. + quoted_strings_can_be_null: bool, optional (default True) + Whether string / binary columns can have quoted null values. + If true *and* strings_can_be_null is true, then strings in + null_values are considered null for string columns, even when + quoted. + Otherwise, then all quoted strings are valid string values. auto_dict_encode: bool, optional (default False) Whether to try to automatically dict-encode string / binary data. If true, then when type inference detects a string or binary column, @@ -478,9 +484,10 @@ cdef class ConvertOptions(_Weakrefable): def __init__(self, *, check_utf8=None, column_types=None, null_values=None, true_values=None, false_values=None, - strings_can_be_null=None, include_columns=None, - include_missing_columns=None, auto_dict_encode=None, - auto_dict_max_cardinality=None, timestamp_parsers=None): + strings_can_be_null=None, quoted_strings_can_be_null=None, + include_columns=None, include_missing_columns=None, + auto_dict_encode=None, auto_dict_max_cardinality=None, + timestamp_parsers=None): if check_utf8 is not None: self.check_utf8 = check_utf8 if column_types is not None: @@ -493,6 +500,8 @@ cdef class ConvertOptions(_Weakrefable): self.false_values = false_values if strings_can_be_null is not None: self.strings_can_be_null = strings_can_be_null + if quoted_strings_can_be_null is not None: + self.quoted_strings_can_be_null = quoted_strings_can_be_null if include_columns is not None: self.include_columns = include_columns if include_missing_columns is not None: @@ -526,6 +535,17 @@ cdef class ConvertOptions(_Weakrefable): def strings_can_be_null(self, value): deref(self.options).strings_can_be_null = value + @property + def quoted_strings_can_be_null(self): + """ + Whether string / binary columns can have quoted null values. + """ + return deref(self.options).quoted_strings_can_be_null + + @quoted_strings_can_be_null.setter + def quoted_strings_can_be_null(self, value): + deref(self.options).quoted_strings_can_be_null = value + @property def column_types(self): """ @@ -699,6 +719,8 @@ cdef class ConvertOptions(_Weakrefable): self.false_values == other.false_values and self.timestamp_parsers == other.timestamp_parsers and self.strings_can_be_null == other.strings_can_be_null and + self.quoted_strings_can_be_null == + other.quoted_strings_can_be_null and self.auto_dict_encode == other.auto_dict_encode and self.auto_dict_max_cardinality == other.auto_dict_max_cardinality and @@ -709,16 +731,16 @@ cdef class ConvertOptions(_Weakrefable): def __getstate__(self): return (self.check_utf8, self.column_types, self.null_values, self.true_values, self.false_values, self.timestamp_parsers, - self.strings_can_be_null, self.auto_dict_encode, - self.auto_dict_max_cardinality, self.include_columns, - self.include_missing_columns) + self.strings_can_be_null, self.quoted_strings_can_be_null, + self.auto_dict_encode, self.auto_dict_max_cardinality, + self.include_columns, self.include_missing_columns) def __setstate__(self, state): (self.check_utf8, self.column_types, self.null_values, self.true_values, self.false_values, self.timestamp_parsers, - self.strings_can_be_null, self.auto_dict_encode, - self.auto_dict_max_cardinality, self.include_columns, - self.include_missing_columns) = state + self.strings_can_be_null, self.quoted_strings_can_be_null, + self.auto_dict_encode, self.auto_dict_max_cardinality, + self.include_columns, self.include_missing_columns) = state def __eq__(self, other): try: diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index b1fb04a1f8e..072062385ca 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1601,6 +1601,7 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil: vector[c_string] true_values vector[c_string] false_values c_bool strings_can_be_null + c_bool quoted_strings_can_be_null vector[shared_ptr[CTimestampParser]] timestamp_parsers c_bool auto_dict_encode diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index 48cdff75f97..482973a7258 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -224,14 +224,16 @@ def test_convert_options(): check_options_class( cls, check_utf8=[True, False], strings_can_be_null=[False, True], + quoted_strings_can_be_null=[True, False], include_columns=[[], ['def', 'abc']], include_missing_columns=[False, True], auto_dict_encode=[False, True], timestamp_parsers=[[], [ISO8601, '%y-%m']]) check_options_class_pickling( - cls, check_utf8=True, - strings_can_be_null=False, + cls, check_utf8=False, + strings_can_be_null=True, + quoted_strings_can_be_null=False, include_columns=['def', 'abc'], include_missing_columns=False, auto_dict_encode=True, @@ -828,7 +830,7 @@ def test_auto_dict_encode(self): def test_custom_nulls(self): # Infer nulls with custom values opts = ConvertOptions(null_values=['Xxx', 'Zzz']) - rows = b"a,b,c,d\nZzz,Xxx,1,2\nXxx,#N/A,,Zzz\n" + rows = b"""a,b,c,d\nZzz,"Xxx",1,2\nXxx,#N/A,,Zzz\n""" table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.null()), ('b', pa.string()), @@ -851,6 +853,14 @@ def test_custom_nulls(self): 'c': ["1", ""], 'd': [2, None], } + opts.quoted_strings_can_be_null = False + table = self.read_bytes(rows, convert_options=opts) + assert table.to_pydict() == { + 'a': [None, None], + 'b': ["Xxx", "#N/A"], + 'c': ["1", ""], + 'd': [2, None], + } opts = ConvertOptions(null_values=[]) rows = b"a,b\n#N/A,\n" From ca7f9ee34ec1573526c984b8a20932218b9fc3e9 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 16 Jun 2021 20:19:27 +0200 Subject: [PATCH 26/61] ARROW-12709: [C++] Add binary_join_element_wise This adds a variadic scalar string join kernel, using the last argument (min 1 argument) as the separator. An options class allows emitting null (the default), skipping null non-separator arguments, or replacing null non-separator arguments with another string (mimicking libcudf). Closes #10520 from lidavidm/arrow-12709 Lead-authored-by: David Li Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/compute/api_scalar.cc | 8 +- cpp/src/arrow/compute/api_scalar.h | 23 +- .../arrow/compute/kernels/scalar_compare.cc | 16 +- .../compute/kernels/scalar_compare_test.cc | 184 +++++++------- .../arrow/compute/kernels/scalar_string.cc | 239 +++++++++++++++++- .../kernels/scalar_string_benchmark.cc | 43 ++++ .../compute/kernels/scalar_string_test.cc | 119 +++++++++ docs/source/cpp/compute.rst | 22 +- docs/source/python/api/compute.rst | 21 +- python/pyarrow/_compute.pyx | 31 +++ python/pyarrow/compute.py | 1 + python/pyarrow/includes/libarrow.pxd | 16 ++ python/pyarrow/tests/test_compute.py | 52 +++- 13 files changed, 643 insertions(+), 132 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index dba71456c29..db1cac290cf 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -63,14 +63,14 @@ SCALAR_ARITHMETIC_BINARY(Multiply, "multiply", "multiply_checked") SCALAR_ARITHMETIC_BINARY(Divide, "divide", "divide_checked") SCALAR_ARITHMETIC_BINARY(Power, "power", "power_checked") -Result ElementWiseMax(const std::vector& args, +Result MaxElementWise(const std::vector& args, ElementWiseAggregateOptions options, ExecContext* ctx) { - return CallFunction("element_wise_max", args, &options, ctx); + return CallFunction("max_element_wise", args, &options, ctx); } -Result ElementWiseMin(const std::vector& args, +Result MinElementWise(const std::vector& args, ElementWiseAggregateOptions options, ExecContext* ctx) { - return CallFunction("element_wise_min", args, &options, ctx); + return CallFunction("min_element_wise", args, &options, ctx); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 6e9a9340f2c..082876b356b 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -48,6 +48,25 @@ struct ARROW_EXPORT ElementWiseAggregateOptions : public FunctionOptions { bool skip_nulls; }; +/// Options for var_args_join. +struct ARROW_EXPORT JoinOptions : public FunctionOptions { + /// How to handle null values. (A null separator always results in a null output.) + enum NullHandlingBehavior { + /// A null in any input results in a null in the output. + EMIT_NULL, + /// Nulls in inputs are skipped. + SKIP, + /// Nulls in inputs are replaced with the replacement string. + REPLACE, + }; + explicit JoinOptions(NullHandlingBehavior null_handling = EMIT_NULL, + std::string null_replacement = "") + : null_handling(null_handling), null_replacement(std::move(null_replacement)) {} + static JoinOptions Defaults() { return JoinOptions(); } + NullHandlingBehavior null_handling; + std::string null_replacement; +}; + struct ARROW_EXPORT MatchSubstringOptions : public FunctionOptions { explicit MatchSubstringOptions(std::string pattern, bool ignore_case = false) : pattern(std::move(pattern)), ignore_case(ignore_case) {} @@ -287,7 +306,7 @@ Result Power(const Datum& left, const Datum& right, /// \param[in] ctx the function execution context, optional /// \return the element-wise maximum ARROW_EXPORT -Result ElementWiseMax( +Result MaxElementWise( const std::vector& args, ElementWiseAggregateOptions options = ElementWiseAggregateOptions::Defaults(), ExecContext* ctx = NULLPTR); @@ -300,7 +319,7 @@ Result ElementWiseMax( /// \param[in] ctx the function execution context, optional /// \return the element-wise minimum ARROW_EXPORT -Result ElementWiseMin( +Result MinElementWise( const std::vector& args, ElementWiseAggregateOptions options = ElementWiseAggregateOptions::Defaults(), ExecContext* ctx = NULLPTR); diff --git a/cpp/src/arrow/compute/kernels/scalar_compare.cc b/cpp/src/arrow/compute/kernels/scalar_compare.cc index 6763b6793f3..041c6a282f9 100644 --- a/cpp/src/arrow/compute/kernels/scalar_compare.cc +++ b/cpp/src/arrow/compute/kernels/scalar_compare.cc @@ -467,14 +467,14 @@ const FunctionDoc less_equal_doc{ ("A null on either side emits a null comparison result."), {"x", "y"}}; -const FunctionDoc element_wise_min_doc{ +const FunctionDoc min_element_wise_doc{ "Find the element-wise minimum value", ("Nulls will be ignored (default) or propagated. " "NaN will be taken over null, but not over any valid float."), {"*args"}, "ElementWiseAggregateOptions"}; -const FunctionDoc element_wise_max_doc{ +const FunctionDoc max_element_wise_doc{ "Find the element-wise maximum value", ("Nulls will be ignored (default) or propagated. " "NaN will be taken over null, but not over any valid float."), @@ -501,13 +501,13 @@ void RegisterScalarComparison(FunctionRegistry* registry) { // ---------------------------------------------------------------------- // Variadic element-wise functions - auto element_wise_min = - MakeScalarMinMax("element_wise_min", &element_wise_min_doc); - DCHECK_OK(registry->AddFunction(std::move(element_wise_min))); + auto min_element_wise = + MakeScalarMinMax("min_element_wise", &min_element_wise_doc); + DCHECK_OK(registry->AddFunction(std::move(min_element_wise))); - auto element_wise_max = - MakeScalarMinMax("element_wise_max", &element_wise_max_doc); - DCHECK_OK(registry->AddFunction(std::move(element_wise_max))); + auto max_element_wise = + MakeScalarMinMax("max_element_wise", &max_element_wise_doc); + DCHECK_OK(registry->AddFunction(std::move(max_element_wise))); } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/scalar_compare_test.cc b/cpp/src/arrow/compute/kernels/scalar_compare_test.cc index 6318a891d3a..50327e82032 100644 --- a/cpp/src/arrow/compute/kernels/scalar_compare_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_compare_test.cc @@ -729,90 +729,90 @@ TYPED_TEST_SUITE(TestVarArgsCompareNumeric, NumericBasedTypes); TYPED_TEST_SUITE(TestVarArgsCompareFloating, RealArrowTypes); TYPED_TEST_SUITE(TestVarArgsCompareParametricTemporal, ParametricTemporalTypes); -TYPED_TEST(TestVarArgsCompareNumeric, ElementWiseMin) { - this->AssertNullScalar(ElementWiseMin, {}); - this->AssertNullScalar(ElementWiseMin, {this->scalar("null"), this->scalar("null")}); +TYPED_TEST(TestVarArgsCompareNumeric, MinElementWise) { + this->AssertNullScalar(MinElementWise, {}); + this->AssertNullScalar(MinElementWise, {this->scalar("null"), this->scalar("null")}); - this->Assert(ElementWiseMin, this->scalar("0"), {this->scalar("0")}); - this->Assert(ElementWiseMin, this->scalar("0"), + this->Assert(MinElementWise, this->scalar("0"), {this->scalar("0")}); + this->Assert(MinElementWise, this->scalar("0"), {this->scalar("2"), this->scalar("0"), this->scalar("1")}); this->Assert( - ElementWiseMin, this->scalar("0"), + MinElementWise, this->scalar("0"), {this->scalar("2"), this->scalar("0"), this->scalar("1"), this->scalar("null")}); - this->Assert(ElementWiseMin, this->scalar("1"), + this->Assert(MinElementWise, this->scalar("1"), {this->scalar("null"), this->scalar("null"), this->scalar("1"), this->scalar("null")}); - this->Assert(ElementWiseMin, (this->array("[]")), {this->array("[]")}); - this->Assert(ElementWiseMin, this->array("[1, 2, 3, null]"), + this->Assert(MinElementWise, (this->array("[]")), {this->array("[]")}); + this->Assert(MinElementWise, this->array("[1, 2, 3, null]"), {this->array("[1, 2, 3, null]")}); - this->Assert(ElementWiseMin, this->array("[1, 2, 2, 2]"), + this->Assert(MinElementWise, this->array("[1, 2, 2, 2]"), {this->array("[1, 2, 3, 4]"), this->scalar("2")}); - this->Assert(ElementWiseMin, this->array("[1, 2, 2, 2]"), + this->Assert(MinElementWise, this->array("[1, 2, 2, 2]"), {this->array("[1, null, 3, 4]"), this->scalar("2")}); - this->Assert(ElementWiseMin, this->array("[1, 2, 2, 2]"), + this->Assert(MinElementWise, this->array("[1, 2, 2, 2]"), {this->array("[1, null, 3, 4]"), this->scalar("2"), this->scalar("4")}); - this->Assert(ElementWiseMin, this->array("[1, 2, 2, 2]"), + this->Assert(MinElementWise, this->array("[1, 2, 2, 2]"), {this->array("[1, null, 3, 4]"), this->scalar("null"), this->scalar("2")}); - this->Assert(ElementWiseMin, this->array("[1, 2, 2, 2]"), + this->Assert(MinElementWise, this->array("[1, 2, 2, 2]"), {this->array("[1, 2, 3, 4]"), this->array("[2, 2, 2, 2]")}); - this->Assert(ElementWiseMin, this->array("[1, 2, 2, 2]"), + this->Assert(MinElementWise, this->array("[1, 2, 2, 2]"), {this->array("[1, 2, 3, 4]"), this->array("[2, null, 2, 2]")}); - this->Assert(ElementWiseMin, this->array("[1, 2, 2, 2]"), + this->Assert(MinElementWise, this->array("[1, 2, 2, 2]"), {this->array("[1, null, 3, 4]"), this->array("[2, 2, 2, 2]")}); - this->Assert(ElementWiseMin, this->array("[1, 2, null, 6]"), + this->Assert(MinElementWise, this->array("[1, 2, null, 6]"), {this->array("[1, 2, null, null]"), this->array("[4, null, null, 6]")}); - this->Assert(ElementWiseMin, this->array("[1, 2, null, 6]"), + this->Assert(MinElementWise, this->array("[1, 2, null, 6]"), {this->array("[4, null, null, 6]"), this->array("[1, 2, null, null]")}); - this->Assert(ElementWiseMin, this->array("[1, 2, 3, 4]"), + this->Assert(MinElementWise, this->array("[1, 2, 3, 4]"), {this->array("[1, 2, 3, 4]"), this->array("[null, null, null, null]")}); - this->Assert(ElementWiseMin, this->array("[1, 2, 3, 4]"), + this->Assert(MinElementWise, this->array("[1, 2, 3, 4]"), {this->array("[null, null, null, null]"), this->array("[1, 2, 3, 4]")}); - this->Assert(ElementWiseMin, this->array("[1, 1, 1, 1]"), + this->Assert(MinElementWise, this->array("[1, 1, 1, 1]"), {this->scalar("1"), this->array("[1, 2, 3, 4]")}); - this->Assert(ElementWiseMin, this->array("[1, 1, 1, 1]"), + this->Assert(MinElementWise, this->array("[1, 1, 1, 1]"), {this->scalar("1"), this->array("[null, null, null, null]")}); - this->Assert(ElementWiseMin, this->array("[1, 1, 1, 1]"), + this->Assert(MinElementWise, this->array("[1, 1, 1, 1]"), {this->scalar("null"), this->array("[1, 1, 1, 1]")}); - this->Assert(ElementWiseMin, this->array("[null, null, null, null]"), + this->Assert(MinElementWise, this->array("[null, null, null, null]"), {this->scalar("null"), this->array("[null, null, null, null]")}); // Test null handling this->element_wise_aggregate_options_.skip_nulls = false; - this->AssertNullScalar(ElementWiseMin, {this->scalar("null"), this->scalar("null")}); - this->AssertNullScalar(ElementWiseMin, {this->scalar("0"), this->scalar("null")}); + this->AssertNullScalar(MinElementWise, {this->scalar("null"), this->scalar("null")}); + this->AssertNullScalar(MinElementWise, {this->scalar("0"), this->scalar("null")}); - this->Assert(ElementWiseMin, this->array("[1, null, 2, 2]"), + this->Assert(MinElementWise, this->array("[1, null, 2, 2]"), {this->array("[1, null, 3, 4]"), this->scalar("2"), this->scalar("4")}); - this->Assert(ElementWiseMin, this->array("[null, null, null, null]"), + this->Assert(MinElementWise, this->array("[null, null, null, null]"), {this->array("[1, null, 3, 4]"), this->scalar("null"), this->scalar("2")}); - this->Assert(ElementWiseMin, this->array("[1, null, 2, 2]"), + this->Assert(MinElementWise, this->array("[1, null, 2, 2]"), {this->array("[1, 2, 3, 4]"), this->array("[2, null, 2, 2]")}); - this->Assert(ElementWiseMin, this->array("[null, null, null, null]"), + this->Assert(MinElementWise, this->array("[null, null, null, null]"), {this->scalar("1"), this->array("[null, null, null, null]")}); - this->Assert(ElementWiseMin, this->array("[null, null, null, null]"), + this->Assert(MinElementWise, this->array("[null, null, null, null]"), {this->scalar("null"), this->array("[1, 1, 1, 1]")}); } -TYPED_TEST(TestVarArgsCompareFloating, ElementWiseMin) { +TYPED_TEST(TestVarArgsCompareFloating, MinElementWise) { auto Check = [this](const std::string& expected, const std::vector& inputs) { std::vector args; for (const auto& input : inputs) { args.emplace_back(this->scalar(input)); } - this->Assert(ElementWiseMin, this->scalar(expected), args); + this->Assert(MinElementWise, this->scalar(expected), args); args.clear(); for (const auto& input : inputs) { args.emplace_back(this->array("[" + input + "]")); } - this->Assert(ElementWiseMin, this->array("[" + expected + "]"), args); + this->Assert(MinElementWise, this->array("[" + expected + "]"), args); }; Check("-0.0", {"0.0", "-0.0"}); Check("-0.0", {"1.0", "-0.0", "0.0"}); @@ -828,111 +828,111 @@ TYPED_TEST(TestVarArgsCompareFloating, ElementWiseMin) { Check("-Inf", {"0", "-Inf"}); } -TYPED_TEST(TestVarArgsCompareParametricTemporal, ElementWiseMin) { +TYPED_TEST(TestVarArgsCompareParametricTemporal, MinElementWise) { // Temporal kernel is implemented with numeric kernel underneath - this->AssertNullScalar(ElementWiseMin, {}); - this->AssertNullScalar(ElementWiseMin, {this->scalar("null"), this->scalar("null")}); + this->AssertNullScalar(MinElementWise, {}); + this->AssertNullScalar(MinElementWise, {this->scalar("null"), this->scalar("null")}); - this->Assert(ElementWiseMin, this->scalar("0"), {this->scalar("0")}); - this->Assert(ElementWiseMin, this->scalar("0"), {this->scalar("2"), this->scalar("0")}); - this->Assert(ElementWiseMin, this->scalar("0"), + this->Assert(MinElementWise, this->scalar("0"), {this->scalar("0")}); + this->Assert(MinElementWise, this->scalar("0"), {this->scalar("2"), this->scalar("0")}); + this->Assert(MinElementWise, this->scalar("0"), {this->scalar("0"), this->scalar("null")}); - this->Assert(ElementWiseMin, (this->array("[]")), {this->array("[]")}); - this->Assert(ElementWiseMin, this->array("[1, 2, 3, null]"), + this->Assert(MinElementWise, (this->array("[]")), {this->array("[]")}); + this->Assert(MinElementWise, this->array("[1, 2, 3, null]"), {this->array("[1, 2, 3, null]")}); - this->Assert(ElementWiseMin, this->array("[1, 2, 2, 2]"), + this->Assert(MinElementWise, this->array("[1, 2, 2, 2]"), {this->array("[1, null, 3, 4]"), this->scalar("null"), this->scalar("2")}); - this->Assert(ElementWiseMin, this->array("[1, 2, 3, 2]"), + this->Assert(MinElementWise, this->array("[1, 2, 3, 2]"), {this->array("[1, null, 3, 4]"), this->array("[2, 2, null, 2]")}); } -TYPED_TEST(TestVarArgsCompareNumeric, ElementWiseMax) { - this->AssertNullScalar(ElementWiseMax, {}); - this->AssertNullScalar(ElementWiseMax, {this->scalar("null"), this->scalar("null")}); +TYPED_TEST(TestVarArgsCompareNumeric, MaxElementWise) { + this->AssertNullScalar(MaxElementWise, {}); + this->AssertNullScalar(MaxElementWise, {this->scalar("null"), this->scalar("null")}); - this->Assert(ElementWiseMax, this->scalar("0"), {this->scalar("0")}); - this->Assert(ElementWiseMax, this->scalar("2"), + this->Assert(MaxElementWise, this->scalar("0"), {this->scalar("0")}); + this->Assert(MaxElementWise, this->scalar("2"), {this->scalar("2"), this->scalar("0"), this->scalar("1")}); this->Assert( - ElementWiseMax, this->scalar("2"), + MaxElementWise, this->scalar("2"), {this->scalar("2"), this->scalar("0"), this->scalar("1"), this->scalar("null")}); - this->Assert(ElementWiseMax, this->scalar("1"), + this->Assert(MaxElementWise, this->scalar("1"), {this->scalar("null"), this->scalar("null"), this->scalar("1"), this->scalar("null")}); - this->Assert(ElementWiseMax, (this->array("[]")), {this->array("[]")}); - this->Assert(ElementWiseMax, this->array("[1, 2, 3, null]"), + this->Assert(MaxElementWise, (this->array("[]")), {this->array("[]")}); + this->Assert(MaxElementWise, this->array("[1, 2, 3, null]"), {this->array("[1, 2, 3, null]")}); - this->Assert(ElementWiseMax, this->array("[2, 2, 3, 4]"), + this->Assert(MaxElementWise, this->array("[2, 2, 3, 4]"), {this->array("[1, 2, 3, 4]"), this->scalar("2")}); - this->Assert(ElementWiseMax, this->array("[2, 2, 3, 4]"), + this->Assert(MaxElementWise, this->array("[2, 2, 3, 4]"), {this->array("[1, null, 3, 4]"), this->scalar("2")}); - this->Assert(ElementWiseMax, this->array("[4, 4, 4, 4]"), + this->Assert(MaxElementWise, this->array("[4, 4, 4, 4]"), {this->array("[1, null, 3, 4]"), this->scalar("2"), this->scalar("4")}); - this->Assert(ElementWiseMax, this->array("[2, 2, 3, 4]"), + this->Assert(MaxElementWise, this->array("[2, 2, 3, 4]"), {this->array("[1, null, 3, 4]"), this->scalar("null"), this->scalar("2")}); - this->Assert(ElementWiseMax, this->array("[2, 2, 3, 4]"), + this->Assert(MaxElementWise, this->array("[2, 2, 3, 4]"), {this->array("[1, 2, 3, 4]"), this->array("[2, 2, 2, 2]")}); - this->Assert(ElementWiseMax, this->array("[2, 2, 3, 4]"), + this->Assert(MaxElementWise, this->array("[2, 2, 3, 4]"), {this->array("[1, 2, 3, 4]"), this->array("[2, null, 2, 2]")}); - this->Assert(ElementWiseMax, this->array("[2, 2, 3, 4]"), + this->Assert(MaxElementWise, this->array("[2, 2, 3, 4]"), {this->array("[1, null, 3, 4]"), this->array("[2, 2, 2, 2]")}); - this->Assert(ElementWiseMax, this->array("[4, 2, null, 6]"), + this->Assert(MaxElementWise, this->array("[4, 2, null, 6]"), {this->array("[1, 2, null, null]"), this->array("[4, null, null, 6]")}); - this->Assert(ElementWiseMax, this->array("[4, 2, null, 6]"), + this->Assert(MaxElementWise, this->array("[4, 2, null, 6]"), {this->array("[4, null, null, 6]"), this->array("[1, 2, null, null]")}); - this->Assert(ElementWiseMax, this->array("[1, 2, 3, 4]"), + this->Assert(MaxElementWise, this->array("[1, 2, 3, 4]"), {this->array("[1, 2, 3, 4]"), this->array("[null, null, null, null]")}); - this->Assert(ElementWiseMax, this->array("[1, 2, 3, 4]"), + this->Assert(MaxElementWise, this->array("[1, 2, 3, 4]"), {this->array("[null, null, null, null]"), this->array("[1, 2, 3, 4]")}); - this->Assert(ElementWiseMax, this->array("[1, 2, 3, 4]"), + this->Assert(MaxElementWise, this->array("[1, 2, 3, 4]"), {this->scalar("1"), this->array("[1, 2, 3, 4]")}); - this->Assert(ElementWiseMax, this->array("[1, 1, 1, 1]"), + this->Assert(MaxElementWise, this->array("[1, 1, 1, 1]"), {this->scalar("1"), this->array("[null, null, null, null]")}); - this->Assert(ElementWiseMax, this->array("[1, 1, 1, 1]"), + this->Assert(MaxElementWise, this->array("[1, 1, 1, 1]"), {this->scalar("null"), this->array("[1, 1, 1, 1]")}); - this->Assert(ElementWiseMax, this->array("[null, null, null, null]"), + this->Assert(MaxElementWise, this->array("[null, null, null, null]"), {this->scalar("null"), this->array("[null, null, null, null]")}); // Test null handling this->element_wise_aggregate_options_.skip_nulls = false; - this->AssertNullScalar(ElementWiseMax, {this->scalar("null"), this->scalar("null")}); - this->AssertNullScalar(ElementWiseMax, {this->scalar("0"), this->scalar("null")}); + this->AssertNullScalar(MaxElementWise, {this->scalar("null"), this->scalar("null")}); + this->AssertNullScalar(MaxElementWise, {this->scalar("0"), this->scalar("null")}); - this->Assert(ElementWiseMax, this->array("[4, null, 4, 4]"), + this->Assert(MaxElementWise, this->array("[4, null, 4, 4]"), {this->array("[1, null, 3, 4]"), this->scalar("2"), this->scalar("4")}); - this->Assert(ElementWiseMax, this->array("[null, null, null, null]"), + this->Assert(MaxElementWise, this->array("[null, null, null, null]"), {this->array("[1, null, 3, 4]"), this->scalar("null"), this->scalar("2")}); - this->Assert(ElementWiseMax, this->array("[2, null, 3, 4]"), + this->Assert(MaxElementWise, this->array("[2, null, 3, 4]"), {this->array("[1, 2, 3, 4]"), this->array("[2, null, 2, 2]")}); - this->Assert(ElementWiseMax, this->array("[null, null, null, null]"), + this->Assert(MaxElementWise, this->array("[null, null, null, null]"), {this->scalar("1"), this->array("[null, null, null, null]")}); - this->Assert(ElementWiseMax, this->array("[null, null, null, null]"), + this->Assert(MaxElementWise, this->array("[null, null, null, null]"), {this->scalar("null"), this->array("[1, 1, 1, 1]")}); } -TYPED_TEST(TestVarArgsCompareFloating, ElementWiseMax) { +TYPED_TEST(TestVarArgsCompareFloating, MaxElementWise) { auto Check = [this](const std::string& expected, const std::vector& inputs) { std::vector args; for (const auto& input : inputs) { args.emplace_back(this->scalar(input)); } - this->Assert(ElementWiseMax, this->scalar(expected), args); + this->Assert(MaxElementWise, this->scalar(expected), args); args.clear(); for (const auto& input : inputs) { args.emplace_back(this->array("[" + input + "]")); } - this->Assert(ElementWiseMax, this->array("[" + expected + "]"), args); + this->Assert(MaxElementWise, this->array("[" + expected + "]"), args); }; Check("0.0", {"0.0", "-0.0"}); Check("1.0", {"1.0", "-0.0", "0.0"}); @@ -948,34 +948,34 @@ TYPED_TEST(TestVarArgsCompareFloating, ElementWiseMax) { Check("0", {"0", "-Inf"}); } -TYPED_TEST(TestVarArgsCompareParametricTemporal, ElementWiseMax) { +TYPED_TEST(TestVarArgsCompareParametricTemporal, MaxElementWise) { // Temporal kernel is implemented with numeric kernel underneath - this->AssertNullScalar(ElementWiseMax, {}); - this->AssertNullScalar(ElementWiseMax, {this->scalar("null"), this->scalar("null")}); + this->AssertNullScalar(MaxElementWise, {}); + this->AssertNullScalar(MaxElementWise, {this->scalar("null"), this->scalar("null")}); - this->Assert(ElementWiseMax, this->scalar("0"), {this->scalar("0")}); - this->Assert(ElementWiseMax, this->scalar("2"), {this->scalar("2"), this->scalar("0")}); - this->Assert(ElementWiseMax, this->scalar("0"), + this->Assert(MaxElementWise, this->scalar("0"), {this->scalar("0")}); + this->Assert(MaxElementWise, this->scalar("2"), {this->scalar("2"), this->scalar("0")}); + this->Assert(MaxElementWise, this->scalar("0"), {this->scalar("0"), this->scalar("null")}); - this->Assert(ElementWiseMax, (this->array("[]")), {this->array("[]")}); - this->Assert(ElementWiseMax, this->array("[1, 2, 3, null]"), + this->Assert(MaxElementWise, (this->array("[]")), {this->array("[]")}); + this->Assert(MaxElementWise, this->array("[1, 2, 3, null]"), {this->array("[1, 2, 3, null]")}); - this->Assert(ElementWiseMax, this->array("[2, 2, 3, 4]"), + this->Assert(MaxElementWise, this->array("[2, 2, 3, 4]"), {this->array("[1, null, 3, 4]"), this->scalar("null"), this->scalar("2")}); - this->Assert(ElementWiseMax, this->array("[2, 2, 3, 4]"), + this->Assert(MaxElementWise, this->array("[2, 2, 3, 4]"), {this->array("[1, null, 3, 4]"), this->array("[2, 2, null, 2]")}); } -TEST(TestElementWiseMaxElementWiseMin, CommonTimestamp) { +TEST(TestMaxElementWiseMinElementWise, CommonTimestamp) { { auto t1 = std::make_shared(TimeUnit::SECOND); auto t2 = std::make_shared(TimeUnit::MILLI); auto expected = MakeScalar(t2, 1000).ValueOrDie(); ASSERT_OK_AND_ASSIGN(auto actual, - ElementWiseMin({Datum(MakeScalar(t1, 1).ValueOrDie()), + MinElementWise({Datum(MakeScalar(t1, 1).ValueOrDie()), Datum(MakeScalar(t2, 12000).ValueOrDie())})); AssertScalarsEqual(*expected, *actual.scalar(), /*verbose=*/true); } @@ -984,7 +984,7 @@ TEST(TestElementWiseMaxElementWiseMin, CommonTimestamp) { auto t2 = std::make_shared(TimeUnit::SECOND); auto expected = MakeScalar(t2, 86401).ValueOrDie(); ASSERT_OK_AND_ASSIGN(auto actual, - ElementWiseMax({Datum(MakeScalar(t1, 1).ValueOrDie()), + MaxElementWise({Datum(MakeScalar(t1, 1).ValueOrDie()), Datum(MakeScalar(t2, 86401).ValueOrDie())})); AssertScalarsEqual(*expected, *actual.scalar(), /*verbose=*/true); } @@ -994,7 +994,7 @@ TEST(TestElementWiseMaxElementWiseMin, CommonTimestamp) { auto t3 = std::make_shared(TimeUnit::SECOND); auto expected = MakeScalar(t3, 86400).ValueOrDie(); ASSERT_OK_AND_ASSIGN( - auto actual, ElementWiseMin({Datum(MakeScalar(t1, 1).ValueOrDie()), + auto actual, MinElementWise({Datum(MakeScalar(t1, 1).ValueOrDie()), Datum(MakeScalar(t2, 2 * 86400000).ValueOrDie())})); AssertScalarsEqual(*expected, *actual.scalar(), /*verbose=*/true); } diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index cd054fcea0e..3f63bf2c405 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -3344,12 +3344,227 @@ struct BinaryJoin { } }; +using BinaryJoinElementWiseState = OptionsWrapper; + +template +struct BinaryJoinElementWise { + using ArrayType = typename TypeTraits::ArrayType; + using BuilderType = typename TypeTraits::BuilderType; + using offset_type = typename Type::offset_type; + + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + JoinOptions options = BinaryJoinElementWiseState::Get(ctx); + // Last argument is the separator (for consistency with binary_join) + if (std::all_of(batch.values.begin(), batch.values.end(), + [](const Datum& d) { return d.is_scalar(); })) { + return ExecOnlyScalar(ctx, options, batch, out); + } + return ExecContainingArrays(ctx, options, batch, out); + } + + static Status ExecOnlyScalar(KernelContext* ctx, const JoinOptions& options, + const ExecBatch& batch, Datum* out) { + BaseBinaryScalar* output = checked_cast(out->scalar().get()); + const size_t num_args = batch.values.size(); + if (num_args == 1) { + // Only separator, no values + ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(0)); + output->is_valid = batch.values[0].scalar()->is_valid; + return Status::OK(); + } + + int64_t final_size = CalculateRowSize(options, batch, 0); + if (final_size < 0) { + ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(0)); + output->is_valid = false; + return Status::OK(); + } + ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(final_size)); + const auto separator = UnboxScalar::Unbox(*batch.values.back().scalar()); + uint8_t* buf = output->value->mutable_data(); + bool first = true; + for (size_t i = 0; i < num_args - 1; i++) { + const Scalar& scalar = *batch[i].scalar(); + util::string_view s; + if (scalar.is_valid) { + s = UnboxScalar::Unbox(scalar); + } else { + switch (options.null_handling) { + case JoinOptions::EMIT_NULL: + // Handled by CalculateRowSize + DCHECK(false) << "unreachable"; + break; + case JoinOptions::SKIP: + continue; + case JoinOptions::REPLACE: + s = options.null_replacement; + break; + } + } + if (!first) { + buf = std::copy(separator.begin(), separator.end(), buf); + } + first = false; + buf = std::copy(s.begin(), s.end(), buf); + } + output->is_valid = true; + DCHECK_EQ(final_size, buf - output->value->mutable_data()); + return Status::OK(); + } + + static Status ExecContainingArrays(KernelContext* ctx, const JoinOptions& options, + const ExecBatch& batch, Datum* out) { + // Presize data to avoid reallocations + int64_t final_size = 0; + for (int64_t i = 0; i < batch.length; i++) { + auto size = CalculateRowSize(options, batch, i); + if (size > 0) final_size += size; + } + BuilderType builder(ctx->memory_pool()); + RETURN_NOT_OK(builder.Reserve(batch.length)); + RETURN_NOT_OK(builder.ReserveData(final_size)); + + std::vector valid_cols(batch.values.size()); + for (size_t row = 0; row < static_cast(batch.length); row++) { + size_t num_valid = 0; // Not counting separator + for (size_t col = 0; col < batch.values.size(); col++) { + if (batch[col].is_scalar()) { + const auto& scalar = *batch[col].scalar(); + if (scalar.is_valid) { + valid_cols[col] = UnboxScalar::Unbox(scalar); + if (col < batch.values.size() - 1) num_valid++; + } else { + valid_cols[col] = util::string_view(); + } + } else { + const ArrayData& array = *batch[col].array(); + if (!array.MayHaveNulls() || + BitUtil::GetBit(array.buffers[0]->data(), array.offset + row)) { + const offset_type* offsets = array.GetValues(1); + const uint8_t* data = array.GetValues(2, /*absolute_offset=*/0); + const int64_t length = offsets[row + 1] - offsets[row]; + valid_cols[col] = util::string_view( + reinterpret_cast(data + offsets[row]), length); + if (col < batch.values.size() - 1) num_valid++; + } else { + valid_cols[col] = util::string_view(); + } + } + } + + if (!valid_cols.back().data()) { + // Separator is null + builder.UnsafeAppendNull(); + continue; + } else if (batch.values.size() == 1) { + // Only given separator + builder.UnsafeAppendEmptyValue(); + continue; + } else if (num_valid < batch.values.size() - 1) { + // We had some nulls + if (options.null_handling == JoinOptions::EMIT_NULL) { + builder.UnsafeAppendNull(); + continue; + } + } + const auto separator = valid_cols.back(); + bool first = true; + for (size_t col = 0; col < batch.values.size() - 1; col++) { + util::string_view value = valid_cols[col]; + if (!value.data()) { + switch (options.null_handling) { + case JoinOptions::EMIT_NULL: + DCHECK(false) << "unreachable"; + break; + case JoinOptions::SKIP: + continue; + case JoinOptions::REPLACE: + value = options.null_replacement; + break; + } + } + if (first) { + builder.UnsafeAppend(value); + first = false; + continue; + } + builder.UnsafeExtendCurrent(separator); + builder.UnsafeExtendCurrent(value); + } + } + + std::shared_ptr string_array; + RETURN_NOT_OK(builder.Finish(&string_array)); + *out = *string_array->data(); + out->mutable_array()->type = batch[0].type(); + DCHECK_EQ(batch.length, out->array()->length); + DCHECK_EQ(final_size, + checked_cast(*string_array).total_values_length()); + return Status::OK(); + } + + // Compute the length of the output for the given position, or -1 if it would be null. + static int64_t CalculateRowSize(const JoinOptions& options, const ExecBatch& batch, + const int64_t index) { + const auto num_args = batch.values.size(); + int64_t final_size = 0; + int64_t num_non_null_args = 0; + for (size_t i = 0; i < num_args; i++) { + int64_t element_size = 0; + bool valid = true; + if (batch[i].is_scalar()) { + const Scalar& scalar = *batch[i].scalar(); + valid = scalar.is_valid; + element_size = UnboxScalar::Unbox(scalar).size(); + } else { + const ArrayData& array = *batch[i].array(); + valid = !array.MayHaveNulls() || + BitUtil::GetBit(array.buffers[0]->data(), array.offset + index); + const offset_type* offsets = array.GetValues(1); + element_size = offsets[index + 1] - offsets[index]; + } + if (i == num_args - 1) { + if (!valid) return -1; + if (num_non_null_args > 1) { + // Add separator size (only if there were values to join) + final_size += (num_non_null_args - 1) * element_size; + } + break; + } + if (!valid) { + switch (options.null_handling) { + case JoinOptions::EMIT_NULL: + return -1; + case JoinOptions::SKIP: + continue; + case JoinOptions::REPLACE: + element_size = options.null_replacement.size(); + break; + } + } + num_non_null_args++; + final_size += element_size; + } + return final_size; + } +}; + const FunctionDoc binary_join_doc( "Join a list of strings together with a `separator` to form a single string", ("Insert `separator` between `list` elements, and concatenate them.\n" "Any null input and any null `list` element emits a null output.\n"), {"list", "separator"}); +const FunctionDoc binary_join_element_wise_doc( + "Join string arguments into one, using the last argument as the separator", + ("Insert the last argument of `strings` between the rest of the elements, " + "and concatenate them.\n" + "Any null separator element emits a null output. Null elements either " + "emit a null (the default), are skipped, or replaced with a given string.\n"), + {"*strings"}, "JoinOptions"); + +const auto kDefaultJoinOptions = JoinOptions::Defaults(); + template void AddBinaryJoinForListType(ScalarFunction* func) { for (const std::shared_ptr& ty : BaseBinaryTypes()) { @@ -3360,11 +3575,25 @@ void AddBinaryJoinForListType(ScalarFunction* func) { } void AddBinaryJoin(FunctionRegistry* registry) { - auto func = - std::make_shared("binary_join", Arity::Binary(), &binary_join_doc); - AddBinaryJoinForListType(func.get()); - AddBinaryJoinForListType(func.get()); - DCHECK_OK(registry->AddFunction(std::move(func))); + { + auto func = std::make_shared("binary_join", Arity::Binary(), + &binary_join_doc); + AddBinaryJoinForListType(func.get()); + AddBinaryJoinForListType(func.get()); + DCHECK_OK(registry->AddFunction(std::move(func))); + } + { + auto func = std::make_shared( + "binary_join_element_wise", Arity::VarArgs(/*min_args=*/1), + &binary_join_element_wise_doc, &kDefaultJoinOptions); + for (const auto& ty : BaseBinaryTypes()) { + DCHECK_OK( + func->AddKernel({InputType(ty)}, ty, + GenerateTypeAgnosticVarBinaryBase(ty), + BinaryJoinElementWiseState::Init)); + } + DCHECK_OK(registry->AddFunction(std::move(func))); + } } template