From c49815b5c6207d0afa786f1d8433e7fa8118303c Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Tue, 30 Jul 2024 15:32:17 -0400 Subject: [PATCH 01/21] initial bool8 --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/extension/CMakeLists.txt | 6 ++ cpp/src/arrow/extension/bool8.cc | 66 ++++++++++++++++ cpp/src/arrow/extension/bool8.h | 48 ++++++++++++ cpp/src/arrow/extension/bool8_test.cc | 100 +++++++++++++++++++++++++ 5 files changed, 221 insertions(+) create mode 100644 cpp/src/arrow/extension/bool8.cc create mode 100644 cpp/src/arrow/extension/bool8.h create mode 100644 cpp/src/arrow/extension/bool8_test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 67d2c19f98a..81109605650 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -906,6 +906,7 @@ endif() if(ARROW_JSON) arrow_add_object_library(ARROW_JSON + extension/bool8.cc extension/fixed_shape_tensor.cc extension/opaque.cc json/options.cc diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt index 6741ab602f5..fcd5fa529ab 100644 --- a/cpp/src/arrow/extension/CMakeLists.txt +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -15,6 +15,12 @@ # specific language governing permissions and limitations # under the License. +add_arrow_test(test + SOURCES + bool8_test.cc + PREFIX + "arrow-extension-bool8") + add_arrow_test(test SOURCES fixed_shape_tensor_test.cc diff --git a/cpp/src/arrow/extension/bool8.cc b/cpp/src/arrow/extension/bool8.cc new file mode 100644 index 00000000000..cf9217d93d8 --- /dev/null +++ b/cpp/src/arrow/extension/bool8.cc @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/extension/bool8.h" +#include "arrow/util/logging.h" + +namespace arrow::extension { + +bool Bool8Type::ExtensionEquals(const ExtensionType& other) const { + return extension_name() == other.extension_name(); +} + +std::string Bool8Type::ToString(bool show_metadata) const { + std::stringstream ss; + ss << "extension<" << this->extension_name() << ">"; + return ss.str(); +} + +std::string Bool8Type::Serialize() const { + return ""; +} + +Result> Bool8Type::Deserialize( + std::shared_ptr storage_type, const std::string& serialized_data) const { + if (storage_type->id() != Type::INT8) { + return Status::Invalid("Expected INT8 storage type, got ", + storage_type->ToString()); + } + return bool8(); +} + +std::shared_ptr Bool8Type::MakeArray( + std::shared_ptr data) const { + DCHECK_EQ(data->type->id(), Type::EXTENSION); + DCHECK_EQ("arrow.bool8", + internal::checked_cast(*data->type).extension_name()); + return std::make_shared(data); +} + +Result> Bool8Type::Make() { + return std::make_shared(); +} + +std::shared_ptr bool8() { + auto maybe_type = Bool8Type::Make(); + ARROW_DCHECK_OK(maybe_type.status()); + return maybe_type.MoveValueUnsafe(); +} + +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/bool8.h b/cpp/src/arrow/extension/bool8.h new file mode 100644 index 00000000000..90787f1d998 --- /dev/null +++ b/cpp/src/arrow/extension/bool8.h @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension_type.h" + +namespace arrow { +namespace extension { + +class ARROW_EXPORT Bool8Array : public ExtensionArray {}; + +class ARROW_EXPORT Bool8Type : public ExtensionType { + public: + Bool8Type(): ExtensionType(int8()) {} + + std::string extension_name() const override { return "arrow.bool8"; } + std::string ToString(bool show_metadata = false) const override; + + bool ExtensionEquals(const ExtensionType& other) const override; + + std::string Serialize() const override; + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized_data) const override; + + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + static Result> Make(); +}; + +ARROW_EXPORT std::shared_ptr bool8(); + +} // namespace extension +} // namespace arrow diff --git a/cpp/src/arrow/extension/bool8_test.cc b/cpp/src/arrow/extension/bool8_test.cc new file mode 100644 index 00000000000..4f58a970e93 --- /dev/null +++ b/cpp/src/arrow/extension/bool8_test.cc @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension/bool8.h" +#include "arrow/io/memory.h" +#include "arrow/ipc/reader.h" +#include "arrow/ipc/writer.h" +#include "arrow/testing/extension_type.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow { + +TEST(Bool8Type, Basics) { + auto type = internal::checked_pointer_cast( + extension::bool8()); + auto type2 = internal::checked_pointer_cast( + extension::bool8()); + ASSERT_EQ("arrow.bool8", type->extension_name()); + ASSERT_EQ(*type, *type); + ASSERT_NE(*arrow::null(), *type); + ASSERT_EQ(*type, *type2); + ASSERT_EQ(*arrow::int8(), *type->storage_type()); + ASSERT_EQ("", type->Serialize()); + ASSERT_EQ("extension", type->ToString(false)); +} + +TEST(Bool8Type, CreateFromArray) { + auto type = internal::checked_pointer_cast( + extension::bool8()); + auto storage = ArrayFromJSON(int8(), "[-1,0,1,2,null]"); + auto array = ExtensionType::WrapArray(type, storage); + ASSERT_EQ(5, array->length()); + ASSERT_EQ(1, array->null_count()); +} + +TEST(Bool8Type, Deserialize) { + auto type = internal::checked_pointer_cast( + extension::bool8()); + ASSERT_OK_AND_ASSIGN(auto deserialized, + type->Deserialize(type->storage_type(), "")); + ASSERT_EQ(*type, *deserialized); + ASSERT_OK_AND_ASSIGN(deserialized, + type->Deserialize(type->storage_type(), "doesn't matter")); + ASSERT_EQ(*type, *deserialized); + ASSERT_NOT_OK(type->Deserialize(uint8(), "")); + ASSERT_EQ(*type, *deserialized); +} + +TEST(Bool8Type, MetadataRoundTrip) { + auto type = internal::checked_pointer_cast( + extension::bool8()); + std::string serialized = type->Serialize(); + ASSERT_OK_AND_ASSIGN(auto deserialized, + type->Deserialize(type->storage_type(), serialized)); + ASSERT_EQ(*type, *deserialized); +} + +TEST(Bool8Type, BatchRoundTrip) { + auto type = internal::checked_pointer_cast( + extension::bool8()); + ExtensionTypeGuard guard(type); + + auto storage = ArrayFromJSON(int8(), "[-1,0,1,2,null]"); + auto array = ExtensionType::WrapArray(type, storage); + auto batch = + RecordBatch::Make(schema({field("field", type)}), array->length(), {array}); + + std::shared_ptr written; + { + ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); + ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), + out_stream.get())); + + ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); + + io::BufferReader reader(complete_ipc_stream); + std::shared_ptr batch_reader; + ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); + ASSERT_OK(batch_reader->ReadNext(&written)); + } + + ASSERT_EQ(*batch->schema(), *written->schema()); + ASSERT_BATCHES_EQUAL(*batch, *written); +} + +} // namespace arrow \ No newline at end of file From c51cb628749b0721b6aae7ceb3bef8b70f1c926f Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Thu, 1 Aug 2024 14:16:51 -0500 Subject: [PATCH 02/21] fix makearray and add comments --- cpp/src/arrow/extension/bool8.cc | 2 +- cpp/src/arrow/extension/bool8.h | 14 +++++++++++++- cpp/src/arrow/extension/bool8_test.cc | 2 +- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/extension/bool8.cc b/cpp/src/arrow/extension/bool8.cc index cf9217d93d8..e5119f42bc5 100644 --- a/cpp/src/arrow/extension/bool8.cc +++ b/cpp/src/arrow/extension/bool8.cc @@ -50,7 +50,7 @@ std::shared_ptr Bool8Type::MakeArray( DCHECK_EQ(data->type->id(), Type::EXTENSION); DCHECK_EQ("arrow.bool8", internal::checked_cast(*data->type).extension_name()); - return std::make_shared(data); + return std::make_shared(data); } Result> Bool8Type::Make() { diff --git a/cpp/src/arrow/extension/bool8.h b/cpp/src/arrow/extension/bool8.h index 90787f1d998..2cd57d03689 100644 --- a/cpp/src/arrow/extension/bool8.h +++ b/cpp/src/arrow/extension/bool8.h @@ -20,10 +20,20 @@ namespace arrow { namespace extension { -class ARROW_EXPORT Bool8Array : public ExtensionArray {}; +/// \brief Bool8 is an alternate representation for boolean +/// arrays using 8 bits instead of 1 bit per value. The underlying +/// storage type is int8. +class ARROW_EXPORT Bool8Array : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; +}; +/// \brief Bool8 is an alternate representation for boolean +/// arrays using 8 bits instead of 1 bit per value. The underlying +/// storage type is int8. class ARROW_EXPORT Bool8Type : public ExtensionType { public: + /// \brief Construct a Bool8Type. Bool8Type(): ExtensionType(int8()) {} std::string extension_name() const override { return "arrow.bool8"; } @@ -37,11 +47,13 @@ class ARROW_EXPORT Bool8Type : public ExtensionType { std::shared_ptr storage_type, const std::string& serialized_data) const override; + /// Create a Bool8Array from ArrayData std::shared_ptr MakeArray(std::shared_ptr data) const override; static Result> Make(); }; +/// \brief Return a Bool8Type instance. ARROW_EXPORT std::shared_ptr bool8(); } // namespace extension diff --git a/cpp/src/arrow/extension/bool8_test.cc b/cpp/src/arrow/extension/bool8_test.cc index 4f58a970e93..f3cbf1c3dc7 100644 --- a/cpp/src/arrow/extension/bool8_test.cc +++ b/cpp/src/arrow/extension/bool8_test.cc @@ -97,4 +97,4 @@ TEST(Bool8Type, BatchRoundTrip) { ASSERT_BATCHES_EQUAL(*batch, *written); } -} // namespace arrow \ No newline at end of file +} // namespace arrow From ca7b56d4336e68ffaa69064a00ff7b87772bbfff Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Thu, 1 Aug 2024 18:38:35 -0500 Subject: [PATCH 03/21] add python bindings --- python/pyarrow/__init__.py | 7 ++- python/pyarrow/array.pxi | 24 ++++++++ python/pyarrow/includes/libarrow.pxd | 11 ++++ python/pyarrow/lib.pxd | 3 + python/pyarrow/public-api.pxi | 2 + python/pyarrow/scalar.pxi | 4 ++ python/pyarrow/tests/test_extension_type.py | 58 ++++++++++++++++++++ python/pyarrow/tests/test_misc.py | 3 + python/pyarrow/types.pxi | 61 +++++++++++++++++++++ 9 files changed, 170 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index aa7bab9f97e..807bcdc3150 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -174,6 +174,7 @@ def print_entry(label, value): run_end_encoded, fixed_shape_tensor, opaque, + bool8, field, type_for_alias, DataType, DictionaryType, StructType, @@ -184,7 +185,7 @@ def print_entry(label, value): FixedSizeBinaryType, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, RunEndEncodedType, FixedShapeTensorType, OpaqueType, - PyExtensionType, UnknownExtensionType, + Bool8Type, PyExtensionType, UnknownExtensionType, register_extension_type, unregister_extension_type, DictionaryMemo, KeyValueMetadata, @@ -218,7 +219,7 @@ def print_entry(label, value): MonthDayNanoIntervalArray, Decimal128Array, Decimal256Array, StructArray, ExtensionArray, RunEndEncodedArray, FixedShapeTensorArray, OpaqueArray, - scalar, NA, _NULL as NULL, Scalar, + Bool8Array, scalar, NA, _NULL as NULL, Scalar, NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar, @@ -235,7 +236,7 @@ def print_entry(label, value): FixedSizeBinaryScalar, DictionaryScalar, MapScalar, StructScalar, UnionScalar, RunEndEncodedScalar, ExtensionScalar, - FixedShapeTensorScalar, OpaqueScalar) + FixedShapeTensorScalar, OpaqueScalar, Bool8Scalar) # Buffers, allocation from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 6c40a21db96..a41576c34c7 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4447,6 +4447,30 @@ cdef class FixedShapeTensorArray(ExtensionArray): FixedSizeListArray.from_arrays(values, shape[1:].prod()) ) +cdef class Bool8Array(ExtensionArray): + """ + Concrete class for bool8 extension arrays. + Examples + -------- + Define the extension type for an bool8 array + >>> import pyarrow as pa + >>> bool8_type = pa.bool8() + Create an extension array + >>> arr = [-1, 0, 1, 2, None] + >>> storage = pa.array(arr, pa.int8()) + >>> pa.ExtensionArray.from_storage(bool8_type, storage) + + [ + -1, + 0, + 1, + 2, + null + ] + """ + + def to_numpy(self): + return self.storage.to_numpy().view(np.bool_) cdef class OpaqueArray(ExtensionArray): """ diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9b008d150f1..1c36840abba 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2895,6 +2895,17 @@ cdef extern from "arrow/extension/opaque.h" namespace "arrow::extension" nogil: pass +cdef extern from "arrow/extension/bool8.h" namespace "arrow::extension" nogil: + cdef cppclass CBool8Type \ + " arrow::extension::Bool8Type"(CExtensionType): + + @staticmethod + CResult[shared_ptr[CDataType]] Make() + + cdef cppclass CBool8Array \ + " arrow::extension::Bool8Array"(CExtensionArray): + pass + cdef extern from "arrow/util/compression.h" namespace "arrow" nogil: cdef enum CCompressionType" arrow::Compression::type": CCompressionType_UNCOMPRESSED" arrow::Compression::UNCOMPRESSED" diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 2cb302d20a8..e3625c18152 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -214,6 +214,9 @@ cdef class FixedShapeTensorType(BaseExtensionType): cdef: const CFixedShapeTensorType* tensor_ext_type +cdef class Bool8Type(BaseExtensionType): + cdef: + const CBool8Type* bool8_ext_type cdef class OpaqueType(BaseExtensionType): cdef: diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 2f9fc1c5542..19a26bd6c68 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -126,6 +126,8 @@ cdef api object pyarrow_wrap_data_type( out = FixedShapeTensorType.__new__(FixedShapeTensorType) elif ext_type.extension_name() == b"arrow.opaque": out = OpaqueType.__new__(OpaqueType) + elif ext_type.extension_name() == b"arrow.bool8": + out = Bool8Type.__new__(Bool8Type) else: out = BaseExtensionType.__new__(BaseExtensionType) else: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 12a99c2aece..584b32dce5d 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1084,6 +1084,10 @@ cdef class FixedShapeTensorScalar(ExtensionScalar): ctensor = GetResultValue(c_type.MakeTensor(scalar)) return pyarrow_wrap_tensor(ctensor) +cdef class Bool8Scalar(ExtensionScalar): + """ + Concrete class for bool8 extension scalar. + """ cdef class OpaqueScalar(ExtensionScalar): """ diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 58c54189f22..958b618faf7 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1707,3 +1707,61 @@ def test_opaque_type(pickle_module, storage_type, storage): # cast extension type -> storage type inner = arr.cast(storage_type) assert inner == storage + +def test_bool8_type(pickle_module): + bool8_type = pa.bool8() + storage_type = pa.int8() + assert bool8_type.extension_name == "arrow.bool8" + assert bool8_type.storage_type == storage_type + assert str(bool8_type) == "extension" + + assert bool8_type == bool8_type + assert bool8_type == pa.bool8() + assert bool8_type != storage_type + + # Pickle roundtrip + result = pickle_module.loads(pickle_module.dumps(bool8_type)) + assert result == bool8_type + + # IPC roundtrip + bool8_arr_class = bool8_type.__arrow_ext_class__() + storage = pa.array([-1, 0, 1, 2, None], storage_type) + arr = pa.ExtensionArray.from_storage(bool8_type, storage) + assert isinstance(arr, bool8_arr_class) + + with registered_extension_type(bool8_type): + buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"])) + batch = ipc_read_batch(buf) + + assert batch.column(0).type.extension_name == "arrow.bool8" + assert isinstance(batch.column(0), bool8_arr_class) + + # cast storage -> extension type + result = storage.cast(bool8_type) + assert result == arr + + # cast extension type -> storage type + inner = arr.cast(storage_type) + assert inner == storage + + # cast extension type -> arrow boolean type + bool_type = pa.bool_() + arrow_bool_arr = pa.array([True, False, True, True, None], bool_type) + cast_bool_arr = arr.cast(bool_type) + assert cast_bool_arr == arrow_bool_arr + + # cast arrow boolean type -> extension type, expecting canonical values + cast_bool8_arr = arrow_bool_arr.cast(bool8_type) + canonical_storage = pa.array([1, 0, 1, 1, None], storage_type) + canonical_bool8_arr = pa.ExtensionArray.from_storage(bool8_type, canonical_storage) + assert cast_bool8_arr == canonical_bool8_arr + + # zero-copy convert to numpy if non-null + with pytest.raises(pa.ArrowInvalid, match="Needed to copy 1 chunks with 1 nulls, but zero_copy_only was True"): + arr.to_numpy() + + arr_np_bool = np.array([True, False, True, True], dtype=np.bool_) + arr_no_nulls = pa.ExtensionArray.from_storage(bool8_type, pa.array([-1, 0, 1, 2], storage_type)) + arr_to_np = arr_no_nulls.to_numpy() + assert np.array_equal(arr_to_np, arr_np_bool) + assert arr_to_np.ctypes.data == arr_no_nulls.buffers()[1].address # zero-copy diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index 9a55a38177f..5d3471c7c35 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -250,6 +250,9 @@ def test_set_timezone_db_path_non_windows(): pa.OpaqueArray, pa.OpaqueScalar, pa.OpaqueType, + pa.Bool8Array, + pa.Bool8Scalar, + pa.Bool8Type, ]) def test_extension_type_constructor_errors(klass): # ARROW-2638: prevent calling extension class constructors directly diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 93d68fb8478..97af82c7ea4 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1836,6 +1836,32 @@ cdef class FixedShapeTensorType(BaseExtensionType): def __arrow_ext_scalar_class__(self): return FixedShapeTensorScalar +cdef class Bool8Type(BaseExtensionType): + """ + Concrete class for bool8 extension type. + Bool8 is an alternate representation for boolean + arrays using 8 bits instead of 1 bit per value. The underlying + storage type is int8. + Examples + -------- + Create an instance of bool8 extension type: + >>> import pyarrow as pa + >>> pa.bool8() + Bool8Type(extension) + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + BaseExtensionType.init(self, type) + self.bool8_ext_type = type.get() + + def __arrow_ext_class__(self): + return Bool8Array + + def __reduce__(self): + return bool8, () + + def __arrow_ext_scalar_class__(self): + return Bool8Scalar cdef class OpaqueType(BaseExtensionType): """ @@ -5277,6 +5303,41 @@ def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=N return out +def bool8(): + """ + Create instance of bool8 extension type. + Examples + -------- + Create an instance of bool8 extension type: + >>> import pyarrow as pa + >>> type = pa.bool8() + >>> type + Bool8Type(extension) + Inspect the data type: + >>> type.storage_type + DataType(int8) + Create a table with a bool8 array: + >>> arr = [-1, 0, 1, 2, None] + >>> storage = pa.array(arr, pa.int8()) + >>> other = pa.ExtensionArray.from_storage(type, storage) + >>> pa.table([other], names=["unknown_col"]) + pyarrow.Table + unknown_col: extension + ---- + unknown_col: [[True, False, True, True, null]] + Returns + ------- + type : Bool8Type + """ + + cdef Bool8Type out = Bool8Type.__new__(Bool8Type) + + with nogil: + c_type = GetResultValue(CBool8Type.Make()) + + out.init(c_type) + + return out def opaque(DataType storage_type, str type_name not None, str vendor_name not None): """ From 35d1ad914b9b36821b4594a94599fffa5e9d7d8d Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Fri, 2 Aug 2024 10:11:06 -0500 Subject: [PATCH 04/21] handle non-zero-copy conversion to numpy --- python/pyarrow/array.pxi | 10 ++++++++-- python/pyarrow/tests/test_extension_type.py | 12 ++++++++---- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index a41576c34c7..91f8e0c4db8 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4469,8 +4469,14 @@ cdef class Bool8Array(ExtensionArray): ] """ - def to_numpy(self): - return self.storage.to_numpy().view(np.bool_) + def to_numpy(self, zero_copy_only=True, writable=False): + try: + return self.storage.to_numpy().view(np.bool_) + except ArrowInvalid as e: + if zero_copy_only: + raise e + + return _pc().not_equal(self.storage, 0).to_numpy(zero_copy_only=zero_copy_only, writable=writable) cdef class OpaqueArray(ExtensionArray): """ diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 958b618faf7..c69b5159dab 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1756,12 +1756,16 @@ def test_bool8_type(pickle_module): canonical_bool8_arr = pa.ExtensionArray.from_storage(bool8_type, canonical_storage) assert cast_bool8_arr == canonical_bool8_arr - # zero-copy convert to numpy if non-null + # convert to numpy + ## cannot zero-copy with nulls with pytest.raises(pa.ArrowInvalid, match="Needed to copy 1 chunks with 1 nulls, but zero_copy_only was True"): arr.to_numpy() + + ## nullable conversion possible with a copy, but dest dtype is object + assert np.array_equal(arr.to_numpy(zero_copy_only=False), np.array([True, False, True, True, None], dtype=np.object_)) - arr_np_bool = np.array([True, False, True, True], dtype=np.bool_) + ## zero-copy possible with non-null array arr_no_nulls = pa.ExtensionArray.from_storage(bool8_type, pa.array([-1, 0, 1, 2], storage_type)) arr_to_np = arr_no_nulls.to_numpy() - assert np.array_equal(arr_to_np, arr_np_bool) - assert arr_to_np.ctypes.data == arr_no_nulls.buffers()[1].address # zero-copy + assert np.array_equal(arr_to_np, np.array([True, False, True, True], dtype=np.bool_)) + assert arr_to_np.ctypes.data == arr_no_nulls.buffers()[1].address # same underlying buffer From f07af50e6e6b71e1ff3f2e9de9571c58e6a76d7d Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Fri, 2 Aug 2024 13:00:18 -0500 Subject: [PATCH 05/21] add from_numpy, improve tests --- python/pyarrow/array.pxi | 79 +++++++++++++++------ python/pyarrow/tests/test_extension_type.py | 64 +++++++++++++---- 2 files changed, 109 insertions(+), 34 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 91f8e0c4db8..f25d6cb2b26 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4447,6 +4447,35 @@ cdef class FixedShapeTensorArray(ExtensionArray): FixedSizeListArray.from_arrays(values, shape[1:].prod()) ) + +cdef class OpaqueArray(ExtensionArray): + """ + Concrete class for opaque extension arrays. + + Examples + -------- + Define the extension type for an opaque array + + >>> import pyarrow as pa + >>> opaque_type = pa.opaque( + ... pa.binary(), + ... type_name="geometry", + ... vendor_name="postgis", + ... ) + + Create an extension array + + >>> arr = [None, b"data"] + >>> storage = pa.array(arr, pa.binary()) + >>> pa.ExtensionArray.from_storage(opaque_type, storage) + + [ + null, + 64617461 + ] + """ + + cdef class Bool8Array(ExtensionArray): """ Concrete class for bool8 extension arrays. @@ -4478,32 +4507,38 @@ cdef class Bool8Array(ExtensionArray): return _pc().not_equal(self.storage, 0).to_numpy(zero_copy_only=zero_copy_only, writable=writable) -cdef class OpaqueArray(ExtensionArray): - """ - Concrete class for opaque extension arrays. + @staticmethod + def from_numpy(obj): + """ + Convert numpy array to a bool8 extension array without making a copy. + The input array must be 1-dimensional, with either bool_ or int8 dtype. - Examples - -------- - Define the extension type for an opaque array + Parameters + ---------- + obj : numpy.ndarray - >>> import pyarrow as pa - >>> opaque_type = pa.opaque( - ... pa.binary(), - ... type_name="geometry", - ... vendor_name="postgis", - ... ) + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> arr = np.array([True, False, True], dtype=np.bool_) + >>> pa.Bool8Array.from_numpy(arr) + + [ + 1, + 0, + 1 + ] + """ - Create an extension array + if obj.ndim != 1: + raise ValueError(f"Cannot convert {obj.ndim}-D array to bool8 array") + + if obj.dtype not in [np.bool_, np.int8]: + raise TypeError(f"Array dtype {obj.dtype} incompatible with bool8 storage") - >>> arr = [None, b"data"] - >>> storage = pa.array(arr, pa.binary()) - >>> pa.ExtensionArray.from_storage(opaque_type, storage) - - [ - null, - 64617461 - ] - """ + buf = foreign_buffer(obj.ctypes.data, obj.size) + return Array.from_buffers(bool8(), obj.size, [None, buf]) cdef dict _array_classes = { diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index c69b5159dab..aeed093b5f3 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1744,28 +1744,68 @@ def test_bool8_type(pickle_module): inner = arr.cast(storage_type) assert inner == storage +def test_bool8_to_bool_conversion(): + bool_arr = pa.array([True, False, True, True, None], pa.bool_()) + bool8_arr = pa.ExtensionArray.from_storage( + pa.bool8(), + pa.array([-1, 0, 1, 2, None], pa.int8()), + ) + # cast extension type -> arrow boolean type - bool_type = pa.bool_() - arrow_bool_arr = pa.array([True, False, True, True, None], bool_type) - cast_bool_arr = arr.cast(bool_type) - assert cast_bool_arr == arrow_bool_arr + assert bool8_arr.cast(pa.bool_()) == bool_arr # cast arrow boolean type -> extension type, expecting canonical values - cast_bool8_arr = arrow_bool_arr.cast(bool8_type) - canonical_storage = pa.array([1, 0, 1, 1, None], storage_type) - canonical_bool8_arr = pa.ExtensionArray.from_storage(bool8_type, canonical_storage) - assert cast_bool8_arr == canonical_bool8_arr + canonical_storage = pa.array([1, 0, 1, 1, None], pa.int8()) + canonical_bool8_arr = pa.ExtensionArray.from_storage(pa.bool8(), canonical_storage) + assert bool_arr.cast(pa.bool8()) == canonical_bool8_arr + +def test_bool8_to_numpy_conversion(): + arr = pa.ExtensionArray.from_storage( + pa.bool8(), + pa.array([-1, 0, 1, 2, None], pa.int8()), + ) - # convert to numpy ## cannot zero-copy with nulls with pytest.raises(pa.ArrowInvalid, match="Needed to copy 1 chunks with 1 nulls, but zero_copy_only was True"): arr.to_numpy() ## nullable conversion possible with a copy, but dest dtype is object - assert np.array_equal(arr.to_numpy(zero_copy_only=False), np.array([True, False, True, True, None], dtype=np.object_)) + assert np.array_equal( + arr.to_numpy(zero_copy_only=False), + np.array([True, False, True, True, None], dtype=np.object_), + ) ## zero-copy possible with non-null array - arr_no_nulls = pa.ExtensionArray.from_storage(bool8_type, pa.array([-1, 0, 1, 2], storage_type)) + np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_) + arr_no_nulls = pa.ExtensionArray.from_storage( + pa.bool8(), + pa.array([-1, 0, 1, 2], pa.int8()), + ) + + arr_to_np = arr_no_nulls.to_numpy() - assert np.array_equal(arr_to_np, np.array([True, False, True, True], dtype=np.bool_)) + assert np.array_equal(arr_to_np, np_arr_no_nulls) assert arr_to_np.ctypes.data == arr_no_nulls.buffers()[1].address # same underlying buffer + + +def test_bool8_from_numpy_conversion(): + np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_) + canonical_bool8_arr_no_nulls = pa.ExtensionArray.from_storage( + pa.bool8(), + pa.array([1, 0, 1, 1], pa.int8()), + ) + + arr_from_np = pa.Bool8Array.from_numpy(np_arr_no_nulls) + assert arr_from_np == canonical_bool8_arr_no_nulls + assert arr_from_np.buffers()[1].address == np_arr_no_nulls.ctypes.data # same underlying buffer + + # conversion only valid for 1-D arrays + with pytest.raises(ValueError, match="Cannot convert 2-D array to bool8 array"): + pa.Bool8Array.from_numpy(np.array([[True, False], [False, True]], dtype=np.bool_)) + + with pytest.raises(ValueError, match="Cannot convert 0-D array to bool8 array"): + pa.Bool8Array.from_numpy(np.bool_()) + + # must use compatible storage type + with pytest.raises(TypeError, match="Array dtype float64 incompatible with bool8 storage"): + pa.Bool8Array.from_numpy(np.array([1, 2, 3], dtype=np.float64)) From 3f45772540b78ba4c0741c829da46c7c6e2de3ed Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Fri, 2 Aug 2024 13:13:36 -0500 Subject: [PATCH 06/21] simplify bool8 constructor --- cpp/src/arrow/extension/bool8.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/arrow/extension/bool8.cc b/cpp/src/arrow/extension/bool8.cc index e5119f42bc5..21da7c9a7c9 100644 --- a/cpp/src/arrow/extension/bool8.cc +++ b/cpp/src/arrow/extension/bool8.cc @@ -58,9 +58,7 @@ Result> Bool8Type::Make() { } std::shared_ptr bool8() { - auto maybe_type = Bool8Type::Make(); - ARROW_DCHECK_OK(maybe_type.status()); - return maybe_type.MoveValueUnsafe(); + return std::make_shared(); } } // namespace arrow::extension From 03fa22fb7b7636778caa82e89b2de6baf5f27c6e Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Tue, 6 Aug 2024 11:54:43 -0400 Subject: [PATCH 07/21] fix precommit --- cpp/src/arrow/extension/bool8.cc | 14 ++----- cpp/src/arrow/extension/bool8.h | 4 +- cpp/src/arrow/extension/bool8_test.cc | 25 +++++------- python/pyarrow/tests/test_extension_type.py | 44 +++++++++++++++------ 4 files changed, 47 insertions(+), 40 deletions(-) diff --git a/cpp/src/arrow/extension/bool8.cc b/cpp/src/arrow/extension/bool8.cc index 21da7c9a7c9..5be6b0082fe 100644 --- a/cpp/src/arrow/extension/bool8.cc +++ b/cpp/src/arrow/extension/bool8.cc @@ -32,21 +32,17 @@ std::string Bool8Type::ToString(bool show_metadata) const { return ss.str(); } -std::string Bool8Type::Serialize() const { - return ""; -} +std::string Bool8Type::Serialize() const { return ""; } Result> Bool8Type::Deserialize( std::shared_ptr storage_type, const std::string& serialized_data) const { if (storage_type->id() != Type::INT8) { - return Status::Invalid("Expected INT8 storage type, got ", - storage_type->ToString()); + return Status::Invalid("Expected INT8 storage type, got ", storage_type->ToString()); } return bool8(); } -std::shared_ptr Bool8Type::MakeArray( - std::shared_ptr data) const { +std::shared_ptr Bool8Type::MakeArray(std::shared_ptr data) const { DCHECK_EQ(data->type->id(), Type::EXTENSION); DCHECK_EQ("arrow.bool8", internal::checked_cast(*data->type).extension_name()); @@ -57,8 +53,6 @@ Result> Bool8Type::Make() { return std::make_shared(); } -std::shared_ptr bool8() { - return std::make_shared(); -} +std::shared_ptr bool8() { return std::make_shared(); } } // namespace arrow::extension diff --git a/cpp/src/arrow/extension/bool8.h b/cpp/src/arrow/extension/bool8.h index 2cd57d03689..8121d3e036c 100644 --- a/cpp/src/arrow/extension/bool8.h +++ b/cpp/src/arrow/extension/bool8.h @@ -33,8 +33,8 @@ class ARROW_EXPORT Bool8Array : public ExtensionArray { /// storage type is int8. class ARROW_EXPORT Bool8Type : public ExtensionType { public: - /// \brief Construct a Bool8Type. - Bool8Type(): ExtensionType(int8()) {} + /// \brief Construct a Bool8Type. + Bool8Type() : ExtensionType(int8()) {} std::string extension_name() const override { return "arrow.bool8"; } std::string ToString(bool show_metadata = false) const override; diff --git a/cpp/src/arrow/extension/bool8_test.cc b/cpp/src/arrow/extension/bool8_test.cc index f3cbf1c3dc7..2cdbf42c394 100644 --- a/cpp/src/arrow/extension/bool8_test.cc +++ b/cpp/src/arrow/extension/bool8_test.cc @@ -25,10 +25,8 @@ namespace arrow { TEST(Bool8Type, Basics) { - auto type = internal::checked_pointer_cast( - extension::bool8()); - auto type2 = internal::checked_pointer_cast( - extension::bool8()); + auto type = internal::checked_pointer_cast(extension::bool8()); + auto type2 = internal::checked_pointer_cast(extension::bool8()); ASSERT_EQ("arrow.bool8", type->extension_name()); ASSERT_EQ(*type, *type); ASSERT_NE(*arrow::null(), *type); @@ -39,8 +37,7 @@ TEST(Bool8Type, Basics) { } TEST(Bool8Type, CreateFromArray) { - auto type = internal::checked_pointer_cast( - extension::bool8()); + auto type = internal::checked_pointer_cast(extension::bool8()); auto storage = ArrayFromJSON(int8(), "[-1,0,1,2,null]"); auto array = ExtensionType::WrapArray(type, storage); ASSERT_EQ(5, array->length()); @@ -48,30 +45,26 @@ TEST(Bool8Type, CreateFromArray) { } TEST(Bool8Type, Deserialize) { - auto type = internal::checked_pointer_cast( - extension::bool8()); - ASSERT_OK_AND_ASSIGN(auto deserialized, - type->Deserialize(type->storage_type(), "")); + auto type = internal::checked_pointer_cast(extension::bool8()); + ASSERT_OK_AND_ASSIGN(auto deserialized, type->Deserialize(type->storage_type(), "")); ASSERT_EQ(*type, *deserialized); ASSERT_OK_AND_ASSIGN(deserialized, - type->Deserialize(type->storage_type(), "doesn't matter")); + type->Deserialize(type->storage_type(), "doesn't matter")); ASSERT_EQ(*type, *deserialized); ASSERT_NOT_OK(type->Deserialize(uint8(), "")); ASSERT_EQ(*type, *deserialized); } TEST(Bool8Type, MetadataRoundTrip) { - auto type = internal::checked_pointer_cast( - extension::bool8()); + auto type = internal::checked_pointer_cast(extension::bool8()); std::string serialized = type->Serialize(); ASSERT_OK_AND_ASSIGN(auto deserialized, - type->Deserialize(type->storage_type(), serialized)); + type->Deserialize(type->storage_type(), serialized)); ASSERT_EQ(*type, *deserialized); } TEST(Bool8Type, BatchRoundTrip) { - auto type = internal::checked_pointer_cast( - extension::bool8()); + auto type = internal::checked_pointer_cast(extension::bool8()); ExtensionTypeGuard guard(type); auto storage = ArrayFromJSON(int8(), "[-1,0,1,2,null]"); diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index aeed093b5f3..7e2695e663f 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1708,6 +1708,7 @@ def test_opaque_type(pickle_module, storage_type, storage): inner = arr.cast(storage_type) assert inner == storage + def test_bool8_type(pickle_module): bool8_type = pa.bool8() storage_type = pa.int8() @@ -1744,6 +1745,7 @@ def test_bool8_type(pickle_module): inner = arr.cast(storage_type) assert inner == storage + def test_bool8_to_bool_conversion(): bool_arr = pa.array([True, False, True, True, None], pa.bool_()) bool8_arr = pa.ExtensionArray.from_storage( @@ -1759,33 +1761,38 @@ def test_bool8_to_bool_conversion(): canonical_bool8_arr = pa.ExtensionArray.from_storage(pa.bool8(), canonical_storage) assert bool_arr.cast(pa.bool8()) == canonical_bool8_arr + def test_bool8_to_numpy_conversion(): arr = pa.ExtensionArray.from_storage( pa.bool8(), pa.array([-1, 0, 1, 2, None], pa.int8()), ) - ## cannot zero-copy with nulls - with pytest.raises(pa.ArrowInvalid, match="Needed to copy 1 chunks with 1 nulls, but zero_copy_only was True"): + # cannot zero-copy with nulls + with pytest.raises( + pa.ArrowInvalid, + match="Needed to copy 1 chunks with 1 nulls, but zero_copy_only was True", + ): arr.to_numpy() - - ## nullable conversion possible with a copy, but dest dtype is object + + # nullable conversion possible with a copy, but dest dtype is object assert np.array_equal( arr.to_numpy(zero_copy_only=False), np.array([True, False, True, True, None], dtype=np.object_), ) - ## zero-copy possible with non-null array + # zero-copy possible with non-null array np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_) arr_no_nulls = pa.ExtensionArray.from_storage( pa.bool8(), pa.array([-1, 0, 1, 2], pa.int8()), ) - arr_to_np = arr_no_nulls.to_numpy() assert np.array_equal(arr_to_np, np_arr_no_nulls) - assert arr_to_np.ctypes.data == arr_no_nulls.buffers()[1].address # same underlying buffer + + # same underlying buffer + assert arr_to_np.ctypes.data == arr_no_nulls.buffers()[1].address def test_bool8_from_numpy_conversion(): @@ -1797,15 +1804,28 @@ def test_bool8_from_numpy_conversion(): arr_from_np = pa.Bool8Array.from_numpy(np_arr_no_nulls) assert arr_from_np == canonical_bool8_arr_no_nulls - assert arr_from_np.buffers()[1].address == np_arr_no_nulls.ctypes.data # same underlying buffer + + # same underlying buffer + assert arr_from_np.buffers()[1].address == np_arr_no_nulls.ctypes.data # conversion only valid for 1-D arrays - with pytest.raises(ValueError, match="Cannot convert 2-D array to bool8 array"): - pa.Bool8Array.from_numpy(np.array([[True, False], [False, True]], dtype=np.bool_)) + with pytest.raises( + ValueError, + match="Cannot convert 2-D array to bool8 array", + ): + pa.Bool8Array.from_numpy( + np.array([[True, False], [False, True]], dtype=np.bool_), + ) - with pytest.raises(ValueError, match="Cannot convert 0-D array to bool8 array"): + with pytest.raises( + ValueError, + match="Cannot convert 0-D array to bool8 array", + ): pa.Bool8Array.from_numpy(np.bool_()) # must use compatible storage type - with pytest.raises(TypeError, match="Array dtype float64 incompatible with bool8 storage"): + with pytest.raises( + TypeError, + match="Array dtype float64 incompatible with bool8 storage", + ): pa.Bool8Array.from_numpy(np.array([1, 2, 3], dtype=np.float64)) From 15fc298213287615a04a414c767a51a72ae43f39 Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Tue, 6 Aug 2024 14:08:14 -0400 Subject: [PATCH 08/21] fix docstring formatting --- python/pyarrow/array.pxi | 4 ++++ python/pyarrow/includes/libarrow.pxd | 6 ++---- python/pyarrow/types.pxi | 10 ++++++++++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index f25d6cb2b26..547bcb6f2d5 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4479,12 +4479,16 @@ cdef class OpaqueArray(ExtensionArray): cdef class Bool8Array(ExtensionArray): """ Concrete class for bool8 extension arrays. + Examples -------- Define the extension type for an bool8 array + >>> import pyarrow as pa >>> bool8_type = pa.bool8() + Create an extension array + >>> arr = [-1, 0, 1, 2, None] >>> storage = pa.array(arr, pa.int8()) >>> pa.ExtensionArray.from_storage(bool8_type, storage) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 1c36840abba..a54a1db292f 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2896,14 +2896,12 @@ cdef extern from "arrow/extension/opaque.h" namespace "arrow::extension" nogil: cdef extern from "arrow/extension/bool8.h" namespace "arrow::extension" nogil: - cdef cppclass CBool8Type \ - " arrow::extension::Bool8Type"(CExtensionType): + cdef cppclass CBool8Type" arrow::extension::Bool8Type"(CExtensionType): @staticmethod CResult[shared_ptr[CDataType]] Make() - cdef cppclass CBool8Array \ - " arrow::extension::Bool8Array"(CExtensionArray): + cdef cppclass CBool8Array" arrow::extension::Bool8Array"(CExtensionArray): pass cdef extern from "arrow/util/compression.h" namespace "arrow" nogil: diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 97af82c7ea4..e54c1a5d040 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1839,12 +1839,15 @@ cdef class FixedShapeTensorType(BaseExtensionType): cdef class Bool8Type(BaseExtensionType): """ Concrete class for bool8 extension type. + Bool8 is an alternate representation for boolean arrays using 8 bits instead of 1 bit per value. The underlying storage type is int8. + Examples -------- Create an instance of bool8 extension type: + >>> import pyarrow as pa >>> pa.bool8() Bool8Type(extension) @@ -5306,17 +5309,23 @@ def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=N def bool8(): """ Create instance of bool8 extension type. + Examples -------- Create an instance of bool8 extension type: + >>> import pyarrow as pa >>> type = pa.bool8() >>> type Bool8Type(extension) + Inspect the data type: + >>> type.storage_type DataType(int8) + Create a table with a bool8 array: + >>> arr = [-1, 0, 1, 2, None] >>> storage = pa.array(arr, pa.int8()) >>> other = pa.ExtensionArray.from_storage(type, storage) @@ -5325,6 +5334,7 @@ def bool8(): unknown_col: extension ---- unknown_col: [[True, False, True, True, null]] + Returns ------- type : Bool8Type From 84c161b66db3eb8c5e129e3bf2f4a59446082786 Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Tue, 6 Aug 2024 16:58:00 -0400 Subject: [PATCH 09/21] from_numpy using from_storage --- python/pyarrow/array.pxi | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 547bcb6f2d5..dbccf819938 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4511,6 +4511,22 @@ cdef class Bool8Array(ExtensionArray): return _pc().not_equal(self.storage, 0).to_numpy(zero_copy_only=zero_copy_only, writable=writable) + @staticmethod + def from_storage(Int8Array storage): + """ + Construct Bool8Array from Int8Array storage. + + Parameters + ---------- + storage : Int8Array + The underlying storage for the result array. + + Returns + ------- + bool8_array : Bool8Array + """ + return ExtensionArray.from_storage(bool8(), storage) + @staticmethod def from_numpy(obj): """ @@ -4521,6 +4537,10 @@ cdef class Bool8Array(ExtensionArray): ---------- obj : numpy.ndarray + Returns + ------- + bool8_array : Bool8Array + Examples -------- >>> import pyarrow as pa @@ -4541,8 +4561,8 @@ cdef class Bool8Array(ExtensionArray): if obj.dtype not in [np.bool_, np.int8]: raise TypeError(f"Array dtype {obj.dtype} incompatible with bool8 storage") - buf = foreign_buffer(obj.ctypes.data, obj.size) - return Array.from_buffers(bool8(), obj.size, [None, buf]) + storage_arr = array(obj.view(np.int8), type=int8()) + return Bool8Array.from_storage(storage_arr) cdef dict _array_classes = { From 0cea57123e9ab8f391461fb3415d1781016f198d Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Tue, 6 Aug 2024 17:11:39 -0400 Subject: [PATCH 10/21] impl as_py() for scalar --- python/pyarrow/scalar.pxi | 16 ++++++++++++---- python/pyarrow/tests/test_extension_type.py | 7 +++++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 584b32dce5d..549495b957a 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1084,10 +1084,6 @@ cdef class FixedShapeTensorScalar(ExtensionScalar): ctensor = GetResultValue(c_type.MakeTensor(scalar)) return pyarrow_wrap_tensor(ctensor) -cdef class Bool8Scalar(ExtensionScalar): - """ - Concrete class for bool8 extension scalar. - """ cdef class OpaqueScalar(ExtensionScalar): """ @@ -1095,6 +1091,18 @@ cdef class OpaqueScalar(ExtensionScalar): """ +cdef class Bool8Scalar(ExtensionScalar): + """ + Concrete class for bool8 extension scalar. + """ + + def as_py(self): + """ + Return this scalar as a Python object. + """ + py_val = super().as_py() + return None if py_val is None else py_val != 0 + cdef dict _scalar_classes = { _Type_BOOL: BooleanScalar, _Type_UINT8: UInt8Scalar, diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 7e2695e663f..6bce97883ba 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1829,3 +1829,10 @@ def test_bool8_from_numpy_conversion(): match="Array dtype float64 incompatible with bool8 storage", ): pa.Bool8Array.from_numpy(np.array([1, 2, 3], dtype=np.float64)) + +def test_bool8_scalar(): + assert pa.ExtensionScalar.from_storage(pa.bool8(), 0).as_py() == False + assert pa.ExtensionScalar.from_storage(pa.bool8(), 1).as_py() == True + assert pa.ExtensionScalar.from_storage(pa.bool8(), 2).as_py() == True + assert pa.ExtensionScalar.from_storage(pa.bool8(), -1).as_py() == True + assert pa.ExtensionScalar.from_storage(pa.bool8(), None).as_py() is None From 71a5d965c2ec10d8cafcc222015a79fffb54b364 Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Tue, 6 Aug 2024 17:13:50 -0400 Subject: [PATCH 11/21] fix pre-commit --- python/pyarrow/tests/test_extension_type.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 6bce97883ba..704d97a5b4e 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1830,9 +1830,10 @@ def test_bool8_from_numpy_conversion(): ): pa.Bool8Array.from_numpy(np.array([1, 2, 3], dtype=np.float64)) + def test_bool8_scalar(): - assert pa.ExtensionScalar.from_storage(pa.bool8(), 0).as_py() == False - assert pa.ExtensionScalar.from_storage(pa.bool8(), 1).as_py() == True - assert pa.ExtensionScalar.from_storage(pa.bool8(), 2).as_py() == True - assert pa.ExtensionScalar.from_storage(pa.bool8(), -1).as_py() == True + assert not pa.ExtensionScalar.from_storage(pa.bool8(), 0).as_py() + assert pa.ExtensionScalar.from_storage(pa.bool8(), 1).as_py() + assert pa.ExtensionScalar.from_storage(pa.bool8(), 2).as_py() + assert pa.ExtensionScalar.from_storage(pa.bool8(), -1).as_py() assert pa.ExtensionScalar.from_storage(pa.bool8(), None).as_py() is None From b76013e1b8a720a0a6856f20da1391cfb069919b Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Wed, 7 Aug 2024 14:44:31 -0400 Subject: [PATCH 12/21] incorporate review comments --- cpp/src/arrow/extension/bool8.cc | 3 +++ cpp/src/arrow/extension/bool8.h | 6 ++---- cpp/src/arrow/extension/bool8_test.cc | 3 +-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/extension/bool8.cc b/cpp/src/arrow/extension/bool8.cc index 5be6b0082fe..c081f0c2b28 100644 --- a/cpp/src/arrow/extension/bool8.cc +++ b/cpp/src/arrow/extension/bool8.cc @@ -39,6 +39,9 @@ Result> Bool8Type::Deserialize( if (storage_type->id() != Type::INT8) { return Status::Invalid("Expected INT8 storage type, got ", storage_type->ToString()); } + if (serialized_data != "") { + return Status::Invalid("Serialize data must be empty, got ", serialized_data); + } return bool8(); } diff --git a/cpp/src/arrow/extension/bool8.h b/cpp/src/arrow/extension/bool8.h index 8121d3e036c..02e629b28a8 100644 --- a/cpp/src/arrow/extension/bool8.h +++ b/cpp/src/arrow/extension/bool8.h @@ -17,8 +17,7 @@ #include "arrow/extension_type.h" -namespace arrow { -namespace extension { +namespace arrow::extension { /// \brief Bool8 is an alternate representation for boolean /// arrays using 8 bits instead of 1 bit per value. The underlying @@ -56,5 +55,4 @@ class ARROW_EXPORT Bool8Type : public ExtensionType { /// \brief Return a Bool8Type instance. ARROW_EXPORT std::shared_ptr bool8(); -} // namespace extension -} // namespace arrow +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/bool8_test.cc b/cpp/src/arrow/extension/bool8_test.cc index 2cdbf42c394..3a3d396168b 100644 --- a/cpp/src/arrow/extension/bool8_test.cc +++ b/cpp/src/arrow/extension/bool8_test.cc @@ -48,8 +48,7 @@ TEST(Bool8Type, Deserialize) { auto type = internal::checked_pointer_cast(extension::bool8()); ASSERT_OK_AND_ASSIGN(auto deserialized, type->Deserialize(type->storage_type(), "")); ASSERT_EQ(*type, *deserialized); - ASSERT_OK_AND_ASSIGN(deserialized, - type->Deserialize(type->storage_type(), "doesn't matter")); + ASSERT_NOT_OK(type->Deserialize(type->storage_type(), "must be empty")); ASSERT_EQ(*type, *deserialized); ASSERT_NOT_OK(type->Deserialize(uint8(), "")); ASSERT_EQ(*type, *deserialized); From aa36147e82ff9623bed39c37ca122aed3563216e Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Wed, 7 Aug 2024 15:49:30 -0400 Subject: [PATCH 13/21] fix whitespace --- python/pyarrow/array.pxi | 2 +- python/pyarrow/types.pxi | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index dbccf819938..a10a7f4cee3 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4557,7 +4557,7 @@ cdef class Bool8Array(ExtensionArray): if obj.ndim != 1: raise ValueError(f"Cannot convert {obj.ndim}-D array to bool8 array") - + if obj.dtype not in [np.bool_, np.int8]: raise TypeError(f"Array dtype {obj.dtype} incompatible with bool8 storage") diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index e54c1a5d040..560f518050b 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -5306,6 +5306,7 @@ def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=N return out + def bool8(): """ Create instance of bool8 extension type. @@ -5349,6 +5350,7 @@ def bool8(): return out + def opaque(DataType storage_type, str type_name not None, str vendor_name not None): """ Create instance of opaque extension type. From dc192f58c53dd01ad7ae056739dd9f652b8c60ea Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Fri, 9 Aug 2024 13:43:34 -0400 Subject: [PATCH 14/21] register extension by default --- cpp/src/arrow/extension/bool8_test.cc | 1 - cpp/src/arrow/extension_type.cc | 8 ++++++-- python/pyarrow/tests/test_extension_type.py | 6 +++--- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/extension/bool8_test.cc b/cpp/src/arrow/extension/bool8_test.cc index 3a3d396168b..eabcfcf62d3 100644 --- a/cpp/src/arrow/extension/bool8_test.cc +++ b/cpp/src/arrow/extension/bool8_test.cc @@ -64,7 +64,6 @@ TEST(Bool8Type, MetadataRoundTrip) { TEST(Bool8Type, BatchRoundTrip) { auto type = internal::checked_pointer_cast(extension::bool8()); - ExtensionTypeGuard guard(type); auto storage = ArrayFromJSON(int8(), "[-1,0,1,2,null]"); auto array = ExtensionType::WrapArray(type, storage); diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index cf8dda7a85d..5b9071b8fa9 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -29,6 +29,7 @@ #include "arrow/config.h" #ifdef ARROW_JSON #include "arrow/extension/fixed_shape_tensor.h" +#include "arrow/extension/bool8.h" #endif #include "arrow/status.h" #include "arrow/type.h" @@ -146,10 +147,13 @@ static void CreateGlobalRegistry() { #ifdef ARROW_JSON // Register canonical extension types - auto ext_type = + auto fst_ext_type = checked_pointer_cast(extension::fixed_shape_tensor(int64(), {})); + ARROW_CHECK_OK(g_registry->RegisterType(fst_ext_type)); - ARROW_CHECK_OK(g_registry->RegisterType(ext_type)); + auto bool8_ext_type = + checked_pointer_cast(extension::bool8()); + ARROW_CHECK_OK(g_registry->RegisterType(bool8_ext_type)); #endif } diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 704d97a5b4e..53bc2d9b079 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1730,9 +1730,9 @@ def test_bool8_type(pickle_module): arr = pa.ExtensionArray.from_storage(bool8_type, storage) assert isinstance(arr, bool8_arr_class) - with registered_extension_type(bool8_type): - buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"])) - batch = ipc_read_batch(buf) + # extension is registered by default + buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"])) + batch = ipc_read_batch(buf) assert batch.column(0).type.extension_name == "arrow.bool8" assert isinstance(batch.column(0), bool8_arr_class) From 735f591c3af3456427f4271fdda9a3fe9f8949eb Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Fri, 9 Aug 2024 14:56:26 -0400 Subject: [PATCH 15/21] more scalar tests --- python/pyarrow/tests/test_extension_type.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 53bc2d9b079..c55e5a12515 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1832,8 +1832,18 @@ def test_bool8_from_numpy_conversion(): def test_bool8_scalar(): + assert pa.ExtensionScalar.from_storage(pa.bool8(), -1).as_py() assert not pa.ExtensionScalar.from_storage(pa.bool8(), 0).as_py() assert pa.ExtensionScalar.from_storage(pa.bool8(), 1).as_py() assert pa.ExtensionScalar.from_storage(pa.bool8(), 2).as_py() - assert pa.ExtensionScalar.from_storage(pa.bool8(), -1).as_py() assert pa.ExtensionScalar.from_storage(pa.bool8(), None).as_py() is None + + arr = pa.ExtensionArray.from_storage( + pa.bool8(), + pa.array([-1, 0, 1, 2, None], pa.int8()), + ) + assert arr[0].as_py() + assert not arr[1].as_py() + assert arr[2].as_py() + assert arr[3].as_py() + assert arr[4].as_py() is None From b2f83a10253614a009c23493d5c9b557ebcc7024 Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Fri, 9 Aug 2024 16:53:22 -0400 Subject: [PATCH 16/21] run precommit --- cpp/src/arrow/extension_type.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index 5b9071b8fa9..685018f7de7 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -28,8 +28,8 @@ #include "arrow/chunked_array.h" #include "arrow/config.h" #ifdef ARROW_JSON -#include "arrow/extension/fixed_shape_tensor.h" #include "arrow/extension/bool8.h" +#include "arrow/extension/fixed_shape_tensor.h" #endif #include "arrow/status.h" #include "arrow/type.h" @@ -151,8 +151,7 @@ static void CreateGlobalRegistry() { checked_pointer_cast(extension::fixed_shape_tensor(int64(), {})); ARROW_CHECK_OK(g_registry->RegisterType(fst_ext_type)); - auto bool8_ext_type = - checked_pointer_cast(extension::bool8()); + auto bool8_ext_type = checked_pointer_cast(extension::bool8()); ARROW_CHECK_OK(g_registry->RegisterType(bool8_ext_type)); #endif } From 8d0766981856af44f942161506d0b0e2bedd418c Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Fri, 9 Aug 2024 17:12:34 -0400 Subject: [PATCH 17/21] update docs and writable --- python/pyarrow/array.pxi | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index a10a7f4cee3..159e29322b4 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4503,11 +4503,34 @@ cdef class Bool8Array(ExtensionArray): """ def to_numpy(self, zero_copy_only=True, writable=False): - try: - return self.storage.to_numpy().view(np.bool_) - except ArrowInvalid as e: - if zero_copy_only: - raise e + """ + Return a NumPy bool view or copy of this array (experimental). + + By default, tries to return a view of this array. This is only + supported for arrays without any nulls. + + Parameters + ---------- + zero_copy_only : bool, default True + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls). + writable : bool, default False + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + + Returns + ------- + array : numpy.ndarray + """ + if not writable: + try: + return self.storage.to_numpy().view(np.bool_) + except ArrowInvalid as e: + if zero_copy_only: + raise e return _pc().not_equal(self.storage, 0).to_numpy(zero_copy_only=zero_copy_only, writable=writable) From aef60b1f7af2e32462f8043d0cdeb8d8dfa20a4a Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Mon, 12 Aug 2024 12:27:06 -0400 Subject: [PATCH 18/21] fix doctest --- python/pyarrow/types.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 560f518050b..2493eb37a44 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -5334,7 +5334,7 @@ def bool8(): pyarrow.Table unknown_col: extension ---- - unknown_col: [[True, False, True, True, null]] + unknown_col: [[-1, 0, 1, 2, null]] Returns ------- From 7084c19dcbeaa59f859de565d71774ab0d23a6ed Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Mon, 12 Aug 2024 14:04:14 -0400 Subject: [PATCH 19/21] pa.scalar working for extension types --- python/pyarrow/scalar.pxi | 11 ++++++++++- python/pyarrow/tests/test_extension_type.py | 6 ++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 549495b957a..72ae2aee5f8 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1211,6 +1211,11 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None): type = ensure_type(type, allow_none=True) pool = maybe_unbox_memory_pool(memory_pool) + extension_type = None + if type is not None and type.id == _Type_EXTENSION: + extension_type = type + type = type.storage_type + if _is_array_like(value): value = get_values(value, &is_pandas_object) @@ -1235,4 +1240,8 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None): # retrieve the scalar from the first position scalar = GetResultValue(array.get().GetScalar(0)) - return Scalar.wrap(scalar) + result = Scalar.wrap(scalar) + + if extension_type is not None: + result = ExtensionScalar.from_storage(extension_type, result) + return result diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index c55e5a12515..2cf80ea34b8 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1847,3 +1847,9 @@ def test_bool8_scalar(): assert arr[2].as_py() assert arr[3].as_py() assert arr[4].as_py() is None + + assert pa.scalar(-1, type=pa.bool8()).as_py() + assert not pa.scalar(0, type=pa.bool8()).as_py() + assert pa.scalar(1, type=pa.bool8()).as_py() + assert pa.scalar(2, type=pa.bool8()).as_py() + assert pa.scalar(None, type=pa.bool8()).as_py() is None From 5bb387b4d51ead27173f1357ed012bb5a780cf61 Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Wed, 14 Aug 2024 17:05:27 -0400 Subject: [PATCH 20/21] fix docstring --- python/pyarrow/types.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 2493eb37a44..e22d640e0b3 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -5334,7 +5334,7 @@ def bool8(): pyarrow.Table unknown_col: extension ---- - unknown_col: [[-1, 0, 1, 2, null]] + unknown_col: [[-1,0,1,2,null]] Returns ------- From c697edadb52fd8775f5f351d43eecc3a91f753b3 Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Tue, 20 Aug 2024 09:58:31 -0400 Subject: [PATCH 21/21] apply python review suggestions --- python/pyarrow/array.pxi | 4 +-- python/pyarrow/tests/test_extension_type.py | 36 ++++++++++++--------- python/pyarrow/types.pxi | 5 +-- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 159e29322b4..4c3eb932326 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1581,7 +1581,7 @@ cdef class Array(_PandasConvertible): def to_numpy(self, zero_copy_only=True, writable=False): """ - Return a NumPy view or copy of this array (experimental). + Return a NumPy view or copy of this array. By default, tries to return a view of this array. This is only supported for primitive arrays with the same memory layout as NumPy @@ -4504,7 +4504,7 @@ cdef class Bool8Array(ExtensionArray): def to_numpy(self, zero_copy_only=True, writable=False): """ - Return a NumPy bool view or copy of this array (experimental). + Return a NumPy bool view or copy of this array. By default, tries to return a view of this array. This is only supported for arrays without any nulls. diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 2cf80ea34b8..b04ee85ec99 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1725,17 +1725,16 @@ def test_bool8_type(pickle_module): assert result == bool8_type # IPC roundtrip - bool8_arr_class = bool8_type.__arrow_ext_class__() storage = pa.array([-1, 0, 1, 2, None], storage_type) arr = pa.ExtensionArray.from_storage(bool8_type, storage) - assert isinstance(arr, bool8_arr_class) + assert isinstance(arr, pa.Bool8Array) # extension is registered by default buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"])) batch = ipc_read_batch(buf) assert batch.column(0).type.extension_name == "arrow.bool8" - assert isinstance(batch.column(0), bool8_arr_class) + assert isinstance(batch.column(0), pa.Bool8Array) # cast storage -> extension type result = storage.cast(bool8_type) @@ -1794,6 +1793,13 @@ def test_bool8_to_numpy_conversion(): # same underlying buffer assert arr_to_np.ctypes.data == arr_no_nulls.buffers()[1].address + # if the user requests a writable array, a copy should be performed + arr_to_np_writable = arr_no_nulls.to_numpy(zero_copy_only=False, writable=True) + assert np.array_equal(arr_to_np_writable, np_arr_no_nulls) + + # different underlying buffer + assert arr_to_np_writable.ctypes.data != arr_no_nulls.buffers()[1].address + def test_bool8_from_numpy_conversion(): np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_) @@ -1832,24 +1838,24 @@ def test_bool8_from_numpy_conversion(): def test_bool8_scalar(): - assert pa.ExtensionScalar.from_storage(pa.bool8(), -1).as_py() - assert not pa.ExtensionScalar.from_storage(pa.bool8(), 0).as_py() - assert pa.ExtensionScalar.from_storage(pa.bool8(), 1).as_py() - assert pa.ExtensionScalar.from_storage(pa.bool8(), 2).as_py() + assert pa.ExtensionScalar.from_storage(pa.bool8(), -1).as_py() is True + assert pa.ExtensionScalar.from_storage(pa.bool8(), 0).as_py() is False + assert pa.ExtensionScalar.from_storage(pa.bool8(), 1).as_py() is True + assert pa.ExtensionScalar.from_storage(pa.bool8(), 2).as_py() is True assert pa.ExtensionScalar.from_storage(pa.bool8(), None).as_py() is None arr = pa.ExtensionArray.from_storage( pa.bool8(), pa.array([-1, 0, 1, 2, None], pa.int8()), ) - assert arr[0].as_py() - assert not arr[1].as_py() - assert arr[2].as_py() - assert arr[3].as_py() + assert arr[0].as_py() is True + assert arr[1].as_py() is False + assert arr[2].as_py() is True + assert arr[3].as_py() is True assert arr[4].as_py() is None - assert pa.scalar(-1, type=pa.bool8()).as_py() - assert not pa.scalar(0, type=pa.bool8()).as_py() - assert pa.scalar(1, type=pa.bool8()).as_py() - assert pa.scalar(2, type=pa.bool8()).as_py() + assert pa.scalar(-1, type=pa.bool8()).as_py() is True + assert pa.scalar(0, type=pa.bool8()).as_py() is False + assert pa.scalar(1, type=pa.bool8()).as_py() is True + assert pa.scalar(2, type=pa.bool8()).as_py() is True assert pa.scalar(None, type=pa.bool8()).as_py() is None diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index e22d640e0b3..945424e471a 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1836,6 +1836,7 @@ cdef class FixedShapeTensorType(BaseExtensionType): def __arrow_ext_scalar_class__(self): return FixedShapeTensorScalar + cdef class Bool8Type(BaseExtensionType): """ Concrete class for bool8 extension type. @@ -1866,6 +1867,7 @@ cdef class Bool8Type(BaseExtensionType): def __arrow_ext_scalar_class__(self): return Bool8Scalar + cdef class OpaqueType(BaseExtensionType): """ Concrete class for opaque extension type. @@ -5343,8 +5345,7 @@ def bool8(): cdef Bool8Type out = Bool8Type.__new__(Bool8Type) - with nogil: - c_type = GetResultValue(CBool8Type.Make()) + c_type = GetResultValue(CBool8Type.Make()) out.init(c_type)