From 20e1b362e2e8544b60044cb0a6329f8c79a82ddc Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 24 Sep 2024 13:45:29 +0200 Subject: [PATCH 01/20] Stricter equality check --- cpp/src/parquet/arrow/arrow_schema_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index d261482d89a..ff9e440bc83 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -792,7 +792,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"}); auto arrow_schema = ::arrow::schema( {::arrow::field("json_1", ::arrow::extension::json(), true, field_metadata), - ::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()), + ::arrow::field("json_2", ::arrow::extension::json(::arrow::utf8()), true)}); std::shared_ptr metadata; @@ -810,7 +810,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"}); auto arrow_schema = ::arrow::schema( {::arrow::field("json_1", ::arrow::extension::json(), true, field_metadata), - ::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()), + ::arrow::field("json_2", ::arrow::extension::json(::arrow::utf8()), true)}); std::shared_ptr metadata; From 399a87ab2099661171b206d3dceb198d64cc9a62 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 24 Sep 2024 14:57:59 +0200 Subject: [PATCH 02/20] Review feedback --- cpp/src/parquet/arrow/arrow_schema_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index ff9e440bc83..d261482d89a 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -792,7 +792,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"}); auto arrow_schema = ::arrow::schema( {::arrow::field("json_1", ::arrow::extension::json(), true, field_metadata), - ::arrow::field("json_2", ::arrow::extension::json(::arrow::utf8()), + ::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()), true)}); std::shared_ptr metadata; @@ -810,7 +810,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"}); auto arrow_schema = ::arrow::schema( {::arrow::field("json_1", ::arrow::extension::json(), true, field_metadata), - ::arrow::field("json_2", ::arrow::extension::json(::arrow::utf8()), + ::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()), true)}); std::shared_ptr metadata; From e0c92d5c706e5a829fee3c64173cb42ac6e97a77 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 24 Sep 2024 22:07:05 +0200 Subject: [PATCH 03/20] Change how extension type storage is restored after reading parquet --- cpp/src/parquet/arrow/schema.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 0d009c8d4f1..e4a26eed438 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -1028,7 +1028,9 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer // Restore extension type, if the storage type is the same as inferred // from the Parquet type - if (ex_type.storage_type()->Equals(*inferred->field->type())) { + if (ex_type.storage_type()->Equals(*inferred->field->type()) || + (ex_type.extension_name() == "arrow.json" && + !ex_type.storage_type()->Equals(*inferred->field->type()))) { inferred->field = inferred->field->WithType(origin_type); } } From 943553a886c5a7b2a693775272870a31f115b03c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 25 Sep 2024 11:29:47 +0200 Subject: [PATCH 04/20] Add another assertion --- cpp/src/parquet/arrow/arrow_schema_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index d261482d89a..b90726fa1ad 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -776,6 +776,8 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { true)}); metadata = std::shared_ptr{}; ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); + EXPECT_FALSE(result_schema_->field(1)->type()->Equals( + arrow_schema->field(1)->type())); EXPECT_TRUE(result_schema_->field(1)->type()->Equals( ::arrow::extension::json(::arrow::utf8()))); EXPECT_FALSE( From 4ab988c00df4a8f1a2c672716b63c5886b5eca9d Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 25 Sep 2024 17:23:52 +0200 Subject: [PATCH 05/20] Review feedback --- cpp/src/parquet/arrow/schema.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index e4a26eed438..91726427e39 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -1029,8 +1029,10 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer // Restore extension type, if the storage type is the same as inferred // from the Parquet type if (ex_type.storage_type()->Equals(*inferred->field->type()) || - (ex_type.extension_name() == "arrow.json" && - !ex_type.storage_type()->Equals(*inferred->field->type()))) { + ((ex_type.extension_name() == "arrow.json") && + (inferred->field->type()->storage_id() == ::arrow::Type::STRING || + inferred->field->type()->storage_id() == ::arrow::Type::LARGE_STRING || + inferred->field->type()->storage_id() == ::arrow::Type::STRING_VIEW))) { inferred->field = inferred->field->WithType(origin_type); } } From 64d4975056a9660fd4556f67d128f57a1178ad8d Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 26 Sep 2024 13:31:53 +0200 Subject: [PATCH 06/20] Review feedback --- cpp/src/parquet/arrow/schema.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 91726427e39..c4430ceae29 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -1030,9 +1030,8 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer // from the Parquet type if (ex_type.storage_type()->Equals(*inferred->field->type()) || ((ex_type.extension_name() == "arrow.json") && - (inferred->field->type()->storage_id() == ::arrow::Type::STRING || - inferred->field->type()->storage_id() == ::arrow::Type::LARGE_STRING || - inferred->field->type()->storage_id() == ::arrow::Type::STRING_VIEW))) { + ::arrow::extension::JsonExtensionType::IsSupportedStorageType( + inferred->field->type()->storage_id()))) { inferred->field = inferred->field->WithType(origin_type); } } From b74fca653f157dd944cd4e8f3c2076e57fd9beef Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 2 Oct 2024 18:38:47 +0200 Subject: [PATCH 07/20] Review feedback --- cpp/src/parquet/arrow/schema.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index c4430ceae29..b273bde913b 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -1029,9 +1029,7 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer // Restore extension type, if the storage type is the same as inferred // from the Parquet type if (ex_type.storage_type()->Equals(*inferred->field->type()) || - ((ex_type.extension_name() == "arrow.json") && - ::arrow::extension::JsonExtensionType::IsSupportedStorageType( - inferred->field->type()->storage_id()))) { + (ex_type.extension_name() == "arrow.json")) { inferred->field = inferred->field->WithType(origin_type); } } From 33b92412c3a3028e16be4522f7c06d69e85e4a42 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 2 Oct 2024 22:14:09 +0200 Subject: [PATCH 08/20] Review feedback --- cpp/src/parquet/arrow/schema.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index b273bde913b..0d009c8d4f1 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -1028,8 +1028,7 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer // Restore extension type, if the storage type is the same as inferred // from the Parquet type - if (ex_type.storage_type()->Equals(*inferred->field->type()) || - (ex_type.extension_name() == "arrow.json")) { + if (ex_type.storage_type()->Equals(*inferred->field->type())) { inferred->field = inferred->field->WithType(origin_type); } } From 466b597454dc538c994f806cbf2bbac944bc93a5 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 11 Sep 2024 16:15:56 +0200 Subject: [PATCH 09/20] Initial commit --- cpp/src/arrow/extension/json.h | 5 ++ python/pyarrow/__init__.py | 8 +-- python/pyarrow/array.pxi | 24 +++++++ python/pyarrow/includes/libarrow.pxd | 10 +++ python/pyarrow/lib.pxd | 4 ++ python/pyarrow/public-api.pxi | 2 + python/pyarrow/scalar.pxi | 9 +++ python/pyarrow/tests/test_extension_type.py | 28 ++++++++ python/pyarrow/types.pxi | 74 +++++++++++++++++++++ 9 files changed, 160 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/extension/json.h b/cpp/src/arrow/extension/json.h index 89976c8073f..c2507bba438 100644 --- a/cpp/src/arrow/extension/json.h +++ b/cpp/src/arrow/extension/json.h @@ -27,6 +27,11 @@ namespace arrow::extension { +class ARROW_EXPORT JsonArray : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; +}; + /// \brief Concrete type class for variable-size JSON data, utf8-encoded. class ARROW_EXPORT JsonExtensionType : public ExtensionType { public: diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index d31c93119b7..4af4712281f 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -172,7 +172,7 @@ def print_entry(label, value): union, sparse_union, dense_union, dictionary, run_end_encoded, - bool8, fixed_shape_tensor, opaque, uuid, + bool8, fixed_shape_tensor, json, opaque, uuid, field, type_for_alias, DataType, DictionaryType, StructType, @@ -183,7 +183,7 @@ def print_entry(label, value): FixedSizeBinaryType, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, RunEndEncodedType, Bool8Type, FixedShapeTensorType, - OpaqueType, UuidType, + JsonType, OpaqueType, UuidType, PyExtensionType, UnknownExtensionType, register_extension_type, unregister_extension_type, DictionaryMemo, @@ -218,7 +218,7 @@ def print_entry(label, value): MonthDayNanoIntervalArray, Decimal128Array, Decimal256Array, StructArray, ExtensionArray, RunEndEncodedArray, Bool8Array, FixedShapeTensorArray, - OpaqueArray, UuidArray, + JsonArray, OpaqueArray, UuidArray, scalar, NA, _NULL as NULL, Scalar, NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, @@ -236,7 +236,7 @@ def print_entry(label, value): FixedSizeBinaryScalar, DictionaryScalar, MapScalar, StructScalar, UnionScalar, RunEndEncodedScalar, Bool8Scalar, ExtensionScalar, - FixedShapeTensorScalar, OpaqueScalar, UuidScalar) + FixedShapeTensorScalar, JsonScalar, OpaqueScalar, UuidScalar) # Buffers, allocation from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ae9e7fd777e..5232b76f0e2 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4344,6 +4344,30 @@ cdef class ExtensionArray(Array): return result +class JsonArray(ExtensionArray): + """ + Concrete class for Arrow arrays of JSON data type. + + Examples + -------- + Define the extension type for JSON array + + >>> import pyarrow as pa + >>> json_type = pa.json(pa.large_utf8()) + + Create an extension array + + >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] + >>> storage = pa.array(arr, pa.large_utf8()) + >>> pa.ExtensionArray.from_storage(json_type, storage) + + [ + null, + { "id":30, "values":["a", "b"] } + ] + """ + + class UuidArray(ExtensionArray): """ Concrete class for Arrow arrays of UUID data type. diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 8e6922a912a..ae156cc27e9 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2867,6 +2867,16 @@ cdef extern from "arrow/extension_type.h" namespace "arrow": shared_ptr[CArray] storage() +cdef extern from "arrow/extension/json.h" namespace "arrow::extension" nogil: + cdef cppclass CJsonType" arrow::extension::JsonExtensionType"(CExtensionType): + + @staticmethod + CResult[shared_ptr[CDataType]] Make(shared_ptr[CDataType]& storage_type) + + cdef cppclass CJsonArray" arrow::extension::JsonArray"(CExtensionArray): + pass + + cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension" nogil: cdef cppclass CUuidType" arrow::extension::UuidType"(CExtensionType): diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 25a7945dc3d..aba17dee479 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -226,6 +226,10 @@ cdef class UuidType(BaseExtensionType): cdef: const CUuidType* uuid_ext_type +cdef class JsonType(BaseExtensionType): + cdef: + const CJsonType* json_ext_type + cdef class PyExtensionType(ExtensionType): pass diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index d3e2ff2e99d..913e25e3082 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -131,6 +131,8 @@ cdef api object pyarrow_wrap_data_type( out = OpaqueType.__new__(OpaqueType) elif extension_name == b"arrow.uuid": out = UuidType.__new__(UuidType) + elif extension_name == b"arrow.json": + out = JsonType.__new__(JsonType) else: out = BaseExtensionType.__new__(BaseExtensionType) else: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 68f77832c43..b7efd0cda96 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1044,6 +1044,15 @@ cdef class ExtensionScalar(Scalar): return pyarrow_wrap_scalar( sp_scalar) +class JsonScalar(ExtensionScalar): + """ + Concrete class for JSON extension scalar. + """ + + def as_py(self): + return None if self.value is None else self.value.as_py() + + class UuidScalar(ExtensionScalar): """ Concrete class for Uuid extension scalar. diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index b74eca75bdc..43cd367a7dc 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1926,3 +1926,31 @@ def test_bool8_scalar(): assert pa.scalar(1, type=pa.bool8()).as_py() is True assert pa.scalar(2, type=pa.bool8()).as_py() is True assert pa.scalar(None, type=pa.bool8()).as_py() is None + + +@pytest.mark.parametrize("str_type", (pa.utf8, pa.large_utf8)) +def test_json(str_type): + storage_type = str_type() + data = ['{"a": 1}', '{"b": 2}', None] + storage = pa.array(data, type=storage_type) + json_type = pa.json(storage_type) + + assert json_type.extension_name == "arrow.json" + assert json_type.storage_type == storage_type + assert json_type.__class__ is pa.JsonType + + array = pa.ExtensionArray.from_storage(json_type, storage) + + assert array.to_pylist() == data + assert array[0].as_py() == data[0] + assert array[2].as_py() is None + + buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["json"])) + + batch = ipc_read_batch(buf) + reconstructed_array = batch.column(0) + assert reconstructed_array.type == json_type + assert reconstructed_array == array + + assert json_type.__arrow_ext_scalar_class__() == pa.JsonScalar + assert isinstance(array[0], pa.JsonScalar) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 70f12e9796e..b0c5e1653fe 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1812,6 +1812,43 @@ cdef class ExtensionType(BaseExtensionType): return ExtensionScalar +cdef class JsonType(BaseExtensionType): + """ + Concrete class for Arrow arrays of JSON data type. + + Examples + -------- + Define the extension type for JSON array + + >>> import pyarrow as pa + >>> json_type = pa.json(pa.large_utf8()) + + Create an extension array + + >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] + >>> storage = pa.array(arr, pa.large_utf8()) + >>> pa.ExtensionArray.from_storage(json_type, storage) + + [ + null, + { "id":30, "values":["a", "b"] } + ] + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + BaseExtensionType.init(self, type) + self.json_ext_type = type.get() + + def __arrow_ext_class__(self): + return JsonArray + + def __reduce__(self): + return json, (self.value_type,) + + def __arrow_ext_scalar_class__(self): + return JsonScalar + + cdef class UuidType(BaseExtensionType): """ Concrete class for UUID extension type. @@ -5296,6 +5333,43 @@ def run_end_encoded(run_end_type, value_type): return pyarrow_wrap_data_type(ree_type) +def json(DataType storage_type): + """ + Create instance of JSON extension type. + + Parameters + ---------- + storage_type : DataType + The underlying data type. + + Returns + ------- + type : JsonType + + Examples + -------- + Create an instance of JSON extension type: + + >>> import pyarrow as pa + >>> pa.json(pa.utf8()) + JsonType(arrow.json) + + Use the JSON type to create an array: + + >>> pa.array(['{"a": 1}', '{"b": 2}'], type=pa.json(pa.utf8())) + + [ + {"a": 1}, + {"b": 2} + ] + """ + + cdef JsonType out = JsonType.__new__(JsonType) + c_json_ext_type = GetResultValue(CJsonType.Make(storage_type.sp_type)) + out.init(c_json_ext_type) + return out + + def uuid(): """ Create UuidType instance. From 16958c43c999385fc6e409ca876cbc3a22e9485f Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 12 Sep 2024 10:13:25 +0200 Subject: [PATCH 10/20] Apply suggestions from code review Co-authored-by: Joris Van den Bossche --- python/pyarrow/lib.pxd | 1 + python/pyarrow/tests/test_extension_type.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index aba17dee479..f3d4e1eec08 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -230,6 +230,7 @@ cdef class JsonType(BaseExtensionType): cdef: const CJsonType* json_ext_type + cdef class PyExtensionType(ExtensionType): pass diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 43cd367a7dc..e9b8ba81b49 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1939,6 +1939,9 @@ def test_json(str_type): assert json_type.storage_type == storage_type assert json_type.__class__ is pa.JsonType + assert json_type == pa.json(storage_type) + assert json_type != storage_type + array = pa.ExtensionArray.from_storage(json_type, storage) assert array.to_pylist() == data From d26b32743aa2986e46fb1158ebf0b004948bbcc9 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 12 Sep 2024 12:12:41 +0200 Subject: [PATCH 11/20] Review feedback --- cpp/src/arrow/extension/json.h | 5 -- python/pyarrow/array.pxi | 2 + python/pyarrow/includes/libarrow.pxd | 3 - python/pyarrow/tests/test_extension_type.py | 63 ++++++++++++++++++++- python/pyarrow/tests/test_misc.py | 3 + python/pyarrow/types.pxi | 2 +- 6 files changed, 66 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/extension/json.h b/cpp/src/arrow/extension/json.h index c2507bba438..89976c8073f 100644 --- a/cpp/src/arrow/extension/json.h +++ b/cpp/src/arrow/extension/json.h @@ -27,11 +27,6 @@ namespace arrow::extension { -class ARROW_EXPORT JsonArray : public ExtensionArray { - public: - using ExtensionArray::ExtensionArray; -}; - /// \brief Concrete type class for variable-size JSON data, utf8-encoded. class ARROW_EXPORT JsonExtensionType : public ExtensionType { public: diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 5232b76f0e2..c032084e237 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4347,6 +4347,8 @@ cdef class ExtensionArray(Array): class JsonArray(ExtensionArray): """ Concrete class for Arrow arrays of JSON data type. + This does not guarantee that the JSON data actually + valid JSON. Examples -------- diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index ae156cc27e9..797cefed8c7 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2873,9 +2873,6 @@ cdef extern from "arrow/extension/json.h" namespace "arrow::extension" nogil: @staticmethod CResult[shared_ptr[CDataType]] Make(shared_ptr[CDataType]& storage_type) - cdef cppclass CJsonArray" arrow::extension::JsonArray"(CExtensionArray): - pass - cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension" nogil: cdef cppclass CUuidType" arrow::extension::UuidType"(CExtensionType): diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index e9b8ba81b49..cf507fd8a6d 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1928,12 +1928,14 @@ def test_bool8_scalar(): assert pa.scalar(None, type=pa.bool8()).as_py() is None -@pytest.mark.parametrize("str_type", (pa.utf8, pa.large_utf8)) -def test_json(str_type): +@pytest.mark.parametrize("str_type", ( + pa.utf8, pa.large_utf8, pa.string_view, pa.string, pa.large_string)) +def test_json(str_type, pickle_module): storage_type = str_type() data = ['{"a": 1}', '{"b": 2}', None] storage = pa.array(data, type=storage_type) json_type = pa.json(storage_type) + json_arr_class = json_type.__arrow_ext_class__() assert json_type.extension_name == "arrow.json" assert json_type.storage_type == storage_type @@ -1943,17 +1945,72 @@ def test_json(str_type): assert json_type != storage_type array = pa.ExtensionArray.from_storage(json_type, storage) + assert isinstance(array, pa.JsonArray) assert array.to_pylist() == data assert array[0].as_py() == data[0] assert array[2].as_py() is None - buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["json"])) + # Pickle roundtrip + result = pickle_module.loads(pickle_module.dumps(json_type)) + assert result == json_type + # IPC roundtrip + buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["ext"])) batch = ipc_read_batch(buf) reconstructed_array = batch.column(0) assert reconstructed_array.type == json_type assert reconstructed_array == array + assert isinstance(array, json_arr_class) assert json_type.__arrow_ext_scalar_class__() == pa.JsonScalar assert isinstance(array[0], pa.JsonScalar) + + # cast storage -> extension type + result = storage.cast(json_type) + assert result == array + + # cast extension type -> storage type + if storage_type != pa.string_view(): + inner = array.cast(storage_type) + assert inner == storage + + +@pytest.mark.parametrize("str_type", ( + pa.utf8, pa.large_utf8, pa.string, pa.large_string)) +@pytest.mark.parquet +def test_parquet_json(tmpdir, str_type): + storage_type = str_type() + data = ['{"a": 1}', '{"b": 2}', None] + storage = pa.array(data, type=storage_type) + json_type = pa.json(storage_type) + + arr = pa.ExtensionArray.from_storage(json_type, storage) + table = pa.table([arr], names=["ext"]) + + import pyarrow.parquet as pq + + filename = tmpdir / 'json_extension_type.parquet' + pq.write_table(table, filename) + + # Stored in parquet as storage type but with extension metadata saved + # in the serialized arrow schema + meta = pq.read_metadata(filename) + assert meta.schema.column(0).physical_type == "BYTE_ARRAY" + assert b"ARROW:schema" in meta.metadata + + import base64 + decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"]) + schema = pa.ipc.read_schema(pa.BufferReader(decoded_schema)) + # Since the type could be reconstructed, the extension type metadata is + # absent. + assert schema.field("ext").metadata == {} + + # When reading in, properly create extension type if it is registered + result = pq.read_table(filename) + result.validate(full=True) + assert result.schema.field("ext").type == json_type + assert result.schema.field("ext").metadata == {} + # Get the exact array class defined by the registered type. + result_array = result.column("ext").chunk(0) + assert type(result_array) is pa.JsonArray diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index 5d3471c7c35..0b2055018f6 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -253,6 +253,9 @@ def test_set_timezone_db_path_non_windows(): pa.Bool8Array, pa.Bool8Scalar, pa.Bool8Type, + pa.JsonArray, + pa.JsonScalar, + pa.JsonType, ]) def test_extension_type_constructor_errors(klass): # ARROW-2638: prevent calling extension class constructors directly diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index b0c5e1653fe..587a00a56ea 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1843,7 +1843,7 @@ cdef class JsonType(BaseExtensionType): return JsonArray def __reduce__(self): - return json, (self.value_type,) + return json, (self.storage_type,) def __arrow_ext_scalar_class__(self): return JsonScalar From dc5143f649819bb1f0bda7990181ba12774cf600 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 12 Sep 2024 12:42:56 +0200 Subject: [PATCH 12/20] Set pa.json(storage_type=utf8()) --- python/pyarrow/tests/test_extension_type.py | 15 +++++++-------- python/pyarrow/types.pxi | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index cf507fd8a6d..db90600e706 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1928,15 +1928,15 @@ def test_bool8_scalar(): assert pa.scalar(None, type=pa.bool8()).as_py() is None -@pytest.mark.parametrize("str_type", ( - pa.utf8, pa.large_utf8, pa.string_view, pa.string, pa.large_string)) -def test_json(str_type, pickle_module): - storage_type = str_type() +@pytest.mark.parametrize("storage_type", ( + pa.utf8(), pa.large_utf8(), pa.string_view(), pa.string(), pa.large_string())) +def test_json(storage_type, pickle_module): data = ['{"a": 1}', '{"b": 2}', None] storage = pa.array(data, type=storage_type) json_type = pa.json(storage_type) json_arr_class = json_type.__arrow_ext_class__() + assert pa.json() == pa.json(pa.utf8()) assert json_type.extension_name == "arrow.json" assert json_type.storage_type == storage_type assert json_type.__class__ is pa.JsonType @@ -1976,11 +1976,10 @@ def test_json(str_type, pickle_module): assert inner == storage -@pytest.mark.parametrize("str_type", ( - pa.utf8, pa.large_utf8, pa.string, pa.large_string)) +@pytest.mark.parametrize("storage_type", ( + pa.utf8(), pa.large_utf8(), pa.string(), pa.large_string())) @pytest.mark.parquet -def test_parquet_json(tmpdir, str_type): - storage_type = str_type() +def test_parquet_json(tmpdir, storage_type): data = ['{"a": 1}', '{"b": 2}', None] storage = pa.array(data, type=storage_type) json_type = pa.json(storage_type) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 587a00a56ea..9961c434938 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -5333,7 +5333,7 @@ def run_end_encoded(run_end_type, value_type): return pyarrow_wrap_data_type(ree_type) -def json(DataType storage_type): +def json(DataType storage_type=utf8()): """ Create instance of JSON extension type. From f4d753cf7af546b1b02a1793747d67b51265496a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 12 Sep 2024 16:00:05 +0200 Subject: [PATCH 13/20] Review feedback --- python/pyarrow/__init__.py | 2 +- .../pyarrow/tests/parquet/test_data_types.py | 13 ++++++ python/pyarrow/tests/test_extension_type.py | 45 ++----------------- python/pyarrow/types.pxi | 4 +- 4 files changed, 19 insertions(+), 45 deletions(-) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 4af4712281f..655836df06e 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -172,7 +172,7 @@ def print_entry(label, value): union, sparse_union, dense_union, dictionary, run_end_encoded, - bool8, fixed_shape_tensor, json, opaque, uuid, + bool8, fixed_shape_tensor, json_, opaque, uuid, field, type_for_alias, DataType, DictionaryType, StructType, diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index 79dd9694826..dcdf10d15f4 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -510,3 +510,16 @@ def test_large_binary_overflow(): pa.ArrowInvalid, match="Parquet cannot store strings with size 2GB or more"): _write_table(table, writer, use_dictionary=use_dictionary) + + +@pytest.mark.parametrize("storage_type", ( + pa.utf8(), pa.large_utf8(), pa.string(), pa.large_string())) +def test_json_extension_type(storage_type): + data = ['{"a": 1}', '{"b": 2}', None] + storage = pa.array(data, type=storage_type) + json_type = pa.json_(storage_type) + + arr = pa.ExtensionArray.from_storage(json_type, storage) + table = pa.table([arr], names=["ext"]) + + _simple_table_roundtrip(table, use_dictionary=False) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index db90600e706..074147326aa 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1933,15 +1933,15 @@ def test_bool8_scalar(): def test_json(storage_type, pickle_module): data = ['{"a": 1}', '{"b": 2}', None] storage = pa.array(data, type=storage_type) - json_type = pa.json(storage_type) + json_type = pa.json_(storage_type) json_arr_class = json_type.__arrow_ext_class__() - assert pa.json() == pa.json(pa.utf8()) + assert pa.json_() == pa.json_(pa.utf8()) assert json_type.extension_name == "arrow.json" assert json_type.storage_type == storage_type assert json_type.__class__ is pa.JsonType - assert json_type == pa.json(storage_type) + assert json_type == pa.json_(storage_type) assert json_type != storage_type array = pa.ExtensionArray.from_storage(json_type, storage) @@ -1974,42 +1974,3 @@ def test_json(storage_type, pickle_module): if storage_type != pa.string_view(): inner = array.cast(storage_type) assert inner == storage - - -@pytest.mark.parametrize("storage_type", ( - pa.utf8(), pa.large_utf8(), pa.string(), pa.large_string())) -@pytest.mark.parquet -def test_parquet_json(tmpdir, storage_type): - data = ['{"a": 1}', '{"b": 2}', None] - storage = pa.array(data, type=storage_type) - json_type = pa.json(storage_type) - - arr = pa.ExtensionArray.from_storage(json_type, storage) - table = pa.table([arr], names=["ext"]) - - import pyarrow.parquet as pq - - filename = tmpdir / 'json_extension_type.parquet' - pq.write_table(table, filename) - - # Stored in parquet as storage type but with extension metadata saved - # in the serialized arrow schema - meta = pq.read_metadata(filename) - assert meta.schema.column(0).physical_type == "BYTE_ARRAY" - assert b"ARROW:schema" in meta.metadata - - import base64 - decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"]) - schema = pa.ipc.read_schema(pa.BufferReader(decoded_schema)) - # Since the type could be reconstructed, the extension type metadata is - # absent. - assert schema.field("ext").metadata == {} - - # When reading in, properly create extension type if it is registered - result = pq.read_table(filename) - result.validate(full=True) - assert result.schema.field("ext").type == json_type - assert result.schema.field("ext").metadata == {} - # Get the exact array class defined by the registered type. - result_array = result.column("ext").chunk(0) - assert type(result_array) is pa.JsonArray diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 9961c434938..564e534da4c 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1843,7 +1843,7 @@ cdef class JsonType(BaseExtensionType): return JsonArray def __reduce__(self): - return json, (self.storage_type,) + return json_, (self.storage_type,) def __arrow_ext_scalar_class__(self): return JsonScalar @@ -5333,7 +5333,7 @@ def run_end_encoded(run_end_type, value_type): return pyarrow_wrap_data_type(ree_type) -def json(DataType storage_type=utf8()): +def json_(DataType storage_type=utf8()): """ Create instance of JSON extension type. From 86208e32ee828ff0d83a3124bd4cad9fb69f2410 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 12 Sep 2024 16:33:43 +0200 Subject: [PATCH 14/20] doctest fix --- python/pyarrow/array.pxi | 4 ++-- python/pyarrow/types.pxi | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index c032084e237..0dcd727e7a7 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4355,7 +4355,7 @@ class JsonArray(ExtensionArray): Define the extension type for JSON array >>> import pyarrow as pa - >>> json_type = pa.json(pa.large_utf8()) + >>> json_type = pa.json_(pa.large_utf8()) Create an extension array @@ -4365,7 +4365,7 @@ class JsonArray(ExtensionArray): [ null, - { "id":30, "values":["a", "b"] } + "{ "id":30, "values":["a", "b"] }" ] """ diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 564e534da4c..e478c5a1761 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1821,7 +1821,7 @@ cdef class JsonType(BaseExtensionType): Define the extension type for JSON array >>> import pyarrow as pa - >>> json_type = pa.json(pa.large_utf8()) + >>> json_type = pa.json_(pa.large_utf8()) Create an extension array @@ -1831,7 +1831,7 @@ cdef class JsonType(BaseExtensionType): [ null, - { "id":30, "values":["a", "b"] } + "{ "id":30, "values":["a", "b"] }" ] """ @@ -5351,16 +5351,16 @@ def json_(DataType storage_type=utf8()): Create an instance of JSON extension type: >>> import pyarrow as pa - >>> pa.json(pa.utf8()) - JsonType(arrow.json) + >>> pa.json_(pa.utf8()) + JsonType(extension) Use the JSON type to create an array: - >>> pa.array(['{"a": 1}', '{"b": 2}'], type=pa.json(pa.utf8())) + >>> pa.array(['{"a": 1}', '{"b": 2}'], type=pa.json_(pa.utf8())) [ - {"a": 1}, - {"b": 2} + "{"a": 1}", + "{"b": 2}" ] """ From 7afd8be02dffa7953c3e4b6d383f754bfdea22c3 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 12 Sep 2024 19:03:03 +0200 Subject: [PATCH 15/20] Rebase and enable casting for pa.string_view --- python/pyarrow/tests/test_extension_type.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 074147326aa..e754dbb81b0 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1971,6 +1971,5 @@ def test_json(storage_type, pickle_module): assert result == array # cast extension type -> storage type - if storage_type != pa.string_view(): - inner = array.cast(storage_type) - assert inner == storage + inner = array.cast(storage_type) + assert inner == storage From 520f0aa9ddc4b80d865434e5ae85d1616a31488d Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 23 Sep 2024 13:52:03 +0200 Subject: [PATCH 16/20] Apply suggestions from code review Co-authored-by: Joris Van den Bossche --- python/pyarrow/array.pxi | 3 ++- python/pyarrow/tests/parquet/test_data_types.py | 2 +- python/pyarrow/tests/test_extension_type.py | 2 +- python/pyarrow/types.pxi | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 0dcd727e7a7..eaedbf1e385 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4347,8 +4347,9 @@ cdef class ExtensionArray(Array): class JsonArray(ExtensionArray): """ Concrete class for Arrow arrays of JSON data type. + This does not guarantee that the JSON data actually - valid JSON. + is valid JSON. Examples -------- diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index dcdf10d15f4..fc20108d331 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -513,7 +513,7 @@ def test_large_binary_overflow(): @pytest.mark.parametrize("storage_type", ( - pa.utf8(), pa.large_utf8(), pa.string(), pa.large_string())) + pa.string(), pa.large_string())) def test_json_extension_type(storage_type): data = ['{"a": 1}', '{"b": 2}', None] storage = pa.array(data, type=storage_type) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index e754dbb81b0..4d496ba119b 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1929,7 +1929,7 @@ def test_bool8_scalar(): @pytest.mark.parametrize("storage_type", ( - pa.utf8(), pa.large_utf8(), pa.string_view(), pa.string(), pa.large_string())) + pa.string(), pa.large_string(), pa.string_view())) def test_json(storage_type, pickle_module): data = ['{"a": 1}', '{"b": 2}', None] storage = pa.array(data, type=storage_type) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index e478c5a1761..6ee93d2f2bd 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -5339,7 +5339,7 @@ def json_(DataType storage_type=utf8()): Parameters ---------- - storage_type : DataType + storage_type : DataType, default pyarrow.string() The underlying data type. Returns From 96469441a6b100ce7b57cc6a9e3473d0800fb235 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 24 Sep 2024 13:30:12 +0200 Subject: [PATCH 17/20] Review feedback --- python/pyarrow/scalar.pxi | 3 --- python/pyarrow/tests/parquet/test_data_types.py | 8 +++++++- python/pyarrow/types.pxi | 6 ++++-- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index b7efd0cda96..2bfdcddf307 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1049,9 +1049,6 @@ class JsonScalar(ExtensionScalar): Concrete class for JSON extension scalar. """ - def as_py(self): - return None if self.value is None else self.value.as_py() - class UuidScalar(ExtensionScalar): """ diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index fc20108d331..6b68ac9f094 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -522,4 +522,10 @@ def test_json_extension_type(storage_type): arr = pa.ExtensionArray.from_storage(json_type, storage) table = pa.table([arr], names=["ext"]) - _simple_table_roundtrip(table, use_dictionary=False) + _simple_table_roundtrip(table) + + for storage_type in (pa.int32(), pa.large_binary(), pa.float32()): + with pytest.raises( + pa.ArrowInvalid, + match="Invalid storage type for JsonExtensionType: " + str(storage_type)): + pa.json_(storage_type) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 6ee93d2f2bd..cbad870f6fb 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1814,7 +1814,7 @@ cdef class ExtensionType(BaseExtensionType): cdef class JsonType(BaseExtensionType): """ - Concrete class for Arrow arrays of JSON data type. + Concrete class for JSON extension type. Examples -------- @@ -5340,7 +5340,9 @@ def json_(DataType storage_type=utf8()): Parameters ---------- storage_type : DataType, default pyarrow.string() - The underlying data type. + The underlying data type. Can be on of the following types: + string, large_string, string_view. + Returns ------- From 583ba677e9358761f131b26e3cdcf6aaa508bc50 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 24 Sep 2024 15:00:00 +0200 Subject: [PATCH 18/20] Review feedback --- cpp/src/parquet/arrow/arrow_schema_test.cc | 2 -- python/pyarrow/tests/parquet/test_data_types.py | 6 ------ python/pyarrow/tests/test_extension_type.py | 7 +++++++ 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index b90726fa1ad..d261482d89a 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -776,8 +776,6 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { true)}); metadata = std::shared_ptr{}; ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); - EXPECT_FALSE(result_schema_->field(1)->type()->Equals( - arrow_schema->field(1)->type())); EXPECT_TRUE(result_schema_->field(1)->type()->Equals( ::arrow::extension::json(::arrow::utf8()))); EXPECT_FALSE( diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index 6b68ac9f094..a762a5bcb4f 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -523,9 +523,3 @@ def test_json_extension_type(storage_type): table = pa.table([arr], names=["ext"]) _simple_table_roundtrip(table) - - for storage_type in (pa.int32(), pa.large_binary(), pa.float32()): - with pytest.raises( - pa.ArrowInvalid, - match="Invalid storage type for JsonExtensionType: " + str(storage_type)): - pa.json_(storage_type) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 4d496ba119b..6b566d0e343 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1973,3 +1973,10 @@ def test_json(storage_type, pickle_module): # cast extension type -> storage type inner = array.cast(storage_type) assert inner == storage + + for storage_type in (pa.int32(), pa.large_binary(), pa.float32()): + with pytest.raises( + pa.ArrowInvalid, + match="Invalid storage type for JsonExtensionType: " + + str(storage_type)): + pa.json_(storage_type) From 20fe633468efba49e52c4aca01e77f74d00b79d2 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 14 Oct 2024 23:23:24 +0200 Subject: [PATCH 19/20] Review feedback --- python/pyarrow/tests/test_extension_type.py | 3 +-- python/pyarrow/types.pxi | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 6b566d0e343..31d4037c84b 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1977,6 +1977,5 @@ def test_json(storage_type, pickle_module): for storage_type in (pa.int32(), pa.large_binary(), pa.float32()): with pytest.raises( pa.ArrowInvalid, - match="Invalid storage type for JsonExtensionType: " + - str(storage_type)): + match=f"Invalid storage type for JsonExtensionType: {storage_type}"): pa.json_(storage_type) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index cbad870f6fb..c66ac5f28d3 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -5343,7 +5343,6 @@ def json_(DataType storage_type=utf8()): The underlying data type. Can be on of the following types: string, large_string, string_view. - Returns ------- type : JsonType From e8baa240a8eaa345619f836e5160c6e7cc40e5ba Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 22 Oct 2024 10:50:32 +0200 Subject: [PATCH 20/20] Use pa.array constructor for extension types --- python/pyarrow/tests/parquet/test_data_types.py | 4 +--- python/pyarrow/tests/test_extension_type.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index a762a5bcb4f..1428f802397 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -516,10 +516,8 @@ def test_large_binary_overflow(): pa.string(), pa.large_string())) def test_json_extension_type(storage_type): data = ['{"a": 1}', '{"b": 2}', None] - storage = pa.array(data, type=storage_type) - json_type = pa.json_(storage_type) + arr = pa.array(data, type=pa.json_(storage_type)) - arr = pa.ExtensionArray.from_storage(json_type, storage) table = pa.table([arr], names=["ext"]) _simple_table_roundtrip(table) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 31d4037c84b..634d9ce2d8d 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1932,8 +1932,9 @@ def test_bool8_scalar(): pa.string(), pa.large_string(), pa.string_view())) def test_json(storage_type, pickle_module): data = ['{"a": 1}', '{"b": 2}', None] - storage = pa.array(data, type=storage_type) json_type = pa.json_(storage_type) + storage = pa.array(data, type=storage_type) + array = pa.array(data, type=json_type) json_arr_class = json_type.__arrow_ext_class__() assert pa.json_() == pa.json_(pa.utf8()) @@ -1944,7 +1945,6 @@ def test_json(storage_type, pickle_module): assert json_type == pa.json_(storage_type) assert json_type != storage_type - array = pa.ExtensionArray.from_storage(json_type, storage) assert isinstance(array, pa.JsonArray) assert array.to_pylist() == data