-
Notifications
You must be signed in to change notification settings - Fork 4k
GH-32538: [C++][Parquet] Add JSON canonical extension type #13901
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d091c21
f3944cc
e5d1604
9ce63da
f3ab322
e6cfa91
d749d01
c95eda4
d731a62
197ce79
b7b01d4
e1a90ee
6f8f467
e8cdb9c
7d3ec48
76628c8
1b66f11
eab70b6
51676cb
5551d7b
e9b44ad
9c09cbe
f518ebf
e2f82a8
e32805e
1ca8f1b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,61 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #include "arrow/extension/json.h" | ||
|
|
||
| #include <string> | ||
|
|
||
| #include "arrow/extension_type.h" | ||
| #include "arrow/result.h" | ||
| #include "arrow/status.h" | ||
| #include "arrow/type_fwd.h" | ||
| #include "arrow/util/logging.h" | ||
|
|
||
| namespace arrow::extension { | ||
|
|
||
| bool JsonExtensionType::ExtensionEquals(const ExtensionType& other) const { | ||
| return other.extension_name() == this->extension_name(); | ||
| } | ||
|
Comment on lines
+30
to
+32
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This equality check does not take into account the storage type, but only the name. As a consequence, a While from a user point of view, it certainly makes sense to have those seen as equal, but the same is true for string vs large_string itself. And in general in Arrow C++, the types are concrete types where variants of the same "logical" type (eg string vs large_string) are not seen as equal. So should the same logic be followed here? I assume that such type equality will for example be used to check if schemas are equal to see if a set of batches can be concatenated or written to the same IPC stream, etc, and for those cases we require exact equality?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, that's certainly a bug. Sorry for not spotting this, and feel free to submit a fix :-)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, I suppose I missed that when switching from string only to it being a parametric type. I can make a fix later today if no one started on it yet.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't start yet |
||
|
|
||
| Result<std::shared_ptr<DataType>> JsonExtensionType::Deserialize( | ||
| std::shared_ptr<DataType> storage_type, const std::string& serialized) const { | ||
| if (storage_type->id() != Type::STRING && storage_type->id() != Type::STRING_VIEW && | ||
| storage_type->id() != Type::LARGE_STRING) { | ||
| return Status::Invalid("Invalid storage type for JsonExtensionType: ", | ||
| storage_type->ToString()); | ||
| } | ||
| return std::make_shared<JsonExtensionType>(storage_type); | ||
| } | ||
|
|
||
| std::string JsonExtensionType::Serialize() const { return ""; } | ||
|
|
||
| std::shared_ptr<Array> JsonExtensionType::MakeArray( | ||
| std::shared_ptr<ArrayData> data) const { | ||
| DCHECK_EQ(data->type->id(), Type::EXTENSION); | ||
| DCHECK_EQ("arrow.json", | ||
| internal::checked_cast<const ExtensionType&>(*data->type).extension_name()); | ||
| return std::make_shared<ExtensionArray>(data); | ||
| } | ||
|
|
||
| std::shared_ptr<DataType> json(const std::shared_ptr<DataType> storage_type) { | ||
| ARROW_CHECK(storage_type->id() != Type::STRING || | ||
| storage_type->id() != Type::STRING_VIEW || | ||
| storage_type->id() != Type::LARGE_STRING); | ||
|
Comment on lines
+55
to
+57
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This check is not correct also. |
||
| return std::make_shared<JsonExtensionType>(storage_type); | ||
| } | ||
|
|
||
| } // namespace arrow::extension | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #pragma once | ||
|
|
||
| #include <stdexcept> | ||
| #include <string> | ||
|
|
||
| #include "arrow/extension_type.h" | ||
| #include "arrow/result.h" | ||
| #include "arrow/type_fwd.h" | ||
| #include "arrow/util/visibility.h" | ||
|
|
||
| namespace arrow::extension { | ||
|
|
||
| /// \brief Concrete type class for variable-size JSON data, utf8-encoded. | ||
| class ARROW_EXPORT JsonExtensionType : public ExtensionType { | ||
| public: | ||
| explicit JsonExtensionType(const std::shared_ptr<DataType>& storage_type) | ||
| : ExtensionType(storage_type), storage_type_(storage_type) {} | ||
|
|
||
| std::string extension_name() const override { return "arrow.json"; } | ||
|
|
||
| bool ExtensionEquals(const ExtensionType& other) const override; | ||
|
|
||
| Result<std::shared_ptr<DataType>> Deserialize( | ||
| std::shared_ptr<DataType> storage_type, | ||
| const std::string& serialized_data) const override; | ||
|
|
||
| std::string Serialize() const override; | ||
|
|
||
| std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override; | ||
|
|
||
| private: | ||
| std::shared_ptr<DataType> storage_type_; | ||
| }; | ||
|
|
||
| /// \brief Return a JsonExtensionType instance. | ||
| ARROW_EXPORT std::shared_ptr<DataType> json( | ||
| std::shared_ptr<DataType> storage_type = utf8()); | ||
|
|
||
| } // namespace arrow::extension |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,83 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #include "arrow/extension/json.h" | ||
|
|
||
| #include "arrow/array/validate.h" | ||
| #include "arrow/ipc/test_common.h" | ||
| #include "arrow/record_batch.h" | ||
| #include "arrow/testing/gtest_util.h" | ||
| #include "parquet/exception.h" | ||
|
|
||
| namespace arrow { | ||
|
|
||
| using arrow::ipc::test::RoundtripBatch; | ||
| using extension::json; | ||
|
|
||
| class TestJsonExtensionType : public ::testing::Test {}; | ||
|
|
||
| std::shared_ptr<Array> ExampleJson(const std::shared_ptr<DataType>& storage_type) { | ||
| std::shared_ptr<Array> arr = ArrayFromJSON(storage_type, R"([ | ||
| "null", | ||
| "1234", | ||
| "3.14159", | ||
| "true", | ||
| "false", | ||
| "\"a json string\"", | ||
| "[\"a\", \"json\", \"array\"]", | ||
| "{\"obj\": \"a simple json object\"}" | ||
| ])"); | ||
| return ExtensionType::WrapArray(arrow::extension::json(storage_type), arr); | ||
| } | ||
|
|
||
| TEST_F(TestJsonExtensionType, JsonRoundtrip) { | ||
| for (const auto& storage_type : {utf8(), large_utf8(), utf8_view()}) { | ||
| std::shared_ptr<Array> ext_arr = ExampleJson(storage_type); | ||
| auto batch = | ||
| RecordBatch::Make(schema({field("f0", json(storage_type))}), 8, {ext_arr}); | ||
|
|
||
| std::shared_ptr<RecordBatch> read_batch; | ||
| ASSERT_OK(RoundtripBatch(batch, &read_batch)); | ||
| ASSERT_OK(read_batch->ValidateFull()); | ||
| CompareBatch(*batch, *read_batch, /*compare_metadata*/ true); | ||
|
|
||
| auto read_ext_arr = read_batch->column(0); | ||
| ASSERT_OK(internal::ValidateUTF8(*read_ext_arr)); | ||
| ASSERT_OK(read_ext_arr->ValidateFull()); | ||
| } | ||
| } | ||
|
|
||
| TEST_F(TestJsonExtensionType, InvalidUTF8) { | ||
| for (const auto& storage_type : {utf8(), large_utf8(), utf8_view()}) { | ||
| auto json_type = json(storage_type); | ||
| auto invalid_input = ArrayFromJSON(storage_type, "[\"Ⱥa\xFFⱭ\", \"Ɽ\xe1\xbdⱤaA\"]"); | ||
| auto ext_arr = ExtensionType::WrapArray(json_type, invalid_input); | ||
|
|
||
| ASSERT_RAISES_WITH_MESSAGE(Invalid, | ||
| "Invalid: Invalid UTF8 sequence at string index 0", | ||
| ext_arr->ValidateFull()); | ||
| ASSERT_RAISES_WITH_MESSAGE(Invalid, | ||
| "Invalid: Invalid UTF8 sequence at string index 0", | ||
| arrow::internal::ValidateUTF8(*ext_arr)); | ||
|
|
||
| auto batch = RecordBatch::Make(schema({field("f0", json_type)}), 2, {ext_arr}); | ||
| std::shared_ptr<RecordBatch> read_batch; | ||
| ASSERT_OK(RoundtripBatch(batch, &read_batch)); | ||
| } | ||
| } | ||
rok marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| } // namespace arrow | ||
Uh oh!
There was an error while loading. Please reload this page.