From 5c43465997dbfd0671c22047e090d6d35cac0cf6 Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 9 May 2025 18:34:32 +0800 Subject: [PATCH 01/31] Variant tools --- cpp/src/parquet/CMakeLists.txt | 3 +- cpp/src/parquet/test_util.cc | 9 +++ cpp/src/parquet/test_util.h | 1 + cpp/src/parquet/variant.h | 99 ++++++++++++++++++++++++++++++++ cpp/src/parquet/variant_test.cpp | 46 +++++++++++++++ 5 files changed, 157 insertions(+), 1 deletion(-) create mode 100644 cpp/src/parquet/variant.h create mode 100644 cpp/src/parquet/variant_test.cpp diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 735653d677b0..47977285c677 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -381,7 +381,8 @@ add_parquet_test(internals-test public_api_test.cc size_statistics_test.cc statistics_test.cc - types_test.cc) + types_test.cc + variant_test.cpp) set_source_files_properties(public_api_test.cc PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON) diff --git a/cpp/src/parquet/test_util.cc b/cpp/src/parquet/test_util.cc index 9d35413d36f9..ecd95500cab7 100644 --- a/cpp/src/parquet/test_util.cc +++ b/cpp/src/parquet/test_util.cc @@ -53,6 +53,15 @@ std::string get_bad_data_dir() { return ss.str(); } +std::string get_variant_dir() { + // PARQUET_TEST_DATA should point to ARROW_HOME/cpp/submodules/parquet-testing/data + // so need to reach one folder up to access the "variant" folder. + std::string data_dir(get_data_dir()); + std::stringstream ss; + ss << data_dir << "/../variant"; + return ss.str(); +} + std::string get_data_file(const std::string& filename, bool is_good) { std::stringstream ss; diff --git a/cpp/src/parquet/test_util.h b/cpp/src/parquet/test_util.h index 3ed9a1a007b9..6233844f552e 100644 --- a/cpp/src/parquet/test_util.h +++ b/cpp/src/parquet/test_util.h @@ -71,6 +71,7 @@ class ParquetTestException : public parquet::ParquetException { const char* get_data_dir(); std::string get_bad_data_dir(); +std::string get_variant_dir(); std::string get_data_file(const std::string& filename, bool is_good = true); diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h new file mode 100644 index 000000000000..3a443be50554 --- /dev/null +++ b/cpp/src/parquet/variant.h @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace parquet::variant { + +enum class VariantBasicType { + /// One of the primitive types + Primitive = 0, + /// A string with a length less than 64 bytes + ShortString = 1, + /// A collection of (string-key, variant-value) pairs + Object = 2, + /// An ordered sequence of variant values + Array = 3 +}; + +enum class VariantPrimitiveType { + /// Equivalent Parquet Type: UNKNOWN + NullType = 0, + /// Equivalent Parquet Type: BOOLEAN + BooleanTrue = 1, + /// Equivalent Parquet Type: BOOLEAN + BooleanFalse = 2, + /// Equivalent Parquet Type: INT(8, signed) + Int8 = 3, + /// Equivalent Parquet Type: INT(16, signed) + Int16 = 4, + /// Equivalent Parquet Type: INT(32, signed) + Int32 = 5, + /// Equivalent Parquet Type: INT(64, signed) + Int64 = 6, + /// Equivalent Parquet Type: DOUBLE + Double = 7, + /// Equivalent Parquet Type: DECIMAL(precision, scale) + Decimal4 = 8, + /// Equivalent Parquet Type: DECIMAL(precision, scale) + Decimal8 = 9, + /// Equivalent Parquet Type: DECIMAL(precision, scale) + Decimal16 = 10, + /// Equivalent Parquet Type: DATE + Date = 11, + /// Equivalent Parquet Type: TIMESTAMP(isAdjustedToUTC=true, MICROS) + Timestamp = 12, + /// Equivalent Parquet Type: TIMESTAMP(isAdjustedToUTC=false, MICROS) + TimestampNTZ = 13, + /// Equivalent Parquet Type: FLOAT + Float = 14, + /// Equivalent Parquet Type: BINARY + Binary = 15, + /// Equivalent Parquet Type: STRING + String = 16, + /// Equivalent Parquet Type: TIME(isAdjustedToUTC=false, MICROS) + TimeNTZ = 17, + /// Equivalent Parquet Type: TIMESTAMP(isAdjustedToUTC=true, NANOS) + TimestampTZ = 18, // Assuming TZ stands for TimeZone, and follows the document's + // 'timestamp with time zone' + /// Equivalent Parquet Type: TIMESTAMP(isAdjustedToUTC=false, NANOS) + TimestampNTZNanos = 19, // Differentiating from TimestampNTZ (MICROS) + /// Equivalent Parquet Type: UUID + Uuid = 20 +}; + +// TODO(mwish): should I use ByteArray as interface here? +struct VariantMetadata { + int8_t offset_size() const; + bool sorted_strings() const; + int8_t version(std::string_view metadata) const; + int32_t dictionary_size(std::string_view metadata) const; + int32_t offset(std::string_view metadata, int32_t offset_idx) const; + std::string_view dictionary_bytes(std::string_view metadata) const; + + std::string_view metadata; +}; + +// TODO(mwish): Adding interface here. +struct VariantValue { + std::string_view value; +}; + +} // namespace parquet::variant \ No newline at end of file diff --git a/cpp/src/parquet/variant_test.cpp b/cpp/src/parquet/variant_test.cpp new file mode 100644 index 000000000000..85715b3a3c35 --- /dev/null +++ b/cpp/src/parquet/variant_test.cpp @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "parquet/test_util.h" +#include "parquet/variant.h" + +#include +#include + +namespace parquet::variant { + +TEST(ParquetVariant, MetadataBase) { + std::string test_file = {"primitive_boolean_true.metadata"}; + auto file_system = std::make_shared<::arrow::fs::LocalFileSystem>(); + { + std::string dir_string(parquet::test::get_variant_dir()); + std::string path = dir_string + "/" + test_file; + ASSERT_OK_AND_ASSIGN(auto file, file_system->OpenInputFile(path)); + ASSERT_OK_AND_ASSIGN(auto file_size, file->GetSize()); + ASSERT_OK_AND_ASSIGN(auto buf, file->Read(file_size)); + + VariantMetadata metadata; + metadata.metadata = std::string_view{*buf}; + std::cout << "file_size:" << buf->size() << std::endl; + } +} + +} // namespace parquet::variant \ No newline at end of file From a4599d84273d5792e054ff08149cf7b9635d12af Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 12 May 2025 17:53:39 +0800 Subject: [PATCH 02/31] metadata logic impl --- cpp/src/parquet/CMakeLists.txt | 3 +- cpp/src/parquet/variant.cc | 87 ++++++++++++++++++++++++++++++++ cpp/src/parquet/variant.h | 29 +++++++---- cpp/src/parquet/variant_test.cpp | 41 ++++++++++++--- 4 files changed, 143 insertions(+), 17 deletions(-) create mode 100644 cpp/src/parquet/variant.cc diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 47977285c677..9177870679a3 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -188,7 +188,8 @@ set(PARQUET_SRCS statistics.cc stream_reader.cc stream_writer.cc - types.cc) + types.cc + variant.cc) if(ARROW_HAVE_RUNTIME_AVX2) # AVX2 is used as a proxy for BMI2. diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc new file mode 100644 index 000000000000..8b3415b32c49 --- /dev/null +++ b/cpp/src/parquet/variant.cc @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/variant.h" + +#include +#include + +#include "arrow/util/endian.h" +#include "parquet/exception.h" + +namespace parquet::variant { + +VariantMetadata::VariantMetadata(std::string_view metadata) : metadata_(metadata) { + if (metadata.size() < 2) { + throw ParquetException("Invalid Variant metadata: too short: " + + std::to_string(metadata.size())); + } +} + +int8_t VariantMetadata::version() const { + return static_cast(metadata_[0]) & 0x0F; +} + +bool VariantMetadata::sortedStrings() const { return (metadata_[0] & 0b10000) != 0; } + +uint8_t VariantMetadata::offsetSize() const { return ((metadata_[0] >> 6) & 0x3) + 1; } + +uint32_t VariantMetadata::dictionarySize() const { + uint8_t length = offsetSize(); + if (length > 4) { + throw ParquetException("Invalid offset size: " + std::to_string(length)); + } + if (length + 1 > metadata_.size()) { + throw ParquetException("Invalid Variant metadata: too short for dictionary size"); + } + uint32_t dict_size = 0; + memcpy(&dict_size, metadata_.data() + 1, length); + dict_size = arrow::bit_util::FromLittleEndian(dict_size); + return dict_size; +} + +std::string_view VariantMetadata::getMetadataKey(int32_t variantId) const { + uint32_t offset_size = offsetSize(); + uint32_t dict_size = dictionarySize(); + + if (variantId < 0 || variantId >= static_cast(dict_size)) { + throw ParquetException("Invalid Variant metadata: variantId out of range"); + } + + if ((dict_size + 1) * offset_size > metadata_.size()) { + throw ParquetException("Invalid Variant metadata: offset out of range"); + } + + size_t offset_start_pos = 1 + offset_size + (variantId * offset_size); + + uint32_t variant_offset = 0; + uint32_t variant_next_offset = 0; + memcpy(&variant_offset, metadata_.data() + offset_start_pos, offset_size); + variant_offset = arrow::bit_util::FromLittleEndian(variant_offset); + memcpy(&variant_next_offset, metadata_.data() + offset_start_pos + offset_size, + offset_size); + variant_next_offset = arrow::bit_util::FromLittleEndian(variant_next_offset); + + uint32_t key_size = variant_next_offset - variant_offset; + + size_t string_start = 1 + offset_size * (dict_size + 2) + variant_offset; + if (string_start + key_size > metadata_.size()) { + throw ParquetException("Invalid Variant metadata: string data out of range"); + } + return std::string_view(metadata_.data() + string_start, key_size); +} +} // namespace parquet::variant diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index 3a443be50554..90c67e1842ac 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -19,9 +19,13 @@ #include #include +#include namespace parquet::variant { +// TODO(mwish): Should I use parquet::ByteArray rather than +// std::string_view? + enum class VariantBasicType { /// One of the primitive types Primitive = 0, @@ -79,20 +83,25 @@ enum class VariantPrimitiveType { Uuid = 20 }; -// TODO(mwish): should I use ByteArray as interface here? -struct VariantMetadata { - int8_t offset_size() const; - bool sorted_strings() const; - int8_t version(std::string_view metadata) const; - int32_t dictionary_size(std::string_view metadata) const; - int32_t offset(std::string_view metadata, int32_t offset_idx) const; - std::string_view dictionary_bytes(std::string_view metadata) const; +class VariantMetadata { + public: + explicit VariantMetadata(std::string_view metadata); + /// \brief Get the variant metadata version. Currently, always 1. + int8_t version() const; + /// \brief Get the metadata key for a given variant field id. + std::string_view getMetadataKey(int32_t variantId) const; + + private: + bool sortedStrings() const; + uint8_t offsetSize() const; + uint32_t dictionarySize() const; - std::string_view metadata; + private: + std::string_view metadata_; }; -// TODO(mwish): Adding interface here. struct VariantValue { + VariantMetadata metadata; std::string_view value; }; diff --git a/cpp/src/parquet/variant_test.cpp b/cpp/src/parquet/variant_test.cpp index 85715b3a3c35..3e267d29e86b 100644 --- a/cpp/src/parquet/variant_test.cpp +++ b/cpp/src/parquet/variant_test.cpp @@ -19,6 +19,7 @@ #include +#include "parquet/exception.h" #include "parquet/test_util.h" #include "parquet/variant.h" @@ -28,18 +29,46 @@ namespace parquet::variant { TEST(ParquetVariant, MetadataBase) { - std::string test_file = {"primitive_boolean_true.metadata"}; + std::string dir_string(parquet::test::get_variant_dir()); auto file_system = std::make_shared<::arrow::fs::LocalFileSystem>(); - { - std::string dir_string(parquet::test::get_variant_dir()); + std::vector primitive_metadatas = { + // "primitive_null.metadata", + "primitive_boolean_true.metadata", "primitive_boolean_true.metadata", + "primitive_date.metadata", "primitive_decimal4.metadata", + "primitive_decimal8.metadata", "primitive_decimal16.metadata", + "primitive_float.metadata", "primitive_double.metadata", + "primitive_int8.metadata", "primitive_int16.metadata", + "primitive_int32.metadata", "primitive_int64.metadata", + "primitive_binary.metadata", "primitive_string.metadata", + }; + for (auto& test_file : primitive_metadatas) { + ARROW_SCOPED_TRACE("Testing file: " + test_file); std::string path = dir_string + "/" + test_file; ASSERT_OK_AND_ASSIGN(auto file, file_system->OpenInputFile(path)); ASSERT_OK_AND_ASSIGN(auto file_size, file->GetSize()); ASSERT_OK_AND_ASSIGN(auto buf, file->Read(file_size)); - VariantMetadata metadata; - metadata.metadata = std::string_view{*buf}; - std::cout << "file_size:" << buf->size() << std::endl; + VariantMetadata metadata(std::string_view{*buf}); + EXPECT_EQ(1, metadata.version()); + EXPECT_THROW(metadata.getMetadataKey(0), ParquetException); + } + + { + std::string object_metadata = {"object_primitive.metadata"}; + ARROW_SCOPED_TRACE("Testing file: " + object_metadata); + std::string path = dir_string + "/" + object_metadata; + ASSERT_OK_AND_ASSIGN(auto file, file_system->OpenInputFile(path)); + ASSERT_OK_AND_ASSIGN(auto file_size, file->GetSize()); + ASSERT_OK_AND_ASSIGN(auto buf, file->Read(file_size)); + + VariantMetadata metadata(std::string_view{*buf}); + EXPECT_EQ("int_field", metadata.getMetadataKey(0)); + EXPECT_EQ("double_field", metadata.getMetadataKey(1)); + EXPECT_EQ("boolean_true_field", metadata.getMetadataKey(2)); + EXPECT_EQ("boolean_false_field", metadata.getMetadataKey(3)); + EXPECT_EQ("string_field", metadata.getMetadataKey(4)); + EXPECT_EQ("null_field", metadata.getMetadataKey(5)); + EXPECT_EQ("timestamp_field", metadata.getMetadataKey(6)); } } From 34c1d2c0c7d8e4897e2b22e4f13bebb2b4b89b24 Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 12 May 2025 21:14:06 +0800 Subject: [PATCH 03/31] add some value interfaces --- cpp/src/parquet/variant.h | 71 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index 90c67e1842ac..c72b537d6163 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -21,6 +21,8 @@ #include #include +#include + namespace parquet::variant { // TODO(mwish): Should I use parquet::ByteArray rather than @@ -83,6 +85,32 @@ enum class VariantPrimitiveType { Uuid = 20 }; +/// VariantType is from basic type and primitive type. +enum class VariantType { + OBJECT, + ARRAY, + VARIANT_NULL, + BOOLEAN, + BYTE, + SHORT, + INT, + LONG, + STRING, + DOUBLE, + DECIMAL4, + DECIMAL8, + DECIMAL16, + DATE, + TIMESTAMP_TZ, + TIMESTAMP_NTZ, + FLOAT, + BINARY, + TIME, + TIMESTAMP_NANOS_TZ, + TIMESTAMP_NANOS_NTZ, + UUID +}; + class VariantMetadata { public: explicit VariantMetadata(std::string_view metadata); @@ -103,6 +131,47 @@ class VariantMetadata { struct VariantValue { VariantMetadata metadata; std::string_view value; + + VariantBasicType getBasicType() const; + VariantType getType() const; + std::string typeDebugString() const; + + // Note: Null doesn't need visitor. + bool getBool() const; + int8_t getInt8() const; + int16_t getInt16() const; + int32_t getInt32() const; + int64_t getInt64() const; + std::string_view getString() const; + std::string_view getBinary() const; + float getFloat() const; + double getDouble() const; + // ::arrow::Decimal32 getDecimal32() const; + + struct ObjectInfo { + uint32_t num_elements; + uint32_t id_size; + uint32_t offset_size; + uint32_t id_start_offset; + uint32_t offset_start_offset; + uint32_t data_start_offset; + }; + ObjectInfo getObjectInfo() const; + std::optional getObjectValueByKey(std::string_view key) const; + + struct ArrayInfo { + uint32_t num_elements; + uint32_t offset_size; + uint32_t offset_start_offset; + uint32_t data_start_offset; + }; + ArrayInfo getArrayInfo() const; + + static constexpr uint8_t BASIC_TYPE_MASK = 0b00000011; + static constexpr uint8_t PRIMITIVE_TYPE_MASK = 0b00111111; + /** The inclusive maximum value of the type info value. It is the size limit of + * `SHORT_STR`. */ + static constexpr uint8_t MAX_SHORT_STR_SIZE_MASK = 0b00111111; }; -} // namespace parquet::variant \ No newline at end of file +} // namespace parquet::variant From ad585f28cf47dfd8cfc57606a5c1f63e48fb6abf Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 13 May 2025 16:28:14 +0800 Subject: [PATCH 04/31] Add basic tests (will failed) --- cpp/src/arrow/util/decimal.h | 4 +- cpp/src/parquet/CMakeLists.txt | 2 +- cpp/src/parquet/variant.cc | 620 +++++++++++++++++++++++++++++++ cpp/src/parquet/variant.h | 28 +- cpp/src/parquet/variant_test.cc | 508 +++++++++++++++++++++++++ cpp/src/parquet/variant_test.cpp | 75 ---- cpp/submodules/parquet-testing | 2 +- 7 files changed, 1159 insertions(+), 80 deletions(-) create mode 100644 cpp/src/parquet/variant_test.cc delete mode 100644 cpp/src/parquet/variant_test.cpp diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h index 640dc9aec157..00328668928e 100644 --- a/cpp/src/arrow/util/decimal.h +++ b/cpp/src/arrow/util/decimal.h @@ -108,7 +108,7 @@ class ARROW_EXPORT Decimal32 : public BasicDecimal32 { /// \brief Convert from a big-endian byte representation. The length must be /// between 1 and 4 - /// \return error statis if the length is an invalid value + /// \return error status if the length is an invalid value static Result FromBigEndian(const uint8_t* data, int32_t length); /// \brief Convert Decimal32 from one scale to another @@ -220,7 +220,7 @@ class ARROW_EXPORT Decimal64 : public BasicDecimal64 { /// \brief Convert from a big-endian byte representation. The length must be /// between 1 and 4 - /// \return error statis if the length is an invalid value + /// \return error status if the length is an invalid value static Result FromBigEndian(const uint8_t* data, int32_t length); /// \brief Convert Decimal64 from one scale to another diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 9177870679a3..edcec8f4ee73 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -383,7 +383,7 @@ add_parquet_test(internals-test size_statistics_test.cc statistics_test.cc types_test.cc - variant_test.cpp) + variant_test.cc) set_source_files_properties(public_api_test.cc PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index 8b3415b32c49..10c7651c43e7 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -18,6 +18,7 @@ #include "parquet/variant.h" #include +#include #include #include "arrow/util/endian.h" @@ -84,4 +85,623 @@ std::string_view VariantMetadata::getMetadataKey(int32_t variantId) const { } return std::string_view(metadata_.data() + string_start, key_size); } + +VariantBasicType VariantValue::getBasicType() const { + if (value.empty()) { + throw ParquetException("Empty variant value"); + } + return static_cast(value[0] & BASIC_TYPE_MASK); +} + +VariantType VariantValue::getType() const { + VariantBasicType basic_type = getBasicType(); + + switch (basic_type) { + case VariantBasicType::Primitive: { + auto primitive_type = static_cast(value[0] >> 2); + switch (primitive_type) { + case VariantPrimitiveType::NullType: + return VariantType::VARIANT_NULL; + case VariantPrimitiveType::BooleanTrue: + case VariantPrimitiveType::BooleanFalse: + return VariantType::BOOLEAN; + case VariantPrimitiveType::Int8: + return VariantType::BYTE; + case VariantPrimitiveType::Int16: + return VariantType::SHORT; + case VariantPrimitiveType::Int32: + return VariantType::INT; + case VariantPrimitiveType::Int64: + return VariantType::LONG; + case VariantPrimitiveType::Double: + return VariantType::DOUBLE; + case VariantPrimitiveType::Decimal4: + return VariantType::DECIMAL4; + case VariantPrimitiveType::Decimal8: + return VariantType::DECIMAL8; + case VariantPrimitiveType::Decimal16: + return VariantType::DECIMAL16; + case VariantPrimitiveType::Date: + return VariantType::DATE; + case VariantPrimitiveType::Timestamp: + return VariantType::TIMESTAMP_TZ; + case VariantPrimitiveType::TimestampNTZ: + return VariantType::TIMESTAMP_NTZ; + case VariantPrimitiveType::Float: + return VariantType::FLOAT; + case VariantPrimitiveType::Binary: + return VariantType::BINARY; + case VariantPrimitiveType::String: + return VariantType::STRING; + case VariantPrimitiveType::TimeNTZ: + return VariantType::TIME; + case VariantPrimitiveType::TimestampTZ: + return VariantType::TIMESTAMP_NANOS_TZ; + case VariantPrimitiveType::TimestampNTZNanos: + return VariantType::TIMESTAMP_NANOS_NTZ; + case VariantPrimitiveType::Uuid: + return VariantType::UUID; + default: + throw ParquetException("Unknown primitive type: " + + std::to_string(static_cast(primitive_type))); + } + } + case VariantBasicType::ShortString: + return VariantType::STRING; + case VariantBasicType::Object: + return VariantType::OBJECT; + case VariantBasicType::Array: + return VariantType::ARRAY; + default: + throw ParquetException("Unknown basic type: " + + std::to_string(static_cast(basic_type))); + } +} + +std::string VariantValue::typeDebugString() const { + VariantType type = getType(); + switch (type) { + case VariantType::OBJECT: + return "OBJECT"; + case VariantType::ARRAY: + return "ARRAY"; + case VariantType::VARIANT_NULL: + return "NULL"; + case VariantType::BOOLEAN: + return "BOOLEAN"; + case VariantType::BYTE: + return "BYTE"; + case VariantType::SHORT: + return "SHORT"; + case VariantType::INT: + return "INT"; + case VariantType::LONG: + return "LONG"; + case VariantType::STRING: + return "STRING"; + case VariantType::DOUBLE: + return "DOUBLE"; + case VariantType::DECIMAL4: + return "DECIMAL4"; + case VariantType::DECIMAL8: + return "DECIMAL8"; + case VariantType::DECIMAL16: + return "DECIMAL16"; + case VariantType::DATE: + return "DATE"; + case VariantType::TIMESTAMP_TZ: + return "TIMESTAMP_TZ"; + case VariantType::TIMESTAMP_NTZ: + return "TIMESTAMP_NTZ"; + case VariantType::FLOAT: + return "FLOAT"; + case VariantType::BINARY: + return "BINARY"; + case VariantType::TIME: + return "TIME"; + case VariantType::TIMESTAMP_NANOS_TZ: + return "TIMESTAMP_NANOS_TZ"; + case VariantType::TIMESTAMP_NANOS_NTZ: + return "TIMESTAMP_NANOS_NTZ"; + case VariantType::UUID: + return "UUID"; + default: + return "UNKNOWN"; + } +} + +bool VariantValue::getBool() const { + if (getBasicType() != VariantBasicType::Primitive) { + throw ParquetException("Not a primitive type"); + } + + auto primitive_type = static_cast(value[0] >> 2); + if (primitive_type == VariantPrimitiveType::BooleanTrue) { + return true; + } else if (primitive_type == VariantPrimitiveType::BooleanFalse) { + return false; + } + + throw ParquetException("Not a boolean type"); +} + +int8_t VariantValue::getInt8() const { + if (getBasicType() != VariantBasicType::Primitive) { + throw ParquetException("Not a primitive type"); + } + + auto primitive_type = static_cast(value[0] >> 2); + if (primitive_type != VariantPrimitiveType::Int8) { + throw ParquetException("Not an Int8 type"); + } + + if (value.size() < 2) { + throw ParquetException("Invalid Int8 value: too short"); + } + + return static_cast(value[1]); +} + +int16_t VariantValue::getInt16() const { + if (getBasicType() != VariantBasicType::Primitive) { + throw ParquetException("Not a primitive type"); + } + + auto primitive_type = static_cast(value[0] >> 2); + if (primitive_type != VariantPrimitiveType::Int16) { + throw ParquetException("Not an Int16 type"); + } + + if (value.size() < 3) { + throw ParquetException("Invalid Int16 value: too short"); + } + + int16_t result; + memcpy(&result, value.data() + 1, sizeof(int16_t)); + return arrow::bit_util::FromLittleEndian(result); +} + +int32_t VariantValue::getInt32() const { + if (getBasicType() != VariantBasicType::Primitive) { + throw ParquetException("Not a primitive type"); + } + + auto primitive_type = static_cast(value[0] >> 2); + if (primitive_type != VariantPrimitiveType::Int32) { + throw ParquetException("Not an Int32 type"); + } + + if (value.size() < 5) { + throw ParquetException("Invalid Int32 value: too short"); + } + + int32_t result; + memcpy(&result, value.data() + 1, sizeof(int32_t)); + return arrow::bit_util::FromLittleEndian(result); +} + +int64_t VariantValue::getInt64() const { + if (getBasicType() != VariantBasicType::Primitive) { + throw ParquetException("Not a primitive type"); + } + + auto primitive_type = static_cast(value[0] >> 2); + if (primitive_type != VariantPrimitiveType::Int64) { + throw ParquetException("Not an Int64 type"); + } + + if (value.size() < 9) { + throw ParquetException("Invalid Int64 value: too short"); + } + + int64_t result; + memcpy(&result, value.data() + 1, sizeof(int64_t)); + return arrow::bit_util::FromLittleEndian(result); +} + +std::string_view VariantValue::getString() const { + VariantBasicType basic_type = getBasicType(); + + if (basic_type == VariantBasicType::ShortString) { + uint8_t length = (value[0] >> 2) & MAX_SHORT_STR_SIZE_MASK; + if (value.size() < length + 1) { + throw ParquetException("Invalid short string: too short"); + } + return std::string_view(value.data() + 1, length); + } else if (basic_type == VariantBasicType::Primitive) { + auto primitive_type = static_cast(value[0] >> 2); + if (primitive_type != VariantPrimitiveType::String) { + throw ParquetException("Not a string type"); + } + + if (value.size() < 5) { + throw ParquetException("Invalid string value: too short"); + } + + uint32_t length; + memcpy(&length, value.data() + 1, sizeof(uint32_t)); + length = arrow::bit_util::FromLittleEndian(length); + + if (value.size() < length + 5) { + throw ParquetException("Invalid string value: too short for specified length"); + } + + return std::string_view(value.data() + 5, length); + } + + throw ParquetException("Not a string type"); +} + +std::string_view VariantValue::getBinary() const { + if (getBasicType() != VariantBasicType::Primitive) { + throw ParquetException("Not a primitive type"); + } + + auto primitive_type = static_cast(value[0] >> 2); + if (primitive_type != VariantPrimitiveType::Binary) { + throw ParquetException("Not a binary type"); + } + + if (value.size() < 5) { + throw ParquetException("Invalid binary value: too short"); + } + + uint32_t length; + memcpy(&length, value.data() + 1, sizeof(uint32_t)); + length = arrow::bit_util::FromLittleEndian(length); + + if (value.size() < length + 5) { + throw ParquetException("Invalid binary value: too short for specified length"); + } + + return std::string_view(value.data() + 5, length); +} + +float VariantValue::getFloat() const { + if (getBasicType() != VariantBasicType::Primitive) { + throw ParquetException("Not a primitive type"); + } + + auto primitive_type = static_cast(value[0] >> 2); + if (primitive_type != VariantPrimitiveType::Float) { + throw ParquetException("Not a float type"); + } + + if (value.size() < 5) { + throw ParquetException("Invalid float value: too short"); + } + + float result; + memcpy(&result, value.data() + 1, sizeof(float)); + return arrow::bit_util::FromLittleEndian(result); +} + +double VariantValue::getDouble() const { + if (getBasicType() != VariantBasicType::Primitive) { + throw ParquetException("Not a primitive type"); + } + + auto primitive_type = static_cast(value[0] >> 2); + if (primitive_type != VariantPrimitiveType::Double) { + throw ParquetException("Not a double type"); + } + + if (value.size() < 9) { + throw ParquetException("Invalid double value: too short"); + } + + double result; + memcpy(&result, value.data() + 1, sizeof(double)); + return arrow::bit_util::FromLittleEndian(result); +} + +DecimalValue<::arrow::Decimal32> VariantValue::getDecimal4() const { + if (getBasicType() != VariantBasicType::Primitive) { + throw ParquetException("Not a primitive type"); + } + + auto primitive_type = static_cast(value[0] >> 2); + if (primitive_type != VariantPrimitiveType::Decimal4) { + throw ParquetException("Not a decimal4 type"); + } + + if (value.size() < 6) { + throw ParquetException("Invalid decimal4 value: too short"); + } + + uint8_t scale = value[1]; + int32_t decimal_value; + memcpy(&decimal_value, value.data() + 2, sizeof(int32_t)); + decimal_value = arrow::bit_util::FromLittleEndian(decimal_value); + + return {scale, ::arrow::Decimal32(decimal_value)}; +} + +DecimalValue<::arrow::Decimal64> VariantValue::getDecimal8() const { + if (getBasicType() != VariantBasicType::Primitive) { + throw ParquetException("Not a primitive type"); + } + + auto primitive_type = static_cast(value[0] >> 2); + if (primitive_type != VariantPrimitiveType::Decimal8) { + throw ParquetException("Not a decimal8 type"); + } + + if (value.size() < 10) { + throw ParquetException("Invalid decimal8 value: too short"); + } + + uint8_t scale = value[1]; + int64_t decimal_value; + memcpy(&decimal_value, value.data() + 2, sizeof(int64_t)); + decimal_value = arrow::bit_util::FromLittleEndian(decimal_value); + + return {scale, ::arrow::Decimal64(decimal_value)}; +} + +DecimalValue<::arrow::Decimal128> VariantValue::getDecimal16() const { + if (getBasicType() != VariantBasicType::Primitive) { + throw ParquetException("Not a primitive type"); + } + + auto primitive_type = static_cast(value[0] >> 2); + if (primitive_type != VariantPrimitiveType::Decimal16) { + throw ParquetException("Not a decimal16 type"); + } + + if (value.size() < 18) { + throw ParquetException("Invalid decimal16 value: too short"); + } + + uint8_t scale = value[1]; + + // Decimal128 is stored as two int64_t values (low bits, high bits) + int64_t low_bits, high_bits; + memcpy(&low_bits, value.data() + 2, sizeof(int64_t)); + memcpy(&high_bits, value.data() + 10, sizeof(int64_t)); + low_bits = arrow::bit_util::FromLittleEndian(low_bits); + high_bits = arrow::bit_util::FromLittleEndian(high_bits); + + return {scale, ::arrow::Decimal128(high_bits, low_bits)}; +} + +int64_t VariantValue::timeNTZ() const { + if (getBasicType() != VariantBasicType::Primitive) { + throw ParquetException("Not a primitive type"); + } + + auto primitive_type = static_cast(value[0] >> 2); + if (primitive_type != VariantPrimitiveType::TimeNTZ) { + throw ParquetException("Not a timeNTZ type"); + } + + if (value.size() < 9) { + throw ParquetException("Invalid timeNTZ value: too short"); + } + + int64_t result; + memcpy(&result, value.data() + 1, sizeof(int64_t)); + return arrow::bit_util::FromLittleEndian(result); +} + +int64_t VariantValue::getTimestamp() const { + if (getBasicType() != VariantBasicType::Primitive) { + throw ParquetException("Not a primitive type"); + } + + auto primitive_type = static_cast(value[0] >> 2); + if (primitive_type != VariantPrimitiveType::Timestamp) { + throw ParquetException("Not a timestamp type"); + } + + if (value.size() < 9) { + throw ParquetException("Invalid timestamp value: too short"); + } + + int64_t result; + memcpy(&result, value.data() + 1, sizeof(int64_t)); + return arrow::bit_util::FromLittleEndian(result); +} + +int64_t VariantValue::getTimestampNTZ() const { + if (getBasicType() != VariantBasicType::Primitive) { + throw ParquetException("Not a primitive type"); + } + + auto primitive_type = static_cast(value[0] >> 2); + if (primitive_type != VariantPrimitiveType::TimestampNTZ) { + throw ParquetException("Not a timestampNTZ type"); + } + + if (value.size() < 9) { + throw ParquetException("Invalid timestampNTZ value: too short"); + } + + int64_t result; + memcpy(&result, value.data() + 1, sizeof(int64_t)); + return arrow::bit_util::FromLittleEndian(result); +} + +const uint8_t* VariantValue::getUuid() const { + if (getBasicType() != VariantBasicType::Primitive) { + throw ParquetException("Not a primitive type"); + } + + auto primitive_type = static_cast(value[0] >> 2); + if (primitive_type != VariantPrimitiveType::Uuid) { + throw ParquetException("Not a UUID type"); + } + + if (value.size() < 17) { + throw ParquetException("Invalid UUID value: too short"); + } + + return reinterpret_cast(value.data() + 1); +} + +VariantValue::ObjectInfo VariantValue::getObjectInfo() const { + if (getBasicType() != VariantBasicType::Object) { + throw ParquetException("Not an object type"); + } + + if (value.size() < 5) { + throw ParquetException("Invalid object value: too short"); + } + + uint32_t num_elements; + memcpy(&num_elements, value.data() + 1, sizeof(uint32_t)); + num_elements = arrow::bit_util::FromLittleEndian(num_elements); + + if (value.size() < 6) { + throw ParquetException("Invalid object value: too short for id_size"); + } + + uint8_t id_size = value[5]; + + if (value.size() < 7) { + throw ParquetException("Invalid object value: too short for offset_size"); + } + + uint8_t offset_size = value[6]; + + if (offset_size < 1 || offset_size > 4 || id_size < 1 || id_size > 4) { + throw ParquetException("Invalid object value: invalid id_size or offset_size"); + } + + uint32_t id_start_offset = 7; + uint32_t offset_start_offset = id_start_offset + num_elements * id_size; + uint32_t data_start_offset = offset_start_offset + (num_elements + 1) * offset_size; + + return {num_elements, id_size, offset_size, id_start_offset, + offset_start_offset, data_start_offset}; +} + +std::optional VariantValue::getObjectValueByKey( + std::string_view key) const { + if (getBasicType() != VariantBasicType::Object) { + throw ParquetException("Not an object type"); + } + + ObjectInfo info = getObjectInfo(); + + for (uint32_t i = 0; i < info.num_elements; ++i) { + std::string_view field_key; + std::optional field_value = getObjectFieldByFieldId(i, &field_key); + + if (field_key == key) { + return field_value; + } + } + + return std::nullopt; +} + +std::optional VariantValue::getObjectFieldByFieldId( + uint32_t variantId, std::string_view* key) const { + ObjectInfo info = getObjectInfo(); + + if (variantId >= info.num_elements) { + throw ParquetException("Field ID out of range"); + } + + // Read the field ID + uint32_t field_id = 0; + memcpy(&field_id, value.data() + info.id_start_offset + variantId * info.id_size, + info.id_size); + field_id = arrow::bit_util::FromLittleEndian(field_id); + + // Get the key from metadata + if (key != nullptr) { + *key = metadata.getMetadataKey(field_id); + } + + // Read the offset and next offset + uint32_t offset = 0, next_offset = 0; + memcpy(&offset, value.data() + info.offset_start_offset + variantId * info.offset_size, + info.offset_size); + memcpy(&next_offset, + value.data() + info.offset_start_offset + (variantId + 1) * info.offset_size, + info.offset_size); + offset = arrow::bit_util::FromLittleEndian(offset); + next_offset = arrow::bit_util::FromLittleEndian(next_offset); + + if (offset == next_offset) { + // Field is not present (null) + return std::nullopt; + } + + if (info.data_start_offset + offset >= value.size() || + info.data_start_offset + next_offset > value.size() || offset > next_offset) { + throw ParquetException("Invalid object field offsets"); + } + + // Create a VariantValue for the field + VariantValue field_value{ + .metadata = metadata, + .value = std::string_view(value.data() + info.data_start_offset + offset, + next_offset - offset)}; + + return field_value; +} + +VariantValue::ArrayInfo VariantValue::getArrayInfo() const { + if (getBasicType() != VariantBasicType::Array) { + throw ParquetException("Not an array type"); + } + + if (value.size() < 6) { + throw ParquetException("Invalid array value: too short"); + } + + uint32_t num_elements; + memcpy(&num_elements, value.data() + 1, sizeof(uint32_t)); + num_elements = arrow::bit_util::FromLittleEndian(num_elements); + + if (value.size() < 6) { + throw ParquetException("Invalid array value: too short for offset_size"); + } + + uint8_t offset_size = value[5]; + + if (offset_size < 1 || offset_size > 4) { + throw ParquetException("Invalid array value: invalid offset_size"); + } + + uint32_t offset_start_offset = 6; + uint32_t data_start_offset = offset_start_offset + (num_elements + 1) * offset_size; + + return {num_elements, offset_size, offset_start_offset, data_start_offset}; +} + +VariantValue VariantValue::getArrayValueByIndex(uint32_t index) const { + ArrayInfo info = getArrayInfo(); + + if (index >= info.num_elements) { + throw ParquetException("Array index out of range"); + } + + // Read the offset and next offset + uint32_t offset = 0, next_offset = 0; + memcpy(&offset, value.data() + info.offset_start_offset + index * info.offset_size, + info.offset_size); + memcpy(&next_offset, + value.data() + info.offset_start_offset + (index + 1) * info.offset_size, + info.offset_size); + offset = arrow::bit_util::FromLittleEndian(offset); + next_offset = arrow::bit_util::FromLittleEndian(next_offset); + + if (info.data_start_offset + offset >= value.size() || + info.data_start_offset + next_offset > value.size() || offset > next_offset) { + throw ParquetException("Invalid array element offsets"); + } + + // Create a VariantValue for the element + VariantValue element_value{ + .metadata = metadata, + .value = std::string_view(value.data() + info.data_start_offset + offset, + next_offset - offset)}; + + return element_value; +} + } // namespace parquet::variant diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index c72b537d6163..7154eaa6351d 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -128,6 +128,12 @@ class VariantMetadata { std::string_view metadata_; }; +template +struct DecimalValue { + uint8_t scale; + DecimalType value; +}; + struct VariantValue { VariantMetadata metadata; std::string_view value; @@ -136,17 +142,33 @@ struct VariantValue { VariantType getType() const; std::string typeDebugString() const; + /// \defgroup ValueAccessors + /// @{ + // Note: Null doesn't need visitor. bool getBool() const; int8_t getInt8() const; int16_t getInt16() const; int32_t getInt32() const; int64_t getInt64() const; + /// Include short_string optimization and primitive string type std::string_view getString() const; std::string_view getBinary() const; float getFloat() const; double getDouble() const; - // ::arrow::Decimal32 getDecimal32() const; + + DecimalValue<::arrow::Decimal32> getDecimal4() const; + DecimalValue<::arrow::Decimal64> getDecimal8() const; + DecimalValue<::arrow::Decimal128> getDecimal16() const; + + int64_t timeNTZ() const; + // timestamp with adjusted to UTC + int64_t getTimestamp() const; + int64_t getTimestampNTZ() const; + // 16 bytes UUID + const uint8_t* getUuid() const; + + /// }@ struct ObjectInfo { uint32_t num_elements; @@ -158,6 +180,8 @@ struct VariantValue { }; ObjectInfo getObjectInfo() const; std::optional getObjectValueByKey(std::string_view key) const; + std::optional getObjectFieldByFieldId(uint32_t variantId, + std::string_view* key) const; struct ArrayInfo { uint32_t num_elements; @@ -166,6 +190,8 @@ struct VariantValue { uint32_t data_start_offset; }; ArrayInfo getArrayInfo() const; + // Would throw ParquetException if index is out of range. + VariantValue getArrayValueByIndex(uint32_t index) const; static constexpr uint8_t BASIC_TYPE_MASK = 0b00000011; static constexpr uint8_t PRIMITIVE_TYPE_MASK = 0b00111111; diff --git a/cpp/src/parquet/variant_test.cc b/cpp/src/parquet/variant_test.cc new file mode 100644 index 000000000000..d260e0311bd4 --- /dev/null +++ b/cpp/src/parquet/variant_test.cc @@ -0,0 +1,508 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "parquet/exception.h" +#include "parquet/test_util.h" +#include "parquet/variant.h" + +#include +#include + +namespace parquet::variant { + +TEST(ParquetVariant, MetadataBase) { + std::string dir_string(parquet::test::get_variant_dir()); + auto file_system = std::make_shared<::arrow::fs::LocalFileSystem>(); + std::vector primitive_metadatas = { + // "primitive_null.metadata", + "primitive_boolean_true.metadata", "primitive_boolean_false.metadata", + "primitive_date.metadata", "primitive_decimal4.metadata", + "primitive_decimal8.metadata", "primitive_decimal16.metadata", + "primitive_float.metadata", "primitive_double.metadata", + "primitive_int8.metadata", "primitive_int16.metadata", + "primitive_int32.metadata", "primitive_int64.metadata", + "primitive_binary.metadata", "primitive_string.metadata", + }; + for (auto& test_file : primitive_metadatas) { + ARROW_SCOPED_TRACE("Testing file: " + test_file); + std::string path = dir_string + "/" + test_file; + ASSERT_OK_AND_ASSIGN(auto file, file_system->OpenInputFile(path)); + ASSERT_OK_AND_ASSIGN(auto file_size, file->GetSize()); + ASSERT_OK_AND_ASSIGN(auto buf, file->Read(file_size)); + + VariantMetadata metadata(std::string_view{*buf}); + EXPECT_EQ(1, metadata.version()); + EXPECT_THROW(metadata.getMetadataKey(0), ParquetException); + } + + { + std::string object_metadata = {"object_primitive.metadata"}; + ARROW_SCOPED_TRACE("Testing file: " + object_metadata); + std::string path = dir_string + "/" + object_metadata; + ASSERT_OK_AND_ASSIGN(auto file, file_system->OpenInputFile(path)); + ASSERT_OK_AND_ASSIGN(auto file_size, file->GetSize()); + ASSERT_OK_AND_ASSIGN(auto buf, file->Read(file_size)); + + VariantMetadata metadata(std::string_view{*buf}); + EXPECT_EQ("int_field", metadata.getMetadataKey(0)); + EXPECT_EQ("double_field", metadata.getMetadataKey(1)); + EXPECT_EQ("boolean_true_field", metadata.getMetadataKey(2)); + EXPECT_EQ("boolean_false_field", metadata.getMetadataKey(3)); + EXPECT_EQ("string_field", metadata.getMetadataKey(4)); + EXPECT_EQ("null_field", metadata.getMetadataKey(5)); + EXPECT_EQ("timestamp_field", metadata.getMetadataKey(6)); + } +} + +std::string metadata_test_file_name(std::string_view test_name) { + return std::string(test_name) + ".metadata"; +} + +std::string value_test_file_name(std::string_view test_name) { + return std::string(test_name) + ".value"; +} + +// 修改后的辅助函数,用于从文件加载 VariantValue,并保持 Buffer 生命周期 +VariantValue LoadVariantValue(const std::string& test_name, + std::shared_ptr<::arrow::Buffer>* metadata_buf_out, + std::shared_ptr<::arrow::Buffer>* value_buf_out) { + std::string dir_string(parquet::test::get_variant_dir()); + auto file_system = std::make_shared<::arrow::fs::LocalFileSystem>(); + + std::string metadata_path = dir_string + "/" + metadata_test_file_name(test_name); + EXPECT_OK_AND_ASSIGN(auto metadata_file, file_system->OpenInputFile(metadata_path)); + EXPECT_OK_AND_ASSIGN(auto metadata_size, metadata_file->GetSize()); + EXPECT_OK_AND_ASSIGN(auto metadata_buf, metadata_file->Read(metadata_size)); + *metadata_buf_out = metadata_buf; + + std::string value_path = dir_string + "/" + value_test_file_name(test_name); + EXPECT_OK_AND_ASSIGN(auto value_file, file_system->OpenInputFile(value_path)); + EXPECT_OK_AND_ASSIGN(auto value_size, value_file->GetSize()); + EXPECT_OK_AND_ASSIGN(auto value_buf, value_file->Read(value_size)); + *value_buf_out = value_buf; + + VariantMetadata metadata(std::string_view{**metadata_buf_out}); + return VariantValue{metadata, std::string_view{**value_buf_out}}; +} + +TEST(ParquetVariant, BooleanValue) { + // test true + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_boolean_true", &metadata_buf, &value_buf); + std::cout << variant.typeDebugString() << '\n'; + EXPECT_EQ(VariantType::BOOLEAN, variant.getType()); + EXPECT_EQ("BOOLEAN", variant.typeDebugString()); + EXPECT_EQ(true, variant.getBool()); + } + + // test false + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_boolean_false", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::BOOLEAN, variant.getType()); + EXPECT_EQ(false, variant.getBool()); + } + + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_int32", &metadata_buf, &value_buf); + EXPECT_THROW(variant.getBool(), ParquetException); + } +} + +TEST(ParquetVariant, NumericValues) { + // 测试 Int8 值 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_int8", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::BYTE, variant.getType()); + EXPECT_EQ("BYTE", variant.typeDebugString()); + EXPECT_EQ(42, variant.getInt8()); + } + + // 测试 Int16 值 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_int16", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::SHORT, variant.getType()); + EXPECT_EQ("SHORT", variant.typeDebugString()); + EXPECT_EQ(12345, variant.getInt16()); + } + + // 测试 Int32 值 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_int32", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::INT, variant.getType()); + EXPECT_EQ("INT", variant.typeDebugString()); + EXPECT_EQ(1234567890, variant.getInt32()); + } + + // 测试 Int64 值 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_int64", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::LONG, variant.getType()); + EXPECT_EQ("LONG", variant.typeDebugString()); + EXPECT_EQ(1234567890123456789LL, variant.getInt64()); + } + + // 测试 Float 值 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_float", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::FLOAT, variant.getType()); + EXPECT_EQ("FLOAT", variant.typeDebugString()); + EXPECT_FLOAT_EQ(3.14159f, variant.getFloat()); + } + + // 测试 Double 值 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_double", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::DOUBLE, variant.getType()); + EXPECT_EQ("DOUBLE", variant.typeDebugString()); + EXPECT_DOUBLE_EQ(2.71828182845904523536, variant.getDouble()); + } + + // 测试类型不匹配的异常 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_int32", &metadata_buf, &value_buf); + EXPECT_THROW(variant.getInt64(), ParquetException); + EXPECT_THROW(variant.getFloat(), ParquetException); + EXPECT_THROW(variant.getDouble(), ParquetException); + } +} + +TEST(ParquetVariant, StringValues) { + // 测试普通字符串 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_string", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::STRING, variant.getType()); + EXPECT_EQ("STRING", variant.typeDebugString()); + EXPECT_EQ("Hello, World!", variant.getString()); + } + + // 测试短字符串(使用 ShortString 优化) + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("short_string", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::STRING, variant.getType()); + EXPECT_EQ(VariantBasicType::ShortString, variant.getBasicType()); + EXPECT_EQ("Short", variant.getString()); + } + + // 测试二进制数据 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_binary", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::BINARY, variant.getType()); + EXPECT_EQ("BINARY", variant.typeDebugString()); + auto binary_data = variant.getBinary(); + EXPECT_EQ(10, binary_data.size()); + } + + // 测试类型不匹配的异常 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_int32", &metadata_buf, &value_buf); + EXPECT_THROW(variant.getString(), ParquetException); + EXPECT_THROW(variant.getBinary(), ParquetException); + } +} + +TEST(ParquetVariant, NullValue) { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_null", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::VARIANT_NULL, variant.getType()); + EXPECT_EQ("NULL", variant.typeDebugString()); +} + +TEST(ParquetVariant, ObjectValues) { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("object_primitive", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::OBJECT, variant.getType()); + EXPECT_EQ("OBJECT", variant.typeDebugString()); + + // 获取对象信息 + auto obj_info = variant.getObjectInfo(); + EXPECT_EQ(7, obj_info.num_elements); + + // 通过键获取值 + auto int_field = variant.getObjectValueByKey("int_field"); + ASSERT_TRUE(int_field.has_value()); + EXPECT_EQ(VariantType::INT, int_field->getType()); + EXPECT_EQ(42, int_field->getInt32()); + + auto double_field = variant.getObjectValueByKey("double_field"); + ASSERT_TRUE(double_field.has_value()); + EXPECT_EQ(VariantType::DOUBLE, double_field->getType()); + EXPECT_DOUBLE_EQ(3.14159, double_field->getDouble()); + + auto boolean_true_field = variant.getObjectValueByKey("boolean_true_field"); + ASSERT_TRUE(boolean_true_field.has_value()); + EXPECT_EQ(VariantType::BOOLEAN, boolean_true_field->getType()); + EXPECT_TRUE(boolean_true_field->getBool()); + + auto boolean_false_field = variant.getObjectValueByKey("boolean_false_field"); + ASSERT_TRUE(boolean_false_field.has_value()); + EXPECT_EQ(VariantType::BOOLEAN, boolean_false_field->getType()); + EXPECT_FALSE(boolean_false_field->getBool()); + + auto string_field = variant.getObjectValueByKey("string_field"); + ASSERT_TRUE(string_field.has_value()); + EXPECT_EQ(VariantType::STRING, string_field->getType()); + EXPECT_EQ("Hello, World!", string_field->getString()); + + auto null_field = variant.getObjectValueByKey("null_field"); + ASSERT_TRUE(null_field.has_value()); + EXPECT_EQ(VariantType::VARIANT_NULL, null_field->getType()); + + // 测试不存在的键 + auto non_existent = variant.getObjectValueByKey("non_existent"); + EXPECT_FALSE(non_existent.has_value()); + + // 通过字段ID获取值 + std::string_view key; + auto field_by_id = variant.getObjectFieldByFieldId(0, &key); + ASSERT_TRUE(field_by_id.has_value()); + EXPECT_EQ("int_field", key); + EXPECT_EQ(VariantType::INT, field_by_id->getType()); + EXPECT_EQ(42, field_by_id->getInt32()); +} + +TEST(ParquetVariant, ArrayValues) { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("array_primitive", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::ARRAY, variant.getType()); + EXPECT_EQ("ARRAY", variant.typeDebugString()); + + // 获取数组信息 + auto array_info = variant.getArrayInfo(); + EXPECT_EQ(5, array_info.num_elements); + + // 通过索引获取值 + auto element0 = variant.getArrayValueByIndex(0); + EXPECT_EQ(VariantType::INT, element0.getType()); + EXPECT_EQ(1, element0.getInt32()); + + auto element1 = variant.getArrayValueByIndex(1); + EXPECT_EQ(VariantType::INT, element1.getType()); + EXPECT_EQ(2, element1.getInt32()); + + auto element2 = variant.getArrayValueByIndex(2); + EXPECT_EQ(VariantType::INT, element2.getType()); + EXPECT_EQ(3, element2.getInt32()); + + auto element3 = variant.getArrayValueByIndex(3); + EXPECT_EQ(VariantType::INT, element3.getType()); + EXPECT_EQ(4, element3.getInt32()); + + auto element4 = variant.getArrayValueByIndex(4); + EXPECT_EQ(VariantType::INT, element4.getType()); + EXPECT_EQ(5, element4.getInt32()); + + // 测试越界索引 + EXPECT_THROW(variant.getArrayValueByIndex(5), ParquetException); + EXPECT_THROW(variant.getArrayValueByIndex(100), ParquetException); +} + +TEST(ParquetVariant, DecimalValues) { + // 测试 Decimal4 值 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_decimal4", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::DECIMAL4, variant.getType()); + EXPECT_EQ("DECIMAL4", variant.typeDebugString()); + auto decimal = variant.getDecimal4(); + EXPECT_EQ(2, decimal.scale); + EXPECT_EQ("123.45", decimal.value.ToString(decimal.scale)); + } + + // 测试 Decimal8 值 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_decimal8", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::DECIMAL8, variant.getType()); + EXPECT_EQ("DECIMAL8", variant.typeDebugString()); + auto decimal = variant.getDecimal8(); + EXPECT_EQ(4, decimal.scale); + EXPECT_EQ("12345.6789", decimal.value.ToString(decimal.scale)); + } + + // 测试 Decimal16 值 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_decimal16", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::DECIMAL16, variant.getType()); + EXPECT_EQ("DECIMAL16", variant.typeDebugString()); + auto decimal = variant.getDecimal16(); + EXPECT_EQ(8, decimal.scale); + EXPECT_EQ("123456789.12345678", decimal.value.ToString(decimal.scale)); + } +} + +TEST(ParquetVariant, DateTimeValues) { + // 测试日期值 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_date", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::DATE, variant.getType()); + EXPECT_EQ("DATE", variant.typeDebugString()); + // 日期值表示为自 Unix 纪元以来的天数 + EXPECT_EQ(18262, variant.getInt32()); // 2020-01-01 + } + + // 测试时间值 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_time", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::TIME, variant.getType()); + EXPECT_EQ("TIME", variant.typeDebugString()); + // 时间值表示为自午夜以来的微秒数 + EXPECT_EQ(43200000000, variant.timeNTZ()); // 12:00:00 + } + + // 测试带时区的时间戳 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_timestamp_tz", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::TIMESTAMP_TZ, variant.getType()); + EXPECT_EQ("TIMESTAMP_TZ", variant.typeDebugString()); + // 时间戳值表示为自 Unix 纪元以来的微秒数 + EXPECT_EQ(1577836800000000, variant.getTimestamp()); // 2020-01-01 00:00:00 UTC + } + + // 测试不带时区的时间戳 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_timestamp_ntz", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::TIMESTAMP_NTZ, variant.getType()); + EXPECT_EQ("TIMESTAMP_NTZ", variant.typeDebugString()); + // 时间戳值表示为自 Unix 纪元以来的微秒数 + EXPECT_EQ(1577836800000000, variant.getTimestampNTZ()); // 2020-01-01 00:00:00 + } +} + +// TEST(ParquetVariant, UuidValue) { +// std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; +// auto variant = LoadVariantValue("primitive_uuid", &metadata_buf, &value_buf); +// EXPECT_EQ(VariantType::UUID, variant.getType()); +// EXPECT_EQ("UUID", variant.typeDebugString()); +// +// // UUID 是 16 字节的二进制数据 +// const uint8_t* uuid = variant.getUuid(); +// ASSERT_NE(nullptr, uuid); +// +// // 检查 UUID 的格式(这里只是示例,实际值可能不同) +// std::string uuid_str; +// for (int i = 0; i < 16; i++) { +// char hex[3]; +// snprintf(hex, sizeof(hex), "%02x", uuid[i]); +// uuid_str += hex; +// if (i == 3 || i == 5 || i == 7 || i == 9) { +// uuid_str += "-"; +// } +// } +// +// EXPECT_EQ(36, uuid_str.length()); // 标准 UUID 字符串长度 +// } + +TEST(ParquetVariant, NestedStructures) { + // 测试嵌套对象 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("object_nested", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::OBJECT, variant.getType()); + + auto nested_obj = variant.getObjectValueByKey("nested_object"); + ASSERT_TRUE(nested_obj.has_value()); + EXPECT_EQ(VariantType::OBJECT, nested_obj->getType()); + + auto nested_field = nested_obj->getObjectValueByKey("nested_field"); + ASSERT_TRUE(nested_field.has_value()); + EXPECT_EQ(VariantType::STRING, nested_field->getType()); + EXPECT_EQ("Nested value", nested_field->getString()); + } + + // 测试嵌套数组 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("array_nested", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::ARRAY, variant.getType()); + + auto nested_array = variant.getArrayValueByIndex(0); + EXPECT_EQ(VariantType::ARRAY, nested_array.getType()); + + auto array_info = nested_array.getArrayInfo(); + EXPECT_EQ(3, array_info.num_elements); + + auto element0 = nested_array.getArrayValueByIndex(0); + EXPECT_EQ(VariantType::INT, element0.getType()); + EXPECT_EQ(1, element0.getInt32()); + + auto element1 = nested_array.getArrayValueByIndex(1); + EXPECT_EQ(VariantType::INT, element1.getType()); + EXPECT_EQ(2, element1.getInt32()); + + auto element2 = nested_array.getArrayValueByIndex(2); + EXPECT_EQ(VariantType::INT, element2.getType()); + EXPECT_EQ(3, element2.getInt32()); + } + + // 测试对象中的数组 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("object_with_array", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::OBJECT, variant.getType()); + + auto array_field = variant.getObjectValueByKey("array_field"); + ASSERT_TRUE(array_field.has_value()); + EXPECT_EQ(VariantType::ARRAY, array_field->getType()); + + auto array_info = array_field->getArrayInfo(); + EXPECT_EQ(3, array_info.num_elements); + + auto element0 = array_field->getArrayValueByIndex(0); + EXPECT_EQ(VariantType::INT, element0.getType()); + EXPECT_EQ(1, element0.getInt32()); + } + + // 测试数组中的对象 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("array_with_objects", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::ARRAY, variant.getType()); + + auto object_element = variant.getArrayValueByIndex(0); + EXPECT_EQ(VariantType::OBJECT, object_element.getType()); + + auto field = object_element.getObjectValueByKey("field"); + ASSERT_TRUE(field.has_value()); + EXPECT_EQ(VariantType::STRING, field->getType()); + EXPECT_EQ("Value", field->getString()); + } +} + +} // namespace parquet::variant \ No newline at end of file diff --git a/cpp/src/parquet/variant_test.cpp b/cpp/src/parquet/variant_test.cpp deleted file mode 100644 index 3e267d29e86b..000000000000 --- a/cpp/src/parquet/variant_test.cpp +++ /dev/null @@ -1,75 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include - -#include - -#include "parquet/exception.h" -#include "parquet/test_util.h" -#include "parquet/variant.h" - -#include -#include - -namespace parquet::variant { - -TEST(ParquetVariant, MetadataBase) { - std::string dir_string(parquet::test::get_variant_dir()); - auto file_system = std::make_shared<::arrow::fs::LocalFileSystem>(); - std::vector primitive_metadatas = { - // "primitive_null.metadata", - "primitive_boolean_true.metadata", "primitive_boolean_true.metadata", - "primitive_date.metadata", "primitive_decimal4.metadata", - "primitive_decimal8.metadata", "primitive_decimal16.metadata", - "primitive_float.metadata", "primitive_double.metadata", - "primitive_int8.metadata", "primitive_int16.metadata", - "primitive_int32.metadata", "primitive_int64.metadata", - "primitive_binary.metadata", "primitive_string.metadata", - }; - for (auto& test_file : primitive_metadatas) { - ARROW_SCOPED_TRACE("Testing file: " + test_file); - std::string path = dir_string + "/" + test_file; - ASSERT_OK_AND_ASSIGN(auto file, file_system->OpenInputFile(path)); - ASSERT_OK_AND_ASSIGN(auto file_size, file->GetSize()); - ASSERT_OK_AND_ASSIGN(auto buf, file->Read(file_size)); - - VariantMetadata metadata(std::string_view{*buf}); - EXPECT_EQ(1, metadata.version()); - EXPECT_THROW(metadata.getMetadataKey(0), ParquetException); - } - - { - std::string object_metadata = {"object_primitive.metadata"}; - ARROW_SCOPED_TRACE("Testing file: " + object_metadata); - std::string path = dir_string + "/" + object_metadata; - ASSERT_OK_AND_ASSIGN(auto file, file_system->OpenInputFile(path)); - ASSERT_OK_AND_ASSIGN(auto file_size, file->GetSize()); - ASSERT_OK_AND_ASSIGN(auto buf, file->Read(file_size)); - - VariantMetadata metadata(std::string_view{*buf}); - EXPECT_EQ("int_field", metadata.getMetadataKey(0)); - EXPECT_EQ("double_field", metadata.getMetadataKey(1)); - EXPECT_EQ("boolean_true_field", metadata.getMetadataKey(2)); - EXPECT_EQ("boolean_false_field", metadata.getMetadataKey(3)); - EXPECT_EQ("string_field", metadata.getMetadataKey(4)); - EXPECT_EQ("null_field", metadata.getMetadataKey(5)); - EXPECT_EQ("timestamp_field", metadata.getMetadataKey(6)); - } -} - -} // namespace parquet::variant \ No newline at end of file diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index 18d17540097f..2dc8bf140ed6 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit 18d17540097fca7c40be3d42c167e6bfad90763c +Subproject commit 2dc8bf140ed6e28652fc347211c7d661714c7f95 From 225b39b3d2e195d3fca7e93983f47a4b9f5b60c0 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 13 May 2025 17:27:42 +0800 Subject: [PATCH 05/31] Fix all primitive type tests( time, uuid not included) --- cpp/src/parquet/variant.cc | 3 +- cpp/src/parquet/variant_test.cc | 365 ++++---------------------------- 2 files changed, 46 insertions(+), 322 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index 10c7651c43e7..c0420e18dd09 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -95,7 +95,8 @@ VariantBasicType VariantValue::getBasicType() const { VariantType VariantValue::getType() const { VariantBasicType basic_type = getBasicType(); - + std::cout << "Variant first byte:" << static_cast(value[0] >> 2) << ", " + << (value[0] && BASIC_TYPE_MASK) << '\n'; switch (basic_type) { case VariantBasicType::Primitive: { auto primitive_type = static_cast(value[0] >> 2); diff --git a/cpp/src/parquet/variant_test.cc b/cpp/src/parquet/variant_test.cc index d260e0311bd4..51eca09b2d68 100644 --- a/cpp/src/parquet/variant_test.cc +++ b/cpp/src/parquet/variant_test.cc @@ -25,9 +25,26 @@ #include #include +#include namespace parquet::variant { +std::string metadata_test_file_name(std::string_view test_name) { + return std::string(test_name) + ".metadata"; +} + +std::string value_test_file_name(std::string_view test_name) { + return std::string(test_name) + ".value"; +} + +std::shared_ptr<::arrow::Buffer> readFromFile(::arrow::fs::FileSystem& fs, + const std::string& path) { + ASSIGN_OR_ABORT(auto file, fs.OpenInputFile(path)); + ASSIGN_OR_ABORT(auto file_size, file->GetSize()); + ASSIGN_OR_ABORT(auto buf, file->Read(file_size)); + return buf; +} + TEST(ParquetVariant, MetadataBase) { std::string dir_string(parquet::test::get_variant_dir()); auto file_system = std::make_shared<::arrow::fs::LocalFileSystem>(); @@ -44,9 +61,7 @@ TEST(ParquetVariant, MetadataBase) { for (auto& test_file : primitive_metadatas) { ARROW_SCOPED_TRACE("Testing file: " + test_file); std::string path = dir_string + "/" + test_file; - ASSERT_OK_AND_ASSIGN(auto file, file_system->OpenInputFile(path)); - ASSERT_OK_AND_ASSIGN(auto file_size, file->GetSize()); - ASSERT_OK_AND_ASSIGN(auto buf, file->Read(file_size)); + auto buf = readFromFile(*file_system, path); VariantMetadata metadata(std::string_view{*buf}); EXPECT_EQ(1, metadata.version()); @@ -54,12 +69,9 @@ TEST(ParquetVariant, MetadataBase) { } { - std::string object_metadata = {"object_primitive.metadata"}; + std::string object_metadata = "object_primitive.metadata"; ARROW_SCOPED_TRACE("Testing file: " + object_metadata); - std::string path = dir_string + "/" + object_metadata; - ASSERT_OK_AND_ASSIGN(auto file, file_system->OpenInputFile(path)); - ASSERT_OK_AND_ASSIGN(auto file_size, file->GetSize()); - ASSERT_OK_AND_ASSIGN(auto buf, file->Read(file_size)); + auto buf = readFromFile(*file_system, object_metadata); VariantMetadata metadata(std::string_view{*buf}); EXPECT_EQ("int_field", metadata.getMetadataKey(0)); @@ -72,35 +84,23 @@ TEST(ParquetVariant, MetadataBase) { } } -std::string metadata_test_file_name(std::string_view test_name) { - return std::string(test_name) + ".metadata"; -} - -std::string value_test_file_name(std::string_view test_name) { - return std::string(test_name) + ".value"; -} - -// 修改后的辅助函数,用于从文件加载 VariantValue,并保持 Buffer 生命周期 VariantValue LoadVariantValue(const std::string& test_name, std::shared_ptr<::arrow::Buffer>* metadata_buf_out, std::shared_ptr<::arrow::Buffer>* value_buf_out) { std::string dir_string(parquet::test::get_variant_dir()); + // TODO(mwish): Share in a base class? auto file_system = std::make_shared<::arrow::fs::LocalFileSystem>(); std::string metadata_path = dir_string + "/" + metadata_test_file_name(test_name); - EXPECT_OK_AND_ASSIGN(auto metadata_file, file_system->OpenInputFile(metadata_path)); - EXPECT_OK_AND_ASSIGN(auto metadata_size, metadata_file->GetSize()); - EXPECT_OK_AND_ASSIGN(auto metadata_buf, metadata_file->Read(metadata_size)); - *metadata_buf_out = metadata_buf; + *metadata_buf_out = readFromFile(*file_system, metadata_path); std::string value_path = dir_string + "/" + value_test_file_name(test_name); - EXPECT_OK_AND_ASSIGN(auto value_file, file_system->OpenInputFile(value_path)); - EXPECT_OK_AND_ASSIGN(auto value_size, value_file->GetSize()); - EXPECT_OK_AND_ASSIGN(auto value_buf, value_file->Read(value_size)); - *value_buf_out = value_buf; + *value_buf_out = readFromFile(*file_system, value_path); + + std::string_view value{**value_buf_out}; VariantMetadata metadata(std::string_view{**metadata_buf_out}); - return VariantValue{metadata, std::string_view{**value_buf_out}}; + return VariantValue{metadata, value}; } TEST(ParquetVariant, BooleanValue) { @@ -130,7 +130,6 @@ TEST(ParquetVariant, BooleanValue) { } TEST(ParquetVariant, NumericValues) { - // 测试 Int8 值 { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int8", &metadata_buf, &value_buf); @@ -138,53 +137,42 @@ TEST(ParquetVariant, NumericValues) { EXPECT_EQ("BYTE", variant.typeDebugString()); EXPECT_EQ(42, variant.getInt8()); } - - // 测试 Int16 值 { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int16", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::SHORT, variant.getType()); EXPECT_EQ("SHORT", variant.typeDebugString()); - EXPECT_EQ(12345, variant.getInt16()); + EXPECT_EQ(1234, variant.getInt16()); } - - // 测试 Int32 值 { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int32", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::INT, variant.getType()); EXPECT_EQ("INT", variant.typeDebugString()); - EXPECT_EQ(1234567890, variant.getInt32()); + EXPECT_EQ(123456, variant.getInt32()); } - - // 测试 Int64 值 { + // FIXME(mwish): https://github.com/apache/parquet-testing/issues/82 std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int64", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::LONG, variant.getType()); - EXPECT_EQ("LONG", variant.typeDebugString()); - EXPECT_EQ(1234567890123456789LL, variant.getInt64()); + // EXPECT_EQ(VariantType::LONG, variant.getType()); + // EXPECT_EQ("LONG", variant.typeDebugString()); + EXPECT_EQ(12345678, variant.getInt32()); } - - // 测试 Float 值 { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_float", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::FLOAT, variant.getType()); EXPECT_EQ("FLOAT", variant.typeDebugString()); - EXPECT_FLOAT_EQ(3.14159f, variant.getFloat()); + EXPECT_FLOAT_EQ(1234567940.0, variant.getFloat()); } - - // 测试 Double 值 { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_double", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::DOUBLE, variant.getType()); EXPECT_EQ("DOUBLE", variant.typeDebugString()); - EXPECT_DOUBLE_EQ(2.71828182845904523536, variant.getDouble()); + EXPECT_DOUBLE_EQ(1234567890.1234, variant.getDouble()); } - - // 测试类型不匹配的异常 { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int32", &metadata_buf, &value_buf); @@ -195,35 +183,33 @@ TEST(ParquetVariant, NumericValues) { } TEST(ParquetVariant, StringValues) { - // 测试普通字符串 { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_string", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::STRING, variant.getType()); EXPECT_EQ("STRING", variant.typeDebugString()); - EXPECT_EQ("Hello, World!", variant.getString()); + std::string expected = + R"(This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥️, 🎣 and 🤦!!)"; + EXPECT_EQ(expected, variant.getString()); } - - // 测试短字符串(使用 ShortString 优化) { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("short_string", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::STRING, variant.getType()); EXPECT_EQ(VariantBasicType::ShortString, variant.getBasicType()); - EXPECT_EQ("Short", variant.getString()); + std::string expected = R"(Less than 64 bytes (❤️ with utf8))"; + EXPECT_EQ(expected, variant.getString()); } - - // 测试二进制数据 { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_binary", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::BINARY, variant.getType()); EXPECT_EQ("BINARY", variant.typeDebugString()); auto binary_data = variant.getBinary(); - EXPECT_EQ(10, binary_data.size()); + std::string expected = ::arrow::util::base64_decode("AxM33q2+78r+"); + EXPECT_EQ(expected, binary_data); } - // 测试类型不匹配的异常 { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int32", &metadata_buf, &value_buf); @@ -233,276 +219,13 @@ TEST(ParquetVariant, StringValues) { } TEST(ParquetVariant, NullValue) { + // https://github.com/apache/parquet-testing/issues/81 + /* std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_null", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::VARIANT_NULL, variant.getType()); EXPECT_EQ("NULL", variant.typeDebugString()); -} - -TEST(ParquetVariant, ObjectValues) { - std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("object_primitive", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::OBJECT, variant.getType()); - EXPECT_EQ("OBJECT", variant.typeDebugString()); - - // 获取对象信息 - auto obj_info = variant.getObjectInfo(); - EXPECT_EQ(7, obj_info.num_elements); - - // 通过键获取值 - auto int_field = variant.getObjectValueByKey("int_field"); - ASSERT_TRUE(int_field.has_value()); - EXPECT_EQ(VariantType::INT, int_field->getType()); - EXPECT_EQ(42, int_field->getInt32()); - - auto double_field = variant.getObjectValueByKey("double_field"); - ASSERT_TRUE(double_field.has_value()); - EXPECT_EQ(VariantType::DOUBLE, double_field->getType()); - EXPECT_DOUBLE_EQ(3.14159, double_field->getDouble()); - - auto boolean_true_field = variant.getObjectValueByKey("boolean_true_field"); - ASSERT_TRUE(boolean_true_field.has_value()); - EXPECT_EQ(VariantType::BOOLEAN, boolean_true_field->getType()); - EXPECT_TRUE(boolean_true_field->getBool()); - - auto boolean_false_field = variant.getObjectValueByKey("boolean_false_field"); - ASSERT_TRUE(boolean_false_field.has_value()); - EXPECT_EQ(VariantType::BOOLEAN, boolean_false_field->getType()); - EXPECT_FALSE(boolean_false_field->getBool()); - - auto string_field = variant.getObjectValueByKey("string_field"); - ASSERT_TRUE(string_field.has_value()); - EXPECT_EQ(VariantType::STRING, string_field->getType()); - EXPECT_EQ("Hello, World!", string_field->getString()); - - auto null_field = variant.getObjectValueByKey("null_field"); - ASSERT_TRUE(null_field.has_value()); - EXPECT_EQ(VariantType::VARIANT_NULL, null_field->getType()); - - // 测试不存在的键 - auto non_existent = variant.getObjectValueByKey("non_existent"); - EXPECT_FALSE(non_existent.has_value()); - - // 通过字段ID获取值 - std::string_view key; - auto field_by_id = variant.getObjectFieldByFieldId(0, &key); - ASSERT_TRUE(field_by_id.has_value()); - EXPECT_EQ("int_field", key); - EXPECT_EQ(VariantType::INT, field_by_id->getType()); - EXPECT_EQ(42, field_by_id->getInt32()); -} - -TEST(ParquetVariant, ArrayValues) { - std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("array_primitive", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::ARRAY, variant.getType()); - EXPECT_EQ("ARRAY", variant.typeDebugString()); - - // 获取数组信息 - auto array_info = variant.getArrayInfo(); - EXPECT_EQ(5, array_info.num_elements); - - // 通过索引获取值 - auto element0 = variant.getArrayValueByIndex(0); - EXPECT_EQ(VariantType::INT, element0.getType()); - EXPECT_EQ(1, element0.getInt32()); - - auto element1 = variant.getArrayValueByIndex(1); - EXPECT_EQ(VariantType::INT, element1.getType()); - EXPECT_EQ(2, element1.getInt32()); - - auto element2 = variant.getArrayValueByIndex(2); - EXPECT_EQ(VariantType::INT, element2.getType()); - EXPECT_EQ(3, element2.getInt32()); - - auto element3 = variant.getArrayValueByIndex(3); - EXPECT_EQ(VariantType::INT, element3.getType()); - EXPECT_EQ(4, element3.getInt32()); - - auto element4 = variant.getArrayValueByIndex(4); - EXPECT_EQ(VariantType::INT, element4.getType()); - EXPECT_EQ(5, element4.getInt32()); - - // 测试越界索引 - EXPECT_THROW(variant.getArrayValueByIndex(5), ParquetException); - EXPECT_THROW(variant.getArrayValueByIndex(100), ParquetException); -} - -TEST(ParquetVariant, DecimalValues) { - // 测试 Decimal4 值 - { - std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("primitive_decimal4", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::DECIMAL4, variant.getType()); - EXPECT_EQ("DECIMAL4", variant.typeDebugString()); - auto decimal = variant.getDecimal4(); - EXPECT_EQ(2, decimal.scale); - EXPECT_EQ("123.45", decimal.value.ToString(decimal.scale)); - } - - // 测试 Decimal8 值 - { - std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("primitive_decimal8", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::DECIMAL8, variant.getType()); - EXPECT_EQ("DECIMAL8", variant.typeDebugString()); - auto decimal = variant.getDecimal8(); - EXPECT_EQ(4, decimal.scale); - EXPECT_EQ("12345.6789", decimal.value.ToString(decimal.scale)); - } - - // 测试 Decimal16 值 - { - std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("primitive_decimal16", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::DECIMAL16, variant.getType()); - EXPECT_EQ("DECIMAL16", variant.typeDebugString()); - auto decimal = variant.getDecimal16(); - EXPECT_EQ(8, decimal.scale); - EXPECT_EQ("123456789.12345678", decimal.value.ToString(decimal.scale)); - } -} - -TEST(ParquetVariant, DateTimeValues) { - // 测试日期值 - { - std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("primitive_date", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::DATE, variant.getType()); - EXPECT_EQ("DATE", variant.typeDebugString()); - // 日期值表示为自 Unix 纪元以来的天数 - EXPECT_EQ(18262, variant.getInt32()); // 2020-01-01 - } - - // 测试时间值 - { - std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("primitive_time", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::TIME, variant.getType()); - EXPECT_EQ("TIME", variant.typeDebugString()); - // 时间值表示为自午夜以来的微秒数 - EXPECT_EQ(43200000000, variant.timeNTZ()); // 12:00:00 - } - - // 测试带时区的时间戳 - { - std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("primitive_timestamp_tz", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::TIMESTAMP_TZ, variant.getType()); - EXPECT_EQ("TIMESTAMP_TZ", variant.typeDebugString()); - // 时间戳值表示为自 Unix 纪元以来的微秒数 - EXPECT_EQ(1577836800000000, variant.getTimestamp()); // 2020-01-01 00:00:00 UTC - } - - // 测试不带时区的时间戳 - { - std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("primitive_timestamp_ntz", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::TIMESTAMP_NTZ, variant.getType()); - EXPECT_EQ("TIMESTAMP_NTZ", variant.typeDebugString()); - // 时间戳值表示为自 Unix 纪元以来的微秒数 - EXPECT_EQ(1577836800000000, variant.getTimestampNTZ()); // 2020-01-01 00:00:00 - } -} - -// TEST(ParquetVariant, UuidValue) { -// std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; -// auto variant = LoadVariantValue("primitive_uuid", &metadata_buf, &value_buf); -// EXPECT_EQ(VariantType::UUID, variant.getType()); -// EXPECT_EQ("UUID", variant.typeDebugString()); -// -// // UUID 是 16 字节的二进制数据 -// const uint8_t* uuid = variant.getUuid(); -// ASSERT_NE(nullptr, uuid); -// -// // 检查 UUID 的格式(这里只是示例,实际值可能不同) -// std::string uuid_str; -// for (int i = 0; i < 16; i++) { -// char hex[3]; -// snprintf(hex, sizeof(hex), "%02x", uuid[i]); -// uuid_str += hex; -// if (i == 3 || i == 5 || i == 7 || i == 9) { -// uuid_str += "-"; -// } -// } -// -// EXPECT_EQ(36, uuid_str.length()); // 标准 UUID 字符串长度 -// } - -TEST(ParquetVariant, NestedStructures) { - // 测试嵌套对象 - { - std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("object_nested", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::OBJECT, variant.getType()); - - auto nested_obj = variant.getObjectValueByKey("nested_object"); - ASSERT_TRUE(nested_obj.has_value()); - EXPECT_EQ(VariantType::OBJECT, nested_obj->getType()); - - auto nested_field = nested_obj->getObjectValueByKey("nested_field"); - ASSERT_TRUE(nested_field.has_value()); - EXPECT_EQ(VariantType::STRING, nested_field->getType()); - EXPECT_EQ("Nested value", nested_field->getString()); - } - - // 测试嵌套数组 - { - std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("array_nested", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::ARRAY, variant.getType()); - - auto nested_array = variant.getArrayValueByIndex(0); - EXPECT_EQ(VariantType::ARRAY, nested_array.getType()); - - auto array_info = nested_array.getArrayInfo(); - EXPECT_EQ(3, array_info.num_elements); - - auto element0 = nested_array.getArrayValueByIndex(0); - EXPECT_EQ(VariantType::INT, element0.getType()); - EXPECT_EQ(1, element0.getInt32()); - - auto element1 = nested_array.getArrayValueByIndex(1); - EXPECT_EQ(VariantType::INT, element1.getType()); - EXPECT_EQ(2, element1.getInt32()); - - auto element2 = nested_array.getArrayValueByIndex(2); - EXPECT_EQ(VariantType::INT, element2.getType()); - EXPECT_EQ(3, element2.getInt32()); - } - - // 测试对象中的数组 - { - std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("object_with_array", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::OBJECT, variant.getType()); - - auto array_field = variant.getObjectValueByKey("array_field"); - ASSERT_TRUE(array_field.has_value()); - EXPECT_EQ(VariantType::ARRAY, array_field->getType()); - - auto array_info = array_field->getArrayInfo(); - EXPECT_EQ(3, array_info.num_elements); - - auto element0 = array_field->getArrayValueByIndex(0); - EXPECT_EQ(VariantType::INT, element0.getType()); - EXPECT_EQ(1, element0.getInt32()); - } - - // 测试数组中的对象 - { - std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("array_with_objects", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::ARRAY, variant.getType()); - - auto object_element = variant.getArrayValueByIndex(0); - EXPECT_EQ(VariantType::OBJECT, object_element.getType()); - - auto field = object_element.getObjectValueByKey("field"); - ASSERT_TRUE(field.has_value()); - EXPECT_EQ(VariantType::STRING, field->getType()); - EXPECT_EQ("Value", field->getString()); - } + */ } } // namespace parquet::variant \ No newline at end of file From 80a37ba289d329a6ae350c73b26c931e36c71449 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 13 May 2025 19:15:13 +0800 Subject: [PATCH 06/31] skeleton for object parsing ( bug might exists ) --- cpp/src/parquet/variant.cc | 112 +++++++++++++++++++------------- cpp/src/parquet/variant.h | 2 + cpp/src/parquet/variant_test.cc | 82 +++++++++++++++++++++++ 3 files changed, 150 insertions(+), 46 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index c0420e18dd09..df7400178868 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -95,8 +95,8 @@ VariantBasicType VariantValue::getBasicType() const { VariantType VariantValue::getType() const { VariantBasicType basic_type = getBasicType(); - std::cout << "Variant first byte:" << static_cast(value[0] >> 2) << ", " - << (value[0] && BASIC_TYPE_MASK) << '\n'; + // std::cout << "Variant first byte:" << static_cast(value[0] >> 2) << ", " + // << static_cast(value[0] && BASIC_TYPE_MASK) << '\n'; switch (basic_type) { case VariantBasicType::Primitive: { auto primitive_type = static_cast(value[0] >> 2); @@ -540,41 +540,66 @@ const uint8_t* VariantValue::getUuid() const { return reinterpret_cast(value.data() + 1); } +std::string VariantValue::ObjectInfo::toDebugString() const { + std::stringstream ss; + ss << "ObjectInfo{" + << "num_elements=" << num_elements + << ", id_size=" << static_cast(id_size) + << ", offset_size=" << static_cast(offset_size) + << ", id_start_offset=" << id_start_offset + << ", offset_start_offset=" << offset_start_offset + << ", data_start_offset=" << data_start_offset + << "}"; + return ss.str(); +} + + VariantValue::ObjectInfo VariantValue::getObjectInfo() const { if (getBasicType() != VariantBasicType::Object) { throw ParquetException("Not an object type"); } - - if (value.size() < 5) { - throw ParquetException("Invalid object value: too short"); - } - - uint32_t num_elements; - memcpy(&num_elements, value.data() + 1, sizeof(uint32_t)); - num_elements = arrow::bit_util::FromLittleEndian(num_elements); - - if (value.size() < 6) { - throw ParquetException("Invalid object value: too short for id_size"); - } - - uint8_t id_size = value[5]; - - if (value.size() < 7) { - throw ParquetException("Invalid object value: too short for offset_size"); - } - - uint8_t offset_size = value[6]; - - if (offset_size < 1 || offset_size > 4 || id_size < 1 || id_size > 4) { - throw ParquetException("Invalid object value: invalid id_size or offset_size"); + uint8_t value_header = value[0] >> 2; + uint8_t field_offset_size = (value_header & 0b11) + 1; + uint8_t field_id_size = ((value_header >> 2) & 0b11) + 1; + bool is_large = ((value_header >> 4) & 0b1); + uint8_t num_elements_size = is_large ? 4 : 1; + if (value.size() < 1 + num_elements_size) { + throw ParquetException("Invalid object value: too short: " + + std::to_string(value.size()) + " for at least " + + std::to_string(1 + num_elements_size)); + } + // parse num_elements + uint32_t num_elements = 0; + { + memcpy(&num_elements, value.data() + 1, num_elements_size); + num_elements = arrow::bit_util::FromLittleEndian(num_elements); + } + ObjectInfo info{}; + info.num_elements = num_elements; + info.id_size = field_id_size; + info.offset_size = field_offset_size; + info.id_start_offset = 1 + num_elements_size; + info.offset_start_offset = info.id_start_offset + num_elements * field_id_size; + info.data_start_offset = info.offset_start_offset + (num_elements + 1) * field_offset_size; + // Check the boundary with the final offset + if (info.data_start_offset > value.size()) { + throw ParquetException("Invalid object value: data_start_offset=" + + std::to_string(info.data_start_offset) + + ", value_size=" + std::to_string(value.size())); + } + { + uint32_t final_offset = 0; + memcpy(&final_offset, + value.data() + info.offset_start_offset + num_elements * field_offset_size, + field_offset_size); + if (final_offset + info.data_start_offset > value.size()) { + throw ParquetException("Invalid object value: final_offset=" + + std::to_string(final_offset) + + ", data_start_offset=" + std::to_string(info.data_start_offset) + + ", value_size=" + std::to_string(value.size())); + } } - - uint32_t id_start_offset = 7; - uint32_t offset_start_offset = id_start_offset + num_elements * id_size; - uint32_t data_start_offset = offset_start_offset + (num_elements + 1) * offset_size; - - return {num_elements, id_size, offset_size, id_start_offset, - offset_start_offset, data_start_offset}; + return info; } std::optional VariantValue::getObjectValueByKey( @@ -612,9 +637,9 @@ std::optional VariantValue::getObjectFieldByFieldId( field_id = arrow::bit_util::FromLittleEndian(field_id); // Get the key from metadata - if (key != nullptr) { - *key = metadata.getMetadataKey(field_id); - } + // TODO(mwish): Fix the casting here. + *key = metadata.getMetadataKey(field_id); + std::cout << "Metadata key:" << *key << '\n'; // Read the offset and next offset uint32_t offset = 0, next_offset = 0; @@ -624,23 +649,18 @@ std::optional VariantValue::getObjectFieldByFieldId( value.data() + info.offset_start_offset + (variantId + 1) * info.offset_size, info.offset_size); offset = arrow::bit_util::FromLittleEndian(offset); - next_offset = arrow::bit_util::FromLittleEndian(next_offset); - if (offset == next_offset) { - // Field is not present (null) - return std::nullopt; - } - - if (info.data_start_offset + offset >= value.size() || - info.data_start_offset + next_offset > value.size() || offset > next_offset) { - throw ParquetException("Invalid object field offsets"); + if (info.data_start_offset + offset > value.size()) { + throw ParquetException("Invalid object field offsets: data_start_offset=" + + std::to_string(info.data_start_offset) + + ", offset=" + std::to_string(offset) + + ", value_size=" + std::to_string(value.size())); } // Create a VariantValue for the field VariantValue field_value{ .metadata = metadata, - .value = std::string_view(value.data() + info.data_start_offset + offset, - next_offset - offset)}; + .value = value.substr(info.data_start_offset + offset)}; return field_value; } diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index 7154eaa6351d..4b049b75d953 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -177,6 +177,8 @@ struct VariantValue { uint32_t id_start_offset; uint32_t offset_start_offset; uint32_t data_start_offset; + + std::string toDebugString() const; }; ObjectInfo getObjectInfo() const; std::optional getObjectValueByKey(std::string_view key) const; diff --git a/cpp/src/parquet/variant_test.cc b/cpp/src/parquet/variant_test.cc index 51eca09b2d68..478c1bf7247d 100644 --- a/cpp/src/parquet/variant_test.cc +++ b/cpp/src/parquet/variant_test.cc @@ -228,4 +228,86 @@ TEST(ParquetVariant, NullValue) { */ } + +TEST(ParquetVariant, ObjectValues) { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("object_primitive", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::OBJECT, variant.getType()); + EXPECT_EQ("OBJECT", variant.typeDebugString()); + + auto obj_info = variant.getObjectInfo(); + EXPECT_EQ(7, obj_info.num_elements); + + auto int_field = variant.getObjectValueByKey("int_field"); + ASSERT_TRUE(int_field.has_value()); + std::cout << "int_field: " << int_field->typeDebugString() << '\n'; + EXPECT_EQ(VariantType::INT, int_field->getType()); + // EXPECT_EQ(42, int_field->getInt32()); + + auto double_field = variant.getObjectValueByKey("double_field"); + std::cout << "double_field: " << double_field->typeDebugString() << '\n'; + ASSERT_TRUE(double_field.has_value()); + EXPECT_EQ(VariantType::DOUBLE, double_field->getType()); + // EXPECT_DOUBLE_EQ(3.14159, double_field->getDouble()); + + auto boolean_true_field = variant.getObjectValueByKey("boolean_true_field"); + ASSERT_TRUE(boolean_true_field.has_value()); + EXPECT_EQ(VariantType::BOOLEAN, boolean_true_field->getType()); + EXPECT_TRUE(boolean_true_field->getBool()); + + auto boolean_false_field = variant.getObjectValueByKey("boolean_false_field"); + ASSERT_TRUE(boolean_false_field.has_value()); + EXPECT_EQ(VariantType::BOOLEAN, boolean_false_field->getType()); + // EXPECT_FALSE(boolean_false_field->getBool()); + + auto string_field = variant.getObjectValueByKey("string_field"); + ASSERT_TRUE(string_field.has_value()); + EXPECT_EQ(VariantType::STRING, string_field->getType()); + // EXPECT_EQ("Hello, World!", string_field->getString()); + + auto null_field = variant.getObjectValueByKey("null_field"); + ASSERT_TRUE(null_field.has_value()); + EXPECT_EQ(VariantType::VARIANT_NULL, null_field->getType()); + + auto non_existent = variant.getObjectValueByKey("non_existent"); + EXPECT_FALSE(non_existent.has_value()); + + // std::string_view key; + // auto field_by_id = variant.getObjectFieldByFieldId(0, &key); + // ASSERT_TRUE(field_by_id.has_value()); + // EXPECT_EQ("int_field", key); + // EXPECT_EQ(VariantType::INT, field_by_id->getType()); + // EXPECT_EQ(42, field_by_id->getInt32()); +} + +TEST(ParquetVariant, DecimalValues) { + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_decimal4", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::DECIMAL4, variant.getType()); + EXPECT_EQ("DECIMAL4", variant.typeDebugString()); + auto decimal = variant.getDecimal4(); + EXPECT_EQ(2, decimal.scale); + EXPECT_EQ("12.34", decimal.value.ToString(decimal.scale)); + } + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_decimal8", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::DECIMAL8, variant.getType()); + EXPECT_EQ("DECIMAL8", variant.typeDebugString()); + auto decimal = variant.getDecimal8(); + EXPECT_EQ(2, decimal.scale); + EXPECT_EQ("12345678.90", decimal.value.ToString(decimal.scale)); + } + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_decimal16", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::DECIMAL16, variant.getType()); + EXPECT_EQ("DECIMAL16", variant.typeDebugString()); + auto decimal = variant.getDecimal16(); + EXPECT_EQ(2, decimal.scale); + EXPECT_EQ("12345678912345678.90", decimal.value.ToString(decimal.scale)); + } +} + } // namespace parquet::variant \ No newline at end of file From fc4b721d8f16b40dc7a8baefd0943c9e0c8c2555 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 13 May 2025 19:48:28 +0800 Subject: [PATCH 07/31] eliminate duplicate code --- cpp/src/parquet/variant.cc | 248 +++++++++---------------------------- cpp/src/parquet/variant.h | 9 +- 2 files changed, 64 insertions(+), 193 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index df7400178868..633def834c71 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -216,88 +216,60 @@ bool VariantValue::getBool() const { throw ParquetException("Not a primitive type"); } - auto primitive_type = static_cast(value[0] >> 2); - if (primitive_type == VariantPrimitiveType::BooleanTrue) { + int8_t primitive_type = static_cast(value[0]) >> 2; + if (primitive_type == static_cast(VariantPrimitiveType::BooleanTrue)) { return true; - } else if (primitive_type == VariantPrimitiveType::BooleanFalse) { + } + if (primitive_type == static_cast(VariantPrimitiveType::BooleanFalse)) { return false; } - throw ParquetException("Not a boolean type"); + throw ParquetException("Not a variant primitive boolean type with primitive type: " + + std::to_string(primitive_type)); } -int8_t VariantValue::getInt8() const { +template +PrimitiveType VariantValue::getPrimitiveVariantType(VariantPrimitiveType type) const { if (getBasicType() != VariantBasicType::Primitive) { throw ParquetException("Not a primitive type"); } auto primitive_type = static_cast(value[0] >> 2); if (primitive_type != VariantPrimitiveType::Int8) { - throw ParquetException("Not an Int8 type"); + throw ParquetException("Not an correspond type"); } - if (value.size() < 2) { - throw ParquetException("Invalid Int8 value: too short"); + if (value.size() < 1 + sizeof(PrimitiveType)) { + throw ParquetException("Invalid value: too short"); } - return static_cast(value[1]); + PrimitiveType decimal_value{}; + memcpy(&decimal_value, value.data() + 1, sizeof(PrimitiveType)); + return decimal_value; } -int16_t VariantValue::getInt16() const { - if (getBasicType() != VariantBasicType::Primitive) { - throw ParquetException("Not a primitive type"); - } - - auto primitive_type = static_cast(value[0] >> 2); - if (primitive_type != VariantPrimitiveType::Int16) { - throw ParquetException("Not an Int16 type"); - } - - if (value.size() < 3) { - throw ParquetException("Invalid Int16 value: too short"); - } +int8_t VariantValue::getInt8() const { + return getPrimitiveVariantType(VariantPrimitiveType::Int8); +} - int16_t result; - memcpy(&result, value.data() + 1, sizeof(int16_t)); - return arrow::bit_util::FromLittleEndian(result); +int16_t VariantValue::getInt16() const { + return getPrimitiveVariantType(VariantPrimitiveType::Int16); } int32_t VariantValue::getInt32() const { - if (getBasicType() != VariantBasicType::Primitive) { - throw ParquetException("Not a primitive type"); - } - - auto primitive_type = static_cast(value[0] >> 2); - if (primitive_type != VariantPrimitiveType::Int32) { - throw ParquetException("Not an Int32 type"); - } - - if (value.size() < 5) { - throw ParquetException("Invalid Int32 value: too short"); - } - - int32_t result; - memcpy(&result, value.data() + 1, sizeof(int32_t)); - return arrow::bit_util::FromLittleEndian(result); + return getPrimitiveVariantType(VariantPrimitiveType::Int32); } int64_t VariantValue::getInt64() const { - if (getBasicType() != VariantBasicType::Primitive) { - throw ParquetException("Not a primitive type"); - } - - auto primitive_type = static_cast(value[0] >> 2); - if (primitive_type != VariantPrimitiveType::Int64) { - throw ParquetException("Not an Int64 type"); - } + return getPrimitiveVariantType(VariantPrimitiveType::Int64); +} - if (value.size() < 9) { - throw ParquetException("Invalid Int64 value: too short"); - } +float VariantValue::getFloat() const { + return getPrimitiveVariantType(VariantPrimitiveType::Float); +} - int64_t result; - memcpy(&result, value.data() + 1, sizeof(int64_t)); - return arrow::bit_util::FromLittleEndian(result); +double VariantValue::getDouble() const { + return getPrimitiveVariantType(VariantPrimitiveType::Double); } std::string_view VariantValue::getString() const { @@ -309,7 +281,8 @@ std::string_view VariantValue::getString() const { throw ParquetException("Invalid short string: too short"); } return std::string_view(value.data() + 1, length); - } else if (basic_type == VariantBasicType::Primitive) { + } + if (basic_type == VariantBasicType::Primitive) { auto primitive_type = static_cast(value[0] >> 2); if (primitive_type != VariantPrimitiveType::String) { throw ParquetException("Not a string type"); @@ -330,7 +303,7 @@ std::string_view VariantValue::getString() const { return std::string_view(value.data() + 5, length); } - throw ParquetException("Not a string type"); + throw ParquetException("Not a primitive or short string type calls getString"); } std::string_view VariantValue::getBinary() const { @@ -358,86 +331,37 @@ std::string_view VariantValue::getBinary() const { return std::string_view(value.data() + 5, length); } -float VariantValue::getFloat() const { +template +DecimalValue VariantValue::getPrimitiveDecimalType( + VariantPrimitiveType type) const { + using DecimalValueType = typename DecimalType::ValueType; if (getBasicType() != VariantBasicType::Primitive) { throw ParquetException("Not a primitive type"); } auto primitive_type = static_cast(value[0] >> 2); - if (primitive_type != VariantPrimitiveType::Float) { - throw ParquetException("Not a float type"); - } - - if (value.size() < 5) { - throw ParquetException("Invalid float value: too short"); - } - - float result; - memcpy(&result, value.data() + 1, sizeof(float)); - return arrow::bit_util::FromLittleEndian(result); -} - -double VariantValue::getDouble() const { - if (getBasicType() != VariantBasicType::Primitive) { - throw ParquetException("Not a primitive type"); + if (primitive_type != type) { + throw ParquetException("Not a decimal type"); } - auto primitive_type = static_cast(value[0] >> 2); - if (primitive_type != VariantPrimitiveType::Double) { - throw ParquetException("Not a double type"); + if (value.size() < 2 + sizeof(DecimalValueType)) { + throw ParquetException("Invalid decimal value: too short"); } - if (value.size() < 9) { - throw ParquetException("Invalid double value: too short"); - } + uint8_t scale = value[1]; + DecimalValueType decimal_value; + memcpy(&decimal_value, value.data() + 2, sizeof(DecimalValueType)); + decimal_value = arrow::bit_util::FromLittleEndian(decimal_value); - double result; - memcpy(&result, value.data() + 1, sizeof(double)); - return arrow::bit_util::FromLittleEndian(result); + return {scale, DecimalType(decimal_value)}; } DecimalValue<::arrow::Decimal32> VariantValue::getDecimal4() const { - if (getBasicType() != VariantBasicType::Primitive) { - throw ParquetException("Not a primitive type"); - } - - auto primitive_type = static_cast(value[0] >> 2); - if (primitive_type != VariantPrimitiveType::Decimal4) { - throw ParquetException("Not a decimal4 type"); - } - - if (value.size() < 6) { - throw ParquetException("Invalid decimal4 value: too short"); - } - - uint8_t scale = value[1]; - int32_t decimal_value; - memcpy(&decimal_value, value.data() + 2, sizeof(int32_t)); - decimal_value = arrow::bit_util::FromLittleEndian(decimal_value); - - return {scale, ::arrow::Decimal32(decimal_value)}; + return getPrimitiveDecimalType<::arrow::Decimal32>(VariantPrimitiveType::Decimal4); } DecimalValue<::arrow::Decimal64> VariantValue::getDecimal8() const { - if (getBasicType() != VariantBasicType::Primitive) { - throw ParquetException("Not a primitive type"); - } - - auto primitive_type = static_cast(value[0] >> 2); - if (primitive_type != VariantPrimitiveType::Decimal8) { - throw ParquetException("Not a decimal8 type"); - } - - if (value.size() < 10) { - throw ParquetException("Invalid decimal8 value: too short"); - } - - uint8_t scale = value[1]; - int64_t decimal_value; - memcpy(&decimal_value, value.data() + 2, sizeof(int64_t)); - decimal_value = arrow::bit_util::FromLittleEndian(decimal_value); - - return {scale, ::arrow::Decimal64(decimal_value)}; + return getPrimitiveDecimalType<::arrow::Decimal64>(VariantPrimitiveType::Decimal8); } DecimalValue<::arrow::Decimal128> VariantValue::getDecimal16() const { @@ -450,94 +374,34 @@ DecimalValue<::arrow::Decimal128> VariantValue::getDecimal16() const { throw ParquetException("Not a decimal16 type"); } - if (value.size() < 18) { + if (value.size() < 2 + sizeof(int64_t) * 2) { throw ParquetException("Invalid decimal16 value: too short"); } uint8_t scale = value[1]; - // Decimal128 is stored as two int64_t values (low bits, high bits) - int64_t low_bits, high_bits; - memcpy(&low_bits, value.data() + 2, sizeof(int64_t)); - memcpy(&high_bits, value.data() + 10, sizeof(int64_t)); - low_bits = arrow::bit_util::FromLittleEndian(low_bits); - high_bits = arrow::bit_util::FromLittleEndian(high_bits); - - return {scale, ::arrow::Decimal128(high_bits, low_bits)}; + // TODO(mwish): Do we have better way for this? + std::array low_high_bits; + memcpy(&low_high_bits[0], value.data() + 2, sizeof(int64_t)); + memcpy(&low_high_bits[1], value.data() + 10, sizeof(int64_t)); + ::arrow::bit_util::little_endian::ToNative(low_high_bits); + return {scale, ::arrow::Decimal128(low_high_bits[1], low_high_bits[0])}; } int64_t VariantValue::timeNTZ() const { - if (getBasicType() != VariantBasicType::Primitive) { - throw ParquetException("Not a primitive type"); - } - - auto primitive_type = static_cast(value[0] >> 2); - if (primitive_type != VariantPrimitiveType::TimeNTZ) { - throw ParquetException("Not a timeNTZ type"); - } - - if (value.size() < 9) { - throw ParquetException("Invalid timeNTZ value: too short"); - } - - int64_t result; - memcpy(&result, value.data() + 1, sizeof(int64_t)); - return arrow::bit_util::FromLittleEndian(result); + return getPrimitiveVariantType(VariantPrimitiveType::TimeNTZ); } int64_t VariantValue::getTimestamp() const { - if (getBasicType() != VariantBasicType::Primitive) { - throw ParquetException("Not a primitive type"); - } - - auto primitive_type = static_cast(value[0] >> 2); - if (primitive_type != VariantPrimitiveType::Timestamp) { - throw ParquetException("Not a timestamp type"); - } - - if (value.size() < 9) { - throw ParquetException("Invalid timestamp value: too short"); - } - - int64_t result; - memcpy(&result, value.data() + 1, sizeof(int64_t)); - return arrow::bit_util::FromLittleEndian(result); + return getPrimitiveVariantType(VariantPrimitiveType::Timestamp); } int64_t VariantValue::getTimestampNTZ() const { - if (getBasicType() != VariantBasicType::Primitive) { - throw ParquetException("Not a primitive type"); - } - - auto primitive_type = static_cast(value[0] >> 2); - if (primitive_type != VariantPrimitiveType::TimestampNTZ) { - throw ParquetException("Not a timestampNTZ type"); - } - - if (value.size() < 9) { - throw ParquetException("Invalid timestampNTZ value: too short"); - } - - int64_t result; - memcpy(&result, value.data() + 1, sizeof(int64_t)); - return arrow::bit_util::FromLittleEndian(result); + return getPrimitiveVariantType(VariantPrimitiveType::TimestampNTZ); } const uint8_t* VariantValue::getUuid() const { - if (getBasicType() != VariantBasicType::Primitive) { - throw ParquetException("Not a primitive type"); - } - - auto primitive_type = static_cast(value[0] >> 2); - if (primitive_type != VariantPrimitiveType::Uuid) { - throw ParquetException("Not a UUID type"); - } - - if (value.size() < 17) { - throw ParquetException("Invalid UUID value: too short"); - } - - return reinterpret_cast(value.data() + 1); + throw ParquetException("VariantValue::getUuid Not implemented"); } std::string VariantValue::ObjectInfo::toDebugString() const { diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index 4b049b75d953..e3ed9a42e316 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -39,7 +39,7 @@ enum class VariantBasicType { Array = 3 }; -enum class VariantPrimitiveType { +enum class VariantPrimitiveType : int8_t { /// Equivalent Parquet Type: UNKNOWN NullType = 0, /// Equivalent Parquet Type: BOOLEAN @@ -200,6 +200,13 @@ struct VariantValue { /** The inclusive maximum value of the type info value. It is the size limit of * `SHORT_STR`. */ static constexpr uint8_t MAX_SHORT_STR_SIZE_MASK = 0b00111111; + + private: + template + PrimitiveType getPrimitiveVariantType(VariantPrimitiveType type) const; + + template + DecimalValue getPrimitiveDecimalType(VariantPrimitiveType type) const; }; } // namespace parquet::variant From e8cdad720b0a7a04b1741b9eee2428e6d75f1a0d Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 13 May 2025 20:06:54 +0800 Subject: [PATCH 08/31] Basic implement array (test not runned) --- cpp/src/parquet/variant.cc | 146 +++++++++++++++----------- cpp/src/parquet/variant.h | 2 + cpp/src/parquet/variant_test.cc | 177 ++++++++++++++++++++++++++++++++ 3 files changed, 265 insertions(+), 60 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index 633def834c71..100673d8926c 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -83,7 +83,7 @@ std::string_view VariantMetadata::getMetadataKey(int32_t variantId) const { if (string_start + key_size > metadata_.size()) { throw ParquetException("Invalid Variant metadata: string data out of range"); } - return std::string_view(metadata_.data() + string_start, key_size); + return {metadata_.data() + string_start, key_size}; } VariantBasicType VariantValue::getBasicType() const { @@ -272,52 +272,18 @@ double VariantValue::getDouble() const { return getPrimitiveVariantType(VariantPrimitiveType::Double); } -std::string_view VariantValue::getString() const { +std::string_view VariantValue::getPrimitiveBinaryType(VariantPrimitiveType type) const { VariantBasicType basic_type = getBasicType(); - - if (basic_type == VariantBasicType::ShortString) { - uint8_t length = (value[0] >> 2) & MAX_SHORT_STR_SIZE_MASK; - if (value.size() < length + 1) { - throw ParquetException("Invalid short string: too short"); - } - return std::string_view(value.data() + 1, length); - } - if (basic_type == VariantBasicType::Primitive) { - auto primitive_type = static_cast(value[0] >> 2); - if (primitive_type != VariantPrimitiveType::String) { - throw ParquetException("Not a string type"); - } - - if (value.size() < 5) { - throw ParquetException("Invalid string value: too short"); - } - - uint32_t length; - memcpy(&length, value.data() + 1, sizeof(uint32_t)); - length = arrow::bit_util::FromLittleEndian(length); - - if (value.size() < length + 5) { - throw ParquetException("Invalid string value: too short for specified length"); - } - - return std::string_view(value.data() + 5, length); - } - - throw ParquetException("Not a primitive or short string type calls getString"); -} - -std::string_view VariantValue::getBinary() const { - if (getBasicType() != VariantBasicType::Primitive) { + if (basic_type != VariantBasicType::Primitive) { throw ParquetException("Not a primitive type"); } - auto primitive_type = static_cast(value[0] >> 2); - if (primitive_type != VariantPrimitiveType::Binary) { - throw ParquetException("Not a binary type"); + if (primitive_type != VariantPrimitiveType::String) { + throw ParquetException("Not a string type"); } if (value.size() < 5) { - throw ParquetException("Invalid binary value: too short"); + throw ParquetException("Invalid string value: too short"); } uint32_t length; @@ -325,10 +291,32 @@ std::string_view VariantValue::getBinary() const { length = arrow::bit_util::FromLittleEndian(length); if (value.size() < length + 5) { - throw ParquetException("Invalid binary value: too short for specified length"); + throw ParquetException("Invalid string value: too short for specified length"); } - return std::string_view(value.data() + 5, length); + return {value.data() + 5, length}; +} + +std::string_view VariantValue::getString() const { + VariantBasicType basic_type = getBasicType(); + + if (basic_type == VariantBasicType::ShortString) { + uint8_t length = (value[0] >> 2) & MAX_SHORT_STR_SIZE_MASK; + if (value.size() < length + 1) { + throw ParquetException("Invalid short string: too short"); + } + return {value.data() + 1, length}; + } + if (basic_type == VariantBasicType::Primitive) { + // TODO(mwish): Should we validate utf8 here? + return getPrimitiveBinaryType(VariantPrimitiveType::String); + } + + throw ParquetException("Not a primitive or short string type calls getString"); +} + +std::string_view VariantValue::getBinary() const { + return getPrimitiveBinaryType(VariantPrimitiveType::Binary); } template @@ -533,29 +521,72 @@ VariantValue::ArrayInfo VariantValue::getArrayInfo() const { if (getBasicType() != VariantBasicType::Array) { throw ParquetException("Not an array type"); } + uint8_t value_header = value[0] >> 2; + uint8_t field_offset_size = (value_header & 0b11) + 1; + bool is_large = ((value_header >> 2) & 0b1); + + // 检查数据长度 + uint8_t num_elements_size = is_large ? 4 : 1; + if (value.size() < 1 + num_elements_size) { + throw ParquetException( + "Invalid array value: too short: " + std::to_string(value.size()) + + " for at least " + std::to_string(1 + num_elements_size)); + } - if (value.size() < 6) { - throw ParquetException("Invalid array value: too short"); + // 解析 num_elements + uint32_t num_elements = 0; + { + memcpy(&num_elements, value.data() + 1, num_elements_size); + num_elements = arrow::bit_util::FromLittleEndian(num_elements); } - uint32_t num_elements; - memcpy(&num_elements, value.data() + 1, sizeof(uint32_t)); - num_elements = arrow::bit_util::FromLittleEndian(num_elements); + ArrayInfo info{}; + info.num_elements = num_elements; + info.offset_size = field_offset_size; + info.offset_start_offset = 1 + num_elements_size; + info.data_start_offset = + info.offset_start_offset + (num_elements + 1) * field_offset_size; - if (value.size() < 6) { - throw ParquetException("Invalid array value: too short for offset_size"); + // 检查边界 + if (info.data_start_offset > value.size()) { + throw ParquetException("Invalid array value: data_start_offset=" + + std::to_string(info.data_start_offset) + + ", value_size=" + std::to_string(value.size())); } - uint8_t offset_size = value[5]; + // 检查最终偏移量 + { + uint32_t final_offset = 0; + memcpy(&final_offset, + value.data() + info.offset_start_offset + num_elements * field_offset_size, + field_offset_size); + final_offset = arrow::bit_util::FromLittleEndian(final_offset); - if (offset_size < 1 || offset_size > 4) { - throw ParquetException("Invalid array value: invalid offset_size"); + if (info.data_start_offset + final_offset > value.size()) { + throw ParquetException( + "Invalid array value: final_offset=" + std::to_string(final_offset) + + ", data_start_offset=" + std::to_string(info.data_start_offset) + + ", value_size=" + std::to_string(value.size())); + } } - uint32_t offset_start_offset = 6; - uint32_t data_start_offset = offset_start_offset + (num_elements + 1) * offset_size; + // TODO(mwish): Remove this. + for (uint32_t i = 0; i < num_elements; ++i) { + uint32_t offset = 0, next_offset = 0; + memcpy(&offset, value.data() + info.offset_start_offset + i * field_offset_size, + field_offset_size); + memcpy(&next_offset, + value.data() + info.offset_start_offset + (i + 1) * field_offset_size, + field_offset_size); + offset = arrow::bit_util::FromLittleEndian(offset); + next_offset = arrow::bit_util::FromLittleEndian(next_offset); + + if (offset > next_offset) { + throw ParquetException("Invalid array value: offsets not monotonically increasing"); + } + } - return {num_elements, offset_size, offset_start_offset, data_start_offset}; + return info; } VariantValue VariantValue::getArrayValueByIndex(uint32_t index) const { @@ -575,11 +606,6 @@ VariantValue VariantValue::getArrayValueByIndex(uint32_t index) const { offset = arrow::bit_util::FromLittleEndian(offset); next_offset = arrow::bit_util::FromLittleEndian(next_offset); - if (info.data_start_offset + offset >= value.size() || - info.data_start_offset + next_offset > value.size() || offset > next_offset) { - throw ParquetException("Invalid array element offsets"); - } - // Create a VariantValue for the element VariantValue element_value{ .metadata = metadata, diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index e3ed9a42e316..d865202c1cb4 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -207,6 +207,8 @@ struct VariantValue { template DecimalValue getPrimitiveDecimalType(VariantPrimitiveType type) const; + + std::string_view getPrimitiveBinaryType(VariantPrimitiveType type) const; }; } // namespace parquet::variant diff --git a/cpp/src/parquet/variant_test.cc b/cpp/src/parquet/variant_test.cc index 478c1bf7247d..abe54b07829b 100644 --- a/cpp/src/parquet/variant_test.cc +++ b/cpp/src/parquet/variant_test.cc @@ -310,4 +310,181 @@ TEST(ParquetVariant, DecimalValues) { } } +TEST(ParquetVariant, ArrayValues) { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("array_primitive", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::ARRAY, variant.getType()); + EXPECT_EQ("ARRAY", variant.typeDebugString()); + + // 获取数组信息 + auto array_info = variant.getArrayInfo(); + EXPECT_EQ(5, array_info.num_elements); + + // 通过索引获取值 + auto element0 = variant.getArrayValueByIndex(0); + EXPECT_EQ(VariantType::INT, element0.getType()); + EXPECT_EQ(1, element0.getInt32()); + + auto element1 = variant.getArrayValueByIndex(1); + EXPECT_EQ(VariantType::INT, element1.getType()); + EXPECT_EQ(2, element1.getInt32()); + + auto element2 = variant.getArrayValueByIndex(2); + EXPECT_EQ(VariantType::INT, element2.getType()); + EXPECT_EQ(3, element2.getInt32()); + + auto element3 = variant.getArrayValueByIndex(3); + EXPECT_EQ(VariantType::INT, element3.getType()); + EXPECT_EQ(4, element3.getInt32()); + + auto element4 = variant.getArrayValueByIndex(4); + EXPECT_EQ(VariantType::INT, element4.getType()); + EXPECT_EQ(5, element4.getInt32()); + + EXPECT_THROW(variant.getArrayValueByIndex(5), ParquetException); + EXPECT_THROW(variant.getArrayValueByIndex(100), ParquetException); +} + +TEST(ParquetVariant, DateTimeValues) { + // 测试日期值 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_date", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::DATE, variant.getType()); + EXPECT_EQ("DATE", variant.typeDebugString()); + // 日期值表示为自 Unix 纪元以来的天数 + EXPECT_EQ(18262, variant.getInt32()); // 2020-01-01 + } + + // 测试时间值 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_time", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::TIME, variant.getType()); + EXPECT_EQ("TIME", variant.typeDebugString()); + // 时间值表示为自午夜以来的微秒数 + EXPECT_EQ(43200000000, variant.timeNTZ()); // 12:00:00 + } + + // 测试带时区的时间戳 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_timestamp_tz", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::TIMESTAMP_TZ, variant.getType()); + EXPECT_EQ("TIMESTAMP_TZ", variant.typeDebugString()); + // 时间戳值表示为自 Unix 纪元以来的微秒数 + EXPECT_EQ(1577836800000000, variant.getTimestamp()); // 2020-01-01 00:00:00 UTC + } + + // 测试不带时区的时间戳 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_timestamp_ntz", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::TIMESTAMP_NTZ, variant.getType()); + EXPECT_EQ("TIMESTAMP_NTZ", variant.typeDebugString()); + // 时间戳值表示为自 Unix 纪元以来的微秒数 + EXPECT_EQ(1577836800000000, variant.getTimestampNTZ()); // 2020-01-01 00:00:00 + } +} + +// TEST(ParquetVariant, UuidValue) { +// std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; +// auto variant = LoadVariantValue("primitive_uuid", &metadata_buf, &value_buf); +// EXPECT_EQ(VariantType::UUID, variant.getType()); +// EXPECT_EQ("UUID", variant.typeDebugString()); +// +// // UUID 是 16 字节的二进制数据 +// const uint8_t* uuid = variant.getUuid(); +// ASSERT_NE(nullptr, uuid); +// +// // 检查 UUID 的格式(这里只是示例,实际值可能不同) +// std::string uuid_str; +// for (int i = 0; i < 16; i++) { +// char hex[3]; +// snprintf(hex, sizeof(hex), "%02x", uuid[i]); +// uuid_str += hex; +// if (i == 3 || i == 5 || i == 7 || i == 9) { +// uuid_str += "-"; +// } +// } +// +// EXPECT_EQ(36, uuid_str.length()); // 标准 UUID 字符串长度 +// } + +TEST(ParquetVariant, NestedStructures) { + // 测试嵌套对象 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("object_nested", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::OBJECT, variant.getType()); + + auto nested_obj = variant.getObjectValueByKey("nested_object"); + ASSERT_TRUE(nested_obj.has_value()); + EXPECT_EQ(VariantType::OBJECT, nested_obj->getType()); + + auto nested_field = nested_obj->getObjectValueByKey("nested_field"); + ASSERT_TRUE(nested_field.has_value()); + EXPECT_EQ(VariantType::STRING, nested_field->getType()); + EXPECT_EQ("Nested value", nested_field->getString()); + } + + // 测试嵌套数组 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("array_nested", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::ARRAY, variant.getType()); + + auto nested_array = variant.getArrayValueByIndex(0); + EXPECT_EQ(VariantType::ARRAY, nested_array.getType()); + + auto array_info = nested_array.getArrayInfo(); + EXPECT_EQ(3, array_info.num_elements); + + auto element0 = nested_array.getArrayValueByIndex(0); + EXPECT_EQ(VariantType::INT, element0.getType()); + EXPECT_EQ(1, element0.getInt32()); + + auto element1 = nested_array.getArrayValueByIndex(1); + EXPECT_EQ(VariantType::INT, element1.getType()); + EXPECT_EQ(2, element1.getInt32()); + + auto element2 = nested_array.getArrayValueByIndex(2); + EXPECT_EQ(VariantType::INT, element2.getType()); + EXPECT_EQ(3, element2.getInt32()); + } + + // 测试对象中的数组 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("object_with_array", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::OBJECT, variant.getType()); + + auto array_field = variant.getObjectValueByKey("array_field"); + ASSERT_TRUE(array_field.has_value()); + EXPECT_EQ(VariantType::ARRAY, array_field->getType()); + + auto array_info = array_field->getArrayInfo(); + EXPECT_EQ(3, array_info.num_elements); + + auto element0 = array_field->getArrayValueByIndex(0); + EXPECT_EQ(VariantType::INT, element0.getType()); + EXPECT_EQ(1, element0.getInt32()); + } + + // 测试数组中的对象 + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("array_with_objects", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::ARRAY, variant.getType()); + + auto object_element = variant.getArrayValueByIndex(0); + EXPECT_EQ(VariantType::OBJECT, object_element.getType()); + + auto field = object_element.getObjectValueByKey("field"); + ASSERT_TRUE(field.has_value()); + EXPECT_EQ(VariantType::STRING, field->getType()); + EXPECT_EQ("Value", field->getString()); + } +} + } // namespace parquet::variant \ No newline at end of file From 34f546e6355c7c86dc67b4a3d5a093235b8c5dce Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 14 May 2025 14:44:45 +0800 Subject: [PATCH 09/31] Finish code of uuid, complete some tests --- cpp/src/parquet/variant.cc | 221 ++++++++++++++++++++++++-------- cpp/src/parquet/variant.h | 14 +- cpp/src/parquet/variant_test.cc | 149 ++++++++++++--------- 3 files changed, 271 insertions(+), 113 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index 100673d8926c..1107ba2ab2c0 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -26,6 +26,121 @@ namespace parquet::variant { +std::string variantBasicTypeToString(VariantBasicType type) { + switch (type) { + case VariantBasicType::Primitive: + return "Primitive"; + case VariantBasicType::ShortString: + return "ShortString"; + case VariantBasicType::Object: + return "Object"; + case VariantBasicType::Array: + return "Array"; + default: + return "Unknown"; + } +} + +std::string variantPrimitiveTypeToString(VariantPrimitiveType type) { + switch (type) { + case VariantPrimitiveType::NullType: + return "NullType"; + case VariantPrimitiveType::BooleanTrue: + return "BooleanTrue"; + case VariantPrimitiveType::BooleanFalse: + return "BooleanFalse"; + case VariantPrimitiveType::Int8: + return "Int8"; + case VariantPrimitiveType::Int16: + return "Int16"; + case VariantPrimitiveType::Int32: + return "Int32"; + case VariantPrimitiveType::Int64: + return "Int64"; + case VariantPrimitiveType::Double: + return "Double"; + case VariantPrimitiveType::Decimal4: + return "Decimal4"; + case VariantPrimitiveType::Decimal8: + return "Decimal8"; + case VariantPrimitiveType::Decimal16: + return "Decimal16"; + case VariantPrimitiveType::Date: + return "Date"; + case VariantPrimitiveType::Timestamp: + return "Timestamp"; + case VariantPrimitiveType::TimestampNTZ: + return "TimestampNTZ"; + case VariantPrimitiveType::Float: + return "Float"; + case VariantPrimitiveType::Binary: + return "Binary"; + case VariantPrimitiveType::String: + return "String"; + case VariantPrimitiveType::TimeNTZ: + return "TimeNTZ"; + case VariantPrimitiveType::TimestampTZ: + return "TimestampTZ"; + case VariantPrimitiveType::TimestampNTZNanos: + return "TimestampNTZNanos"; + case VariantPrimitiveType::Uuid: + return "Uuid"; + default: + return "Unknown"; + } +} + +std::string variantTypeToString(VariantType type) { + switch (type) { + case VariantType::OBJECT: + return "OBJECT"; + case VariantType::ARRAY: + return "ARRAY"; + case VariantType::VARIANT_NULL: + return "NULL"; + case VariantType::BOOLEAN: + return "BOOLEAN"; + case VariantType::BYTE: + return "BYTE"; + case VariantType::SHORT: + return "SHORT"; + case VariantType::INT: + return "INT"; + case VariantType::LONG: + return "LONG"; + case VariantType::STRING: + return "STRING"; + case VariantType::DOUBLE: + return "DOUBLE"; + case VariantType::DECIMAL4: + return "DECIMAL4"; + case VariantType::DECIMAL8: + return "DECIMAL8"; + case VariantType::DECIMAL16: + return "DECIMAL16"; + case VariantType::DATE: + return "DATE"; + case VariantType::TIMESTAMP_TZ: + return "TIMESTAMP_TZ"; + case VariantType::TIMESTAMP_NTZ: + return "TIMESTAMP_NTZ"; + case VariantType::FLOAT: + return "FLOAT"; + case VariantType::BINARY: + return "BINARY"; + case VariantType::TIME: + return "TIME"; + case VariantType::TIMESTAMP_NANOS_TZ: + return "TIMESTAMP_NANOS_TZ"; + case VariantType::TIMESTAMP_NANOS_NTZ: + return "TIMESTAMP_NANOS_NTZ"; + case VariantType::UUID: + return "UUID"; + default: + return "UNKNOWN"; + } +} + VariantMetadata::VariantMetadata(std::string_view metadata) : metadata_(metadata) { if (metadata.size() < 2) { throw ParquetException("Invalid Variant metadata: too short: " + @@ -95,8 +210,6 @@ VariantBasicType VariantValue::getBasicType() const { VariantType VariantValue::getType() const { VariantBasicType basic_type = getBasicType(); - // std::cout << "Variant first byte:" << static_cast(value[0] >> 2) << ", " - // << static_cast(value[0] && BASIC_TYPE_MASK) << '\n'; switch (basic_type) { case VariantBasicType::Primitive: { auto primitive_type = static_cast(value[0] >> 2); @@ -228,48 +341,60 @@ bool VariantValue::getBool() const { std::to_string(primitive_type)); } -template -PrimitiveType VariantValue::getPrimitiveVariantType(VariantPrimitiveType type) const { +void VariantValue::checkPrimitiveType(VariantPrimitiveType type, + size_t size_required) const { if (getBasicType() != VariantBasicType::Primitive) { throw ParquetException("Not a primitive type"); } auto primitive_type = static_cast(value[0] >> 2); - if (primitive_type != VariantPrimitiveType::Int8) { - throw ParquetException("Not an correspond type"); + if (primitive_type != type) { + throw ParquetException( + "Expected primitive type: " + variantPrimitiveTypeToString(type) + + ", but got: " + variantPrimitiveTypeToString(primitive_type)); } - if (value.size() < 1 + sizeof(PrimitiveType)) { - throw ParquetException("Invalid value: too short"); + if (value.size() < 1 + size_required) { + throw ParquetException("Invalid value: too short, expected at least " + + std::to_string(1 + size_required) + " bytes for type " + + variantPrimitiveTypeToString(type) + + ", but got: " + std::to_string(value.size()) + " bytes"); } +} + +template +PrimitiveType VariantValue::getPrimitiveType(VariantPrimitiveType type) const { + checkPrimitiveType(type, sizeof(PrimitiveType)); - PrimitiveType decimal_value{}; - memcpy(&decimal_value, value.data() + 1, sizeof(PrimitiveType)); - return decimal_value; + PrimitiveType primitive_value{}; + memcpy(&primitive_value, value.data() + 1, sizeof(PrimitiveType)); + // Here we should cast from Little endian. + primitive_value = ::arrow::bit_util::FromLittleEndian(primitive_value); + return primitive_value; } int8_t VariantValue::getInt8() const { - return getPrimitiveVariantType(VariantPrimitiveType::Int8); + return getPrimitiveType(VariantPrimitiveType::Int8); } int16_t VariantValue::getInt16() const { - return getPrimitiveVariantType(VariantPrimitiveType::Int16); + return getPrimitiveType(VariantPrimitiveType::Int16); } int32_t VariantValue::getInt32() const { - return getPrimitiveVariantType(VariantPrimitiveType::Int32); + return getPrimitiveType(VariantPrimitiveType::Int32); } int64_t VariantValue::getInt64() const { - return getPrimitiveVariantType(VariantPrimitiveType::Int64); + return getPrimitiveType(VariantPrimitiveType::Int64); } float VariantValue::getFloat() const { - return getPrimitiveVariantType(VariantPrimitiveType::Float); + return getPrimitiveType(VariantPrimitiveType::Float); } double VariantValue::getDouble() const { - return getPrimitiveVariantType(VariantPrimitiveType::Double); + return getPrimitiveType(VariantPrimitiveType::Double); } std::string_view VariantValue::getPrimitiveBinaryType(VariantPrimitiveType type) const { @@ -323,18 +448,7 @@ template DecimalValue VariantValue::getPrimitiveDecimalType( VariantPrimitiveType type) const { using DecimalValueType = typename DecimalType::ValueType; - if (getBasicType() != VariantBasicType::Primitive) { - throw ParquetException("Not a primitive type"); - } - - auto primitive_type = static_cast(value[0] >> 2); - if (primitive_type != type) { - throw ParquetException("Not a decimal type"); - } - - if (value.size() < 2 + sizeof(DecimalValueType)) { - throw ParquetException("Invalid decimal value: too short"); - } + checkPrimitiveType(type, sizeof(DecimalValueType) + 2); uint8_t scale = value[1]; DecimalValueType decimal_value; @@ -353,18 +467,8 @@ DecimalValue<::arrow::Decimal64> VariantValue::getDecimal8() const { } DecimalValue<::arrow::Decimal128> VariantValue::getDecimal16() const { - if (getBasicType() != VariantBasicType::Primitive) { - throw ParquetException("Not a primitive type"); - } - - auto primitive_type = static_cast(value[0] >> 2); - if (primitive_type != VariantPrimitiveType::Decimal16) { - throw ParquetException("Not a decimal16 type"); - } - - if (value.size() < 2 + sizeof(int64_t) * 2) { - throw ParquetException("Invalid decimal16 value: too short"); - } + checkPrimitiveType(VariantPrimitiveType::Decimal16, + /*size_required=*/sizeof(int64_t) * 2); uint8_t scale = value[1]; @@ -376,20 +480,33 @@ DecimalValue<::arrow::Decimal128> VariantValue::getDecimal16() const { return {scale, ::arrow::Decimal128(low_high_bits[1], low_high_bits[0])}; } -int64_t VariantValue::timeNTZ() const { - return getPrimitiveVariantType(VariantPrimitiveType::TimeNTZ); +int32_t VariantValue::getDate() const { + return getPrimitiveType(VariantPrimitiveType::Date); +} + +int64_t VariantValue::getTimeNTZ() const { + return getPrimitiveType(VariantPrimitiveType::TimeNTZ); } int64_t VariantValue::getTimestamp() const { - return getPrimitiveVariantType(VariantPrimitiveType::Timestamp); + return getPrimitiveType(VariantPrimitiveType::Timestamp); } int64_t VariantValue::getTimestampNTZ() const { - return getPrimitiveVariantType(VariantPrimitiveType::TimestampNTZ); + return getPrimitiveType(VariantPrimitiveType::TimestampNTZ); } -const uint8_t* VariantValue::getUuid() const { - throw ParquetException("VariantValue::getUuid Not implemented"); +std::array VariantValue::getUuid() const { + checkPrimitiveType(VariantPrimitiveType::Uuid, /*size_required=*/17); + std::array uuid_value; + memcpy(uuid_value.data(), value.data() + 1, sizeof(uuid_value)); +#if ARROW_LITTLE_ENDIAN + std::array uuid_value_le; + ::arrow::bit_util::ByteSwap(uuid_value_le.data(), uuid_value.data(), uuid_value.size()); + return uuid_value_le; +#else + return uuid_value; +#endif } std::string VariantValue::ObjectInfo::toDebugString() const { @@ -489,9 +606,7 @@ std::optional VariantValue::getObjectFieldByFieldId( field_id = arrow::bit_util::FromLittleEndian(field_id); // Get the key from metadata - // TODO(mwish): Fix the casting here. *key = metadata.getMetadataKey(field_id); - std::cout << "Metadata key:" << *key << '\n'; // Read the offset and next offset uint32_t offset = 0, next_offset = 0; @@ -519,13 +634,14 @@ std::optional VariantValue::getObjectFieldByFieldId( VariantValue::ArrayInfo VariantValue::getArrayInfo() const { if (getBasicType() != VariantBasicType::Array) { - throw ParquetException("Not an array type"); + throw ParquetException("Expected array type, but got: " + + variantBasicTypeToString(getBasicType())); } uint8_t value_header = value[0] >> 2; uint8_t field_offset_size = (value_header & 0b11) + 1; bool is_large = ((value_header >> 2) & 0b1); - // 检查数据长度 + // check the array header uint8_t num_elements_size = is_large ? 4 : 1; if (value.size() < 1 + num_elements_size) { throw ParquetException( @@ -570,7 +686,8 @@ VariantValue::ArrayInfo VariantValue::getArrayInfo() const { } } - // TODO(mwish): Remove this. + // checking the element is incremental. + // TODO(mwish): Remove this or encapsulate this range check to function for (uint32_t i = 0; i < num_elements; ++i) { uint32_t offset = 0, next_offset = 0; memcpy(&offset, value.data() + info.offset_start_offset + i * field_offset_size, diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index d865202c1cb4..d96cb5351861 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -39,6 +39,8 @@ enum class VariantBasicType { Array = 3 }; +std::string variantBasicTypeToString(VariantBasicType type); + enum class VariantPrimitiveType : int8_t { /// Equivalent Parquet Type: UNKNOWN NullType = 0, @@ -85,6 +87,8 @@ enum class VariantPrimitiveType : int8_t { Uuid = 20 }; +std::string variantPrimitiveTypeToString(VariantPrimitiveType type); + /// VariantType is from basic type and primitive type. enum class VariantType { OBJECT, @@ -111,6 +115,8 @@ enum class VariantType { UUID }; +std::string variantTypeToString(VariantType type); + class VariantMetadata { public: explicit VariantMetadata(std::string_view metadata); @@ -161,12 +167,13 @@ struct VariantValue { DecimalValue<::arrow::Decimal64> getDecimal8() const; DecimalValue<::arrow::Decimal128> getDecimal16() const; - int64_t timeNTZ() const; + int32_t getDate() const; + int64_t getTimeNTZ() const; // timestamp with adjusted to UTC int64_t getTimestamp() const; int64_t getTimestampNTZ() const; // 16 bytes UUID - const uint8_t* getUuid() const; + std::array getUuid() const; /// }@ @@ -203,12 +210,13 @@ struct VariantValue { private: template - PrimitiveType getPrimitiveVariantType(VariantPrimitiveType type) const; + PrimitiveType getPrimitiveType(VariantPrimitiveType type) const; template DecimalValue getPrimitiveDecimalType(VariantPrimitiveType type) const; std::string_view getPrimitiveBinaryType(VariantPrimitiveType type) const; + void checkPrimitiveType(VariantPrimitiveType type, size_t size_required) const; }; } // namespace parquet::variant diff --git a/cpp/src/parquet/variant_test.cc b/cpp/src/parquet/variant_test.cc index abe54b07829b..b2bba4591f3f 100644 --- a/cpp/src/parquet/variant_test.cc +++ b/cpp/src/parquet/variant_test.cc @@ -310,80 +310,35 @@ TEST(ParquetVariant, DecimalValues) { } } -TEST(ParquetVariant, ArrayValues) { - std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("array_primitive", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::ARRAY, variant.getType()); - EXPECT_EQ("ARRAY", variant.typeDebugString()); - - // 获取数组信息 - auto array_info = variant.getArrayInfo(); - EXPECT_EQ(5, array_info.num_elements); - - // 通过索引获取值 - auto element0 = variant.getArrayValueByIndex(0); - EXPECT_EQ(VariantType::INT, element0.getType()); - EXPECT_EQ(1, element0.getInt32()); - - auto element1 = variant.getArrayValueByIndex(1); - EXPECT_EQ(VariantType::INT, element1.getType()); - EXPECT_EQ(2, element1.getInt32()); - - auto element2 = variant.getArrayValueByIndex(2); - EXPECT_EQ(VariantType::INT, element2.getType()); - EXPECT_EQ(3, element2.getInt32()); - - auto element3 = variant.getArrayValueByIndex(3); - EXPECT_EQ(VariantType::INT, element3.getType()); - EXPECT_EQ(4, element3.getInt32()); - - auto element4 = variant.getArrayValueByIndex(4); - EXPECT_EQ(VariantType::INT, element4.getType()); - EXPECT_EQ(5, element4.getInt32()); - - EXPECT_THROW(variant.getArrayValueByIndex(5), ParquetException); - EXPECT_THROW(variant.getArrayValueByIndex(100), ParquetException); -} - TEST(ParquetVariant, DateTimeValues) { - // 测试日期值 { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_date", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::DATE, variant.getType()); EXPECT_EQ("DATE", variant.typeDebugString()); - // 日期值表示为自 Unix 纪元以来的天数 - EXPECT_EQ(18262, variant.getInt32()); // 2020-01-01 - } - - // 测试时间值 - { - std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("primitive_time", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::TIME, variant.getType()); - EXPECT_EQ("TIME", variant.typeDebugString()); - // 时间值表示为自午夜以来的微秒数 - EXPECT_EQ(43200000000, variant.timeNTZ()); // 12:00:00 + // 2025-04-16 + EXPECT_EQ(20194, variant.getDate()); } - - // 测试带时区的时间戳 + // { + // std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + // auto variant = LoadVariantValue("primitive_time", &metadata_buf, &value_buf); + // EXPECT_EQ(VariantType::TIME, variant.getType()); + // EXPECT_EQ("TIME", variant.typeDebugString()); + // EXPECT_EQ(43200000000, variant.getTimeNTZ()); + // } { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("primitive_timestamp_tz", &metadata_buf, &value_buf); + auto variant = LoadVariantValue("primitive_timestamp", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::TIMESTAMP_TZ, variant.getType()); EXPECT_EQ("TIMESTAMP_TZ", variant.typeDebugString()); - // 时间戳值表示为自 Unix 纪元以来的微秒数 - EXPECT_EQ(1577836800000000, variant.getTimestamp()); // 2020-01-01 00:00:00 UTC + EXPECT_EQ(1744821296780000, variant.getTimestamp()); } - - // 测试不带时区的时间戳 { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("primitive_timestamp_ntz", &metadata_buf, &value_buf); + auto variant = LoadVariantValue("primitive_timestampntz", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::TIMESTAMP_NTZ, variant.getType()); EXPECT_EQ("TIMESTAMP_NTZ", variant.typeDebugString()); - // 时间戳值表示为自 Unix 纪元以来的微秒数 - EXPECT_EQ(1577836800000000, variant.getTimestampNTZ()); // 2020-01-01 00:00:00 + EXPECT_EQ(1744806896780000, variant.getTimestampNTZ()); } } @@ -411,6 +366,84 @@ TEST(ParquetVariant, DateTimeValues) { // EXPECT_EQ(36, uuid_str.length()); // 标准 UUID 字符串长度 // } +TEST(ParquetVariant, ArrayValues) { + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("array_primitive", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::ARRAY, variant.getType()); + EXPECT_EQ("ARRAY", variant.typeDebugString()); + + auto array_info = variant.getArrayInfo(); + EXPECT_EQ(4, array_info.num_elements); + + // 通过索引获取值 + auto element0 = variant.getArrayValueByIndex(0); + EXPECT_EQ(VariantType::BYTE, element0.getType()); + EXPECT_EQ(2, element0.getInt32()); + + auto element1 = variant.getArrayValueByIndex(1); + EXPECT_EQ(VariantType::BYTE, element1.getType()); + EXPECT_EQ(1, element1.getInt32()); + + auto element2 = variant.getArrayValueByIndex(2); + EXPECT_EQ(VariantType::BYTE, element2.getType()); + EXPECT_EQ(5, element2.getInt8()); + + auto element3 = variant.getArrayValueByIndex(3); + EXPECT_EQ(VariantType::BYTE, element3.getType()); + EXPECT_EQ(9, element3.getInt8()); + + EXPECT_THROW(variant.getArrayValueByIndex(4), ParquetException); + EXPECT_THROW(variant.getArrayValueByIndex(100), ParquetException); + EXPECT_THROW(variant.getObjectInfo(), ParquetException); + } + { + // array_empty + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("array_empty", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::ARRAY, variant.getType()); + EXPECT_EQ("ARRAY", variant.typeDebugString()); + auto array_info = variant.getArrayInfo(); + EXPECT_EQ(0, array_info.num_elements); + + EXPECT_THROW(variant.getArrayValueByIndex(0), ParquetException); + EXPECT_THROW(variant.getObjectInfo(), ParquetException); + } +} + +TEST(ParquetVariant, ArrayValuesNested) { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("array_nested", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::ARRAY, variant.getType()); + EXPECT_EQ("ARRAY", variant.typeDebugString()); + auto object_info = variant.getArrayInfo(); + EXPECT_EQ(3, object_info.num_elements); + { + auto first_element = variant.getArrayValueByIndex(0); + EXPECT_EQ(VariantType::OBJECT, first_element.getType()); + auto first_element_info = first_element.getObjectInfo(); + EXPECT_EQ(2, first_element_info.num_elements); + auto id = first_element.getObjectValueByKey("id"); + ASSERT_TRUE(id.has_value()); + EXPECT_EQ(VariantType::BYTE, id->getType()); + EXPECT_EQ(1, id->getInt8()); + } + { + auto second_element = variant.getArrayValueByIndex(1); + EXPECT_EQ(VariantType::VARIANT_NULL, second_element.getType()); + } + { + auto third_element = variant.getArrayValueByIndex(2); + EXPECT_EQ(VariantType::OBJECT, third_element.getType()); + auto third_element_info = third_element.getObjectInfo(); + EXPECT_EQ(3, third_element_info.num_elements); + auto id = third_element.getObjectValueByKey("id"); + ASSERT_TRUE(id.has_value()); + EXPECT_EQ(VariantType::BYTE, id->getType()); + EXPECT_EQ(2, id->getInt8()); + } +} + TEST(ParquetVariant, NestedStructures) { // 测试嵌套对象 { From 3566dfde905025840d0abb8426d1e84071bc4a31 Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 14 May 2025 15:52:48 +0800 Subject: [PATCH 10/31] Cleanup for review --- cpp/src/parquet/variant.cc | 69 +++++----- cpp/src/parquet/variant.h | 5 +- cpp/src/parquet/variant_test.cc | 236 ++++++++++---------------------- 3 files changed, 104 insertions(+), 206 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index 1107ba2ab2c0..b503508b50b8 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -326,7 +326,8 @@ std::string VariantValue::typeDebugString() const { bool VariantValue::getBool() const { if (getBasicType() != VariantBasicType::Primitive) { - throw ParquetException("Not a primitive type"); + throw ParquetException("Expected primitive type, but got: " + + variantBasicTypeToString(getBasicType())); } int8_t primitive_type = static_cast(value[0]) >> 2; @@ -341,11 +342,16 @@ bool VariantValue::getBool() const { std::to_string(primitive_type)); } +void VariantValue::checkBasicType(VariantBasicType type) const { + if (getBasicType() != type) { + throw ParquetException("Expected basic type: " + variantBasicTypeToString(type) + + ", but got: " + variantBasicTypeToString(getBasicType())); + } +} + void VariantValue::checkPrimitiveType(VariantPrimitiveType type, size_t size_required) const { - if (getBasicType() != VariantBasicType::Primitive) { - throw ParquetException("Not a primitive type"); - } + checkBasicType(VariantBasicType::Primitive); auto primitive_type = static_cast(value[0] >> 2); if (primitive_type != type) { @@ -354,9 +360,9 @@ void VariantValue::checkPrimitiveType(VariantPrimitiveType type, ", but got: " + variantPrimitiveTypeToString(primitive_type)); } - if (value.size() < 1 + size_required) { + if (value.size() < size_required) { throw ParquetException("Invalid value: too short, expected at least " + - std::to_string(1 + size_required) + " bytes for type " + + std::to_string(size_required) + " bytes for type " + variantPrimitiveTypeToString(type) + ", but got: " + std::to_string(value.size()) + " bytes"); } @@ -364,7 +370,7 @@ void VariantValue::checkPrimitiveType(VariantPrimitiveType type, template PrimitiveType VariantValue::getPrimitiveType(VariantPrimitiveType type) const { - checkPrimitiveType(type, sizeof(PrimitiveType)); + checkPrimitiveType(type, sizeof(PrimitiveType) + 1); PrimitiveType primitive_value{}; memcpy(&primitive_value, value.data() + 1, sizeof(PrimitiveType)); @@ -378,15 +384,15 @@ int8_t VariantValue::getInt8() const { } int16_t VariantValue::getInt16() const { - return getPrimitiveType(VariantPrimitiveType::Int16); + return getPrimitiveType(VariantPrimitiveType::Int16); } int32_t VariantValue::getInt32() const { - return getPrimitiveType(VariantPrimitiveType::Int32); + return getPrimitiveType(VariantPrimitiveType::Int32); } int64_t VariantValue::getInt64() const { - return getPrimitiveType(VariantPrimitiveType::Int64); + return getPrimitiveType(VariantPrimitiveType::Int64); } float VariantValue::getFloat() const { @@ -394,22 +400,11 @@ float VariantValue::getFloat() const { } double VariantValue::getDouble() const { - return getPrimitiveType(VariantPrimitiveType::Double); + return getPrimitiveType(VariantPrimitiveType::Double); } std::string_view VariantValue::getPrimitiveBinaryType(VariantPrimitiveType type) const { - VariantBasicType basic_type = getBasicType(); - if (basic_type != VariantBasicType::Primitive) { - throw ParquetException("Not a primitive type"); - } - auto primitive_type = static_cast(value[0] >> 2); - if (primitive_type != VariantPrimitiveType::String) { - throw ParquetException("Not a string type"); - } - - if (value.size() < 5) { - throw ParquetException("Invalid string value: too short"); - } + checkPrimitiveType(type, /*size_required=*/5); uint32_t length; memcpy(&length, value.data() + 1, sizeof(uint32_t)); @@ -468,7 +463,7 @@ DecimalValue<::arrow::Decimal64> VariantValue::getDecimal8() const { DecimalValue<::arrow::Decimal128> VariantValue::getDecimal16() const { checkPrimitiveType(VariantPrimitiveType::Decimal16, - /*size_required=*/sizeof(int64_t) * 2); + /*size_required=*/sizeof(int64_t) * 2 + 2); uint8_t scale = value[1]; @@ -524,9 +519,7 @@ std::string VariantValue::ObjectInfo::toDebugString() const { VariantValue::ObjectInfo VariantValue::getObjectInfo() const { - if (getBasicType() != VariantBasicType::Object) { - throw ParquetException("Not an object type"); - } + checkBasicType(VariantBasicType::Object); uint8_t value_header = value[0] >> 2; uint8_t field_offset_size = (value_header & 0b11) + 1; uint8_t field_id_size = ((value_header >> 2) & 0b11) + 1; @@ -561,6 +554,7 @@ VariantValue::ObjectInfo VariantValue::getObjectInfo() const { memcpy(&final_offset, value.data() + info.offset_start_offset + num_elements * field_offset_size, field_offset_size); + // It could be less than value size since it could be a sub-object. if (final_offset + info.data_start_offset > value.size()) { throw ParquetException("Invalid object value: final_offset=" + std::to_string(final_offset) + @@ -591,12 +585,13 @@ std::optional VariantValue::getObjectValueByKey( return std::nullopt; } -std::optional VariantValue::getObjectFieldByFieldId( - uint32_t variantId, std::string_view* key) const { +VariantValue VariantValue::getObjectFieldByFieldId(uint32_t variantId, + std::string_view* key) const { ObjectInfo info = getObjectInfo(); if (variantId >= info.num_elements) { - throw ParquetException("Field ID out of range"); + throw ParquetException("Field ID out of range: " + std::to_string(variantId) + + " >= " + std::to_string(info.num_elements)); } // Read the field ID @@ -606,7 +601,7 @@ std::optional VariantValue::getObjectFieldByFieldId( field_id = arrow::bit_util::FromLittleEndian(field_id); // Get the key from metadata - *key = metadata.getMetadataKey(field_id); + *key = metadata.getMetadataKey(static_cast(field_id)); // Read the offset and next offset uint32_t offset = 0, next_offset = 0; @@ -633,10 +628,7 @@ std::optional VariantValue::getObjectFieldByFieldId( } VariantValue::ArrayInfo VariantValue::getArrayInfo() const { - if (getBasicType() != VariantBasicType::Array) { - throw ParquetException("Expected array type, but got: " + - variantBasicTypeToString(getBasicType())); - } + checkBasicType(VariantBasicType::Array); uint8_t value_header = value[0] >> 2; uint8_t field_offset_size = (value_header & 0b11) + 1; bool is_large = ((value_header >> 2) & 0b1); @@ -649,7 +641,7 @@ VariantValue::ArrayInfo VariantValue::getArrayInfo() const { " for at least " + std::to_string(1 + num_elements_size)); } - // 解析 num_elements + // parse num_elements uint32_t num_elements = 0; { memcpy(&num_elements, value.data() + 1, num_elements_size); @@ -663,14 +655,15 @@ VariantValue::ArrayInfo VariantValue::getArrayInfo() const { info.data_start_offset = info.offset_start_offset + (num_elements + 1) * field_offset_size; - // 检查边界 + // Boundary check if (info.data_start_offset > value.size()) { throw ParquetException("Invalid array value: data_start_offset=" + std::to_string(info.data_start_offset) + ", value_size=" + std::to_string(value.size())); } - // 检查最终偏移量 + // Validate final offset is equal to the size of the value, + // it would work since even empty array would have an offset of 0. { uint32_t final_offset = 0; memcpy(&final_offset, diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index d96cb5351861..2500f6dc0113 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -189,8 +189,7 @@ struct VariantValue { }; ObjectInfo getObjectInfo() const; std::optional getObjectValueByKey(std::string_view key) const; - std::optional getObjectFieldByFieldId(uint32_t variantId, - std::string_view* key) const; + VariantValue getObjectFieldByFieldId(uint32_t variantId, std::string_view* key) const; struct ArrayInfo { uint32_t num_elements; @@ -202,6 +201,7 @@ struct VariantValue { // Would throw ParquetException if index is out of range. VariantValue getArrayValueByIndex(uint32_t index) const; + private: static constexpr uint8_t BASIC_TYPE_MASK = 0b00000011; static constexpr uint8_t PRIMITIVE_TYPE_MASK = 0b00111111; /** The inclusive maximum value of the type info value. It is the size limit of @@ -216,6 +216,7 @@ struct VariantValue { DecimalValue getPrimitiveDecimalType(VariantPrimitiveType type) const; std::string_view getPrimitiveBinaryType(VariantPrimitiveType type) const; + void checkBasicType(VariantBasicType type) const; void checkPrimitiveType(VariantPrimitiveType type, size_t size_required) const; }; diff --git a/cpp/src/parquet/variant_test.cc b/cpp/src/parquet/variant_test.cc index b2bba4591f3f..15bbe1ed70aa 100644 --- a/cpp/src/parquet/variant_test.cc +++ b/cpp/src/parquet/variant_test.cc @@ -67,11 +67,11 @@ TEST(ParquetVariant, MetadataBase) { EXPECT_EQ(1, metadata.version()); EXPECT_THROW(metadata.getMetadataKey(0), ParquetException); } - { std::string object_metadata = "object_primitive.metadata"; ARROW_SCOPED_TRACE("Testing file: " + object_metadata); - auto buf = readFromFile(*file_system, object_metadata); + std::string path = dir_string + "/" + object_metadata; + auto buf = readFromFile(*file_system, path); VariantMetadata metadata(std::string_view{*buf}); EXPECT_EQ("int_field", metadata.getMetadataKey(0)); @@ -108,12 +108,10 @@ TEST(ParquetVariant, BooleanValue) { { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_boolean_true", &metadata_buf, &value_buf); - std::cout << variant.typeDebugString() << '\n'; EXPECT_EQ(VariantType::BOOLEAN, variant.getType()); EXPECT_EQ("BOOLEAN", variant.typeDebugString()); EXPECT_EQ(true, variant.getBool()); } - // test false { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; @@ -121,7 +119,6 @@ TEST(ParquetVariant, BooleanValue) { EXPECT_EQ(VariantType::BOOLEAN, variant.getType()); EXPECT_EQ(false, variant.getBool()); } - { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int32", &metadata_buf, &value_buf); @@ -218,17 +215,6 @@ TEST(ParquetVariant, StringValues) { } } -TEST(ParquetVariant, NullValue) { - // https://github.com/apache/parquet-testing/issues/81 - /* - std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("primitive_null", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::VARIANT_NULL, variant.getType()); - EXPECT_EQ("NULL", variant.typeDebugString()); - */ -} - - TEST(ParquetVariant, ObjectValues) { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("object_primitive", &metadata_buf, &value_buf); @@ -237,47 +223,73 @@ TEST(ParquetVariant, ObjectValues) { auto obj_info = variant.getObjectInfo(); EXPECT_EQ(7, obj_info.num_elements); + auto handle_int_field = [](const std::optional& value) { + EXPECT_TRUE(value.has_value()); + EXPECT_EQ(VariantType::BYTE, value->getType()); + EXPECT_EQ(1, value->getInt8()); + }; + auto handle_double_field = [](const std::optional& value) { + EXPECT_TRUE(value.has_value()); + EXPECT_EQ(VariantType::DECIMAL4, value->getType()); + auto decimal_value = value->getDecimal4(); + EXPECT_EQ("1.23456789", decimal_value.value.ToString(decimal_value.scale)); + }; + auto handle_boolean_true_field = [](const std::optional& value) { + EXPECT_TRUE(value.has_value()); + EXPECT_EQ(VariantType::BOOLEAN, value->getType()); + EXPECT_TRUE(value->getBool()); + }; + auto handle_boolean_false_field = [](const std::optional& value) { + EXPECT_TRUE(value.has_value()); + EXPECT_EQ(VariantType::BOOLEAN, value->getType()); + EXPECT_FALSE(value->getBool()); + }; + auto handle_string_field = [](const std::optional& value) { + EXPECT_TRUE(value.has_value()); + EXPECT_EQ(VariantType::STRING, value->getType()); + EXPECT_EQ("Apache Parquet", value->getString()); + }; + auto handle_null_field = [](const std::optional& value) { + EXPECT_TRUE(value.has_value()); + EXPECT_EQ(VariantType::VARIANT_NULL, value->getType()); + }; + auto handle_timestamp_field = [](const std::optional& value) { + EXPECT_TRUE(value.has_value()); + EXPECT_EQ(VariantType::STRING, value->getType()); + EXPECT_EQ("2025-04-16T12:34:56.78", value->getString()); + }; - auto int_field = variant.getObjectValueByKey("int_field"); - ASSERT_TRUE(int_field.has_value()); - std::cout << "int_field: " << int_field->typeDebugString() << '\n'; - EXPECT_EQ(VariantType::INT, int_field->getType()); - // EXPECT_EQ(42, int_field->getInt32()); - - auto double_field = variant.getObjectValueByKey("double_field"); - std::cout << "double_field: " << double_field->typeDebugString() << '\n'; - ASSERT_TRUE(double_field.has_value()); - EXPECT_EQ(VariantType::DOUBLE, double_field->getType()); - // EXPECT_DOUBLE_EQ(3.14159, double_field->getDouble()); - - auto boolean_true_field = variant.getObjectValueByKey("boolean_true_field"); - ASSERT_TRUE(boolean_true_field.has_value()); - EXPECT_EQ(VariantType::BOOLEAN, boolean_true_field->getType()); - EXPECT_TRUE(boolean_true_field->getBool()); - - auto boolean_false_field = variant.getObjectValueByKey("boolean_false_field"); - ASSERT_TRUE(boolean_false_field.has_value()); - EXPECT_EQ(VariantType::BOOLEAN, boolean_false_field->getType()); - // EXPECT_FALSE(boolean_false_field->getBool()); - - auto string_field = variant.getObjectValueByKey("string_field"); - ASSERT_TRUE(string_field.has_value()); - EXPECT_EQ(VariantType::STRING, string_field->getType()); - // EXPECT_EQ("Hello, World!", string_field->getString()); - - auto null_field = variant.getObjectValueByKey("null_field"); - ASSERT_TRUE(null_field.has_value()); - EXPECT_EQ(VariantType::VARIANT_NULL, null_field->getType()); - - auto non_existent = variant.getObjectValueByKey("non_existent"); - EXPECT_FALSE(non_existent.has_value()); - - // std::string_view key; - // auto field_by_id = variant.getObjectFieldByFieldId(0, &key); - // ASSERT_TRUE(field_by_id.has_value()); - // EXPECT_EQ("int_field", key); - // EXPECT_EQ(VariantType::INT, field_by_id->getType()); - // EXPECT_EQ(42, field_by_id->getInt32()); + std::map& value)>> + key_handler = {{"int_field", handle_int_field}, + {"double_field", handle_double_field}, + {"boolean_true_field", handle_boolean_true_field}, + {"boolean_false_field", handle_boolean_false_field}, + {"string_field", handle_string_field}, + {"null_field", handle_null_field}, + {"timestamp_field", handle_timestamp_field}}; + // Test getObjectValueByKey with existing keys + for (auto& [key, handler] : key_handler) { + auto value = variant.getObjectValueByKey(key); + handler(value); + } + // Test non-existing key + { + auto ne = variant.getObjectValueByKey("non_exists"); + EXPECT_FALSE(ne.has_value()); + } + // Test get by index + for (uint32_t i = 0; i < obj_info.num_elements; ++i) { + std::string_view key; + auto value = variant.getObjectFieldByFieldId(i, &key); + auto iter = key_handler.find(std::string(key)); + ASSERT_TRUE(iter != key_handler.end()); + auto handler = iter->second; + handler(value); + } + { + std::string_view key; + EXPECT_THROW(variant.getObjectFieldByFieldId(100, &key), ParquetException); + } } TEST(ParquetVariant, DecimalValues) { @@ -319,13 +331,6 @@ TEST(ParquetVariant, DateTimeValues) { // 2025-04-16 EXPECT_EQ(20194, variant.getDate()); } - // { - // std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - // auto variant = LoadVariantValue("primitive_time", &metadata_buf, &value_buf); - // EXPECT_EQ(VariantType::TIME, variant.getType()); - // EXPECT_EQ("TIME", variant.typeDebugString()); - // EXPECT_EQ(43200000000, variant.getTimeNTZ()); - // } { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_timestamp", &metadata_buf, &value_buf); @@ -342,30 +347,6 @@ TEST(ParquetVariant, DateTimeValues) { } } -// TEST(ParquetVariant, UuidValue) { -// std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; -// auto variant = LoadVariantValue("primitive_uuid", &metadata_buf, &value_buf); -// EXPECT_EQ(VariantType::UUID, variant.getType()); -// EXPECT_EQ("UUID", variant.typeDebugString()); -// -// // UUID 是 16 字节的二进制数据 -// const uint8_t* uuid = variant.getUuid(); -// ASSERT_NE(nullptr, uuid); -// -// // 检查 UUID 的格式(这里只是示例,实际值可能不同) -// std::string uuid_str; -// for (int i = 0; i < 16; i++) { -// char hex[3]; -// snprintf(hex, sizeof(hex), "%02x", uuid[i]); -// uuid_str += hex; -// if (i == 3 || i == 5 || i == 7 || i == 9) { -// uuid_str += "-"; -// } -// } -// -// EXPECT_EQ(36, uuid_str.length()); // 标准 UUID 字符串长度 -// } - TEST(ParquetVariant, ArrayValues) { { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; @@ -376,14 +357,13 @@ TEST(ParquetVariant, ArrayValues) { auto array_info = variant.getArrayInfo(); EXPECT_EQ(4, array_info.num_elements); - // 通过索引获取值 auto element0 = variant.getArrayValueByIndex(0); EXPECT_EQ(VariantType::BYTE, element0.getType()); - EXPECT_EQ(2, element0.getInt32()); + EXPECT_EQ(2, element0.getInt8()); auto element1 = variant.getArrayValueByIndex(1); EXPECT_EQ(VariantType::BYTE, element1.getType()); - EXPECT_EQ(1, element1.getInt32()); + EXPECT_EQ(1, element1.getInt8()); auto element2 = variant.getArrayValueByIndex(2); EXPECT_EQ(VariantType::BYTE, element2.getType()); @@ -444,80 +424,4 @@ TEST(ParquetVariant, ArrayValuesNested) { } } -TEST(ParquetVariant, NestedStructures) { - // 测试嵌套对象 - { - std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("object_nested", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::OBJECT, variant.getType()); - - auto nested_obj = variant.getObjectValueByKey("nested_object"); - ASSERT_TRUE(nested_obj.has_value()); - EXPECT_EQ(VariantType::OBJECT, nested_obj->getType()); - - auto nested_field = nested_obj->getObjectValueByKey("nested_field"); - ASSERT_TRUE(nested_field.has_value()); - EXPECT_EQ(VariantType::STRING, nested_field->getType()); - EXPECT_EQ("Nested value", nested_field->getString()); - } - - // 测试嵌套数组 - { - std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("array_nested", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::ARRAY, variant.getType()); - - auto nested_array = variant.getArrayValueByIndex(0); - EXPECT_EQ(VariantType::ARRAY, nested_array.getType()); - - auto array_info = nested_array.getArrayInfo(); - EXPECT_EQ(3, array_info.num_elements); - - auto element0 = nested_array.getArrayValueByIndex(0); - EXPECT_EQ(VariantType::INT, element0.getType()); - EXPECT_EQ(1, element0.getInt32()); - - auto element1 = nested_array.getArrayValueByIndex(1); - EXPECT_EQ(VariantType::INT, element1.getType()); - EXPECT_EQ(2, element1.getInt32()); - - auto element2 = nested_array.getArrayValueByIndex(2); - EXPECT_EQ(VariantType::INT, element2.getType()); - EXPECT_EQ(3, element2.getInt32()); - } - - // 测试对象中的数组 - { - std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("object_with_array", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::OBJECT, variant.getType()); - - auto array_field = variant.getObjectValueByKey("array_field"); - ASSERT_TRUE(array_field.has_value()); - EXPECT_EQ(VariantType::ARRAY, array_field->getType()); - - auto array_info = array_field->getArrayInfo(); - EXPECT_EQ(3, array_info.num_elements); - - auto element0 = array_field->getArrayValueByIndex(0); - EXPECT_EQ(VariantType::INT, element0.getType()); - EXPECT_EQ(1, element0.getInt32()); - } - - // 测试数组中的对象 - { - std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; - auto variant = LoadVariantValue("array_with_objects", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::ARRAY, variant.getType()); - - auto object_element = variant.getArrayValueByIndex(0); - EXPECT_EQ(VariantType::OBJECT, object_element.getType()); - - auto field = object_element.getObjectValueByKey("field"); - ASSERT_TRUE(field.has_value()); - EXPECT_EQ(VariantType::STRING, field->getType()); - EXPECT_EQ("Value", field->getString()); - } -} - } // namespace parquet::variant \ No newline at end of file From 54681c41a94ebe5389a1f987269dab3fc201806d Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 14 May 2025 16:04:26 +0800 Subject: [PATCH 11/31] continue cleanup --- cpp/src/parquet/variant.cc | 34 ++++++++++++++++++++++++--------- cpp/src/parquet/variant.h | 6 ++++++ cpp/src/parquet/variant_test.cc | 2 +- 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index b503508b50b8..d63466e4a7e0 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -567,12 +567,21 @@ VariantValue::ObjectInfo VariantValue::getObjectInfo() const { std::optional VariantValue::getObjectValueByKey( std::string_view key) const { - if (getBasicType() != VariantBasicType::Object) { - throw ParquetException("Not an object type"); - } - ObjectInfo info = getObjectInfo(); + return getObjectValueByKey(key, info); +} + +std::optional VariantValue::getObjectValueByKey( + std::string_view key, const VariantValue::ObjectInfo& info) const { + // TODO(mwish): Currently we just linear search here. The best way here is: + // 1. check the num_elements + // 2.1. If the element number is less than 8(or other magic number), we can keep + // current method. + // 2.2. If the element number is larger than 8, and metadata.sorted_strings is true, + // we can first apply binary search on the metadata, and then binary search the + // field id. + for (uint32_t i = 0; i < info.num_elements; ++i) { std::string_view field_key; std::optional field_value = getObjectFieldByFieldId(i, &field_key); @@ -692,18 +701,20 @@ VariantValue::ArrayInfo VariantValue::getArrayInfo() const { next_offset = arrow::bit_util::FromLittleEndian(next_offset); if (offset > next_offset) { - throw ParquetException("Invalid array value: offsets not monotonically increasing"); + throw ParquetException( + "Invalid array value: offsets not monotonically increasing: " + + std::to_string(offset) + " > " + std::to_string(next_offset)); } } return info; } -VariantValue VariantValue::getArrayValueByIndex(uint32_t index) const { - ArrayInfo info = getArrayInfo(); - +VariantValue VariantValue::getArrayValueByIndex(uint32_t index, + const ArrayInfo& info) const { if (index >= info.num_elements) { - throw ParquetException("Array index out of range"); + throw ParquetException("Array index out of range: " + std::to_string(index) + + " >= " + std::to_string(info.num_elements)); } // Read the offset and next offset @@ -725,4 +736,9 @@ VariantValue VariantValue::getArrayValueByIndex(uint32_t index) const { return element_value; } +VariantValue VariantValue::getArrayValueByIndex(uint32_t index) const { + ArrayInfo info = getArrayInfo(); + return getArrayValueByIndex(index, info); +} + } // namespace parquet::variant diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index 2500f6dc0113..643f7905b9eb 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -18,6 +18,7 @@ #pragma once #include +#include #include #include @@ -189,6 +190,8 @@ struct VariantValue { }; ObjectInfo getObjectInfo() const; std::optional getObjectValueByKey(std::string_view key) const; + std::optional getObjectValueByKey(std::string_view key, + const ObjectInfo& info) const; VariantValue getObjectFieldByFieldId(uint32_t variantId, std::string_view* key) const; struct ArrayInfo { @@ -200,6 +203,7 @@ struct VariantValue { ArrayInfo getArrayInfo() const; // Would throw ParquetException if index is out of range. VariantValue getArrayValueByIndex(uint32_t index) const; + VariantValue getArrayValueByIndex(uint32_t index, const ArrayInfo& info) const; private: static constexpr uint8_t BASIC_TYPE_MASK = 0b00000011; @@ -212,9 +216,11 @@ struct VariantValue { template PrimitiveType getPrimitiveType(VariantPrimitiveType type) const; + // An extra function because decimal uses 1 byte for scale. template DecimalValue getPrimitiveDecimalType(VariantPrimitiveType type) const; + // An extra function because binary/string uses 4 bytes for length. std::string_view getPrimitiveBinaryType(VariantPrimitiveType type) const; void checkBasicType(VariantBasicType type) const; void checkPrimitiveType(VariantPrimitiveType type, size_t size_required) const; diff --git a/cpp/src/parquet/variant_test.cc b/cpp/src/parquet/variant_test.cc index 15bbe1ed70aa..570605450079 100644 --- a/cpp/src/parquet/variant_test.cc +++ b/cpp/src/parquet/variant_test.cc @@ -424,4 +424,4 @@ TEST(ParquetVariant, ArrayValuesNested) { } } -} // namespace parquet::variant \ No newline at end of file +} // namespace parquet::variant From 7759b03a6198614d4fd084b2e4790fc2d6249417 Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 14 May 2025 16:48:33 +0800 Subject: [PATCH 12/31] Try to fix lint --- cpp/src/parquet/variant.cc | 45 +++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index d63466e4a7e0..716ed79e2326 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -161,7 +161,7 @@ uint32_t VariantMetadata::dictionarySize() const { if (length > 4) { throw ParquetException("Invalid offset size: " + std::to_string(length)); } - if (length + 1 > metadata_.size()) { + if (static_cast(length + 1) > metadata_.size()) { throw ParquetException("Invalid Variant metadata: too short for dictionary size"); } uint32_t dict_size = 0; @@ -422,8 +422,10 @@ std::string_view VariantValue::getString() const { if (basic_type == VariantBasicType::ShortString) { uint8_t length = (value[0] >> 2) & MAX_SHORT_STR_SIZE_MASK; - if (value.size() < length + 1) { - throw ParquetException("Invalid short string: too short"); + if (value.size() < static_cast(length + 1)) { + throw ParquetException( + "Invalid short string: too short: " + std::to_string(value.size()) + + " for at least " + std::to_string(length + 1)); } return {value.data() + 1, length}; } @@ -507,17 +509,14 @@ std::array VariantValue::getUuid() const { std::string VariantValue::ObjectInfo::toDebugString() const { std::stringstream ss; ss << "ObjectInfo{" - << "num_elements=" << num_elements - << ", id_size=" << static_cast(id_size) + << "num_elements=" << num_elements << ", id_size=" << static_cast(id_size) << ", offset_size=" << static_cast(offset_size) << ", id_start_offset=" << id_start_offset << ", offset_start_offset=" << offset_start_offset - << ", data_start_offset=" << data_start_offset - << "}"; + << ", data_start_offset=" << data_start_offset << "}"; return ss.str(); } - VariantValue::ObjectInfo VariantValue::getObjectInfo() const { checkBasicType(VariantBasicType::Object); uint8_t value_header = value[0] >> 2; @@ -525,10 +524,10 @@ VariantValue::ObjectInfo VariantValue::getObjectInfo() const { uint8_t field_id_size = ((value_header >> 2) & 0b11) + 1; bool is_large = ((value_header >> 4) & 0b1); uint8_t num_elements_size = is_large ? 4 : 1; - if (value.size() < 1 + num_elements_size) { - throw ParquetException("Invalid object value: too short: " + - std::to_string(value.size()) + " for at least " + - std::to_string(1 + num_elements_size)); + if (value.size() < static_cast(1 + num_elements_size)) { + throw ParquetException( + "Invalid object value: too short: " + std::to_string(value.size()) + + " for at least " + std::to_string(1 + num_elements_size)); } // parse num_elements uint32_t num_elements = 0; @@ -542,12 +541,13 @@ VariantValue::ObjectInfo VariantValue::getObjectInfo() const { info.offset_size = field_offset_size; info.id_start_offset = 1 + num_elements_size; info.offset_start_offset = info.id_start_offset + num_elements * field_id_size; - info.data_start_offset = info.offset_start_offset + (num_elements + 1) * field_offset_size; + info.data_start_offset = + info.offset_start_offset + (num_elements + 1) * field_offset_size; // Check the boundary with the final offset if (info.data_start_offset > value.size()) { throw ParquetException("Invalid object value: data_start_offset=" + - std::to_string(info.data_start_offset) + - ", value_size=" + std::to_string(value.size())); + std::to_string(info.data_start_offset) + + ", value_size=" + std::to_string(value.size())); } { uint32_t final_offset = 0; @@ -556,10 +556,10 @@ VariantValue::ObjectInfo VariantValue::getObjectInfo() const { field_offset_size); // It could be less than value size since it could be a sub-object. if (final_offset + info.data_start_offset > value.size()) { - throw ParquetException("Invalid object value: final_offset=" + - std::to_string(final_offset) + - ", data_start_offset=" + std::to_string(info.data_start_offset) + - ", value_size=" + std::to_string(value.size())); + throw ParquetException( + "Invalid object value: final_offset=" + std::to_string(final_offset) + + ", data_start_offset=" + std::to_string(info.data_start_offset) + + ", value_size=" + std::to_string(value.size())); } } return info; @@ -629,9 +629,8 @@ VariantValue VariantValue::getObjectFieldByFieldId(uint32_t variantId, } // Create a VariantValue for the field - VariantValue field_value{ - .metadata = metadata, - .value = value.substr(info.data_start_offset + offset)}; + VariantValue field_value{.metadata = metadata, + .value = value.substr(info.data_start_offset + offset)}; return field_value; } @@ -644,7 +643,7 @@ VariantValue::ArrayInfo VariantValue::getArrayInfo() const { // check the array header uint8_t num_elements_size = is_large ? 4 : 1; - if (value.size() < 1 + num_elements_size) { + if (value.size() < static_cast(1 + num_elements_size)) { throw ParquetException( "Invalid array value: too short: " + std::to_string(value.size()) + " for at least " + std::to_string(1 + num_elements_size)); From 31fc7dfd271f7dd315c11d44aed75ca0b0b8d2ad Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 14 May 2025 18:41:11 +0800 Subject: [PATCH 13/31] Address some comments, fix and test NestedObject problem --- cpp/src/parquet/variant.cc | 24 ++++++++++--- cpp/src/parquet/variant.h | 3 ++ cpp/src/parquet/variant_test.cc | 64 +++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index 716ed79e2326..abe118aeacf1 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -21,7 +21,8 @@ #include #include -#include "arrow/util/endian.h" +#include + #include "parquet/exception.h" namespace parquet::variant { @@ -146,13 +147,20 @@ VariantMetadata::VariantMetadata(std::string_view metadata) : metadata_(metadata throw ParquetException("Invalid Variant metadata: too short: " + std::to_string(metadata.size())); } + if (version() != 1) { + // Currently we only supports version 1. + throw ParquetException("Unsupported Variant metadata version: " + + std::to_string(version())); + } } int8_t VariantMetadata::version() const { - return static_cast(metadata_[0]) & 0x0F; + return static_cast(metadata_[0]) & VERSION_MASK; } -bool VariantMetadata::sortedStrings() const { return (metadata_[0] & 0b10000) != 0; } +bool VariantMetadata::sortedStrings() const { + return (metadata_[0] & SORTED_STRING_MASK) != 0; +} uint8_t VariantMetadata::offsetSize() const { return ((metadata_[0] >> 6) & 0x3) + 1; } @@ -187,10 +195,10 @@ std::string_view VariantMetadata::getMetadataKey(int32_t variantId) const { uint32_t variant_offset = 0; uint32_t variant_next_offset = 0; memcpy(&variant_offset, metadata_.data() + offset_start_pos, offset_size); - variant_offset = arrow::bit_util::FromLittleEndian(variant_offset); + variant_offset = ::arrow::bit_util::FromLittleEndian(variant_offset); memcpy(&variant_next_offset, metadata_.data() + offset_start_pos + offset_size, offset_size); - variant_next_offset = arrow::bit_util::FromLittleEndian(variant_next_offset); + variant_next_offset = ::arrow::bit_util::FromLittleEndian(variant_next_offset); uint32_t key_size = variant_next_offset - variant_offset; @@ -586,6 +594,12 @@ std::optional VariantValue::getObjectValueByKey( std::string_view field_key; std::optional field_value = getObjectFieldByFieldId(i, &field_key); + if (!field_value.has_value()) { + // The field might not belong to the current object, + // just skip it. + continue; + } + if (field_key == key) { return field_value; } diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index 643f7905b9eb..9c6294e690a4 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -131,6 +131,9 @@ class VariantMetadata { uint8_t offsetSize() const; uint32_t dictionarySize() const; + static constexpr uint8_t VERSION_MASK = 0xF; + static constexpr uint8_t SORTED_STRING_MASK = 0b10000; + private: std::string_view metadata_; }; diff --git a/cpp/src/parquet/variant_test.cc b/cpp/src/parquet/variant_test.cc index 570605450079..79f03cf5f87f 100644 --- a/cpp/src/parquet/variant_test.cc +++ b/cpp/src/parquet/variant_test.cc @@ -49,6 +49,8 @@ TEST(ParquetVariant, MetadataBase) { std::string dir_string(parquet::test::get_variant_dir()); auto file_system = std::make_shared<::arrow::fs::LocalFileSystem>(); std::vector primitive_metadatas = { + // FIXME(mwish): null metadata is corrupt, see + // https://github.com/apache/parquet-testing/issues/81 // "primitive_null.metadata", "primitive_boolean_true.metadata", "primitive_boolean_false.metadata", "primitive_date.metadata", "primitive_decimal4.metadata", @@ -292,6 +294,68 @@ TEST(ParquetVariant, ObjectValues) { } } +TEST(ParquetVariant, NestedObjectValues) { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("object_nested", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::OBJECT, variant.getType()); + EXPECT_EQ("OBJECT", variant.typeDebugString()); + auto info = variant.getObjectInfo(); + EXPECT_EQ(3, info.num_elements); + + // Trying to get the exists key + auto value = variant.getObjectValueByKey("id", info); + ASSERT_TRUE(value.has_value()); + EXPECT_EQ(VariantType::BYTE, value->getType()); + EXPECT_EQ(1, value->getInt8()); + + auto observation = value->getObjectValueByKey("observation", info); + ASSERT_TRUE(observation.has_value()); + EXPECT_EQ(VariantType::OBJECT, observation->getType()); + + auto species = observation->getObjectValueByKey("species", info); + ASSERT_TRUE(species.has_value()); + EXPECT_EQ(VariantType::OBJECT, species->getType()); + + // Inner object works well + { + auto species_object_info = species->getObjectInfo(); + EXPECT_EQ(2, species_object_info.num_elements); + auto name = species->getObjectValueByKey("name"); + ASSERT_TRUE(name.has_value()); + EXPECT_EQ(VariantType::STRING, name->getType()); + EXPECT_EQ("name", name->getString()); + + auto population = species->getObjectValueByKey("population"); + ASSERT_TRUE(population.has_value()); + EXPECT_EQ(VariantType::SHORT, name->getType()); + EXPECT_EQ(6789, name->getInt16()); + } + + // Get inner key outside will fail + { + std::vector observation_keys = {"location", "time", "value"}; + for (auto& key : observation_keys) { + // Only observation would get it successfully. + auto inner_value = observation->getObjectValueByKey(key); + ASSERT_TRUE(value.has_value()); + + inner_value = value->getObjectValueByKey(key, info); + ASSERT_FALSE(value.has_value()); + + inner_value = species->getObjectValueByKey(key); + ASSERT_FALSE(value.has_value()); + } + } + // Get outside keys in inner object + { + auto inner_value = observation->getObjectValueByKey("id", info); + ASSERT_FALSE(inner_value.has_value()); + + inner_value = species->getObjectValueByKey("id", info); + ASSERT_FALSE(inner_value.has_value()); + } +} + TEST(ParquetVariant, DecimalValues) { { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; From 15b12f2a62ab2686ef1498be592a39a4f102bea8 Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 14 May 2025 19:39:37 +0800 Subject: [PATCH 14/31] Fix object handling, and refactor methods ( needs rethink it ) --- cpp/src/parquet/variant.cc | 161 +++++++++++++++++++------------- cpp/src/parquet/variant.h | 25 +++-- cpp/src/parquet/variant_test.cc | 72 +++++++------- 3 files changed, 151 insertions(+), 107 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index abe118aeacf1..38b1214e98db 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -22,6 +22,7 @@ #include #include +#include #include "parquet/exception.h" @@ -101,14 +102,14 @@ std::string variantTypeToString(VariantType type) { return "NULL"; case VariantType::BOOLEAN: return "BOOLEAN"; - case VariantType::BYTE: - return "BYTE"; - case VariantType::SHORT: - return "SHORT"; - case VariantType::INT: - return "INT"; - case VariantType::LONG: - return "LONG"; + case VariantType::INT8: + return "INT8"; + case VariantType::INT16: + return "INT16"; + case VariantType::INT32: + return "INT32"; + case VariantType::INT64: + return "INT64"; case VariantType::STRING: return "STRING"; case VariantType::DOUBLE: @@ -178,19 +179,19 @@ uint32_t VariantMetadata::dictionarySize() const { return dict_size; } -std::string_view VariantMetadata::getMetadataKey(int32_t variantId) const { +std::string_view VariantMetadata::getMetadataKey(int32_t variant_id) const { uint32_t offset_size = offsetSize(); uint32_t dict_size = dictionarySize(); - if (variantId < 0 || variantId >= static_cast(dict_size)) { - throw ParquetException("Invalid Variant metadata: variantId out of range"); + if (variant_id < 0 || variant_id >= static_cast(dict_size)) { + throw ParquetException("Invalid Variant metadata: variant_id out of range"); } if ((dict_size + 1) * offset_size > metadata_.size()) { throw ParquetException("Invalid Variant metadata: offset out of range"); } - size_t offset_start_pos = 1 + offset_size + (variantId * offset_size); + size_t offset_start_pos = 1 + offset_size + (variant_id * offset_size); uint32_t variant_offset = 0; uint32_t variant_next_offset = 0; @@ -209,6 +210,40 @@ std::string_view VariantMetadata::getMetadataKey(int32_t variantId) const { return {metadata_.data() + string_start, key_size}; } +arrow::internal::SmallVector VariantMetadata::getMetadataId( + std::string_view key) const { + uint32_t offset_size = offsetSize(); + uint32_t dict_size = dictionarySize(); + + if ((dict_size + 1) * offset_size > metadata_.size()) { + throw ParquetException("Invalid Variant metadata: offset out of range"); + } + ::arrow::internal::SmallVector vector; + for (uint32_t i = 0; i < dict_size; ++i) { + size_t offset_start_pos = 1 + offset_size + (i * offset_size); + uint32_t variant_offset = 0; + memcpy(&variant_offset, metadata_.data() + offset_start_pos, offset_size); + variant_offset = ::arrow::bit_util::FromLittleEndian(variant_offset); + + uint32_t variant_next_offset = 0; + memcpy(&variant_next_offset, metadata_.data() + offset_start_pos + offset_size, + offset_size); + variant_next_offset = ::arrow::bit_util::FromLittleEndian(variant_next_offset); + + uint32_t key_size = variant_next_offset - variant_offset; + + size_t string_start = 1 + offset_size * (dict_size + 2) + variant_offset; + if (string_start + key_size > metadata_.size()) { + throw ParquetException("Invalid Variant metadata: string data out of range"); + } + std::string_view current_key{metadata_.data() + string_start, key_size}; + if (current_key == key) { + vector.push_back(i); + } + } + return vector; +} + VariantBasicType VariantValue::getBasicType() const { if (value.empty()) { throw ParquetException("Empty variant value"); @@ -228,13 +263,13 @@ VariantType VariantValue::getType() const { case VariantPrimitiveType::BooleanFalse: return VariantType::BOOLEAN; case VariantPrimitiveType::Int8: - return VariantType::BYTE; + return VariantType::INT8; case VariantPrimitiveType::Int16: - return VariantType::SHORT; + return VariantType::INT16; case VariantPrimitiveType::Int32: - return VariantType::INT; + return VariantType::INT32; case VariantPrimitiveType::Int64: - return VariantType::LONG; + return VariantType::INT64; case VariantPrimitiveType::Double: return VariantType::DOUBLE; case VariantPrimitiveType::Decimal4: @@ -291,14 +326,14 @@ std::string VariantValue::typeDebugString() const { return "NULL"; case VariantType::BOOLEAN: return "BOOLEAN"; - case VariantType::BYTE: - return "BYTE"; - case VariantType::SHORT: - return "SHORT"; - case VariantType::INT: - return "INT"; - case VariantType::LONG: - return "LONG"; + case VariantType::INT8: + return "INT8"; + case VariantType::INT16: + return "INT16"; + case VariantType::INT32: + return "INT32"; + case VariantType::INT64: + return "INT64"; case VariantType::STRING: return "STRING"; case VariantType::DOUBLE: @@ -582,56 +617,45 @@ std::optional VariantValue::getObjectValueByKey( std::optional VariantValue::getObjectValueByKey( std::string_view key, const VariantValue::ObjectInfo& info) const { - // TODO(mwish): Currently we just linear search here. The best way here is: - // 1. check the num_elements - // 2.1. If the element number is less than 8(or other magic number), we can keep - // current method. - // 2.2. If the element number is larger than 8, and metadata.sorted_strings is true, - // we can first apply binary search on the metadata, and then binary search the - // field id. - - for (uint32_t i = 0; i < info.num_elements; ++i) { - std::string_view field_key; - std::optional field_value = getObjectFieldByFieldId(i, &field_key); - - if (!field_value.has_value()) { - // The field might not belong to the current object, - // just skip it. - continue; - } - - if (field_key == key) { - return field_value; + ARROW_DCHECK_EQ(getObjectInfo(), info); + auto metadata_ids = metadata.getMetadataId(key); + if (metadata_ids.empty()) { + return std::nullopt; + } + for (uint32_t variant_id : metadata_ids) { + auto variant_value = getObjectFieldByFieldId(variant_id, info); + if (variant_value.has_value()) { + return variant_value; } } - return std::nullopt; } -VariantValue VariantValue::getObjectFieldByFieldId(uint32_t variantId, - std::string_view* key) const { - ObjectInfo info = getObjectInfo(); +std::optional VariantValue::getObjectFieldByFieldId( + uint32_t variant_id, const ObjectInfo& info) const { + ARROW_DCHECK_EQ(getObjectInfo(), info); - if (variantId >= info.num_elements) { - throw ParquetException("Field ID out of range: " + std::to_string(variantId) + - " >= " + std::to_string(info.num_elements)); + uint32_t field_offset = std::numeric_limits::max(); + // Get the field offset + // TODO(mwish): Using binary search to optimize it. + for (uint32_t i = 0; i < info.num_elements; ++i) { + uint32_t variant_field_id = 0; + memcpy(&variant_field_id, value.data() + info.id_start_offset + i * info.id_size, + info.id_size); + variant_field_id = arrow::bit_util::FromLittleEndian(variant_field_id); + if (variant_field_id == variant_id) { + field_offset = i; + break; + } + } + if (field_offset == std::numeric_limits::max()) { + return std::nullopt; } - - // Read the field ID - uint32_t field_id = 0; - memcpy(&field_id, value.data() + info.id_start_offset + variantId * info.id_size, - info.id_size); - field_id = arrow::bit_util::FromLittleEndian(field_id); - - // Get the key from metadata - *key = metadata.getMetadataKey(static_cast(field_id)); // Read the offset and next offset - uint32_t offset = 0, next_offset = 0; - memcpy(&offset, value.data() + info.offset_start_offset + variantId * info.offset_size, - info.offset_size); - memcpy(&next_offset, - value.data() + info.offset_start_offset + (variantId + 1) * info.offset_size, + uint32_t offset = 0; + memcpy(&offset, + value.data() + info.offset_start_offset + field_offset * info.offset_size, info.offset_size); offset = arrow::bit_util::FromLittleEndian(offset); @@ -649,6 +673,13 @@ VariantValue VariantValue::getObjectFieldByFieldId(uint32_t variantId, return field_value; } +std::optional VariantValue::getObjectFieldByFieldId( + uint32_t variant_id) const { + ObjectInfo info = getObjectInfo(); + + return getObjectFieldByFieldId(variant_id, info); +} + VariantValue::ArrayInfo VariantValue::getArrayInfo() const { checkBasicType(VariantBasicType::Array); uint8_t value_header = value[0] >> 2; diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index 9c6294e690a4..a62bbede9250 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -23,6 +23,7 @@ #include #include +#include namespace parquet::variant { @@ -96,10 +97,10 @@ enum class VariantType { ARRAY, VARIANT_NULL, BOOLEAN, - BYTE, - SHORT, - INT, - LONG, + INT8, + INT16, + INT32, + INT64, STRING, DOUBLE, DECIMAL4, @@ -124,13 +125,14 @@ class VariantMetadata { /// \brief Get the variant metadata version. Currently, always 1. int8_t version() const; /// \brief Get the metadata key for a given variant field id. - std::string_view getMetadataKey(int32_t variantId) const; + std::string_view getMetadataKey(int32_t variant_id) const; + ::arrow::internal::SmallVector getMetadataId(std::string_view key) const; - private: bool sortedStrings() const; uint8_t offsetSize() const; uint32_t dictionarySize() const; + private: static constexpr uint8_t VERSION_MASK = 0xF; static constexpr uint8_t SORTED_STRING_MASK = 0b10000; @@ -189,13 +191,22 @@ struct VariantValue { uint32_t offset_start_offset; uint32_t data_start_offset; + bool operator==(const ObjectInfo& info) const { + return num_elements == info.num_elements && id_size == info.id_size && + offset_size == info.offset_size && id_start_offset == info.id_start_offset && + offset_start_offset == info.offset_start_offset && + data_start_offset == info.data_start_offset; + } + std::string toDebugString() const; }; ObjectInfo getObjectInfo() const; std::optional getObjectValueByKey(std::string_view key) const; std::optional getObjectValueByKey(std::string_view key, const ObjectInfo& info) const; - VariantValue getObjectFieldByFieldId(uint32_t variantId, std::string_view* key) const; + std::optional getObjectFieldByFieldId(uint32_t variant_id) const; + std::optional getObjectFieldByFieldId(uint32_t variant_id, + const ObjectInfo& info) const; struct ArrayInfo { uint32_t num_elements; diff --git a/cpp/src/parquet/variant_test.cc b/cpp/src/parquet/variant_test.cc index 79f03cf5f87f..8cbe867bf321 100644 --- a/cpp/src/parquet/variant_test.cc +++ b/cpp/src/parquet/variant_test.cc @@ -132,30 +132,30 @@ TEST(ParquetVariant, NumericValues) { { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int8", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::BYTE, variant.getType()); - EXPECT_EQ("BYTE", variant.typeDebugString()); + EXPECT_EQ(VariantType::INT8, variant.getType()); + EXPECT_EQ("INT8", variant.typeDebugString()); EXPECT_EQ(42, variant.getInt8()); } { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int16", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::SHORT, variant.getType()); - EXPECT_EQ("SHORT", variant.typeDebugString()); + EXPECT_EQ(VariantType::INT16, variant.getType()); + EXPECT_EQ("INT16", variant.typeDebugString()); EXPECT_EQ(1234, variant.getInt16()); } { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int32", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::INT, variant.getType()); - EXPECT_EQ("INT", variant.typeDebugString()); + EXPECT_EQ(VariantType::INT32, variant.getType()); + EXPECT_EQ("INT32", variant.typeDebugString()); EXPECT_EQ(123456, variant.getInt32()); } { // FIXME(mwish): https://github.com/apache/parquet-testing/issues/82 std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int64", &metadata_buf, &value_buf); - // EXPECT_EQ(VariantType::LONG, variant.getType()); - // EXPECT_EQ("LONG", variant.typeDebugString()); + EXPECT_EQ(VariantType::INT32, variant.getType()); + EXPECT_EQ("INT32", variant.typeDebugString()); EXPECT_EQ(12345678, variant.getInt32()); } { @@ -227,7 +227,7 @@ TEST(ParquetVariant, ObjectValues) { EXPECT_EQ(7, obj_info.num_elements); auto handle_int_field = [](const std::optional& value) { EXPECT_TRUE(value.has_value()); - EXPECT_EQ(VariantType::BYTE, value->getType()); + EXPECT_EQ(VariantType::INT8, value->getType()); EXPECT_EQ(1, value->getInt8()); }; auto handle_double_field = [](const std::optional& value) { @@ -281,8 +281,8 @@ TEST(ParquetVariant, ObjectValues) { } // Test get by index for (uint32_t i = 0; i < obj_info.num_elements; ++i) { - std::string_view key; - auto value = variant.getObjectFieldByFieldId(i, &key); + auto value = variant.getObjectFieldByFieldId(i); + auto key = variant.metadata.getMetadataKey(i); auto iter = key_handler.find(std::string(key)); ASSERT_TRUE(iter != key_handler.end()); auto handler = iter->second; @@ -290,7 +290,7 @@ TEST(ParquetVariant, ObjectValues) { } { std::string_view key; - EXPECT_THROW(variant.getObjectFieldByFieldId(100, &key), ParquetException); + EXPECT_FALSE(variant.getObjectFieldByFieldId(100).has_value()); } } @@ -303,18 +303,20 @@ TEST(ParquetVariant, NestedObjectValues) { EXPECT_EQ(3, info.num_elements); // Trying to get the exists key - auto value = variant.getObjectValueByKey("id", info); - ASSERT_TRUE(value.has_value()); - EXPECT_EQ(VariantType::BYTE, value->getType()); - EXPECT_EQ(1, value->getInt8()); + auto id = variant.getObjectValueByKey("id", info); + ASSERT_TRUE(id.has_value()); + EXPECT_EQ(VariantType::INT8, id->getType()); + EXPECT_EQ(1, id->getInt8()); - auto observation = value->getObjectValueByKey("observation", info); + auto observation = variant.getObjectValueByKey("observation", info); ASSERT_TRUE(observation.has_value()); EXPECT_EQ(VariantType::OBJECT, observation->getType()); - auto species = observation->getObjectValueByKey("species", info); + auto species = variant.getObjectValueByKey("species", info); ASSERT_TRUE(species.has_value()); EXPECT_EQ(VariantType::OBJECT, species->getType()); + auto species_info = species->getObjectInfo(); + EXPECT_EQ(2, species_info.num_elements); // Inner object works well { @@ -323,12 +325,12 @@ TEST(ParquetVariant, NestedObjectValues) { auto name = species->getObjectValueByKey("name"); ASSERT_TRUE(name.has_value()); EXPECT_EQ(VariantType::STRING, name->getType()); - EXPECT_EQ("name", name->getString()); + EXPECT_EQ("lava monster", name->getString()); auto population = species->getObjectValueByKey("population"); ASSERT_TRUE(population.has_value()); - EXPECT_EQ(VariantType::SHORT, name->getType()); - EXPECT_EQ(6789, name->getInt16()); + EXPECT_EQ(VariantType::INT16, population->getType()); + EXPECT_EQ(6789, population->getInt16()); } // Get inner key outside will fail @@ -337,22 +339,22 @@ TEST(ParquetVariant, NestedObjectValues) { for (auto& key : observation_keys) { // Only observation would get it successfully. auto inner_value = observation->getObjectValueByKey(key); - ASSERT_TRUE(value.has_value()); + ASSERT_TRUE(inner_value.has_value()); - inner_value = value->getObjectValueByKey(key, info); - ASSERT_FALSE(value.has_value()); + inner_value = variant.getObjectValueByKey(key); + ASSERT_FALSE(inner_value.has_value()); inner_value = species->getObjectValueByKey(key); - ASSERT_FALSE(value.has_value()); + ASSERT_FALSE(inner_value.has_value()); } } // Get outside keys in inner object { - auto inner_value = observation->getObjectValueByKey("id", info); - ASSERT_FALSE(inner_value.has_value()); + auto inner_value = observation->getObjectValueByKey("id"); + EXPECT_FALSE(inner_value.has_value()); - inner_value = species->getObjectValueByKey("id", info); - ASSERT_FALSE(inner_value.has_value()); + inner_value = species->getObjectValueByKey("id"); + EXPECT_FALSE(inner_value.has_value()); } } @@ -422,19 +424,19 @@ TEST(ParquetVariant, ArrayValues) { EXPECT_EQ(4, array_info.num_elements); auto element0 = variant.getArrayValueByIndex(0); - EXPECT_EQ(VariantType::BYTE, element0.getType()); + EXPECT_EQ(VariantType::INT8, element0.getType()); EXPECT_EQ(2, element0.getInt8()); auto element1 = variant.getArrayValueByIndex(1); - EXPECT_EQ(VariantType::BYTE, element1.getType()); + EXPECT_EQ(VariantType::INT8, element1.getType()); EXPECT_EQ(1, element1.getInt8()); auto element2 = variant.getArrayValueByIndex(2); - EXPECT_EQ(VariantType::BYTE, element2.getType()); + EXPECT_EQ(VariantType::INT8, element2.getType()); EXPECT_EQ(5, element2.getInt8()); auto element3 = variant.getArrayValueByIndex(3); - EXPECT_EQ(VariantType::BYTE, element3.getType()); + EXPECT_EQ(VariantType::INT8, element3.getType()); EXPECT_EQ(9, element3.getInt8()); EXPECT_THROW(variant.getArrayValueByIndex(4), ParquetException); @@ -469,7 +471,7 @@ TEST(ParquetVariant, ArrayValuesNested) { EXPECT_EQ(2, first_element_info.num_elements); auto id = first_element.getObjectValueByKey("id"); ASSERT_TRUE(id.has_value()); - EXPECT_EQ(VariantType::BYTE, id->getType()); + EXPECT_EQ(VariantType::INT8, id->getType()); EXPECT_EQ(1, id->getInt8()); } { @@ -483,7 +485,7 @@ TEST(ParquetVariant, ArrayValuesNested) { EXPECT_EQ(3, third_element_info.num_elements); auto id = third_element.getObjectValueByKey("id"); ASSERT_TRUE(id.has_value()); - EXPECT_EQ(VariantType::BYTE, id->getType()); + EXPECT_EQ(VariantType::INT8, id->getType()); EXPECT_EQ(2, id->getInt8()); } } From bbce69c4fec45d3d857540405a6f1a1c77ac760d Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 14 May 2025 19:48:51 +0800 Subject: [PATCH 15/31] Minor comments --- cpp/src/parquet/variant.cc | 9 +++++---- cpp/src/parquet/variant.h | 5 +++++ cpp/src/parquet/variant_test.cc | 5 +---- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index 38b1214e98db..9536ed5553d6 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -218,6 +218,7 @@ arrow::internal::SmallVector VariantMetadata::getMetadataId( if ((dict_size + 1) * offset_size > metadata_.size()) { throw ParquetException("Invalid Variant metadata: offset out of range"); } + // TODO(mwish): This can be optimized by using binary search if the metadata is sorted. ::arrow::internal::SmallVector vector; for (uint32_t i = 0; i < dict_size; ++i) { size_t offset_start_pos = 1 + offset_size + (i * offset_size); @@ -635,7 +636,7 @@ std::optional VariantValue::getObjectFieldByFieldId( uint32_t variant_id, const ObjectInfo& info) const { ARROW_DCHECK_EQ(getObjectInfo(), info); - uint32_t field_offset = std::numeric_limits::max(); + std::optional field_offset_opt; // Get the field offset // TODO(mwish): Using binary search to optimize it. for (uint32_t i = 0; i < info.num_elements; ++i) { @@ -644,14 +645,14 @@ std::optional VariantValue::getObjectFieldByFieldId( info.id_size); variant_field_id = arrow::bit_util::FromLittleEndian(variant_field_id); if (variant_field_id == variant_id) { - field_offset = i; + field_offset_opt = i; break; } } - if (field_offset == std::numeric_limits::max()) { + if (!field_offset_opt.has_value()) { return std::nullopt; } - + uint32_t field_offset = field_offset_opt.value(); // Read the offset and next offset uint32_t offset = 0; memcpy(&offset, diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index a62bbede9250..61b673f8b2d4 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -126,6 +126,11 @@ class VariantMetadata { int8_t version() const; /// \brief Get the metadata key for a given variant field id. std::string_view getMetadataKey(int32_t variant_id) const; + /// \brief Get the metadata id for a given key. + /// From the discussion in ML: + /// https://lists.apache.org/thread/b68tjmrjmy64mbv9dknpmqs28vnzjj96 if + /// !sortedStrings(), the metadata key is not guaranteed to be unique, so we use a + /// vector to store all the metadata ids. ::arrow::internal::SmallVector getMetadataId(std::string_view key) const; bool sortedStrings() const; diff --git a/cpp/src/parquet/variant_test.cc b/cpp/src/parquet/variant_test.cc index 8cbe867bf321..4c6c3eb5e9a0 100644 --- a/cpp/src/parquet/variant_test.cc +++ b/cpp/src/parquet/variant_test.cc @@ -288,10 +288,7 @@ TEST(ParquetVariant, ObjectValues) { auto handler = iter->second; handler(value); } - { - std::string_view key; - EXPECT_FALSE(variant.getObjectFieldByFieldId(100).has_value()); - } + EXPECT_FALSE(variant.getObjectFieldByFieldId(100).has_value()); } TEST(ParquetVariant, NestedObjectValues) { From 6454bb7941eaaf051a033dc44b1cbcb403dd854f Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 14 May 2025 21:26:13 +0800 Subject: [PATCH 16/31] Trying to using scope resolution operator to fix ci --- cpp/src/parquet/variant.cc | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index 9536ed5553d6..02839923bfe9 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -175,7 +175,7 @@ uint32_t VariantMetadata::dictionarySize() const { } uint32_t dict_size = 0; memcpy(&dict_size, metadata_.data() + 1, length); - dict_size = arrow::bit_util::FromLittleEndian(dict_size); + dict_size = ::arrow::bit_util::FromLittleEndian(dict_size); return dict_size; } @@ -210,7 +210,7 @@ std::string_view VariantMetadata::getMetadataKey(int32_t variant_id) const { return {metadata_.data() + string_start, key_size}; } -arrow::internal::SmallVector VariantMetadata::getMetadataId( +::arrow::internal::SmallVector VariantMetadata::getMetadataId( std::string_view key) const { uint32_t offset_size = offsetSize(); uint32_t dict_size = dictionarySize(); @@ -452,7 +452,7 @@ std::string_view VariantValue::getPrimitiveBinaryType(VariantPrimitiveType type) uint32_t length; memcpy(&length, value.data() + 1, sizeof(uint32_t)); - length = arrow::bit_util::FromLittleEndian(length); + length = ::arrow::bit_util::FromLittleEndian(length); if (value.size() < length + 5) { throw ParquetException("Invalid string value: too short for specified length"); @@ -494,7 +494,7 @@ DecimalValue VariantValue::getPrimitiveDecimalType( uint8_t scale = value[1]; DecimalValueType decimal_value; memcpy(&decimal_value, value.data() + 2, sizeof(DecimalValueType)); - decimal_value = arrow::bit_util::FromLittleEndian(decimal_value); + decimal_value = ::arrow::bit_util::FromLittleEndian(decimal_value); return {scale, DecimalType(decimal_value)}; } @@ -554,6 +554,7 @@ std::string VariantValue::ObjectInfo::toDebugString() const { std::stringstream ss; ss << "ObjectInfo{" << "num_elements=" << num_elements << ", id_size=" << static_cast(id_size) + << ", id_size=" << static_cast(id_size) << ", offset_size=" << static_cast(offset_size) << ", id_start_offset=" << id_start_offset << ", offset_start_offset=" << offset_start_offset @@ -577,7 +578,7 @@ VariantValue::ObjectInfo VariantValue::getObjectInfo() const { uint32_t num_elements = 0; { memcpy(&num_elements, value.data() + 1, num_elements_size); - num_elements = arrow::bit_util::FromLittleEndian(num_elements); + num_elements = ::arrow::bit_util::FromLittleEndian(num_elements); } ObjectInfo info{}; info.num_elements = num_elements; @@ -643,7 +644,7 @@ std::optional VariantValue::getObjectFieldByFieldId( uint32_t variant_field_id = 0; memcpy(&variant_field_id, value.data() + info.id_start_offset + i * info.id_size, info.id_size); - variant_field_id = arrow::bit_util::FromLittleEndian(variant_field_id); + variant_field_id = ::arrow::bit_util::FromLittleEndian(variant_field_id); if (variant_field_id == variant_id) { field_offset_opt = i; break; @@ -658,7 +659,7 @@ std::optional VariantValue::getObjectFieldByFieldId( memcpy(&offset, value.data() + info.offset_start_offset + field_offset * info.offset_size, info.offset_size); - offset = arrow::bit_util::FromLittleEndian(offset); + offset = ::arrow::bit_util::FromLittleEndian(offset); if (info.data_start_offset + offset > value.size()) { throw ParquetException("Invalid object field offsets: data_start_offset=" + @@ -668,8 +669,7 @@ std::optional VariantValue::getObjectFieldByFieldId( } // Create a VariantValue for the field - VariantValue field_value{.metadata = metadata, - .value = value.substr(info.data_start_offset + offset)}; + VariantValue field_value{metadata, value.substr(info.data_start_offset + offset)}; return field_value; } @@ -699,7 +699,7 @@ VariantValue::ArrayInfo VariantValue::getArrayInfo() const { uint32_t num_elements = 0; { memcpy(&num_elements, value.data() + 1, num_elements_size); - num_elements = arrow::bit_util::FromLittleEndian(num_elements); + num_elements = ::arrow::bit_util::FromLittleEndian(num_elements); } ArrayInfo info{}; @@ -723,7 +723,7 @@ VariantValue::ArrayInfo VariantValue::getArrayInfo() const { memcpy(&final_offset, value.data() + info.offset_start_offset + num_elements * field_offset_size, field_offset_size); - final_offset = arrow::bit_util::FromLittleEndian(final_offset); + final_offset = ::arrow::bit_util::FromLittleEndian(final_offset); if (info.data_start_offset + final_offset > value.size()) { throw ParquetException( @@ -742,8 +742,8 @@ VariantValue::ArrayInfo VariantValue::getArrayInfo() const { memcpy(&next_offset, value.data() + info.offset_start_offset + (i + 1) * field_offset_size, field_offset_size); - offset = arrow::bit_util::FromLittleEndian(offset); - next_offset = arrow::bit_util::FromLittleEndian(next_offset); + offset = ::arrow::bit_util::FromLittleEndian(offset); + next_offset = ::arrow::bit_util::FromLittleEndian(next_offset); if (offset > next_offset) { throw ParquetException( @@ -769,14 +769,13 @@ VariantValue VariantValue::getArrayValueByIndex(uint32_t index, memcpy(&next_offset, value.data() + info.offset_start_offset + (index + 1) * info.offset_size, info.offset_size); - offset = arrow::bit_util::FromLittleEndian(offset); - next_offset = arrow::bit_util::FromLittleEndian(next_offset); + offset = ::arrow::bit_util::FromLittleEndian(offset); + next_offset = ::arrow::bit_util::FromLittleEndian(next_offset); // Create a VariantValue for the element VariantValue element_value{ - .metadata = metadata, - .value = std::string_view(value.data() + info.data_start_offset + offset, - next_offset - offset)}; + metadata, std::string_view(value.data() + info.data_start_offset + offset, + next_offset - offset)}; return element_value; } From 95600239c4ee99a9c03dc9a32a7cb8404add6a3d Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 15 May 2025 00:00:30 +0800 Subject: [PATCH 17/31] Try to fix CI --- cpp/src/parquet/CMakeLists.txt | 4 ++-- cpp/src/parquet/{variant.cc => variant_value.cc} | 2 +- cpp/src/parquet/{variant.h => variant_value.h} | 0 cpp/src/parquet/{variant_test.cc => variant_value_test.cc} | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) rename cpp/src/parquet/{variant.cc => variant_value.cc} (99%) rename cpp/src/parquet/{variant.h => variant_value.h} (100%) rename cpp/src/parquet/{variant_test.cc => variant_value_test.cc} (99%) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 9f4328fc3b27..c68a8cbca716 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -190,7 +190,7 @@ set(PARQUET_SRCS stream_reader.cc stream_writer.cc types.cc - variant.cc) + variant_value.cc) if(ARROW_HAVE_RUNTIME_AVX2) # AVX2 is used as a proxy for BMI2. @@ -384,7 +384,7 @@ add_parquet_test(internals-test size_statistics_test.cc statistics_test.cc types_test.cc - variant_test.cc) + variant_value_test.cc) set_source_files_properties(public_api_test.cc PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant_value.cc similarity index 99% rename from cpp/src/parquet/variant.cc rename to cpp/src/parquet/variant_value.cc index 02839923bfe9..9910aad74873 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant_value.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "parquet/variant.h" +#include "parquet/variant_value.h" #include #include diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant_value.h similarity index 100% rename from cpp/src/parquet/variant.h rename to cpp/src/parquet/variant_value.h diff --git a/cpp/src/parquet/variant_test.cc b/cpp/src/parquet/variant_value_test.cc similarity index 99% rename from cpp/src/parquet/variant_test.cc rename to cpp/src/parquet/variant_value_test.cc index 4c6c3eb5e9a0..c71f0a431dec 100644 --- a/cpp/src/parquet/variant_test.cc +++ b/cpp/src/parquet/variant_value_test.cc @@ -21,7 +21,7 @@ #include "parquet/exception.h" #include "parquet/test_util.h" -#include "parquet/variant.h" +#include "parquet/variant_value.h" #include #include @@ -163,7 +163,7 @@ TEST(ParquetVariant, NumericValues) { auto variant = LoadVariantValue("primitive_float", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::FLOAT, variant.getType()); EXPECT_EQ("FLOAT", variant.typeDebugString()); - EXPECT_FLOAT_EQ(1234567940.0, variant.getFloat()); + EXPECT_FLOAT_EQ(1234567940.0f, variant.getFloat()); } { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; From 365329f3677756583227b3aa107b10053977f4f8 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 15 May 2025 16:14:31 +0800 Subject: [PATCH 18/31] Revert stupid file name change --- cpp/src/parquet/{variant_value.cc => variant.cc} | 2 +- cpp/src/parquet/{variant_value.h => variant.h} | 0 cpp/src/parquet/{variant_value_test.cc => variant_test.cc} | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename cpp/src/parquet/{variant_value.cc => variant.cc} (99%) rename cpp/src/parquet/{variant_value.h => variant.h} (100%) rename cpp/src/parquet/{variant_value_test.cc => variant_test.cc} (99%) diff --git a/cpp/src/parquet/variant_value.cc b/cpp/src/parquet/variant.cc similarity index 99% rename from cpp/src/parquet/variant_value.cc rename to cpp/src/parquet/variant.cc index 9910aad74873..02839923bfe9 100644 --- a/cpp/src/parquet/variant_value.cc +++ b/cpp/src/parquet/variant.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "parquet/variant_value.h" +#include "parquet/variant.h" #include #include diff --git a/cpp/src/parquet/variant_value.h b/cpp/src/parquet/variant.h similarity index 100% rename from cpp/src/parquet/variant_value.h rename to cpp/src/parquet/variant.h diff --git a/cpp/src/parquet/variant_value_test.cc b/cpp/src/parquet/variant_test.cc similarity index 99% rename from cpp/src/parquet/variant_value_test.cc rename to cpp/src/parquet/variant_test.cc index c71f0a431dec..1cd4fa0bde38 100644 --- a/cpp/src/parquet/variant_value_test.cc +++ b/cpp/src/parquet/variant_test.cc @@ -21,7 +21,7 @@ #include "parquet/exception.h" #include "parquet/test_util.h" -#include "parquet/variant_value.h" +#include "parquet/variant.h" #include #include From cf9c4ad955751581d262c3335c0ddd838a682fcf Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 15 May 2025 17:34:37 +0800 Subject: [PATCH 19/31] refactor metadata and read u32le (bug exists) --- cpp/src/parquet/CMakeLists.txt | 4 +- cpp/src/parquet/variant.cc | 172 ++++++++++++++++----------------- cpp/src/parquet/variant.h | 17 +++- 3 files changed, 101 insertions(+), 92 deletions(-) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index c68a8cbca716..9f4328fc3b27 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -190,7 +190,7 @@ set(PARQUET_SRCS stream_reader.cc stream_writer.cc types.cc - variant_value.cc) + variant.cc) if(ARROW_HAVE_RUNTIME_AVX2) # AVX2 is used as a proxy for BMI2. @@ -384,7 +384,7 @@ add_parquet_test(internals-test size_statistics_test.cc statistics_test.cc types_test.cc - variant_value_test.cc) + variant_test.cc) set_source_files_properties(public_api_test.cc PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index 02839923bfe9..55a6f998196d 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -143,69 +143,91 @@ std::string variantTypeToString(VariantType type) { } } +inline uint32_t readLittleEndianU32(const void* from, uint8_t size) { + ARROW_DCHECK_LE(size, 4); + ARROW_DCHECK_GE(size, 1); + + uint32_t result = 0; + memcpy(&result, from, size); + return ::arrow::bit_util::FromLittleEndian(result); +} + VariantMetadata::VariantMetadata(std::string_view metadata) : metadata_(metadata) { - if (metadata.size() < 2) { - throw ParquetException("Invalid Variant metadata: too short: " + + if (metadata.size() < HEADER_SIZE_BYTES + MINIMAL_OFFSET_SIZE_BYTES * 2) { + // Empty metadata is at least 3 bytes: version, dictionarySize and + // at least one offset. + throw ParquetException("Invalid Variant metadata: too short: size=" + std::to_string(metadata.size())); } - if (version() != 1) { + if (version() != SUPPORTED_VERSION) { // Currently we only supports version 1. throw ParquetException("Unsupported Variant metadata version: " + std::to_string(version())); } + uint8_t offset_size = offsetSize(); + if (offset_size < MINIMAL_OFFSET_SIZE_BYTES || + offset_size > MAXIMUM_OFFSET_SIZE_BYTES) { + throw ParquetException("Invalid Variant metadata: invalid offset size: " + + std::to_string(offset_size)); + } + dictionary_size_ = loadDictionarySize(metadata, offset_size); + if (HEADER_SIZE_BYTES + (dictionary_size_ + 1) * offset_size > metadata_.size()) { + throw ParquetException( + "Invalid Variant metadata: offset out of range: " + + std::to_string((dictionary_size_ + HEADER_SIZE_BYTES) * offset_size) + " > " + + std::to_string(metadata_.size())); + } } -int8_t VariantMetadata::version() const { - return static_cast(metadata_[0]) & VERSION_MASK; +uint8_t VariantMetadata::version() const { + return static_cast(metadata_[0]) & VERSION_MASK; } bool VariantMetadata::sortedStrings() const { return (metadata_[0] & SORTED_STRING_MASK) != 0; } -uint8_t VariantMetadata::offsetSize() const { return ((metadata_[0] >> 6) & 0x3) + 1; } +uint8_t VariantMetadata::offsetSize() const { + // Since it stores offsetSize - 1, we add 1 here. + return ((metadata_[0] >> OFFSET_SIZE_BIT_SHIFT) & OFFSET_SIZE_MASK) + 1; +} -uint32_t VariantMetadata::dictionarySize() const { - uint8_t length = offsetSize(); - if (length > 4) { - throw ParquetException("Invalid offset size: " + std::to_string(length)); - } - if (static_cast(length + 1) > metadata_.size()) { +uint32_t VariantMetadata::loadDictionarySize(std::string_view metadata, + uint8_t offset_size) { + if (static_cast(offset_size + HEADER_SIZE_BYTES) > metadata.size()) { throw ParquetException("Invalid Variant metadata: too short for dictionary size"); } - uint32_t dict_size = 0; - memcpy(&dict_size, metadata_.data() + 1, length); - dict_size = ::arrow::bit_util::FromLittleEndian(dict_size); - return dict_size; + return readLittleEndianU32(metadata.data() + HEADER_SIZE_BYTES, offset_size); } +uint32_t VariantMetadata::dictionarySize() const { return dictionary_size_; } + std::string_view VariantMetadata::getMetadataKey(int32_t variant_id) const { uint32_t offset_size = offsetSize(); uint32_t dict_size = dictionarySize(); if (variant_id < 0 || variant_id >= static_cast(dict_size)) { - throw ParquetException("Invalid Variant metadata: variant_id out of range"); - } - - if ((dict_size + 1) * offset_size > metadata_.size()) { - throw ParquetException("Invalid Variant metadata: offset out of range"); + throw ParquetException("Invalid Variant metadata: variant_id out of range: " + + std::to_string(variant_id) + + " >= " + std::to_string(dict_size)); } - size_t offset_start_pos = 1 + offset_size + (variant_id * offset_size); - - uint32_t variant_offset = 0; - uint32_t variant_next_offset = 0; - memcpy(&variant_offset, metadata_.data() + offset_start_pos, offset_size); - variant_offset = ::arrow::bit_util::FromLittleEndian(variant_offset); - memcpy(&variant_next_offset, metadata_.data() + offset_start_pos + offset_size, - offset_size); - variant_next_offset = ::arrow::bit_util::FromLittleEndian(variant_next_offset); + size_t offset_start_pos = HEADER_SIZE_BYTES + offset_size + (variant_id * offset_size); + // Index range of offsets are already checked in ctor, so no need to check again. + uint32_t variant_offset = + readLittleEndianU32(metadata_.data() + offset_start_pos, offset_size); + uint32_t variant_next_offset = + readLittleEndianU32(metadata_.data() + offset_start_pos + offset_size, offset_size); uint32_t key_size = variant_next_offset - variant_offset; - size_t string_start = 1 + offset_size * (dict_size + 2) + variant_offset; + size_t string_start = + HEADER_SIZE_BYTES + offset_size * (dict_size + 2) + variant_offset; if (string_start + key_size > metadata_.size()) { - throw ParquetException("Invalid Variant metadata: string data out of range"); + throw ParquetException("Invalid Variant metadata: string data out of range: " + + std::to_string(string_start) + " + " + + std::to_string(key_size) + " > " + + std::to_string(metadata_.size())); } return {metadata_.data() + string_start, key_size}; } @@ -215,22 +237,17 @@ ::arrow::internal::SmallVector VariantMetadata::getMetadataId( uint32_t offset_size = offsetSize(); uint32_t dict_size = dictionarySize(); - if ((dict_size + 1) * offset_size > metadata_.size()) { + if ((dict_size + HEADER_SIZE_BYTES) * offset_size > metadata_.size()) { throw ParquetException("Invalid Variant metadata: offset out of range"); } // TODO(mwish): This can be optimized by using binary search if the metadata is sorted. ::arrow::internal::SmallVector vector; for (uint32_t i = 0; i < dict_size; ++i) { size_t offset_start_pos = 1 + offset_size + (i * offset_size); - uint32_t variant_offset = 0; - memcpy(&variant_offset, metadata_.data() + offset_start_pos, offset_size); - variant_offset = ::arrow::bit_util::FromLittleEndian(variant_offset); - - uint32_t variant_next_offset = 0; - memcpy(&variant_next_offset, metadata_.data() + offset_start_pos + offset_size, - offset_size); - variant_next_offset = ::arrow::bit_util::FromLittleEndian(variant_next_offset); - + uint32_t variant_offset = + readLittleEndianU32(metadata_.data() + offset_start_pos, offset_size); + uint32_t variant_next_offset = readLittleEndianU32( + metadata_.data() + offset_start_pos + offset_size, offset_size); uint32_t key_size = variant_next_offset - variant_offset; size_t string_start = 1 + offset_size * (dict_size + 2) + variant_offset; @@ -575,11 +592,7 @@ VariantValue::ObjectInfo VariantValue::getObjectInfo() const { " for at least " + std::to_string(1 + num_elements_size)); } // parse num_elements - uint32_t num_elements = 0; - { - memcpy(&num_elements, value.data() + 1, num_elements_size); - num_elements = ::arrow::bit_util::FromLittleEndian(num_elements); - } + uint32_t num_elements = readLittleEndianU32(value.data() + 1, num_elements_size); ObjectInfo info{}; info.num_elements = num_elements; info.id_size = field_id_size; @@ -595,10 +608,9 @@ VariantValue::ObjectInfo VariantValue::getObjectInfo() const { ", value_size=" + std::to_string(value.size())); } { - uint32_t final_offset = 0; - memcpy(&final_offset, - value.data() + info.offset_start_offset + num_elements * field_offset_size, - field_offset_size); + uint32_t final_offset = readLittleEndianU32( + value.data() + info.offset_start_offset + num_elements * field_offset_size, + field_offset_size); // It could be less than value size since it could be a sub-object. if (final_offset + info.data_start_offset > value.size()) { throw ParquetException( @@ -641,9 +653,8 @@ std::optional VariantValue::getObjectFieldByFieldId( // Get the field offset // TODO(mwish): Using binary search to optimize it. for (uint32_t i = 0; i < info.num_elements; ++i) { - uint32_t variant_field_id = 0; - memcpy(&variant_field_id, value.data() + info.id_start_offset + i * info.id_size, - info.id_size); + uint32_t variant_field_id = readLittleEndianU32( + value.data() + info.offset_start_offset + i * info.offset_size, info.offset_size); variant_field_id = ::arrow::bit_util::FromLittleEndian(variant_field_id); if (variant_field_id == variant_id) { field_offset_opt = i; @@ -655,11 +666,9 @@ std::optional VariantValue::getObjectFieldByFieldId( } uint32_t field_offset = field_offset_opt.value(); // Read the offset and next offset - uint32_t offset = 0; - memcpy(&offset, - value.data() + info.offset_start_offset + field_offset * info.offset_size, - info.offset_size); - offset = ::arrow::bit_util::FromLittleEndian(offset); + uint32_t offset = readLittleEndianU32( + value.data() + info.offset_start_offset + field_offset * info.offset_size, + info.offset_size); if (info.data_start_offset + offset > value.size()) { throw ParquetException("Invalid object field offsets: data_start_offset=" + @@ -695,13 +704,7 @@ VariantValue::ArrayInfo VariantValue::getArrayInfo() const { " for at least " + std::to_string(1 + num_elements_size)); } - // parse num_elements - uint32_t num_elements = 0; - { - memcpy(&num_elements, value.data() + 1, num_elements_size); - num_elements = ::arrow::bit_util::FromLittleEndian(num_elements); - } - + uint32_t num_elements = readLittleEndianU32(value.data() + 1, num_elements_size); ArrayInfo info{}; info.num_elements = num_elements; info.offset_size = field_offset_size; @@ -719,11 +722,9 @@ VariantValue::ArrayInfo VariantValue::getArrayInfo() const { // Validate final offset is equal to the size of the value, // it would work since even empty array would have an offset of 0. { - uint32_t final_offset = 0; - memcpy(&final_offset, - value.data() + info.offset_start_offset + num_elements * field_offset_size, - field_offset_size); - final_offset = ::arrow::bit_util::FromLittleEndian(final_offset); + uint32_t final_offset = readLittleEndianU32( + value.data() + info.offset_start_offset + num_elements * field_offset_size, + field_offset_size); if (info.data_start_offset + final_offset > value.size()) { throw ParquetException( @@ -736,15 +737,12 @@ VariantValue::ArrayInfo VariantValue::getArrayInfo() const { // checking the element is incremental. // TODO(mwish): Remove this or encapsulate this range check to function for (uint32_t i = 0; i < num_elements; ++i) { - uint32_t offset = 0, next_offset = 0; - memcpy(&offset, value.data() + info.offset_start_offset + i * field_offset_size, - field_offset_size); - memcpy(&next_offset, - value.data() + info.offset_start_offset + (i + 1) * field_offset_size, - field_offset_size); - offset = ::arrow::bit_util::FromLittleEndian(offset); - next_offset = ::arrow::bit_util::FromLittleEndian(next_offset); - + uint32_t offset = readLittleEndianU32( + value.data() + info.offset_start_offset + i * field_offset_size, + field_offset_size); + uint32_t next_offset = readLittleEndianU32( + value.data() + info.offset_start_offset + (i + 1) * field_offset_size, + field_offset_size); if (offset > next_offset) { throw ParquetException( "Invalid array value: offsets not monotonically increasing: " + @@ -763,14 +761,12 @@ VariantValue VariantValue::getArrayValueByIndex(uint32_t index, } // Read the offset and next offset - uint32_t offset = 0, next_offset = 0; - memcpy(&offset, value.data() + info.offset_start_offset + index * info.offset_size, - info.offset_size); - memcpy(&next_offset, - value.data() + info.offset_start_offset + (index + 1) * info.offset_size, - info.offset_size); - offset = ::arrow::bit_util::FromLittleEndian(offset); - next_offset = ::arrow::bit_util::FromLittleEndian(next_offset); + uint32_t offset = readLittleEndianU32( + value.data() + info.offset_start_offset + index * info.offset_size, + info.offset_size); + uint32_t next_offset = readLittleEndianU32( + value.data() + info.offset_start_offset + (index + 1) * info.offset_size, + info.offset_size); // Create a VariantValue for the element VariantValue element_value{ diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index 61b673f8b2d4..7fb895bb52cb 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -123,8 +123,10 @@ class VariantMetadata { public: explicit VariantMetadata(std::string_view metadata); /// \brief Get the variant metadata version. Currently, always 1. - int8_t version() const; + uint8_t version() const; /// \brief Get the metadata key for a given variant field id. + /// \throw ParquetException if the variant_id is out of range(larger than + /// dictionary size). std::string_view getMetadataKey(int32_t variant_id) const; /// \brief Get the metadata id for a given key. /// From the discussion in ML: @@ -138,11 +140,22 @@ class VariantMetadata { uint32_t dictionarySize() const; private: - static constexpr uint8_t VERSION_MASK = 0xF; + static uint32_t loadDictionarySize(std::string_view metadata, uint8_t offset_size); + + private: + static constexpr uint8_t VERSION_MASK = 0b1111; static constexpr uint8_t SORTED_STRING_MASK = 0b10000; + static constexpr size_t HEADER_SIZE_BYTES = 1; + static constexpr size_t MINIMAL_OFFSET_SIZE_BYTES = 1; + static constexpr size_t MAXIMUM_OFFSET_SIZE_BYTES = 4; + // mask is applied after shift, it's like 0b11000000 before shift. + static constexpr uint8_t OFFSET_SIZE_MASK = 0b11; + static constexpr uint8_t OFFSET_SIZE_BIT_SHIFT = 6; + static constexpr uint8_t SUPPORTED_VERSION = 1; private: std::string_view metadata_; + uint32_t dictionary_size_{0}; }; template From 9b52d0bacbb3f4c2dae1557bea92ea5b53f6b709 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 15 May 2025 17:55:57 +0800 Subject: [PATCH 20/31] VariantMetadata use u32 rather than i32; minor bug fix --- cpp/src/parquet/variant.cc | 18 +++++++++--------- cpp/src/parquet/variant.h | 4 ++-- cpp/src/parquet/variant_test.cc | 26 ++++++++++++++++---------- 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index 55a6f998196d..503bd1261690 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -202,11 +202,11 @@ uint32_t VariantMetadata::loadDictionarySize(std::string_view metadata, uint32_t VariantMetadata::dictionarySize() const { return dictionary_size_; } -std::string_view VariantMetadata::getMetadataKey(int32_t variant_id) const { +std::string_view VariantMetadata::getMetadataKey(uint32_t variant_id) const { uint32_t offset_size = offsetSize(); uint32_t dict_size = dictionarySize(); - if (variant_id < 0 || variant_id >= static_cast(dict_size)) { + if (variant_id >= dict_size) { throw ParquetException("Invalid Variant metadata: variant_id out of range: " + std::to_string(variant_id) + " >= " + std::to_string(dict_size)); @@ -232,7 +232,7 @@ std::string_view VariantMetadata::getMetadataKey(int32_t variant_id) const { return {metadata_.data() + string_start, key_size}; } -::arrow::internal::SmallVector VariantMetadata::getMetadataId( +::arrow::internal::SmallVector VariantMetadata::getMetadataId( std::string_view key) const { uint32_t offset_size = offsetSize(); uint32_t dict_size = dictionarySize(); @@ -241,12 +241,13 @@ ::arrow::internal::SmallVector VariantMetadata::getMetadataId( throw ParquetException("Invalid Variant metadata: offset out of range"); } // TODO(mwish): This can be optimized by using binary search if the metadata is sorted. - ::arrow::internal::SmallVector vector; + ::arrow::internal::SmallVector vector; + uint32_t variant_offset = 0; + uint32_t variant_next_offset = 0; for (uint32_t i = 0; i < dict_size; ++i) { size_t offset_start_pos = 1 + offset_size + (i * offset_size); - uint32_t variant_offset = - readLittleEndianU32(metadata_.data() + offset_start_pos, offset_size); - uint32_t variant_next_offset = readLittleEndianU32( + variant_offset = variant_next_offset; + variant_next_offset = readLittleEndianU32( metadata_.data() + offset_start_pos + offset_size, offset_size); uint32_t key_size = variant_next_offset - variant_offset; @@ -654,8 +655,7 @@ std::optional VariantValue::getObjectFieldByFieldId( // TODO(mwish): Using binary search to optimize it. for (uint32_t i = 0; i < info.num_elements; ++i) { uint32_t variant_field_id = readLittleEndianU32( - value.data() + info.offset_start_offset + i * info.offset_size, info.offset_size); - variant_field_id = ::arrow::bit_util::FromLittleEndian(variant_field_id); + value.data() + info.id_start_offset + i * info.id_size, info.id_size); if (variant_field_id == variant_id) { field_offset_opt = i; break; diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index 7fb895bb52cb..c68e4216cc15 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -127,13 +127,13 @@ class VariantMetadata { /// \brief Get the metadata key for a given variant field id. /// \throw ParquetException if the variant_id is out of range(larger than /// dictionary size). - std::string_view getMetadataKey(int32_t variant_id) const; + std::string_view getMetadataKey(uint32_t variant_id) const; /// \brief Get the metadata id for a given key. /// From the discussion in ML: /// https://lists.apache.org/thread/b68tjmrjmy64mbv9dknpmqs28vnzjj96 if /// !sortedStrings(), the metadata key is not guaranteed to be unique, so we use a /// vector to store all the metadata ids. - ::arrow::internal::SmallVector getMetadataId(std::string_view key) const; + ::arrow::internal::SmallVector getMetadataId(std::string_view key) const; bool sortedStrings() const; uint8_t offsetSize() const; diff --git a/cpp/src/parquet/variant_test.cc b/cpp/src/parquet/variant_test.cc index 1cd4fa0bde38..aa9612f75546 100644 --- a/cpp/src/parquet/variant_test.cc +++ b/cpp/src/parquet/variant_test.cc @@ -270,9 +270,12 @@ TEST(ParquetVariant, ObjectValues) { {"null_field", handle_null_field}, {"timestamp_field", handle_timestamp_field}}; // Test getObjectValueByKey with existing keys - for (auto& [key, handler] : key_handler) { - auto value = variant.getObjectValueByKey(key); - handler(value); + { + ARROW_SCOPED_TRACE("Test getObjectValueByKey with existing keys"); + for (auto& [key, handler] : key_handler) { + auto value = variant.getObjectValueByKey(key); + handler(value); + } } // Test non-existing key { @@ -280,13 +283,16 @@ TEST(ParquetVariant, ObjectValues) { EXPECT_FALSE(ne.has_value()); } // Test get by index - for (uint32_t i = 0; i < obj_info.num_elements; ++i) { - auto value = variant.getObjectFieldByFieldId(i); - auto key = variant.metadata.getMetadataKey(i); - auto iter = key_handler.find(std::string(key)); - ASSERT_TRUE(iter != key_handler.end()); - auto handler = iter->second; - handler(value); + { + ARROW_SCOPED_TRACE("Test getObjectFieldByFieldId with existing indexes"); + for (uint32_t i = 0; i < obj_info.num_elements; ++i) { + auto value = variant.getObjectFieldByFieldId(i); + auto key = variant.metadata.getMetadataKey(i); + auto iter = key_handler.find(std::string(key)); + ASSERT_TRUE(iter != key_handler.end()); + auto handler = iter->second; + handler(value); + } } EXPECT_FALSE(variant.getObjectFieldByFieldId(100).has_value()); } From 6c9c66dd856cff9ffdd9aa78a2ef35623fae2106 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 15 May 2025 19:18:30 +0800 Subject: [PATCH 21/31] optimize the case when metadata key is unique --- cpp/src/parquet/variant.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index 503bd1261690..5301776efb60 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -241,6 +241,7 @@ ::arrow::internal::SmallVector VariantMetadata::getMetadataId( throw ParquetException("Invalid Variant metadata: offset out of range"); } // TODO(mwish): This can be optimized by using binary search if the metadata is sorted. + bool sort_and_unique = sortedStrings(); ::arrow::internal::SmallVector vector; uint32_t variant_offset = 0; uint32_t variant_next_offset = 0; @@ -258,6 +259,9 @@ ::arrow::internal::SmallVector VariantMetadata::getMetadataId( std::string_view current_key{metadata_.data() + string_start, key_size}; if (current_key == key) { vector.push_back(i); + if (sort_and_unique) { + break; + } } } return vector; From 53615001f8d3a8e58c37bb5f225a0a4cd25415ca Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 17 May 2025 03:23:33 +0800 Subject: [PATCH 22/31] Apply cr suggestions --- cpp/src/arrow/util/decimal.h | 4 +- cpp/src/parquet/variant.cc | 310 ++++++++++++++++---------------- cpp/src/parquet/variant.h | 104 +++++------ cpp/src/parquet/variant_test.cc | 106 +++++------ 4 files changed, 262 insertions(+), 262 deletions(-) diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h index 00328668928e..640dc9aec157 100644 --- a/cpp/src/arrow/util/decimal.h +++ b/cpp/src/arrow/util/decimal.h @@ -108,7 +108,7 @@ class ARROW_EXPORT Decimal32 : public BasicDecimal32 { /// \brief Convert from a big-endian byte representation. The length must be /// between 1 and 4 - /// \return error status if the length is an invalid value + /// \return error statis if the length is an invalid value static Result FromBigEndian(const uint8_t* data, int32_t length); /// \brief Convert Decimal32 from one scale to another @@ -220,7 +220,7 @@ class ARROW_EXPORT Decimal64 : public BasicDecimal64 { /// \brief Convert from a big-endian byte representation. The length must be /// between 1 and 4 - /// \return error status if the length is an invalid value + /// \return error statis if the length is an invalid value static Result FromBigEndian(const uint8_t* data, int32_t length); /// \brief Convert Decimal64 from one scale to another diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index 5301776efb60..fb1833576318 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -28,7 +28,7 @@ namespace parquet::variant { -std::string variantBasicTypeToString(VariantBasicType type) { +std::string VariantBasicTypeToString(VariantBasicType type) { switch (type) { case VariantBasicType::Primitive: return "Primitive"; @@ -43,7 +43,7 @@ std::string variantBasicTypeToString(VariantBasicType type) { } } -std::string variantPrimitiveTypeToString(VariantPrimitiveType type) { +std::string VariantPrimitiveTypeToString(VariantPrimitiveType type) { switch (type) { case VariantPrimitiveType::NullType: return "NullType"; @@ -71,20 +71,20 @@ std::string variantPrimitiveTypeToString(VariantPrimitiveType type) { return "Date"; case VariantPrimitiveType::Timestamp: return "Timestamp"; - case VariantPrimitiveType::TimestampNTZ: - return "TimestampNTZ"; + case VariantPrimitiveType::TimestampNtz: + return "TimestampNtz"; case VariantPrimitiveType::Float: return "Float"; case VariantPrimitiveType::Binary: return "Binary"; case VariantPrimitiveType::String: return "String"; - case VariantPrimitiveType::TimeNTZ: - return "TimeNTZ"; - case VariantPrimitiveType::TimestampTZ: + case VariantPrimitiveType::TimeNtz: + return "TimeNtz"; + case VariantPrimitiveType::TimestampTz: return "TimestampTZ"; - case VariantPrimitiveType::TimestampNTZNanos: - return "TimestampNTZNanos"; + case VariantPrimitiveType::TimestampNtzNanos: + return "TimestampNtzNanos"; case VariantPrimitiveType::Uuid: return "Uuid"; default: @@ -92,51 +92,51 @@ std::string variantPrimitiveTypeToString(VariantPrimitiveType type) { } } -std::string variantTypeToString(VariantType type) { +std::string VariantTypeToString(VariantType type) { switch (type) { - case VariantType::OBJECT: + case VariantType::Object: return "OBJECT"; - case VariantType::ARRAY: + case VariantType::Array: return "ARRAY"; - case VariantType::VARIANT_NULL: + case VariantType::Null: return "NULL"; - case VariantType::BOOLEAN: + case VariantType::Boolean: return "BOOLEAN"; - case VariantType::INT8: + case VariantType::Int8: return "INT8"; - case VariantType::INT16: + case VariantType::Int16: return "INT16"; - case VariantType::INT32: + case VariantType::Int32: return "INT32"; - case VariantType::INT64: + case VariantType::Int64: return "INT64"; - case VariantType::STRING: + case VariantType::String: return "STRING"; - case VariantType::DOUBLE: + case VariantType::Double: return "DOUBLE"; - case VariantType::DECIMAL4: + case VariantType::Decimal4: return "DECIMAL4"; - case VariantType::DECIMAL8: + case VariantType::Decimal8: return "DECIMAL8"; - case VariantType::DECIMAL16: + case VariantType::Decimal16: return "DECIMAL16"; - case VariantType::DATE: + case VariantType::Date: return "DATE"; - case VariantType::TIMESTAMP_TZ: + case VariantType::TimestampTz: return "TIMESTAMP_TZ"; - case VariantType::TIMESTAMP_NTZ: - return "TIMESTAMP_NTZ"; - case VariantType::FLOAT: + case VariantType::TimestampNtz: + return "TIMESTAMP_Ntz"; + case VariantType::Float: return "FLOAT"; - case VariantType::BINARY: + case VariantType::Binary: return "BINARY"; - case VariantType::TIME: + case VariantType::Time: return "TIME"; - case VariantType::TIMESTAMP_NANOS_TZ: + case VariantType::TimestampNanosTz: return "TIMESTAMP_NANOS_TZ"; - case VariantType::TIMESTAMP_NANOS_NTZ: - return "TIMESTAMP_NANOS_NTZ"; - case VariantType::UUID: + case VariantType::TimestampNanosNtz: + return "TIMESTAMP_NANOS_Ntz"; + case VariantType::Uuid: return "UUID"; default: return "UNKNOWN"; @@ -153,76 +153,75 @@ inline uint32_t readLittleEndianU32(const void* from, uint8_t size) { } VariantMetadata::VariantMetadata(std::string_view metadata) : metadata_(metadata) { - if (metadata.size() < HEADER_SIZE_BYTES + MINIMAL_OFFSET_SIZE_BYTES * 2) { + if (metadata.size() < kHeaderSizeBytes + kMinimalOffsetSizeBytes * 2) { // Empty metadata is at least 3 bytes: version, dictionarySize and // at least one offset. throw ParquetException("Invalid Variant metadata: too short: size=" + std::to_string(metadata.size())); } - if (version() != SUPPORTED_VERSION) { + if (version() != kSupportedVersion) { // Currently we only supports version 1. throw ParquetException("Unsupported Variant metadata version: " + std::to_string(version())); } - uint8_t offset_size = offsetSize(); - if (offset_size < MINIMAL_OFFSET_SIZE_BYTES || - offset_size > MAXIMUM_OFFSET_SIZE_BYTES) { + uint8_t offset_sz = offset_size(); + if (offset_sz < kMinimalOffsetSizeBytes || offset_sz > kMaximumOffsetSizeBytes) { throw ParquetException("Invalid Variant metadata: invalid offset size: " + - std::to_string(offset_size)); + std::to_string(offset_sz)); } - dictionary_size_ = loadDictionarySize(metadata, offset_size); - if (HEADER_SIZE_BYTES + (dictionary_size_ + 1) * offset_size > metadata_.size()) { + dictionary_size_ = loadDictionarySize(metadata, offset_sz); + if (kHeaderSizeBytes + (dictionary_size_ + 1) * offset_sz > metadata_.size()) { throw ParquetException( "Invalid Variant metadata: offset out of range: " + - std::to_string((dictionary_size_ + HEADER_SIZE_BYTES) * offset_size) + " > " + + std::to_string((dictionary_size_ + kHeaderSizeBytes) * offset_sz) + " > " + std::to_string(metadata_.size())); } } uint8_t VariantMetadata::version() const { - return static_cast(metadata_[0]) & VERSION_MASK; + return static_cast(metadata_[0]) & kVersionMask; } -bool VariantMetadata::sortedStrings() const { - return (metadata_[0] & SORTED_STRING_MASK) != 0; +bool VariantMetadata::sorted_and_unique() const { + return (metadata_[0] & kSortedStringMask) != 0; } -uint8_t VariantMetadata::offsetSize() const { +uint8_t VariantMetadata::offset_size() const { // Since it stores offsetSize - 1, we add 1 here. - return ((metadata_[0] >> OFFSET_SIZE_BIT_SHIFT) & OFFSET_SIZE_MASK) + 1; + return ((metadata_[0] >> kOffsetSizeBitShift) & kOffsetSizeMask) + 1; } uint32_t VariantMetadata::loadDictionarySize(std::string_view metadata, uint8_t offset_size) { - if (static_cast(offset_size + HEADER_SIZE_BYTES) > metadata.size()) { + if (static_cast(offset_size + kHeaderSizeBytes) > metadata.size()) { throw ParquetException("Invalid Variant metadata: too short for dictionary size"); } - return readLittleEndianU32(metadata.data() + HEADER_SIZE_BYTES, offset_size); + return readLittleEndianU32(metadata.data() + kHeaderSizeBytes, offset_size); } -uint32_t VariantMetadata::dictionarySize() const { return dictionary_size_; } +uint32_t VariantMetadata::dictionary_size() const { return dictionary_size_; } -std::string_view VariantMetadata::getMetadataKey(uint32_t variant_id) const { - uint32_t offset_size = offsetSize(); - uint32_t dict_size = dictionarySize(); +std::string_view VariantMetadata::GetMetadataKey(uint32_t variant_id) const { + uint32_t offset_bytes = offset_size(); + uint32_t dictionary_bytes = dictionary_size(); - if (variant_id >= dict_size) { + if (variant_id >= dictionary_bytes) { throw ParquetException("Invalid Variant metadata: variant_id out of range: " + std::to_string(variant_id) + - " >= " + std::to_string(dict_size)); + " >= " + std::to_string(dictionary_bytes)); } - size_t offset_start_pos = HEADER_SIZE_BYTES + offset_size + (variant_id * offset_size); + size_t offset_start_pos = kHeaderSizeBytes + offset_bytes + (variant_id * offset_bytes); // Index range of offsets are already checked in ctor, so no need to check again. uint32_t variant_offset = - readLittleEndianU32(metadata_.data() + offset_start_pos, offset_size); - uint32_t variant_next_offset = - readLittleEndianU32(metadata_.data() + offset_start_pos + offset_size, offset_size); + readLittleEndianU32(metadata_.data() + offset_start_pos, offset_bytes); + uint32_t variant_next_offset = readLittleEndianU32( + metadata_.data() + offset_start_pos + offset_bytes, offset_bytes); uint32_t key_size = variant_next_offset - variant_offset; size_t string_start = - HEADER_SIZE_BYTES + offset_size * (dict_size + 2) + variant_offset; + kHeaderSizeBytes + offset_bytes * (dictionary_bytes + 2) + variant_offset; if (string_start + key_size > metadata_.size()) { throw ParquetException("Invalid Variant metadata: string data out of range: " + std::to_string(string_start) + " + " + @@ -232,27 +231,27 @@ std::string_view VariantMetadata::getMetadataKey(uint32_t variant_id) const { return {metadata_.data() + string_start, key_size}; } -::arrow::internal::SmallVector VariantMetadata::getMetadataId( +::arrow::internal::SmallVector VariantMetadata::GetMetadataId( std::string_view key) const { - uint32_t offset_size = offsetSize(); - uint32_t dict_size = dictionarySize(); + uint32_t offset_bytes = offset_size(); + uint32_t dictionary_bytes = dictionary_size(); - if ((dict_size + HEADER_SIZE_BYTES) * offset_size > metadata_.size()) { + if ((dictionary_bytes + kHeaderSizeBytes) * offset_bytes > metadata_.size()) { throw ParquetException("Invalid Variant metadata: offset out of range"); } + const bool sort_and_unique = sorted_and_unique(); // TODO(mwish): This can be optimized by using binary search if the metadata is sorted. - bool sort_and_unique = sortedStrings(); ::arrow::internal::SmallVector vector; uint32_t variant_offset = 0; uint32_t variant_next_offset = 0; - for (uint32_t i = 0; i < dict_size; ++i) { - size_t offset_start_pos = 1 + offset_size + (i * offset_size); + for (uint32_t i = 0; i < dictionary_bytes; ++i) { + size_t offset_start_pos = 1 + offset_bytes + (i * offset_bytes); variant_offset = variant_next_offset; variant_next_offset = readLittleEndianU32( - metadata_.data() + offset_start_pos + offset_size, offset_size); + metadata_.data() + offset_start_pos + offset_bytes, offset_bytes); uint32_t key_size = variant_next_offset - variant_offset; - size_t string_start = 1 + offset_size * (dict_size + 2) + variant_offset; + size_t string_start = 1 + offset_bytes * (dictionary_bytes + 2) + variant_offset; if (string_start + key_size > metadata_.size()) { throw ParquetException("Invalid Variant metadata: string data out of range"); } @@ -271,9 +270,8 @@ VariantBasicType VariantValue::getBasicType() const { if (value.empty()) { throw ParquetException("Empty variant value"); } - return static_cast(value[0] & BASIC_TYPE_MASK); + return static_cast(value[0] & kBasicTypeMask); } - VariantType VariantValue::getType() const { VariantBasicType basic_type = getBasicType(); switch (basic_type) { @@ -281,57 +279,57 @@ VariantType VariantValue::getType() const { auto primitive_type = static_cast(value[0] >> 2); switch (primitive_type) { case VariantPrimitiveType::NullType: - return VariantType::VARIANT_NULL; + return VariantType::Null; case VariantPrimitiveType::BooleanTrue: case VariantPrimitiveType::BooleanFalse: - return VariantType::BOOLEAN; + return VariantType::Boolean; case VariantPrimitiveType::Int8: - return VariantType::INT8; + return VariantType::Int8; case VariantPrimitiveType::Int16: - return VariantType::INT16; + return VariantType::Int16; case VariantPrimitiveType::Int32: - return VariantType::INT32; + return VariantType::Int32; case VariantPrimitiveType::Int64: - return VariantType::INT64; + return VariantType::Int64; case VariantPrimitiveType::Double: - return VariantType::DOUBLE; + return VariantType::Double; case VariantPrimitiveType::Decimal4: - return VariantType::DECIMAL4; + return VariantType::Decimal4; case VariantPrimitiveType::Decimal8: - return VariantType::DECIMAL8; + return VariantType::Decimal8; case VariantPrimitiveType::Decimal16: - return VariantType::DECIMAL16; + return VariantType::Decimal16; case VariantPrimitiveType::Date: - return VariantType::DATE; + return VariantType::Date; case VariantPrimitiveType::Timestamp: - return VariantType::TIMESTAMP_TZ; - case VariantPrimitiveType::TimestampNTZ: - return VariantType::TIMESTAMP_NTZ; + return VariantType::TimestampTz; + case VariantPrimitiveType::TimestampNtz: + return VariantType::TimestampNtz; case VariantPrimitiveType::Float: - return VariantType::FLOAT; + return VariantType::Float; case VariantPrimitiveType::Binary: - return VariantType::BINARY; + return VariantType::Binary; case VariantPrimitiveType::String: - return VariantType::STRING; - case VariantPrimitiveType::TimeNTZ: - return VariantType::TIME; - case VariantPrimitiveType::TimestampTZ: - return VariantType::TIMESTAMP_NANOS_TZ; - case VariantPrimitiveType::TimestampNTZNanos: - return VariantType::TIMESTAMP_NANOS_NTZ; + return VariantType::String; + case VariantPrimitiveType::TimeNtz: + return VariantType::Time; + case VariantPrimitiveType::TimestampTz: + return VariantType::TimestampNanosTz; + case VariantPrimitiveType::TimestampNtzNanos: + return VariantType::TimestampNanosNtz; case VariantPrimitiveType::Uuid: - return VariantType::UUID; + return VariantType::Uuid; default: throw ParquetException("Unknown primitive type: " + std::to_string(static_cast(primitive_type))); } } case VariantBasicType::ShortString: - return VariantType::STRING; + return VariantType::String; case VariantBasicType::Object: - return VariantType::OBJECT; + return VariantType::Object; case VariantBasicType::Array: - return VariantType::ARRAY; + return VariantType::Array; default: throw ParquetException("Unknown basic type: " + std::to_string(static_cast(basic_type))); @@ -341,59 +339,59 @@ VariantType VariantValue::getType() const { std::string VariantValue::typeDebugString() const { VariantType type = getType(); switch (type) { - case VariantType::OBJECT: - return "OBJECT"; - case VariantType::ARRAY: - return "ARRAY"; - case VariantType::VARIANT_NULL: - return "NULL"; - case VariantType::BOOLEAN: - return "BOOLEAN"; - case VariantType::INT8: - return "INT8"; - case VariantType::INT16: - return "INT16"; - case VariantType::INT32: - return "INT32"; - case VariantType::INT64: - return "INT64"; - case VariantType::STRING: - return "STRING"; - case VariantType::DOUBLE: - return "DOUBLE"; - case VariantType::DECIMAL4: - return "DECIMAL4"; - case VariantType::DECIMAL8: - return "DECIMAL8"; - case VariantType::DECIMAL16: - return "DECIMAL16"; - case VariantType::DATE: - return "DATE"; - case VariantType::TIMESTAMP_TZ: - return "TIMESTAMP_TZ"; - case VariantType::TIMESTAMP_NTZ: - return "TIMESTAMP_NTZ"; - case VariantType::FLOAT: - return "FLOAT"; - case VariantType::BINARY: - return "BINARY"; - case VariantType::TIME: - return "TIME"; - case VariantType::TIMESTAMP_NANOS_TZ: - return "TIMESTAMP_NANOS_TZ"; - case VariantType::TIMESTAMP_NANOS_NTZ: - return "TIMESTAMP_NANOS_NTZ"; - case VariantType::UUID: - return "UUID"; + case VariantType::Object: + return "Object"; + case VariantType::Array: + return "Array"; + case VariantType::Null: + return "Null"; + case VariantType::Boolean: + return "Boolean"; + case VariantType::Int8: + return "Int8"; + case VariantType::Int16: + return "Int16"; + case VariantType::Int32: + return "Int32"; + case VariantType::Int64: + return "Int64"; + case VariantType::String: + return "String"; + case VariantType::Double: + return "Double"; + case VariantType::Decimal4: + return "Decimal4"; + case VariantType::Decimal8: + return "Decimal8"; + case VariantType::Decimal16: + return "Decimal16"; + case VariantType::Date: + return "Date"; + case VariantType::TimestampTz: + return "TimestampTz"; + case VariantType::TimestampNtz: + return "TimestampNtz"; + case VariantType::Float: + return "Float"; + case VariantType::Binary: + return "Binary"; + case VariantType::Time: + return "Time"; + case VariantType::TimestampNanosTz: + return "TimestampNanosTz"; + case VariantType::TimestampNanosNtz: + return "TimestampNanosNtz"; + case VariantType::Uuid: + return "Uuid"; default: - return "UNKNOWN"; + return "Unknown"; } } bool VariantValue::getBool() const { if (getBasicType() != VariantBasicType::Primitive) { throw ParquetException("Expected primitive type, but got: " + - variantBasicTypeToString(getBasicType())); + VariantBasicTypeToString(getBasicType())); } int8_t primitive_type = static_cast(value[0]) >> 2; @@ -410,8 +408,8 @@ bool VariantValue::getBool() const { void VariantValue::checkBasicType(VariantBasicType type) const { if (getBasicType() != type) { - throw ParquetException("Expected basic type: " + variantBasicTypeToString(type) + - ", but got: " + variantBasicTypeToString(getBasicType())); + throw ParquetException("Expected basic type: " + VariantBasicTypeToString(type) + + ", but got: " + VariantBasicTypeToString(getBasicType())); } } @@ -422,14 +420,14 @@ void VariantValue::checkPrimitiveType(VariantPrimitiveType type, auto primitive_type = static_cast(value[0] >> 2); if (primitive_type != type) { throw ParquetException( - "Expected primitive type: " + variantPrimitiveTypeToString(type) + - ", but got: " + variantPrimitiveTypeToString(primitive_type)); + "Expected primitive type: " + VariantPrimitiveTypeToString(type) + + ", but got: " + VariantPrimitiveTypeToString(primitive_type)); } if (value.size() < size_required) { throw ParquetException("Invalid value: too short, expected at least " + std::to_string(size_required) + " bytes for type " + - variantPrimitiveTypeToString(type) + + VariantPrimitiveTypeToString(type) + ", but got: " + std::to_string(value.size()) + " bytes"); } } @@ -487,7 +485,7 @@ std::string_view VariantValue::getString() const { VariantBasicType basic_type = getBasicType(); if (basic_type == VariantBasicType::ShortString) { - uint8_t length = (value[0] >> 2) & MAX_SHORT_STR_SIZE_MASK; + uint8_t length = (value[0] >> 2) & kMaxShortStrSizeMask; if (value.size() < static_cast(length + 1)) { throw ParquetException( "Invalid short string: too short: " + std::to_string(value.size()) + @@ -547,16 +545,16 @@ int32_t VariantValue::getDate() const { return getPrimitiveType(VariantPrimitiveType::Date); } -int64_t VariantValue::getTimeNTZ() const { - return getPrimitiveType(VariantPrimitiveType::TimeNTZ); +int64_t VariantValue::getTimeNtz() const { + return getPrimitiveType(VariantPrimitiveType::TimeNtz); } int64_t VariantValue::getTimestamp() const { return getPrimitiveType(VariantPrimitiveType::Timestamp); } -int64_t VariantValue::getTimestampNTZ() const { - return getPrimitiveType(VariantPrimitiveType::TimestampNTZ); +int64_t VariantValue::getTimestampNtz() const { + return getPrimitiveType(VariantPrimitiveType::TimestampNtz); } std::array VariantValue::getUuid() const { @@ -637,7 +635,7 @@ std::optional VariantValue::getObjectValueByKey( std::optional VariantValue::getObjectValueByKey( std::string_view key, const VariantValue::ObjectInfo& info) const { ARROW_DCHECK_EQ(getObjectInfo(), info); - auto metadata_ids = metadata.getMetadataId(key); + auto metadata_ids = metadata.GetMetadataId(key); if (metadata_ids.empty()) { return std::nullopt; } diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index c68e4216cc15..0d66969de4a5 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -25,6 +25,8 @@ #include #include +#include "parquet/platform.h" + namespace parquet::variant { // TODO(mwish): Should I use parquet::ByteArray rather than @@ -41,7 +43,7 @@ enum class VariantBasicType { Array = 3 }; -std::string variantBasicTypeToString(VariantBasicType type); +PARQUET_EXPORT std::string VariantBasicTypeToString(VariantBasicType type); enum class VariantPrimitiveType : int8_t { /// Equivalent Parquet Type: UNKNOWN @@ -71,7 +73,7 @@ enum class VariantPrimitiveType : int8_t { /// Equivalent Parquet Type: TIMESTAMP(isAdjustedToUTC=true, MICROS) Timestamp = 12, /// Equivalent Parquet Type: TIMESTAMP(isAdjustedToUTC=false, MICROS) - TimestampNTZ = 13, + TimestampNtz = 13, /// Equivalent Parquet Type: FLOAT Float = 14, /// Equivalent Parquet Type: BINARY @@ -79,47 +81,47 @@ enum class VariantPrimitiveType : int8_t { /// Equivalent Parquet Type: STRING String = 16, /// Equivalent Parquet Type: TIME(isAdjustedToUTC=false, MICROS) - TimeNTZ = 17, + TimeNtz = 17, /// Equivalent Parquet Type: TIMESTAMP(isAdjustedToUTC=true, NANOS) - TimestampTZ = 18, // Assuming TZ stands for TimeZone, and follows the document's + TimestampTz = 18, // Assuming TZ stands for TimeZone, and follows the document's // 'timestamp with time zone' /// Equivalent Parquet Type: TIMESTAMP(isAdjustedToUTC=false, NANOS) - TimestampNTZNanos = 19, // Differentiating from TimestampNTZ (MICROS) + TimestampNtzNanos = 19, // Differentiating from TimestampNtz (MICROS) /// Equivalent Parquet Type: UUID Uuid = 20 }; -std::string variantPrimitiveTypeToString(VariantPrimitiveType type); +PARQUET_EXPORT std::string VariantPrimitiveTypeToString(VariantPrimitiveType type); /// VariantType is from basic type and primitive type. enum class VariantType { - OBJECT, - ARRAY, - VARIANT_NULL, - BOOLEAN, - INT8, - INT16, - INT32, - INT64, - STRING, - DOUBLE, - DECIMAL4, - DECIMAL8, - DECIMAL16, - DATE, - TIMESTAMP_TZ, - TIMESTAMP_NTZ, - FLOAT, - BINARY, - TIME, - TIMESTAMP_NANOS_TZ, - TIMESTAMP_NANOS_NTZ, - UUID + Object, + Array, + Null, + Boolean, + Int8, + Int16, + Int32, + Int64, + String, + Double, + Decimal4, + Decimal8, + Decimal16, + Date, + TimestampTz, + TimestampNtz, + Float, + Binary, + Time, + TimestampNanosTz, + TimestampNanosNtz, + Uuid }; -std::string variantTypeToString(VariantType type); +PARQUET_EXPORT std::string VariantTypeToString(VariantType type); -class VariantMetadata { +class PARQUET_EXPORT VariantMetadata { public: explicit VariantMetadata(std::string_view metadata); /// \brief Get the variant metadata version. Currently, always 1. @@ -127,31 +129,31 @@ class VariantMetadata { /// \brief Get the metadata key for a given variant field id. /// \throw ParquetException if the variant_id is out of range(larger than /// dictionary size). - std::string_view getMetadataKey(uint32_t variant_id) const; + std::string_view GetMetadataKey(uint32_t variant_id) const; /// \brief Get the metadata id for a given key. /// From the discussion in ML: /// https://lists.apache.org/thread/b68tjmrjmy64mbv9dknpmqs28vnzjj96 if - /// !sortedStrings(), the metadata key is not guaranteed to be unique, so we use a + /// !sorted_and_unique(), the metadata key is not guaranteed to be unique, so we use a /// vector to store all the metadata ids. - ::arrow::internal::SmallVector getMetadataId(std::string_view key) const; + ::arrow::internal::SmallVector GetMetadataId(std::string_view key) const; - bool sortedStrings() const; - uint8_t offsetSize() const; - uint32_t dictionarySize() const; + bool sorted_and_unique() const; + uint8_t offset_size() const; + uint32_t dictionary_size() const; private: static uint32_t loadDictionarySize(std::string_view metadata, uint8_t offset_size); private: - static constexpr uint8_t VERSION_MASK = 0b1111; - static constexpr uint8_t SORTED_STRING_MASK = 0b10000; - static constexpr size_t HEADER_SIZE_BYTES = 1; - static constexpr size_t MINIMAL_OFFSET_SIZE_BYTES = 1; - static constexpr size_t MAXIMUM_OFFSET_SIZE_BYTES = 4; + static constexpr uint8_t kVersionMask = 0b1111; + static constexpr uint8_t kSortedStringMask = 0b10000; + static constexpr size_t kHeaderSizeBytes = 1; + static constexpr size_t kMinimalOffsetSizeBytes = 1; + static constexpr size_t kMaximumOffsetSizeBytes = 4; // mask is applied after shift, it's like 0b11000000 before shift. - static constexpr uint8_t OFFSET_SIZE_MASK = 0b11; - static constexpr uint8_t OFFSET_SIZE_BIT_SHIFT = 6; - static constexpr uint8_t SUPPORTED_VERSION = 1; + static constexpr uint8_t kOffsetSizeMask = 0b11; + static constexpr uint8_t kOffsetSizeBitShift = 6; + static constexpr uint8_t kSupportedVersion = 1; private: std::string_view metadata_; @@ -159,12 +161,12 @@ class VariantMetadata { }; template -struct DecimalValue { +struct PARQUET_EXPORT DecimalValue { uint8_t scale; DecimalType value; }; -struct VariantValue { +struct PARQUET_EXPORT VariantValue { VariantMetadata metadata; std::string_view value; @@ -192,10 +194,10 @@ struct VariantValue { DecimalValue<::arrow::Decimal128> getDecimal16() const; int32_t getDate() const; - int64_t getTimeNTZ() const; + int64_t getTimeNtz() const; // timestamp with adjusted to UTC int64_t getTimestamp() const; - int64_t getTimestampNTZ() const; + int64_t getTimestampNtz() const; // 16 bytes UUID std::array getUuid() const; @@ -238,11 +240,11 @@ struct VariantValue { VariantValue getArrayValueByIndex(uint32_t index, const ArrayInfo& info) const; private: - static constexpr uint8_t BASIC_TYPE_MASK = 0b00000011; - static constexpr uint8_t PRIMITIVE_TYPE_MASK = 0b00111111; + static constexpr uint8_t kBasicTypeMask = 0b00000011; + static constexpr uint8_t kPrimitiveTypeMask = 0b00111111; /** The inclusive maximum value of the type info value. It is the size limit of * `SHORT_STR`. */ - static constexpr uint8_t MAX_SHORT_STR_SIZE_MASK = 0b00111111; + static constexpr uint8_t kMaxShortStrSizeMask = 0b00111111; private: template diff --git a/cpp/src/parquet/variant_test.cc b/cpp/src/parquet/variant_test.cc index aa9612f75546..6d6e21c4780a 100644 --- a/cpp/src/parquet/variant_test.cc +++ b/cpp/src/parquet/variant_test.cc @@ -67,7 +67,7 @@ TEST(ParquetVariant, MetadataBase) { VariantMetadata metadata(std::string_view{*buf}); EXPECT_EQ(1, metadata.version()); - EXPECT_THROW(metadata.getMetadataKey(0), ParquetException); + EXPECT_THROW(metadata.GetMetadataKey(0), ParquetException); } { std::string object_metadata = "object_primitive.metadata"; @@ -76,13 +76,13 @@ TEST(ParquetVariant, MetadataBase) { auto buf = readFromFile(*file_system, path); VariantMetadata metadata(std::string_view{*buf}); - EXPECT_EQ("int_field", metadata.getMetadataKey(0)); - EXPECT_EQ("double_field", metadata.getMetadataKey(1)); - EXPECT_EQ("boolean_true_field", metadata.getMetadataKey(2)); - EXPECT_EQ("boolean_false_field", metadata.getMetadataKey(3)); - EXPECT_EQ("string_field", metadata.getMetadataKey(4)); - EXPECT_EQ("null_field", metadata.getMetadataKey(5)); - EXPECT_EQ("timestamp_field", metadata.getMetadataKey(6)); + EXPECT_EQ("int_field", metadata.GetMetadataKey(0)); + EXPECT_EQ("double_field", metadata.GetMetadataKey(1)); + EXPECT_EQ("boolean_true_field", metadata.GetMetadataKey(2)); + EXPECT_EQ("boolean_false_field", metadata.GetMetadataKey(3)); + EXPECT_EQ("string_field", metadata.GetMetadataKey(4)); + EXPECT_EQ("null_field", metadata.GetMetadataKey(5)); + EXPECT_EQ("timestamp_field", metadata.GetMetadataKey(6)); } } @@ -110,7 +110,7 @@ TEST(ParquetVariant, BooleanValue) { { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_boolean_true", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::BOOLEAN, variant.getType()); + EXPECT_EQ(VariantType::Boolean, variant.getType()); EXPECT_EQ("BOOLEAN", variant.typeDebugString()); EXPECT_EQ(true, variant.getBool()); } @@ -118,7 +118,7 @@ TEST(ParquetVariant, BooleanValue) { { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_boolean_false", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::BOOLEAN, variant.getType()); + EXPECT_EQ(VariantType::Boolean, variant.getType()); EXPECT_EQ(false, variant.getBool()); } { @@ -132,21 +132,21 @@ TEST(ParquetVariant, NumericValues) { { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int8", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::INT8, variant.getType()); + EXPECT_EQ(VariantType::Int8, variant.getType()); EXPECT_EQ("INT8", variant.typeDebugString()); EXPECT_EQ(42, variant.getInt8()); } { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int16", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::INT16, variant.getType()); + EXPECT_EQ(VariantType::Int16, variant.getType()); EXPECT_EQ("INT16", variant.typeDebugString()); EXPECT_EQ(1234, variant.getInt16()); } { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int32", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::INT32, variant.getType()); + EXPECT_EQ(VariantType::Int32, variant.getType()); EXPECT_EQ("INT32", variant.typeDebugString()); EXPECT_EQ(123456, variant.getInt32()); } @@ -154,21 +154,21 @@ TEST(ParquetVariant, NumericValues) { // FIXME(mwish): https://github.com/apache/parquet-testing/issues/82 std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int64", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::INT32, variant.getType()); + EXPECT_EQ(VariantType::Int32, variant.getType()); EXPECT_EQ("INT32", variant.typeDebugString()); EXPECT_EQ(12345678, variant.getInt32()); } { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_float", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::FLOAT, variant.getType()); + EXPECT_EQ(VariantType::Float, variant.getType()); EXPECT_EQ("FLOAT", variant.typeDebugString()); EXPECT_FLOAT_EQ(1234567940.0f, variant.getFloat()); } { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_double", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::DOUBLE, variant.getType()); + EXPECT_EQ(VariantType::Double, variant.getType()); EXPECT_EQ("DOUBLE", variant.typeDebugString()); EXPECT_DOUBLE_EQ(1234567890.1234, variant.getDouble()); } @@ -185,7 +185,7 @@ TEST(ParquetVariant, StringValues) { { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_string", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::STRING, variant.getType()); + EXPECT_EQ(VariantType::String, variant.getType()); EXPECT_EQ("STRING", variant.typeDebugString()); std::string expected = R"(This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥️, 🎣 and 🤦!!)"; @@ -194,7 +194,7 @@ TEST(ParquetVariant, StringValues) { { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("short_string", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::STRING, variant.getType()); + EXPECT_EQ(VariantType::String, variant.getType()); EXPECT_EQ(VariantBasicType::ShortString, variant.getBasicType()); std::string expected = R"(Less than 64 bytes (❤️ with utf8))"; EXPECT_EQ(expected, variant.getString()); @@ -202,7 +202,7 @@ TEST(ParquetVariant, StringValues) { { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_binary", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::BINARY, variant.getType()); + EXPECT_EQ(VariantType::Binary, variant.getType()); EXPECT_EQ("BINARY", variant.typeDebugString()); auto binary_data = variant.getBinary(); std::string expected = ::arrow::util::base64_decode("AxM33q2+78r+"); @@ -220,44 +220,44 @@ TEST(ParquetVariant, StringValues) { TEST(ParquetVariant, ObjectValues) { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("object_primitive", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::OBJECT, variant.getType()); + EXPECT_EQ(VariantType::Object, variant.getType()); EXPECT_EQ("OBJECT", variant.typeDebugString()); auto obj_info = variant.getObjectInfo(); EXPECT_EQ(7, obj_info.num_elements); auto handle_int_field = [](const std::optional& value) { EXPECT_TRUE(value.has_value()); - EXPECT_EQ(VariantType::INT8, value->getType()); + EXPECT_EQ(VariantType::Int8, value->getType()); EXPECT_EQ(1, value->getInt8()); }; auto handle_double_field = [](const std::optional& value) { EXPECT_TRUE(value.has_value()); - EXPECT_EQ(VariantType::DECIMAL4, value->getType()); + EXPECT_EQ(VariantType::Decimal4, value->getType()); auto decimal_value = value->getDecimal4(); EXPECT_EQ("1.23456789", decimal_value.value.ToString(decimal_value.scale)); }; auto handle_boolean_true_field = [](const std::optional& value) { EXPECT_TRUE(value.has_value()); - EXPECT_EQ(VariantType::BOOLEAN, value->getType()); + EXPECT_EQ(VariantType::Boolean, value->getType()); EXPECT_TRUE(value->getBool()); }; auto handle_boolean_false_field = [](const std::optional& value) { EXPECT_TRUE(value.has_value()); - EXPECT_EQ(VariantType::BOOLEAN, value->getType()); + EXPECT_EQ(VariantType::Boolean, value->getType()); EXPECT_FALSE(value->getBool()); }; auto handle_string_field = [](const std::optional& value) { EXPECT_TRUE(value.has_value()); - EXPECT_EQ(VariantType::STRING, value->getType()); + EXPECT_EQ(VariantType::String, value->getType()); EXPECT_EQ("Apache Parquet", value->getString()); }; auto handle_null_field = [](const std::optional& value) { EXPECT_TRUE(value.has_value()); - EXPECT_EQ(VariantType::VARIANT_NULL, value->getType()); + EXPECT_EQ(VariantType::Null, value->getType()); }; auto handle_timestamp_field = [](const std::optional& value) { EXPECT_TRUE(value.has_value()); - EXPECT_EQ(VariantType::STRING, value->getType()); + EXPECT_EQ(VariantType::String, value->getType()); EXPECT_EQ("2025-04-16T12:34:56.78", value->getString()); }; @@ -287,7 +287,7 @@ TEST(ParquetVariant, ObjectValues) { ARROW_SCOPED_TRACE("Test getObjectFieldByFieldId with existing indexes"); for (uint32_t i = 0; i < obj_info.num_elements; ++i) { auto value = variant.getObjectFieldByFieldId(i); - auto key = variant.metadata.getMetadataKey(i); + auto key = variant.metadata.GetMetadataKey(i); auto iter = key_handler.find(std::string(key)); ASSERT_TRUE(iter != key_handler.end()); auto handler = iter->second; @@ -300,7 +300,7 @@ TEST(ParquetVariant, ObjectValues) { TEST(ParquetVariant, NestedObjectValues) { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("object_nested", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::OBJECT, variant.getType()); + EXPECT_EQ(VariantType::Object, variant.getType()); EXPECT_EQ("OBJECT", variant.typeDebugString()); auto info = variant.getObjectInfo(); EXPECT_EQ(3, info.num_elements); @@ -308,16 +308,16 @@ TEST(ParquetVariant, NestedObjectValues) { // Trying to get the exists key auto id = variant.getObjectValueByKey("id", info); ASSERT_TRUE(id.has_value()); - EXPECT_EQ(VariantType::INT8, id->getType()); + EXPECT_EQ(VariantType::Int8, id->getType()); EXPECT_EQ(1, id->getInt8()); auto observation = variant.getObjectValueByKey("observation", info); ASSERT_TRUE(observation.has_value()); - EXPECT_EQ(VariantType::OBJECT, observation->getType()); + EXPECT_EQ(VariantType::Object, observation->getType()); auto species = variant.getObjectValueByKey("species", info); ASSERT_TRUE(species.has_value()); - EXPECT_EQ(VariantType::OBJECT, species->getType()); + EXPECT_EQ(VariantType::Object, species->getType()); auto species_info = species->getObjectInfo(); EXPECT_EQ(2, species_info.num_elements); @@ -327,12 +327,12 @@ TEST(ParquetVariant, NestedObjectValues) { EXPECT_EQ(2, species_object_info.num_elements); auto name = species->getObjectValueByKey("name"); ASSERT_TRUE(name.has_value()); - EXPECT_EQ(VariantType::STRING, name->getType()); + EXPECT_EQ(VariantType::String, name->getType()); EXPECT_EQ("lava monster", name->getString()); auto population = species->getObjectValueByKey("population"); ASSERT_TRUE(population.has_value()); - EXPECT_EQ(VariantType::INT16, population->getType()); + EXPECT_EQ(VariantType::Int16, population->getType()); EXPECT_EQ(6789, population->getInt16()); } @@ -365,7 +365,7 @@ TEST(ParquetVariant, DecimalValues) { { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_decimal4", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::DECIMAL4, variant.getType()); + EXPECT_EQ(VariantType::Decimal4, variant.getType()); EXPECT_EQ("DECIMAL4", variant.typeDebugString()); auto decimal = variant.getDecimal4(); EXPECT_EQ(2, decimal.scale); @@ -374,7 +374,7 @@ TEST(ParquetVariant, DecimalValues) { { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_decimal8", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::DECIMAL8, variant.getType()); + EXPECT_EQ(VariantType::Decimal8, variant.getType()); EXPECT_EQ("DECIMAL8", variant.typeDebugString()); auto decimal = variant.getDecimal8(); EXPECT_EQ(2, decimal.scale); @@ -383,7 +383,7 @@ TEST(ParquetVariant, DecimalValues) { { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_decimal16", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::DECIMAL16, variant.getType()); + EXPECT_EQ(VariantType::Decimal16, variant.getType()); EXPECT_EQ("DECIMAL16", variant.typeDebugString()); auto decimal = variant.getDecimal16(); EXPECT_EQ(2, decimal.scale); @@ -395,7 +395,7 @@ TEST(ParquetVariant, DateTimeValues) { { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_date", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::DATE, variant.getType()); + EXPECT_EQ(VariantType::Date, variant.getType()); EXPECT_EQ("DATE", variant.typeDebugString()); // 2025-04-16 EXPECT_EQ(20194, variant.getDate()); @@ -403,16 +403,16 @@ TEST(ParquetVariant, DateTimeValues) { { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_timestamp", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::TIMESTAMP_TZ, variant.getType()); + EXPECT_EQ(VariantType::TimestampTz, variant.getType()); EXPECT_EQ("TIMESTAMP_TZ", variant.typeDebugString()); EXPECT_EQ(1744821296780000, variant.getTimestamp()); } { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_timestampntz", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::TIMESTAMP_NTZ, variant.getType()); + EXPECT_EQ(VariantType::TimestampNtz, variant.getType()); EXPECT_EQ("TIMESTAMP_NTZ", variant.typeDebugString()); - EXPECT_EQ(1744806896780000, variant.getTimestampNTZ()); + EXPECT_EQ(1744806896780000, variant.getTimestampNtz()); } } @@ -420,26 +420,26 @@ TEST(ParquetVariant, ArrayValues) { { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("array_primitive", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::ARRAY, variant.getType()); + EXPECT_EQ(VariantType::Array, variant.getType()); EXPECT_EQ("ARRAY", variant.typeDebugString()); auto array_info = variant.getArrayInfo(); EXPECT_EQ(4, array_info.num_elements); auto element0 = variant.getArrayValueByIndex(0); - EXPECT_EQ(VariantType::INT8, element0.getType()); + EXPECT_EQ(VariantType::Int8, element0.getType()); EXPECT_EQ(2, element0.getInt8()); auto element1 = variant.getArrayValueByIndex(1); - EXPECT_EQ(VariantType::INT8, element1.getType()); + EXPECT_EQ(VariantType::Int8, element1.getType()); EXPECT_EQ(1, element1.getInt8()); auto element2 = variant.getArrayValueByIndex(2); - EXPECT_EQ(VariantType::INT8, element2.getType()); + EXPECT_EQ(VariantType::Int8, element2.getType()); EXPECT_EQ(5, element2.getInt8()); auto element3 = variant.getArrayValueByIndex(3); - EXPECT_EQ(VariantType::INT8, element3.getType()); + EXPECT_EQ(VariantType::Int8, element3.getType()); EXPECT_EQ(9, element3.getInt8()); EXPECT_THROW(variant.getArrayValueByIndex(4), ParquetException); @@ -450,7 +450,7 @@ TEST(ParquetVariant, ArrayValues) { // array_empty std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("array_empty", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::ARRAY, variant.getType()); + EXPECT_EQ(VariantType::Array, variant.getType()); EXPECT_EQ("ARRAY", variant.typeDebugString()); auto array_info = variant.getArrayInfo(); EXPECT_EQ(0, array_info.num_elements); @@ -463,32 +463,32 @@ TEST(ParquetVariant, ArrayValues) { TEST(ParquetVariant, ArrayValuesNested) { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("array_nested", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::ARRAY, variant.getType()); + EXPECT_EQ(VariantType::Array, variant.getType()); EXPECT_EQ("ARRAY", variant.typeDebugString()); auto object_info = variant.getArrayInfo(); EXPECT_EQ(3, object_info.num_elements); { auto first_element = variant.getArrayValueByIndex(0); - EXPECT_EQ(VariantType::OBJECT, first_element.getType()); + EXPECT_EQ(VariantType::Object, first_element.getType()); auto first_element_info = first_element.getObjectInfo(); EXPECT_EQ(2, first_element_info.num_elements); auto id = first_element.getObjectValueByKey("id"); ASSERT_TRUE(id.has_value()); - EXPECT_EQ(VariantType::INT8, id->getType()); + EXPECT_EQ(VariantType::Int8, id->getType()); EXPECT_EQ(1, id->getInt8()); } { auto second_element = variant.getArrayValueByIndex(1); - EXPECT_EQ(VariantType::VARIANT_NULL, second_element.getType()); + EXPECT_EQ(VariantType::Null, second_element.getType()); } { auto third_element = variant.getArrayValueByIndex(2); - EXPECT_EQ(VariantType::OBJECT, third_element.getType()); + EXPECT_EQ(VariantType::Object, third_element.getType()); auto third_element_info = third_element.getObjectInfo(); EXPECT_EQ(3, third_element_info.num_elements); auto id = third_element.getObjectValueByKey("id"); ASSERT_TRUE(id.has_value()); - EXPECT_EQ(VariantType::INT8, id->getType()); + EXPECT_EQ(VariantType::Int8, id->getType()); EXPECT_EQ(2, id->getInt8()); } } From 7628110688b69c1d8a0f8815ae56a060ac0c5228 Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 17 May 2025 03:59:08 +0800 Subject: [PATCH 23/31] Refactor with heavier value --- cpp/src/parquet/variant.cc | 273 +++++++++++++++----------------- cpp/src/parquet/variant.h | 59 +++---- cpp/src/parquet/variant_test.cc | 84 +++++----- 3 files changed, 189 insertions(+), 227 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index fb1833576318..fed0bd54fd1c 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -266,17 +266,39 @@ ::arrow::internal::SmallVector VariantMetadata::GetMetadataId( return vector; } -VariantBasicType VariantValue::getBasicType() const { - if (value.empty()) { - throw ParquetException("Empty variant value"); +VariantValue::VariantValue(VariantMetadata metadata, std::string_view value) + : metadata_(metadata), value_(value) { + if (value_.empty()) { + throw ParquetException("Invalid Variant metadata: empty string"); + } + switch (getBasicType()) { + case VariantBasicType::Array: { + complex_info_ = getArrayInfo(value_); + break; + } + case VariantBasicType::Object: { + complex_info_ = getObjectInfo(value_); + break; + } + case VariantBasicType::ShortString: + case VariantBasicType::Primitive: { + break; + } } - return static_cast(value[0] & kBasicTypeMask); } + +VariantValue::VariantValue(std::string_view metadata, std::string_view value) + : VariantValue(VariantMetadata(metadata), value) {} + +VariantBasicType VariantValue::getBasicType() const { + return static_cast(value_[0] & kBasicTypeMask); +} + VariantType VariantValue::getType() const { VariantBasicType basic_type = getBasicType(); switch (basic_type) { case VariantBasicType::Primitive: { - auto primitive_type = static_cast(value[0] >> 2); + auto primitive_type = static_cast(value_[0] >> 2); switch (primitive_type) { case VariantPrimitiveType::NullType: return VariantType::Null; @@ -336,9 +358,11 @@ VariantType VariantValue::getType() const { } } -std::string VariantValue::typeDebugString() const { - VariantType type = getType(); - switch (type) { +const VariantMetadata& VariantValue::metadata() const { return metadata_; } + +std::string_view VariantValue::typeDebugString() const { + VariantType variant_type = getType(); + switch (variant_type) { case VariantType::Object: return "Object"; case VariantType::Array: @@ -383,8 +407,6 @@ std::string VariantValue::typeDebugString() const { return "TimestampNanosNtz"; case VariantType::Uuid: return "Uuid"; - default: - return "Unknown"; } } @@ -394,11 +416,11 @@ bool VariantValue::getBool() const { VariantBasicTypeToString(getBasicType())); } - int8_t primitive_type = static_cast(value[0]) >> 2; - if (primitive_type == static_cast(VariantPrimitiveType::BooleanTrue)) { + uint8_t primitive_type = static_cast(value_[0]) >> 2; + if (primitive_type == static_cast(VariantPrimitiveType::BooleanTrue)) { return true; } - if (primitive_type == static_cast(VariantPrimitiveType::BooleanFalse)) { + if (primitive_type == static_cast(VariantPrimitiveType::BooleanFalse)) { return false; } @@ -417,18 +439,18 @@ void VariantValue::checkPrimitiveType(VariantPrimitiveType type, size_t size_required) const { checkBasicType(VariantBasicType::Primitive); - auto primitive_type = static_cast(value[0] >> 2); + auto primitive_type = static_cast(value_[0] >> 2); if (primitive_type != type) { throw ParquetException( "Expected primitive type: " + VariantPrimitiveTypeToString(type) + ", but got: " + VariantPrimitiveTypeToString(primitive_type)); } - if (value.size() < size_required) { + if (value_.size() < size_required) { throw ParquetException("Invalid value: too short, expected at least " + std::to_string(size_required) + " bytes for type " + VariantPrimitiveTypeToString(type) + - ", but got: " + std::to_string(value.size()) + " bytes"); + ", but got: " + std::to_string(value_.size()) + " bytes"); } } @@ -437,7 +459,7 @@ PrimitiveType VariantValue::getPrimitiveType(VariantPrimitiveType type) const { checkPrimitiveType(type, sizeof(PrimitiveType) + 1); PrimitiveType primitive_value{}; - memcpy(&primitive_value, value.data() + 1, sizeof(PrimitiveType)); + memcpy(&primitive_value, value_.data() + 1, sizeof(PrimitiveType)); // Here we should cast from Little endian. primitive_value = ::arrow::bit_util::FromLittleEndian(primitive_value); return primitive_value; @@ -471,27 +493,27 @@ std::string_view VariantValue::getPrimitiveBinaryType(VariantPrimitiveType type) checkPrimitiveType(type, /*size_required=*/5); uint32_t length; - memcpy(&length, value.data() + 1, sizeof(uint32_t)); + memcpy(&length, value_.data() + 1, sizeof(uint32_t)); length = ::arrow::bit_util::FromLittleEndian(length); - if (value.size() < length + 5) { + if (value_.size() < length + 5) { throw ParquetException("Invalid string value: too short for specified length"); } - return {value.data() + 5, length}; + return {value_.data() + 5, length}; } std::string_view VariantValue::getString() const { VariantBasicType basic_type = getBasicType(); if (basic_type == VariantBasicType::ShortString) { - uint8_t length = (value[0] >> 2) & kMaxShortStrSizeMask; - if (value.size() < static_cast(length + 1)) { + uint8_t length = (value_[0] >> 2) & kMaxShortStrSizeMask; + if (value_.size() < static_cast(length + 1)) { throw ParquetException( - "Invalid short string: too short: " + std::to_string(value.size()) + + "Invalid short string: too short: " + std::to_string(value_.size()) + " for at least " + std::to_string(length + 1)); } - return {value.data() + 1, length}; + return {value_.data() + 1, length}; } if (basic_type == VariantBasicType::Primitive) { // TODO(mwish): Should we validate utf8 here? @@ -511,9 +533,9 @@ DecimalValue VariantValue::getPrimitiveDecimalType( using DecimalValueType = typename DecimalType::ValueType; checkPrimitiveType(type, sizeof(DecimalValueType) + 2); - uint8_t scale = value[1]; + uint8_t scale = value_[1]; DecimalValueType decimal_value; - memcpy(&decimal_value, value.data() + 2, sizeof(DecimalValueType)); + memcpy(&decimal_value, value_.data() + 2, sizeof(DecimalValueType)); decimal_value = ::arrow::bit_util::FromLittleEndian(decimal_value); return {scale, DecimalType(decimal_value)}; @@ -531,12 +553,12 @@ DecimalValue<::arrow::Decimal128> VariantValue::getDecimal16() const { checkPrimitiveType(VariantPrimitiveType::Decimal16, /*size_required=*/sizeof(int64_t) * 2 + 2); - uint8_t scale = value[1]; + uint8_t scale = value_[1]; // TODO(mwish): Do we have better way for this? std::array low_high_bits; - memcpy(&low_high_bits[0], value.data() + 2, sizeof(int64_t)); - memcpy(&low_high_bits[1], value.data() + 10, sizeof(int64_t)); + memcpy(&low_high_bits[0], value_.data() + 2, sizeof(int64_t)); + memcpy(&low_high_bits[1], value_.data() + 10, sizeof(int64_t)); ::arrow::bit_util::little_endian::ToNative(low_high_bits); return {scale, ::arrow::Decimal128(low_high_bits[1], low_high_bits[0])}; } @@ -560,7 +582,7 @@ int64_t VariantValue::getTimestampNtz() const { std::array VariantValue::getUuid() const { checkPrimitiveType(VariantPrimitiveType::Uuid, /*size_required=*/17); std::array uuid_value; - memcpy(uuid_value.data(), value.data() + 1, sizeof(uuid_value)); + memcpy(uuid_value.data(), value_.data() + 1, sizeof(uuid_value)); #if ARROW_LITTLE_ENDIAN std::array uuid_value_le; ::arrow::bit_util::ByteSwap(uuid_value_le.data(), uuid_value.data(), uuid_value.size()); @@ -570,20 +592,7 @@ std::array VariantValue::getUuid() const { #endif } -std::string VariantValue::ObjectInfo::toDebugString() const { - std::stringstream ss; - ss << "ObjectInfo{" - << "num_elements=" << num_elements << ", id_size=" << static_cast(id_size) - << ", id_size=" << static_cast(id_size) - << ", offset_size=" << static_cast(offset_size) - << ", id_start_offset=" << id_start_offset - << ", offset_start_offset=" << offset_start_offset - << ", data_start_offset=" << data_start_offset << "}"; - return ss.str(); -} - -VariantValue::ObjectInfo VariantValue::getObjectInfo() const { - checkBasicType(VariantBasicType::Object); +VariantValue::ComplexInfo VariantValue::getObjectInfo(std::string_view value) { uint8_t value_header = value[0] >> 2; uint8_t field_offset_size = (value_header & 0b11) + 1; uint8_t field_id_size = ((value_header >> 2) & 0b11) + 1; @@ -596,51 +605,60 @@ VariantValue::ObjectInfo VariantValue::getObjectInfo() const { } // parse num_elements uint32_t num_elements = readLittleEndianU32(value.data() + 1, num_elements_size); - ObjectInfo info{}; - info.num_elements = num_elements; - info.id_size = field_id_size; - info.offset_size = field_offset_size; - info.id_start_offset = 1 + num_elements_size; - info.offset_start_offset = info.id_start_offset + num_elements * field_id_size; - info.data_start_offset = - info.offset_start_offset + (num_elements + 1) * field_offset_size; + ComplexInfo complex_info{}; + complex_info.num_elements = num_elements; + complex_info.id_size = field_id_size; + complex_info.offset_size = field_offset_size; + complex_info.id_start_offset = 1 + num_elements_size; + complex_info.offset_start_offset = + complex_info.id_start_offset + num_elements * field_id_size; + complex_info.data_start_offset = + complex_info.offset_start_offset + (num_elements + 1) * field_offset_size; // Check the boundary with the final offset - if (info.data_start_offset > value.size()) { + if (complex_info.data_start_offset > value.size()) { throw ParquetException("Invalid object value: data_start_offset=" + - std::to_string(info.data_start_offset) + + std::to_string(complex_info.data_start_offset) + ", value_size=" + std::to_string(value.size())); } { - uint32_t final_offset = readLittleEndianU32( - value.data() + info.offset_start_offset + num_elements * field_offset_size, - field_offset_size); + uint32_t final_offset = + readLittleEndianU32(value.data() + complex_info.offset_start_offset + + num_elements * field_offset_size, + field_offset_size); // It could be less than value size since it could be a sub-object. - if (final_offset + info.data_start_offset > value.size()) { + if (final_offset + complex_info.data_start_offset > value.size()) { throw ParquetException( "Invalid object value: final_offset=" + std::to_string(final_offset) + - ", data_start_offset=" + std::to_string(info.data_start_offset) + + ", data_start_offset=" + std::to_string(complex_info.data_start_offset) + ", value_size=" + std::to_string(value.size())); } } - return info; + return complex_info; } -std::optional VariantValue::getObjectValueByKey( - std::string_view key) const { - ObjectInfo info = getObjectInfo(); - - return getObjectValueByKey(key, info); +uint32_t VariantValue::num_elements() const { + auto basic_type = getBasicType(); + switch (basic_type) { + case VariantBasicType::Object: + case VariantBasicType::Array: + return complex_info_.num_elements; + case VariantBasicType::Primitive: + case VariantBasicType::ShortString: { + throw ParquetException("Invalid call to num_elements() for basic type: " + + VariantBasicTypeToString(basic_type)); + } + } } std::optional VariantValue::getObjectValueByKey( - std::string_view key, const VariantValue::ObjectInfo& info) const { - ARROW_DCHECK_EQ(getObjectInfo(), info); - auto metadata_ids = metadata.GetMetadataId(key); + std::string_view key) const { + checkBasicType(VariantBasicType::Object); + auto metadata_ids = metadata_.GetMetadataId(key); if (metadata_ids.empty()) { return std::nullopt; } for (uint32_t variant_id : metadata_ids) { - auto variant_value = getObjectFieldByFieldId(variant_id, info); + auto variant_value = getObjectFieldByFieldId(variant_id); if (variant_value.has_value()) { return variant_value; } @@ -649,15 +667,15 @@ std::optional VariantValue::getObjectValueByKey( } std::optional VariantValue::getObjectFieldByFieldId( - uint32_t variant_id, const ObjectInfo& info) const { - ARROW_DCHECK_EQ(getObjectInfo(), info); - + uint32_t variant_id) const { + checkBasicType(VariantBasicType::Object); std::optional field_offset_opt; // Get the field offset // TODO(mwish): Using binary search to optimize it. - for (uint32_t i = 0; i < info.num_elements; ++i) { + for (uint32_t i = 0; i < complex_info_.num_elements; ++i) { uint32_t variant_field_id = readLittleEndianU32( - value.data() + info.id_start_offset + i * info.id_size, info.id_size); + value_.data() + complex_info_.id_start_offset + i * complex_info_.id_size, + complex_info_.id_size); if (variant_field_id == variant_id) { field_offset_opt = i; break; @@ -668,32 +686,26 @@ std::optional VariantValue::getObjectFieldByFieldId( } uint32_t field_offset = field_offset_opt.value(); // Read the offset and next offset - uint32_t offset = readLittleEndianU32( - value.data() + info.offset_start_offset + field_offset * info.offset_size, - info.offset_size); + uint32_t offset = + readLittleEndianU32(value_.data() + complex_info_.offset_start_offset + + field_offset * complex_info_.offset_size, + complex_info_.offset_size); - if (info.data_start_offset + offset > value.size()) { + if (complex_info_.data_start_offset + offset > value_.size()) { throw ParquetException("Invalid object field offsets: data_start_offset=" + - std::to_string(info.data_start_offset) + + std::to_string(complex_info_.data_start_offset) + ", offset=" + std::to_string(offset) + - ", value_size=" + std::to_string(value.size())); + ", value_size=" + std::to_string(value_.size())); } // Create a VariantValue for the field - VariantValue field_value{metadata, value.substr(info.data_start_offset + offset)}; + VariantValue field_value{metadata_, + value_.substr(complex_info_.data_start_offset + offset)}; return field_value; } -std::optional VariantValue::getObjectFieldByFieldId( - uint32_t variant_id) const { - ObjectInfo info = getObjectInfo(); - - return getObjectFieldByFieldId(variant_id, info); -} - -VariantValue::ArrayInfo VariantValue::getArrayInfo() const { - checkBasicType(VariantBasicType::Array); +VariantValue::ComplexInfo VariantValue::getArrayInfo(std::string_view value) { uint8_t value_header = value[0] >> 2; uint8_t field_offset_size = (value_header & 0b11) + 1; bool is_large = ((value_header >> 2) & 0b1); @@ -707,80 +719,47 @@ VariantValue::ArrayInfo VariantValue::getArrayInfo() const { } uint32_t num_elements = readLittleEndianU32(value.data() + 1, num_elements_size); - ArrayInfo info{}; - info.num_elements = num_elements; - info.offset_size = field_offset_size; - info.offset_start_offset = 1 + num_elements_size; - info.data_start_offset = - info.offset_start_offset + (num_elements + 1) * field_offset_size; + ComplexInfo complex_info{}; + complex_info.num_elements = num_elements; + complex_info.offset_size = field_offset_size; + complex_info.offset_start_offset = 1 + num_elements_size; + complex_info.data_start_offset = + complex_info.offset_start_offset + (num_elements + 1) * field_offset_size; // Boundary check - if (info.data_start_offset > value.size()) { + if (complex_info.data_start_offset > value.size()) { throw ParquetException("Invalid array value: data_start_offset=" + - std::to_string(info.data_start_offset) + + std::to_string(complex_info.data_start_offset) + ", value_size=" + std::to_string(value.size())); } - // Validate final offset is equal to the size of the value, - // it would work since even empty array would have an offset of 0. - { - uint32_t final_offset = readLittleEndianU32( - value.data() + info.offset_start_offset + num_elements * field_offset_size, - field_offset_size); - - if (info.data_start_offset + final_offset > value.size()) { - throw ParquetException( - "Invalid array value: final_offset=" + std::to_string(final_offset) + - ", data_start_offset=" + std::to_string(info.data_start_offset) + - ", value_size=" + std::to_string(value.size())); - } - } - - // checking the element is incremental. - // TODO(mwish): Remove this or encapsulate this range check to function - for (uint32_t i = 0; i < num_elements; ++i) { - uint32_t offset = readLittleEndianU32( - value.data() + info.offset_start_offset + i * field_offset_size, - field_offset_size); - uint32_t next_offset = readLittleEndianU32( - value.data() + info.offset_start_offset + (i + 1) * field_offset_size, - field_offset_size); - if (offset > next_offset) { - throw ParquetException( - "Invalid array value: offsets not monotonically increasing: " + - std::to_string(offset) + " > " + std::to_string(next_offset)); - } - } - - return info; + return complex_info; } -VariantValue VariantValue::getArrayValueByIndex(uint32_t index, - const ArrayInfo& info) const { - if (index >= info.num_elements) { +VariantValue VariantValue::getArrayValueByIndex(uint32_t index) const { + checkBasicType(VariantBasicType::Array); + if (index >= complex_info_.num_elements) { throw ParquetException("Array index out of range: " + std::to_string(index) + - " >= " + std::to_string(info.num_elements)); + " >= " + std::to_string(complex_info_.num_elements)); } // Read the offset and next offset - uint32_t offset = readLittleEndianU32( - value.data() + info.offset_start_offset + index * info.offset_size, - info.offset_size); - uint32_t next_offset = readLittleEndianU32( - value.data() + info.offset_start_offset + (index + 1) * info.offset_size, - info.offset_size); + uint32_t offset = + readLittleEndianU32(value_.data() + complex_info_.offset_start_offset + + index * complex_info_.offset_size, + complex_info_.offset_size); + uint32_t next_offset = + readLittleEndianU32(value_.data() + complex_info_.offset_start_offset + + (index + 1) * complex_info_.offset_size, + complex_info_.offset_size); // Create a VariantValue for the element VariantValue element_value{ - metadata, std::string_view(value.data() + info.data_start_offset + offset, - next_offset - offset)}; + metadata_, + std::string_view(value_.data() + complex_info_.data_start_offset + offset, + next_offset - offset)}; return element_value; } -VariantValue VariantValue::getArrayValueByIndex(uint32_t index) const { - ArrayInfo info = getArrayInfo(); - return getArrayValueByIndex(index, info); -} - } // namespace parquet::variant diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index 0d66969de4a5..1d0d4f1c6a97 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -166,13 +166,14 @@ struct PARQUET_EXPORT DecimalValue { DecimalType value; }; -struct PARQUET_EXPORT VariantValue { - VariantMetadata metadata; - std::string_view value; +class PARQUET_EXPORT VariantValue { + public: + VariantValue(std::string_view metadata, std::string_view value); VariantBasicType getBasicType() const; VariantType getType() const; - std::string typeDebugString() const; + std::string_view typeDebugString() const; + const VariantMetadata& metadata() const; /// \defgroup ValueAccessors /// @{ @@ -203,41 +204,13 @@ struct PARQUET_EXPORT VariantValue { /// }@ - struct ObjectInfo { - uint32_t num_elements; - uint32_t id_size; - uint32_t offset_size; - uint32_t id_start_offset; - uint32_t offset_start_offset; - uint32_t data_start_offset; - - bool operator==(const ObjectInfo& info) const { - return num_elements == info.num_elements && id_size == info.id_size && - offset_size == info.offset_size && id_start_offset == info.id_start_offset && - offset_start_offset == info.offset_start_offset && - data_start_offset == info.data_start_offset; - } + uint32_t num_elements() const; - std::string toDebugString() const; - }; - ObjectInfo getObjectInfo() const; std::optional getObjectValueByKey(std::string_view key) const; - std::optional getObjectValueByKey(std::string_view key, - const ObjectInfo& info) const; std::optional getObjectFieldByFieldId(uint32_t variant_id) const; - std::optional getObjectFieldByFieldId(uint32_t variant_id, - const ObjectInfo& info) const; - struct ArrayInfo { - uint32_t num_elements; - uint32_t offset_size; - uint32_t offset_start_offset; - uint32_t data_start_offset; - }; - ArrayInfo getArrayInfo() const; // Would throw ParquetException if index is out of range. VariantValue getArrayValueByIndex(uint32_t index) const; - VariantValue getArrayValueByIndex(uint32_t index, const ArrayInfo& info) const; private: static constexpr uint8_t kBasicTypeMask = 0b00000011; @@ -246,7 +219,18 @@ struct PARQUET_EXPORT VariantValue { * `SHORT_STR`. */ static constexpr uint8_t kMaxShortStrSizeMask = 0b00111111; + struct ComplexInfo { + uint32_t num_elements; + uint32_t id_start_offset; + uint32_t offset_start_offset; + uint32_t data_start_offset; + uint8_t id_size; + uint8_t offset_size; + }; + private: + VariantValue(VariantMetadata metadata, std::string_view value); + template PrimitiveType getPrimitiveType(VariantPrimitiveType type) const; @@ -258,6 +242,15 @@ struct PARQUET_EXPORT VariantValue { std::string_view getPrimitiveBinaryType(VariantPrimitiveType type) const; void checkBasicType(VariantBasicType type) const; void checkPrimitiveType(VariantPrimitiveType type, size_t size_required) const; + + static ComplexInfo getArrayInfo(std::string_view value); + static ComplexInfo getObjectInfo(std::string_view value); + + private: + VariantMetadata metadata_; + std::string_view value_; + + ComplexInfo complex_info_{}; }; } // namespace parquet::variant diff --git a/cpp/src/parquet/variant_test.cc b/cpp/src/parquet/variant_test.cc index 6d6e21c4780a..0a70fbe80d0f 100644 --- a/cpp/src/parquet/variant_test.cc +++ b/cpp/src/parquet/variant_test.cc @@ -100,8 +100,7 @@ VariantValue LoadVariantValue(const std::string& test_name, *value_buf_out = readFromFile(*file_system, value_path); std::string_view value{**value_buf_out}; - - VariantMetadata metadata(std::string_view{**metadata_buf_out}); + std::string_view metadata{**metadata_buf_out}; return VariantValue{metadata, value}; } @@ -111,7 +110,7 @@ TEST(ParquetVariant, BooleanValue) { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_boolean_true", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::Boolean, variant.getType()); - EXPECT_EQ("BOOLEAN", variant.typeDebugString()); + EXPECT_EQ("Boolean", variant.typeDebugString()); EXPECT_EQ(true, variant.getBool()); } // test false @@ -133,21 +132,21 @@ TEST(ParquetVariant, NumericValues) { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int8", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::Int8, variant.getType()); - EXPECT_EQ("INT8", variant.typeDebugString()); + EXPECT_EQ("Int8", variant.typeDebugString()); EXPECT_EQ(42, variant.getInt8()); } { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int16", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::Int16, variant.getType()); - EXPECT_EQ("INT16", variant.typeDebugString()); + EXPECT_EQ("Int16", variant.typeDebugString()); EXPECT_EQ(1234, variant.getInt16()); } { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int32", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::Int32, variant.getType()); - EXPECT_EQ("INT32", variant.typeDebugString()); + EXPECT_EQ("Int32", variant.typeDebugString()); EXPECT_EQ(123456, variant.getInt32()); } { @@ -155,21 +154,21 @@ TEST(ParquetVariant, NumericValues) { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int64", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::Int32, variant.getType()); - EXPECT_EQ("INT32", variant.typeDebugString()); + EXPECT_EQ("Int32", variant.typeDebugString()); EXPECT_EQ(12345678, variant.getInt32()); } { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_float", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::Float, variant.getType()); - EXPECT_EQ("FLOAT", variant.typeDebugString()); + EXPECT_EQ("Float", variant.typeDebugString()); EXPECT_FLOAT_EQ(1234567940.0f, variant.getFloat()); } { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_double", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::Double, variant.getType()); - EXPECT_EQ("DOUBLE", variant.typeDebugString()); + EXPECT_EQ("Double", variant.typeDebugString()); EXPECT_DOUBLE_EQ(1234567890.1234, variant.getDouble()); } { @@ -186,7 +185,7 @@ TEST(ParquetVariant, StringValues) { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_string", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::String, variant.getType()); - EXPECT_EQ("STRING", variant.typeDebugString()); + EXPECT_EQ("String", variant.typeDebugString()); std::string expected = R"(This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥️, 🎣 and 🤦!!)"; EXPECT_EQ(expected, variant.getString()); @@ -203,7 +202,7 @@ TEST(ParquetVariant, StringValues) { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_binary", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::Binary, variant.getType()); - EXPECT_EQ("BINARY", variant.typeDebugString()); + EXPECT_EQ("Binary", variant.typeDebugString()); auto binary_data = variant.getBinary(); std::string expected = ::arrow::util::base64_decode("AxM33q2+78r+"); EXPECT_EQ(expected, binary_data); @@ -221,10 +220,9 @@ TEST(ParquetVariant, ObjectValues) { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("object_primitive", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::Object, variant.getType()); - EXPECT_EQ("OBJECT", variant.typeDebugString()); + EXPECT_EQ("Object", variant.typeDebugString()); - auto obj_info = variant.getObjectInfo(); - EXPECT_EQ(7, obj_info.num_elements); + EXPECT_EQ(7, variant.num_elements()); auto handle_int_field = [](const std::optional& value) { EXPECT_TRUE(value.has_value()); EXPECT_EQ(VariantType::Int8, value->getType()); @@ -285,9 +283,9 @@ TEST(ParquetVariant, ObjectValues) { // Test get by index { ARROW_SCOPED_TRACE("Test getObjectFieldByFieldId with existing indexes"); - for (uint32_t i = 0; i < obj_info.num_elements; ++i) { + for (uint32_t i = 0; i < variant.num_elements(); ++i) { auto value = variant.getObjectFieldByFieldId(i); - auto key = variant.metadata.GetMetadataKey(i); + auto key = variant.metadata().GetMetadataKey(i); auto iter = key_handler.find(std::string(key)); ASSERT_TRUE(iter != key_handler.end()); auto handler = iter->second; @@ -301,30 +299,26 @@ TEST(ParquetVariant, NestedObjectValues) { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("object_nested", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::Object, variant.getType()); - EXPECT_EQ("OBJECT", variant.typeDebugString()); - auto info = variant.getObjectInfo(); - EXPECT_EQ(3, info.num_elements); + EXPECT_EQ("Object", variant.typeDebugString()); + EXPECT_EQ(3, variant.num_elements()); // Trying to get the exists key - auto id = variant.getObjectValueByKey("id", info); + auto id = variant.getObjectValueByKey("id"); ASSERT_TRUE(id.has_value()); EXPECT_EQ(VariantType::Int8, id->getType()); EXPECT_EQ(1, id->getInt8()); - auto observation = variant.getObjectValueByKey("observation", info); + auto observation = variant.getObjectValueByKey("observation"); ASSERT_TRUE(observation.has_value()); EXPECT_EQ(VariantType::Object, observation->getType()); - auto species = variant.getObjectValueByKey("species", info); + auto species = variant.getObjectValueByKey("species"); ASSERT_TRUE(species.has_value()); EXPECT_EQ(VariantType::Object, species->getType()); - auto species_info = species->getObjectInfo(); - EXPECT_EQ(2, species_info.num_elements); // Inner object works well { - auto species_object_info = species->getObjectInfo(); - EXPECT_EQ(2, species_object_info.num_elements); + EXPECT_EQ(2, species->num_elements()); auto name = species->getObjectValueByKey("name"); ASSERT_TRUE(name.has_value()); EXPECT_EQ(VariantType::String, name->getType()); @@ -366,7 +360,7 @@ TEST(ParquetVariant, DecimalValues) { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_decimal4", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::Decimal4, variant.getType()); - EXPECT_EQ("DECIMAL4", variant.typeDebugString()); + EXPECT_EQ("Decimal4", variant.typeDebugString()); auto decimal = variant.getDecimal4(); EXPECT_EQ(2, decimal.scale); EXPECT_EQ("12.34", decimal.value.ToString(decimal.scale)); @@ -375,7 +369,7 @@ TEST(ParquetVariant, DecimalValues) { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_decimal8", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::Decimal8, variant.getType()); - EXPECT_EQ("DECIMAL8", variant.typeDebugString()); + EXPECT_EQ("Decimal8", variant.typeDebugString()); auto decimal = variant.getDecimal8(); EXPECT_EQ(2, decimal.scale); EXPECT_EQ("12345678.90", decimal.value.ToString(decimal.scale)); @@ -384,7 +378,7 @@ TEST(ParquetVariant, DecimalValues) { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_decimal16", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::Decimal16, variant.getType()); - EXPECT_EQ("DECIMAL16", variant.typeDebugString()); + EXPECT_EQ("Decimal16", variant.typeDebugString()); auto decimal = variant.getDecimal16(); EXPECT_EQ(2, decimal.scale); EXPECT_EQ("12345678912345678.90", decimal.value.ToString(decimal.scale)); @@ -396,7 +390,7 @@ TEST(ParquetVariant, DateTimeValues) { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_date", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::Date, variant.getType()); - EXPECT_EQ("DATE", variant.typeDebugString()); + EXPECT_EQ("Date", variant.typeDebugString()); // 2025-04-16 EXPECT_EQ(20194, variant.getDate()); } @@ -404,14 +398,14 @@ TEST(ParquetVariant, DateTimeValues) { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_timestamp", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::TimestampTz, variant.getType()); - EXPECT_EQ("TIMESTAMP_TZ", variant.typeDebugString()); + EXPECT_EQ("TimestampTz", variant.typeDebugString()); EXPECT_EQ(1744821296780000, variant.getTimestamp()); } { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_timestampntz", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::TimestampNtz, variant.getType()); - EXPECT_EQ("TIMESTAMP_NTZ", variant.typeDebugString()); + EXPECT_EQ("TimestampNtz", variant.typeDebugString()); EXPECT_EQ(1744806896780000, variant.getTimestampNtz()); } } @@ -421,10 +415,9 @@ TEST(ParquetVariant, ArrayValues) { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("array_primitive", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::Array, variant.getType()); - EXPECT_EQ("ARRAY", variant.typeDebugString()); + EXPECT_EQ("Array", variant.typeDebugString()); - auto array_info = variant.getArrayInfo(); - EXPECT_EQ(4, array_info.num_elements); + EXPECT_EQ(4, variant.num_elements()); auto element0 = variant.getArrayValueByIndex(0); EXPECT_EQ(VariantType::Int8, element0.getType()); @@ -444,19 +437,19 @@ TEST(ParquetVariant, ArrayValues) { EXPECT_THROW(variant.getArrayValueByIndex(4), ParquetException); EXPECT_THROW(variant.getArrayValueByIndex(100), ParquetException); - EXPECT_THROW(variant.getObjectInfo(), ParquetException); + EXPECT_THROW(variant.getObjectValueByKey("10"), ParquetException); + EXPECT_THROW(variant.getObjectFieldByFieldId(10), ParquetException); } { // array_empty std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("array_empty", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::Array, variant.getType()); - EXPECT_EQ("ARRAY", variant.typeDebugString()); - auto array_info = variant.getArrayInfo(); - EXPECT_EQ(0, array_info.num_elements); + EXPECT_EQ("Array", variant.typeDebugString()); + EXPECT_EQ(0, variant.num_elements()); EXPECT_THROW(variant.getArrayValueByIndex(0), ParquetException); - EXPECT_THROW(variant.getObjectInfo(), ParquetException); + EXPECT_THROW(variant.getObjectValueByKey("key"), ParquetException); } } @@ -464,14 +457,12 @@ TEST(ParquetVariant, ArrayValuesNested) { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("array_nested", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::Array, variant.getType()); - EXPECT_EQ("ARRAY", variant.typeDebugString()); - auto object_info = variant.getArrayInfo(); - EXPECT_EQ(3, object_info.num_elements); + EXPECT_EQ("Array", variant.typeDebugString()); + EXPECT_EQ(3, variant.num_elements()); { auto first_element = variant.getArrayValueByIndex(0); EXPECT_EQ(VariantType::Object, first_element.getType()); - auto first_element_info = first_element.getObjectInfo(); - EXPECT_EQ(2, first_element_info.num_elements); + EXPECT_EQ(2, first_element.num_elements()); auto id = first_element.getObjectValueByKey("id"); ASSERT_TRUE(id.has_value()); EXPECT_EQ(VariantType::Int8, id->getType()); @@ -484,8 +475,7 @@ TEST(ParquetVariant, ArrayValuesNested) { { auto third_element = variant.getArrayValueByIndex(2); EXPECT_EQ(VariantType::Object, third_element.getType()); - auto third_element_info = third_element.getObjectInfo(); - EXPECT_EQ(3, third_element_info.num_elements); + EXPECT_EQ(3, third_element.num_elements()); auto id = third_element.getObjectValueByKey("id"); ASSERT_TRUE(id.has_value()); EXPECT_EQ(VariantType::Int8, id->getType()); From 1b31d425120ac1af381c7ca8a1a077511ae8c05d Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 17 May 2025 04:08:55 +0800 Subject: [PATCH 24/31] More comments for the code interface --- cpp/src/parquet/variant.cc | 4 +++ cpp/src/parquet/variant.h | 63 +++++++++++++++++++++++++++++++------- 2 files changed, 56 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index fed0bd54fd1c..02cbf4dfb855 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -23,6 +23,7 @@ #include #include +#include #include "parquet/exception.h" @@ -407,6 +408,8 @@ std::string_view VariantValue::typeDebugString() const { return "TimestampNanosNtz"; case VariantType::Uuid: return "Uuid"; + default: + ::arrow::Unreachable(); } } @@ -648,6 +651,7 @@ uint32_t VariantValue::num_elements() const { VariantBasicTypeToString(basic_type)); } } + ::arrow::Unreachable(); } std::optional VariantValue::getObjectValueByKey( diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index 1d0d4f1c6a97..88a12b0a84f2 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -128,7 +128,7 @@ class PARQUET_EXPORT VariantMetadata { uint8_t version() const; /// \brief Get the metadata key for a given variant field id. /// \throw ParquetException if the variant_id is out of range(larger than - /// dictionary size). + /// dictionary_size). std::string_view GetMetadataKey(uint32_t variant_id) const; /// \brief Get the metadata id for a given key. /// From the discussion in ML: @@ -175,38 +175,77 @@ class PARQUET_EXPORT VariantValue { std::string_view typeDebugString() const; const VariantMetadata& metadata() const; - /// \defgroup ValueAccessors - /// @{ - // Note: Null doesn't need visitor. + + /// \brief Get the primitive boolean value. + /// \throw ParquetException if the type is not a boolean type. bool getBool() const; + /// \brief Get the primitive int8 value. + /// \throw ParquetException if the type is not an int8 type. int8_t getInt8() const; + /// \brief Get the primitive int16 value. + /// \throw ParquetException if the type is not an int16 type. int16_t getInt16() const; + /// \brief Get the primitive int32 value. + /// \throw ParquetException if the type is not an int32 type. int32_t getInt32() const; + /// \brief Get the primitive int64 value. + /// \throw ParquetException if the type is not an int64 type. int64_t getInt64() const; - /// Include short_string optimization and primitive string type + /// \brief Get the string value, including both short string optimization and primitive + /// string type. \throw ParquetException if the type is not a string type. std::string_view getString() const; + /// \brief Get the binary value. + /// \throw ParquetException if the type is not a binary type. std::string_view getBinary() const; + /// \brief Get the primitive float value. + /// \throw ParquetException if the type is not a float type. float getFloat() const; + /// \brief Get the primitive double value. + /// \throw ParquetException if the type is not a double type. double getDouble() const; + /// \brief Get the decimal value with 4 bytes precision. + /// \throw ParquetException if the type is not a decimal4 type. DecimalValue<::arrow::Decimal32> getDecimal4() const; + /// \brief Get the decimal value with 8 bytes precision. + /// \throw ParquetException if the type is not a decimal8 type. DecimalValue<::arrow::Decimal64> getDecimal8() const; + /// \brief Get the decimal value with 16 bytes precision. + /// \throw ParquetException if the type is not a decimal16 type. DecimalValue<::arrow::Decimal128> getDecimal16() const; + /// \brief Get the date value as days since Unix epoch. + /// \throw ParquetException if the type is not a date type. int32_t getDate() const; + /// \brief Get the time value without timezone as microseconds since midnight. + /// \throw ParquetException if the type is not a time type. int64_t getTimeNtz() const; - // timestamp with adjusted to UTC + /// \brief Get the timestamp value with UTC timezone as microseconds since Unix epoch. + /// \throw ParquetException if the type is not a timestamp type. int64_t getTimestamp() const; + /// \brief Get the timestamp value without timezone as microseconds since Unix epoch. + /// \throw ParquetException if the type is not a timestamp without timezone type. int64_t getTimestampNtz() const; - // 16 bytes UUID + /// \brief Get the UUID value as a 16-byte array. + /// \throw ParquetException if the type is not a UUID type. std::array getUuid() const; - /// }@ - + /// \brief Get the num_elements of the array or object. + /// For array, it returns the number of elements in the array. + /// For object, it returns the number of fields in the object. + /// \throw ParquetException if the type is not an array or object type. uint32_t num_elements() const; + /// \brief Get the value of the object field by key. + /// \return returns the value of the field with the given key, or empty if the key + /// doesn't exist. + /// \throw ParquetException if the type is not an object type. std::optional getObjectValueByKey(std::string_view key) const; + /// \brief Get the value of the object field by field id. + /// \return returns the value of the field with the given field id, or empty if the + /// field id doesn't exist. + /// \throw ParquetException if the type is not an object type. std::optional getObjectFieldByFieldId(uint32_t variant_id) const; // Would throw ParquetException if index is out of range. @@ -215,10 +254,12 @@ class PARQUET_EXPORT VariantValue { private: static constexpr uint8_t kBasicTypeMask = 0b00000011; static constexpr uint8_t kPrimitiveTypeMask = 0b00111111; - /** The inclusive maximum value of the type info value. It is the size limit of - * `SHORT_STR`. */ + /// The inclusive maximum value of the type info value. It is the size limit of + /// ShortString. static constexpr uint8_t kMaxShortStrSizeMask = 0b00111111; + /// ComplexInfo is used to store the metadata of the array or object. + /// For array, it doesn't have id_size and id_start_offset. struct ComplexInfo { uint32_t num_elements; uint32_t id_start_offset; From a83f8ccab376fdda68bab7100350caedc88df734 Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 17 May 2025 04:59:31 +0800 Subject: [PATCH 25/31] change str to view --- cpp/src/parquet/variant.cc | 6 +++--- cpp/src/parquet/variant.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index 02cbf4dfb855..7e7719b6468c 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -29,7 +29,7 @@ namespace parquet::variant { -std::string VariantBasicTypeToString(VariantBasicType type) { +std::string_view VariantBasicTypeToString(VariantBasicType type) { switch (type) { case VariantBasicType::Primitive: return "Primitive"; @@ -44,7 +44,7 @@ std::string VariantBasicTypeToString(VariantBasicType type) { } } -std::string VariantPrimitiveTypeToString(VariantPrimitiveType type) { +std::string_view VariantPrimitiveTypeToString(VariantPrimitiveType type) { switch (type) { case VariantPrimitiveType::NullType: return "NullType"; @@ -93,7 +93,7 @@ std::string VariantPrimitiveTypeToString(VariantPrimitiveType type) { } } -std::string VariantTypeToString(VariantType type) { +std::string_view VariantTypeToString(VariantType type) { switch (type) { case VariantType::Object: return "OBJECT"; diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index 88a12b0a84f2..15504de3ac0d 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -43,7 +43,7 @@ enum class VariantBasicType { Array = 3 }; -PARQUET_EXPORT std::string VariantBasicTypeToString(VariantBasicType type); +PARQUET_EXPORT std::string_view VariantBasicTypeToString(VariantBasicType type); enum class VariantPrimitiveType : int8_t { /// Equivalent Parquet Type: UNKNOWN @@ -91,7 +91,7 @@ enum class VariantPrimitiveType : int8_t { Uuid = 20 }; -PARQUET_EXPORT std::string VariantPrimitiveTypeToString(VariantPrimitiveType type); +PARQUET_EXPORT std::string_view VariantPrimitiveTypeToString(VariantPrimitiveType type); /// VariantType is from basic type and primitive type. enum class VariantType { @@ -119,7 +119,7 @@ enum class VariantType { Uuid }; -PARQUET_EXPORT std::string VariantTypeToString(VariantType type); +PARQUET_EXPORT std::string_view VariantTypeToString(VariantType type); class PARQUET_EXPORT VariantMetadata { public: From 17a0637397528a63111fa051b7aeca9f4051bea2 Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 17 May 2025 11:52:11 +0800 Subject: [PATCH 26/31] Revert toString return view firstly --- cpp/src/parquet/variant.cc | 6 +++--- cpp/src/parquet/variant.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index 7e7719b6468c..02cbf4dfb855 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -29,7 +29,7 @@ namespace parquet::variant { -std::string_view VariantBasicTypeToString(VariantBasicType type) { +std::string VariantBasicTypeToString(VariantBasicType type) { switch (type) { case VariantBasicType::Primitive: return "Primitive"; @@ -44,7 +44,7 @@ std::string_view VariantBasicTypeToString(VariantBasicType type) { } } -std::string_view VariantPrimitiveTypeToString(VariantPrimitiveType type) { +std::string VariantPrimitiveTypeToString(VariantPrimitiveType type) { switch (type) { case VariantPrimitiveType::NullType: return "NullType"; @@ -93,7 +93,7 @@ std::string_view VariantPrimitiveTypeToString(VariantPrimitiveType type) { } } -std::string_view VariantTypeToString(VariantType type) { +std::string VariantTypeToString(VariantType type) { switch (type) { case VariantType::Object: return "OBJECT"; diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index 15504de3ac0d..88a12b0a84f2 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -43,7 +43,7 @@ enum class VariantBasicType { Array = 3 }; -PARQUET_EXPORT std::string_view VariantBasicTypeToString(VariantBasicType type); +PARQUET_EXPORT std::string VariantBasicTypeToString(VariantBasicType type); enum class VariantPrimitiveType : int8_t { /// Equivalent Parquet Type: UNKNOWN @@ -91,7 +91,7 @@ enum class VariantPrimitiveType : int8_t { Uuid = 20 }; -PARQUET_EXPORT std::string_view VariantPrimitiveTypeToString(VariantPrimitiveType type); +PARQUET_EXPORT std::string VariantPrimitiveTypeToString(VariantPrimitiveType type); /// VariantType is from basic type and primitive type. enum class VariantType { @@ -119,7 +119,7 @@ enum class VariantType { Uuid }; -PARQUET_EXPORT std::string_view VariantTypeToString(VariantType type); +PARQUET_EXPORT std::string VariantTypeToString(VariantType type); class PARQUET_EXPORT VariantMetadata { public: From 01ec5eaedd045034b522356c84ca1fa8d764db43 Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 17 May 2025 12:53:50 +0800 Subject: [PATCH 27/31] Fix ci --- cpp/src/parquet/variant.cc | 48 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index 02cbf4dfb855..4804a234dc08 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -96,51 +96,51 @@ std::string VariantPrimitiveTypeToString(VariantPrimitiveType type) { std::string VariantTypeToString(VariantType type) { switch (type) { case VariantType::Object: - return "OBJECT"; + return "Object"; case VariantType::Array: - return "ARRAY"; + return "Array"; case VariantType::Null: - return "NULL"; + return "Null"; case VariantType::Boolean: - return "BOOLEAN"; + return "Boolean"; case VariantType::Int8: - return "INT8"; + return "Int8"; case VariantType::Int16: - return "INT16"; + return "Int16"; case VariantType::Int32: - return "INT32"; + return "Int32"; case VariantType::Int64: - return "INT64"; + return "Int64"; case VariantType::String: - return "STRING"; + return "String"; case VariantType::Double: - return "DOUBLE"; + return "Double"; case VariantType::Decimal4: - return "DECIMAL4"; + return "Decimal4"; case VariantType::Decimal8: - return "DECIMAL8"; + return "Decimal8"; case VariantType::Decimal16: - return "DECIMAL16"; + return "Decimal16"; case VariantType::Date: - return "DATE"; + return "Date"; case VariantType::TimestampTz: - return "TIMESTAMP_TZ"; + return "TimestampTz"; case VariantType::TimestampNtz: - return "TIMESTAMP_Ntz"; + return "TimestampNtz"; case VariantType::Float: - return "FLOAT"; + return "Float"; case VariantType::Binary: - return "BINARY"; + return "Binary"; case VariantType::Time: - return "TIME"; + return "Time"; case VariantType::TimestampNanosTz: - return "TIMESTAMP_NANOS_TZ"; + return "TimestampNanosTz"; case VariantType::TimestampNanosNtz: - return "TIMESTAMP_NANOS_Ntz"; + return "TimestampNanosNtz"; case VariantType::Uuid: - return "UUID"; + return "Uuid"; default: - return "UNKNOWN"; + return "Unknown"; } } @@ -588,7 +588,7 @@ std::array VariantValue::getUuid() const { memcpy(uuid_value.data(), value_.data() + 1, sizeof(uuid_value)); #if ARROW_LITTLE_ENDIAN std::array uuid_value_le; - ::arrow::bit_util::ByteSwap(uuid_value_le.data(), uuid_value.data(), uuid_value.size()); + ::arrow::bit_util::ByteSwap(uuid_value_le.data(), uuid_value.data(), 16); return uuid_value_le; #else return uuid_value; From 925a062ea03e1c0cc25ab41053c2d8b25ad0212e Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 19 May 2025 17:33:47 +0800 Subject: [PATCH 28/31] Add basic test and enhance interface for timestamp and uuid type --- cpp/src/parquet/variant.cc | 76 ++++++++-------- cpp/src/parquet/variant.h | 32 ++++--- cpp/src/parquet/variant_test.cc | 153 ++++++++++++++++++++++++++++++-- 3 files changed, 207 insertions(+), 54 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index 4804a234dc08..c7a0b735448c 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -70,22 +70,22 @@ std::string VariantPrimitiveTypeToString(VariantPrimitiveType type) { return "Decimal16"; case VariantPrimitiveType::Date: return "Date"; - case VariantPrimitiveType::Timestamp: - return "Timestamp"; - case VariantPrimitiveType::TimestampNtz: - return "TimestampNtz"; + case VariantPrimitiveType::TimestampMicros: + return "TimestampMicros"; + case VariantPrimitiveType::TimestampMicrosNtz: + return "TimestampMicrosNtz"; case VariantPrimitiveType::Float: return "Float"; case VariantPrimitiveType::Binary: return "Binary"; case VariantPrimitiveType::String: return "String"; - case VariantPrimitiveType::TimeNtz: - return "TimeNtz"; - case VariantPrimitiveType::TimestampTz: - return "TimestampTZ"; - case VariantPrimitiveType::TimestampNtzNanos: - return "TimestampNtzNanos"; + case VariantPrimitiveType::TimeMicrosNtz: + return "TimeMicrosNtz"; + case VariantPrimitiveType::TimestampNanosTz: + return "TimestampNanosTz"; + case VariantPrimitiveType::TimestampNanosNtz: + return "TimestampNanosNtz"; case VariantPrimitiveType::Uuid: return "Uuid"; default: @@ -123,10 +123,10 @@ std::string VariantTypeToString(VariantType type) { return "Decimal16"; case VariantType::Date: return "Date"; - case VariantType::TimestampTz: - return "TimestampTz"; - case VariantType::TimestampNtz: - return "TimestampNtz"; + case VariantType::TimestampMicrosTz: + return "TimestampMicrosTz"; + case VariantType::TimestampMicrosNtz: + return "TimestampMicrosNtz"; case VariantType::Float: return "Float"; case VariantType::Binary: @@ -324,21 +324,21 @@ VariantType VariantValue::getType() const { return VariantType::Decimal16; case VariantPrimitiveType::Date: return VariantType::Date; - case VariantPrimitiveType::Timestamp: - return VariantType::TimestampTz; - case VariantPrimitiveType::TimestampNtz: - return VariantType::TimestampNtz; + case VariantPrimitiveType::TimestampMicros: + return VariantType::TimestampMicrosTz; + case VariantPrimitiveType::TimestampMicrosNtz: + return VariantType::TimestampMicrosNtz; case VariantPrimitiveType::Float: return VariantType::Float; case VariantPrimitiveType::Binary: return VariantType::Binary; case VariantPrimitiveType::String: return VariantType::String; - case VariantPrimitiveType::TimeNtz: + case VariantPrimitiveType::TimeMicrosNtz: return VariantType::Time; - case VariantPrimitiveType::TimestampTz: + case VariantPrimitiveType::TimestampNanosTz: return VariantType::TimestampNanosTz; - case VariantPrimitiveType::TimestampNtzNanos: + case VariantPrimitiveType::TimestampNanosNtz: return VariantType::TimestampNanosNtz; case VariantPrimitiveType::Uuid: return VariantType::Uuid; @@ -392,10 +392,10 @@ std::string_view VariantValue::typeDebugString() const { return "Decimal16"; case VariantType::Date: return "Date"; - case VariantType::TimestampTz: - return "TimestampTz"; - case VariantType::TimestampNtz: - return "TimestampNtz"; + case VariantType::TimestampMicrosTz: + return "TimestampMicrosTz"; + case VariantType::TimestampMicrosNtz: + return "TimestampMicrosNtz"; case VariantType::Float: return "Float"; case VariantType::Binary: @@ -570,29 +570,31 @@ int32_t VariantValue::getDate() const { return getPrimitiveType(VariantPrimitiveType::Date); } -int64_t VariantValue::getTimeNtz() const { - return getPrimitiveType(VariantPrimitiveType::TimeNtz); +int64_t VariantValue::getTimeMicrosNtz() const { + return getPrimitiveType(VariantPrimitiveType::TimeMicrosNtz); +} + +int64_t VariantValue::getTimestampMicros() const { + return getPrimitiveType(VariantPrimitiveType::TimestampMicros); +} + +int64_t VariantValue::getTimestampMicrosNtz() const { + return getPrimitiveType(VariantPrimitiveType::TimestampMicrosNtz); } -int64_t VariantValue::getTimestamp() const { - return getPrimitiveType(VariantPrimitiveType::Timestamp); +int64_t VariantValue::getTimestampNanosTz() const { + return getPrimitiveType(VariantPrimitiveType::TimestampNanosTz); } -int64_t VariantValue::getTimestampNtz() const { - return getPrimitiveType(VariantPrimitiveType::TimestampNtz); +int64_t VariantValue::getTimestampNanosNtz() const { + return getPrimitiveType(VariantPrimitiveType::TimestampNanosNtz); } std::array VariantValue::getUuid() const { checkPrimitiveType(VariantPrimitiveType::Uuid, /*size_required=*/17); std::array uuid_value; memcpy(uuid_value.data(), value_.data() + 1, sizeof(uuid_value)); -#if ARROW_LITTLE_ENDIAN - std::array uuid_value_le; - ::arrow::bit_util::ByteSwap(uuid_value_le.data(), uuid_value.data(), 16); - return uuid_value_le; -#else return uuid_value; -#endif } VariantValue::ComplexInfo VariantValue::getObjectInfo(std::string_view value) { diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index 88a12b0a84f2..a4c7aec28d19 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -71,9 +71,9 @@ enum class VariantPrimitiveType : int8_t { /// Equivalent Parquet Type: DATE Date = 11, /// Equivalent Parquet Type: TIMESTAMP(isAdjustedToUTC=true, MICROS) - Timestamp = 12, + TimestampMicros = 12, /// Equivalent Parquet Type: TIMESTAMP(isAdjustedToUTC=false, MICROS) - TimestampNtz = 13, + TimestampMicrosNtz = 13, /// Equivalent Parquet Type: FLOAT Float = 14, /// Equivalent Parquet Type: BINARY @@ -81,12 +81,12 @@ enum class VariantPrimitiveType : int8_t { /// Equivalent Parquet Type: STRING String = 16, /// Equivalent Parquet Type: TIME(isAdjustedToUTC=false, MICROS) - TimeNtz = 17, + TimeMicrosNtz = 17, /// Equivalent Parquet Type: TIMESTAMP(isAdjustedToUTC=true, NANOS) - TimestampTz = 18, // Assuming TZ stands for TimeZone, and follows the document's - // 'timestamp with time zone' + TimestampNanosTz = 18, // Assuming TZ stands for TimeZone, and follows the document's + // 'timestamp with time zone' /// Equivalent Parquet Type: TIMESTAMP(isAdjustedToUTC=false, NANOS) - TimestampNtzNanos = 19, // Differentiating from TimestampNtz (MICROS) + TimestampNanosNtz = 19, // Differentiating from TimestampNtz (MICROS) /// Equivalent Parquet Type: UUID Uuid = 20 }; @@ -109,8 +109,8 @@ enum class VariantType { Decimal8, Decimal16, Date, - TimestampTz, - TimestampNtz, + TimestampMicrosTz, + TimestampMicrosNtz, Float, Binary, Time, @@ -141,6 +141,10 @@ class PARQUET_EXPORT VariantMetadata { uint8_t offset_size() const; uint32_t dictionary_size() const; + /// Metadata for primitive types and any nested types + /// without key dictionary. + static constexpr char kEmptyMetadataChars[] = {0x1, 0x0, 0x0}; + private: static uint32_t loadDictionarySize(std::string_view metadata, uint8_t offset_size); @@ -220,13 +224,19 @@ class PARQUET_EXPORT VariantValue { int32_t getDate() const; /// \brief Get the time value without timezone as microseconds since midnight. /// \throw ParquetException if the type is not a time type. - int64_t getTimeNtz() const; + int64_t getTimeMicrosNtz() const; /// \brief Get the timestamp value with UTC timezone as microseconds since Unix epoch. /// \throw ParquetException if the type is not a timestamp type. - int64_t getTimestamp() const; + int64_t getTimestampMicros() const; /// \brief Get the timestamp value without timezone as microseconds since Unix epoch. /// \throw ParquetException if the type is not a timestamp without timezone type. - int64_t getTimestampNtz() const; + int64_t getTimestampMicrosNtz() const; + /// \brief Get the timestamp value with UTC timezone as nanoseconds since Unix epoch. + /// \throw ParquetException if the type is not a timestamp type. + int64_t getTimestampNanosTz() const; + /// \brief Get the timestamp value without timezone as nanoseconds since Unix epoch. + /// \throw ParquetException if the type is not a timestamp without timezone type. + int64_t getTimestampNanosNtz() const; /// \brief Get the UUID value as a 16-byte array. /// \throw ParquetException if the type is not a UUID type. std::array getUuid() const; diff --git a/cpp/src/parquet/variant_test.cc b/cpp/src/parquet/variant_test.cc index 0a70fbe80d0f..a55e601328e9 100644 --- a/cpp/src/parquet/variant_test.cc +++ b/cpp/src/parquet/variant_test.cc @@ -27,6 +27,9 @@ #include #include +#include +#include + namespace parquet::variant { std::string metadata_test_file_name(std::string_view test_name) { @@ -45,6 +48,11 @@ std::shared_ptr<::arrow::Buffer> readFromFile(::arrow::fs::FileSystem& fs, return buf; } +uint8_t primitiveHeader(VariantPrimitiveType primitive) { + return (static_cast(primitive) << 2); +} + +// TODO(mwish): Extract this to primitive metadata test TEST(ParquetVariant, MetadataBase) { std::string dir_string(parquet::test::get_variant_dir()); auto file_system = std::make_shared<::arrow::fs::LocalFileSystem>(); @@ -104,6 +112,15 @@ VariantValue LoadVariantValue(const std::string& test_name, return VariantValue{metadata, value}; } +TEST(ParquetVariant, NullValue) { + std::string_view empty_metadata(VariantMetadata::kEmptyMetadataChars, 3); + const uint8_t null_chars[] = {primitiveHeader(VariantPrimitiveType::NullType)}; + VariantValue variant{empty_metadata, + std::string_view{reinterpret_cast(null_chars), 1}}; + EXPECT_EQ(VariantType::Null, variant.getType()); + EXPECT_EQ("Null", variant.typeDebugString()); +} + TEST(ParquetVariant, BooleanValue) { // test true { @@ -151,12 +168,49 @@ TEST(ParquetVariant, NumericValues) { } { // FIXME(mwish): https://github.com/apache/parquet-testing/issues/82 + // The primitive_int64 is a int32 value, but the metadata is int64. std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int64", &metadata_buf, &value_buf); EXPECT_EQ(VariantType::Int32, variant.getType()); EXPECT_EQ("Int32", variant.typeDebugString()); EXPECT_EQ(12345678, variant.getInt32()); } + { + // Test handwritten int64 + const uint8_t int64_chars[] = {primitiveHeader(VariantPrimitiveType::Int64), + 0xB1, + 0x1C, + 0x6C, + 0xB1, + 0xF4, + 0x10, + 0x22, + 0x11}; + std::string_view metadata(VariantMetadata::kEmptyMetadataChars, 3); + std::string_view value{reinterpret_cast(int64_chars), + sizeof(int64_chars)}; + VariantValue variant{metadata, value}; + EXPECT_EQ(VariantType::Int64, variant.getType()); + EXPECT_EQ(1234567890987654321L, variant.getInt64()); + } + { + // Test handwritten int64 negative + const uint8_t int64_chars[] = {primitiveHeader(VariantPrimitiveType::Int64), + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF}; + std::string_view metadata(VariantMetadata::kEmptyMetadataChars, 3); + std::string_view value{reinterpret_cast(int64_chars), + sizeof(int64_chars)}; + VariantValue variant{metadata, value}; + EXPECT_EQ(VariantType::Int64, variant.getType()); + EXPECT_EQ(-1L, variant.getInt64()); + } { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_float", &metadata_buf, &value_buf); @@ -385,6 +439,36 @@ TEST(ParquetVariant, DecimalValues) { } } +TEST(ParquetVariant, Uuid) { + std::string_view empty_metadata(VariantMetadata::kEmptyMetadataChars, 3); + const uint8_t uuid_chars[] = {primitiveHeader(VariantPrimitiveType::Uuid), + 0x00, + 0x11, + 0x22, + 0x33, + 0x44, + 0x55, + 0x66, + 0x77, + 0x88, + 0x99, + 0xAA, + 0xBB, + 0xCC, + 0xDD, + 0xEE, + 0xFF}; + std::string_view value(reinterpret_cast(uuid_chars), sizeof(uuid_chars)); + VariantValue variant(empty_metadata, value); + ASSERT_EQ(VariantType::Uuid, variant.getType()); + auto uuid_val = variant.getUuid(); + boost::uuids::uuid uuid{}; + for (size_t i = 0; i < uuid.size(); ++i) { + uuid.data[i] = uuid_val[i]; + } + EXPECT_EQ("00112233-4455-6677-8899-aabbccddeeff", to_string(uuid)); +} + TEST(ParquetVariant, DateTimeValues) { { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; @@ -397,16 +481,73 @@ TEST(ParquetVariant, DateTimeValues) { { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_timestamp", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::TimestampTz, variant.getType()); - EXPECT_EQ("TimestampTz", variant.typeDebugString()); - EXPECT_EQ(1744821296780000, variant.getTimestamp()); + EXPECT_EQ(VariantType::TimestampMicrosTz, variant.getType()); + EXPECT_EQ("TimestampMicrosTz", variant.typeDebugString()); + EXPECT_EQ(1744821296780000, variant.getTimestampMicros()); } { std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_timestampntz", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::TimestampNtz, variant.getType()); - EXPECT_EQ("TimestampNtz", variant.typeDebugString()); - EXPECT_EQ(1744806896780000, variant.getTimestampNtz()); + EXPECT_EQ(VariantType::TimestampMicrosNtz, variant.getType()); + EXPECT_EQ("TimestampMicrosNtz", variant.typeDebugString()); + EXPECT_EQ(1744806896780000, variant.getTimestampMicrosNtz()); + } + { + // Timestamp Nanos tz negative + std::string_view empty_metadata(VariantMetadata::kEmptyMetadataChars, 3); + const uint8_t timestamp_nanos_ntz_chars[] = { + primitiveHeader(VariantPrimitiveType::TimestampNanosTz), + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF}; + std::string_view value{reinterpret_cast(timestamp_nanos_ntz_chars), + sizeof(timestamp_nanos_ntz_chars)}; + VariantValue variant{empty_metadata, value}; + EXPECT_EQ(VariantType::TimestampNanosTz, variant.getType()); + EXPECT_EQ(-1L, variant.getTimestampNanosTz()); + } + { + // Timestamp Nanos tz negative + std::string_view empty_metadata(VariantMetadata::kEmptyMetadataChars, 3); + const uint8_t timestamp_nanos_ntz_chars[] = { + primitiveHeader(VariantPrimitiveType::TimestampNanosTz), + 0x15, + 0xC9, + 0xBB, + 0x86, + 0xB4, + 0x0C, + 0x37, + 0x18}; + std::string_view value{reinterpret_cast(timestamp_nanos_ntz_chars), + sizeof(timestamp_nanos_ntz_chars)}; + VariantValue variant{empty_metadata, value}; + EXPECT_EQ(VariantType::TimestampNanosTz, variant.getType()); + EXPECT_EQ(1744877350123456789L, variant.getTimestampNanosTz()); + } + { + // Timestamp Nanos Ntz + std::string_view empty_metadata(VariantMetadata::kEmptyMetadataChars, 3); + const uint8_t timestamp_nanos_ntz_chars[] = { + primitiveHeader(VariantPrimitiveType::TimestampNanosNtz), + 0x15, + 0xC9, + 0xBB, + 0x86, + 0xB4, + 0x0C, + 0x37, + 0x18}; + std::string_view value{reinterpret_cast(timestamp_nanos_ntz_chars), + sizeof(timestamp_nanos_ntz_chars)}; + VariantValue variant{empty_metadata, value}; + EXPECT_EQ(VariantType::TimestampNanosNtz, variant.getType()); + EXPECT_EQ(1744877350123456789L, variant.getTimestampNanosNtz()); } } From 4630680ef50e41def0434f1bca545ffeca69a90e Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 24 May 2025 02:41:41 +0800 Subject: [PATCH 29/31] Enhancement --- cpp/src/parquet/variant.cc | 162 +++++++++++++++++--------------- cpp/src/parquet/variant.h | 10 ++ cpp/src/parquet/variant_test.cc | 45 ++++----- cpp/submodules/parquet-testing | 2 +- 4 files changed, 115 insertions(+), 104 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index c7a0b735448c..ec1660e830da 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -171,7 +171,7 @@ VariantMetadata::VariantMetadata(std::string_view metadata) : metadata_(metadata std::to_string(offset_sz)); } dictionary_size_ = loadDictionarySize(metadata, offset_sz); - if (kHeaderSizeBytes + (dictionary_size_ + 1) * offset_sz > metadata_.size()) { + if (kHeaderSizeBytes + (dictionary_size_ + 2) * offset_sz > metadata_.size()) { throw ParquetException( "Invalid Variant metadata: offset out of range: " + std::to_string((dictionary_size_ + kHeaderSizeBytes) * offset_sz) + " > " + @@ -204,12 +204,12 @@ uint32_t VariantMetadata::dictionary_size() const { return dictionary_size_; } std::string_view VariantMetadata::GetMetadataKey(uint32_t variant_id) const { uint32_t offset_bytes = offset_size(); - uint32_t dictionary_bytes = dictionary_size(); + uint32_t dictionary_item_count = dictionary_size(); - if (variant_id >= dictionary_bytes) { + if (variant_id >= dictionary_item_count) { throw ParquetException("Invalid Variant metadata: variant_id out of range: " + std::to_string(variant_id) + - " >= " + std::to_string(dictionary_bytes)); + " >= " + std::to_string(dictionary_item_count)); } size_t offset_start_pos = kHeaderSizeBytes + offset_bytes + (variant_id * offset_bytes); @@ -221,8 +221,9 @@ std::string_view VariantMetadata::GetMetadataKey(uint32_t variant_id) const { metadata_.data() + offset_start_pos + offset_bytes, offset_bytes); uint32_t key_size = variant_next_offset - variant_offset; + // 2 for dictionary_size and an extra offset for the dictionary key string. size_t string_start = - kHeaderSizeBytes + offset_bytes * (dictionary_bytes + 2) + variant_offset; + kHeaderSizeBytes + offset_bytes * (dictionary_item_count + 2) + variant_offset; if (string_start + key_size > metadata_.size()) { throw ParquetException("Invalid Variant metadata: string data out of range: " + std::to_string(string_start) + " + " + @@ -237,26 +238,26 @@ ::arrow::internal::SmallVector VariantMetadata::GetMetadataId( uint32_t offset_bytes = offset_size(); uint32_t dictionary_bytes = dictionary_size(); - if ((dictionary_bytes + kHeaderSizeBytes) * offset_bytes > metadata_.size()) { - throw ParquetException("Invalid Variant metadata: offset out of range"); - } const bool sort_and_unique = sorted_and_unique(); // TODO(mwish): This can be optimized by using binary search if the metadata is sorted. ::arrow::internal::SmallVector vector; - uint32_t variant_offset = 0; - uint32_t variant_next_offset = 0; + uint32_t dictionary_key_offset = 0; + uint32_t dictionary_next_key_offset = 0; + const uint32_t dictionary_key_start_offset = + kHeaderSizeBytes + offset_bytes * (dictionary_bytes + 2); for (uint32_t i = 0; i < dictionary_bytes; ++i) { - size_t offset_start_pos = 1 + offset_bytes + (i * offset_bytes); - variant_offset = variant_next_offset; - variant_next_offset = readLittleEndianU32( + size_t offset_start_pos = kHeaderSizeBytes + (i + 1) * offset_bytes; + dictionary_key_offset = dictionary_next_key_offset; + dictionary_next_key_offset = readLittleEndianU32( metadata_.data() + offset_start_pos + offset_bytes, offset_bytes); - uint32_t key_size = variant_next_offset - variant_offset; + uint32_t dictionary_key_size = dictionary_next_key_offset - dictionary_key_offset; - size_t string_start = 1 + offset_bytes * (dictionary_bytes + 2) + variant_offset; - if (string_start + key_size > metadata_.size()) { + size_t dictionary_key_start = dictionary_key_start_offset + dictionary_key_offset; + if (dictionary_key_start + dictionary_key_size > metadata_.size()) { throw ParquetException("Invalid Variant metadata: string data out of range"); } - std::string_view current_key{metadata_.data() + string_start, key_size}; + std::string_view current_key{metadata_.data() + dictionary_key_start, + dictionary_key_size}; if (current_key == key) { vector.push_back(i); if (sort_and_unique) { @@ -299,7 +300,8 @@ VariantType VariantValue::getType() const { VariantBasicType basic_type = getBasicType(); switch (basic_type) { case VariantBasicType::Primitive: { - auto primitive_type = static_cast(value_[0] >> 2); + auto primitive_type = + static_cast(value_[0] >> kPrimitiveTypeBitShift); switch (primitive_type) { case VariantPrimitiveType::NullType: return VariantType::Null; @@ -419,7 +421,7 @@ bool VariantValue::getBool() const { VariantBasicTypeToString(getBasicType())); } - uint8_t primitive_type = static_cast(value_[0]) >> 2; + uint8_t primitive_type = static_cast(value_[0]) >> kPrimitiveTypeBitShift; if (primitive_type == static_cast(VariantPrimitiveType::BooleanTrue)) { return true; } @@ -438,11 +440,20 @@ void VariantValue::checkBasicType(VariantBasicType type) const { } } +void VariantValue::checkIsComplexType() const { + VariantBasicType basic_type = getBasicType(); + if (basic_type != VariantBasicType::Object && basic_type != VariantBasicType::Array) { + throw ParquetException("Expected complex type, but got: " + + VariantBasicTypeToString(basic_type)); + } +} + void VariantValue::checkPrimitiveType(VariantPrimitiveType type, size_t size_required) const { checkBasicType(VariantBasicType::Primitive); - auto primitive_type = static_cast(value_[0] >> 2); + auto primitive_type = + static_cast(value_[0] >> kPrimitiveTypeBitShift); if (primitive_type != type) { throw ParquetException( "Expected primitive type: " + VariantPrimitiveTypeToString(type) + @@ -459,10 +470,10 @@ void VariantValue::checkPrimitiveType(VariantPrimitiveType type, template PrimitiveType VariantValue::getPrimitiveType(VariantPrimitiveType type) const { - checkPrimitiveType(type, sizeof(PrimitiveType) + 1); + checkPrimitiveType(type, sizeof(PrimitiveType) + kHeaderSizeBytes); PrimitiveType primitive_value{}; - memcpy(&primitive_value, value_.data() + 1, sizeof(PrimitiveType)); + memcpy(&primitive_value, value_.data() + kHeaderSizeBytes, sizeof(PrimitiveType)); // Here we should cast from Little endian. primitive_value = ::arrow::bit_util::FromLittleEndian(primitive_value); return primitive_value; @@ -493,37 +504,39 @@ double VariantValue::getDouble() const { } std::string_view VariantValue::getPrimitiveBinaryType(VariantPrimitiveType type) const { - checkPrimitiveType(type, /*size_required=*/5); + checkPrimitiveType( + type, /*size_required=*/kHeaderSizeBytes + kPrimitiveStringLengthSizeBytes); uint32_t length; - memcpy(&length, value_.data() + 1, sizeof(uint32_t)); + memcpy(&length, value_.data() + kHeaderSizeBytes, sizeof(uint32_t)); length = ::arrow::bit_util::FromLittleEndian(length); - if (value_.size() < length + 5) { + if (value_.size() < length + kHeaderSizeBytes + kPrimitiveStringLengthSizeBytes) { throw ParquetException("Invalid string value: too short for specified length"); } - return {value_.data() + 5, length}; + return {value_.data() + kHeaderSizeBytes + kPrimitiveStringLengthSizeBytes, length}; } std::string_view VariantValue::getString() const { VariantBasicType basic_type = getBasicType(); if (basic_type == VariantBasicType::ShortString) { - uint8_t length = (value_[0] >> 2) & kMaxShortStrSizeMask; - if (value_.size() < static_cast(length + 1)) { + uint8_t short_string_length = (value_[0] >> 2) & kMaxShortStrSizeMask; + if (value_.size() < static_cast(short_string_length + kHeaderSizeBytes)) { throw ParquetException( "Invalid short string: too short: " + std::to_string(value_.size()) + - " for at least " + std::to_string(length + 1)); + " for at least " + std::to_string(short_string_length + kHeaderSizeBytes)); } - return {value_.data() + 1, length}; + return {value_.data() + kHeaderSizeBytes, short_string_length}; } if (basic_type == VariantBasicType::Primitive) { // TODO(mwish): Should we validate utf8 here? return getPrimitiveBinaryType(VariantPrimitiveType::String); } - throw ParquetException("Not a primitive or short string type calls getString"); + throw ParquetException("Expected ShortString or Primitive type, but got: " + + VariantBasicTypeToString(basic_type)); } std::string_view VariantValue::getBinary() const { @@ -534,11 +547,13 @@ template DecimalValue VariantValue::getPrimitiveDecimalType( VariantPrimitiveType type) const { using DecimalValueType = typename DecimalType::ValueType; - checkPrimitiveType(type, sizeof(DecimalValueType) + 2); + checkPrimitiveType( + type, sizeof(DecimalValueType) + kHeaderSizeBytes + kDecimalScaleSizeBytes); - uint8_t scale = value_[1]; + uint8_t scale = value_[kHeaderSizeBytes]; DecimalValueType decimal_value; - memcpy(&decimal_value, value_.data() + 2, sizeof(DecimalValueType)); + memcpy(&decimal_value, value_.data() + kHeaderSizeBytes + kDecimalScaleSizeBytes, + sizeof(DecimalValueType)); decimal_value = ::arrow::bit_util::FromLittleEndian(decimal_value); return {scale, DecimalType(decimal_value)}; @@ -553,8 +568,9 @@ DecimalValue<::arrow::Decimal64> VariantValue::getDecimal8() const { } DecimalValue<::arrow::Decimal128> VariantValue::getDecimal16() const { - checkPrimitiveType(VariantPrimitiveType::Decimal16, - /*size_required=*/sizeof(int64_t) * 2 + 2); + checkPrimitiveType( + VariantPrimitiveType::Decimal16, + /*size_required=*/sizeof(int64_t) * 2 + kHeaderSizeBytes + kDecimalScaleSizeBytes); uint8_t scale = value_[1]; @@ -591,25 +607,40 @@ int64_t VariantValue::getTimestampNanosNtz() const { } std::array VariantValue::getUuid() const { - checkPrimitiveType(VariantPrimitiveType::Uuid, /*size_required=*/17); + checkPrimitiveType(VariantPrimitiveType::Uuid, /*size_required=*/16 + kHeaderSizeBytes); std::array uuid_value; - memcpy(uuid_value.data(), value_.data() + 1, sizeof(uuid_value)); + memcpy(uuid_value.data(), value_.data() + kHeaderSizeBytes, sizeof(uuid_value)); return uuid_value; } +uint32_t VariantValue::complexOffsetAt(uint32_t field_index) const { + checkIsComplexType(); + return readLittleEndianU32(value_.data() + complex_info_.offset_start_offset + + field_index * complex_info_.offset_size, + complex_info_.offset_size); +} + +uint32_t VariantValue::complexFieldIdAt(uint32_t field_index) const { + checkBasicType(VariantBasicType::Object); + return readLittleEndianU32( + value_.data() + complex_info_.id_start_offset + field_index * complex_info_.id_size, + complex_info_.id_size); +} + VariantValue::ComplexInfo VariantValue::getObjectInfo(std::string_view value) { uint8_t value_header = value[0] >> 2; uint8_t field_offset_size = (value_header & 0b11) + 1; uint8_t field_id_size = ((value_header >> 2) & 0b11) + 1; bool is_large = ((value_header >> 4) & 0b1); uint8_t num_elements_size = is_large ? 4 : 1; - if (value.size() < static_cast(1 + num_elements_size)) { + if (value.size() < static_cast(kHeaderSizeBytes + num_elements_size)) { throw ParquetException( "Invalid object value: too short: " + std::to_string(value.size()) + - " for at least " + std::to_string(1 + num_elements_size)); + " for at least " + std::to_string(kHeaderSizeBytes + num_elements_size)); } // parse num_elements - uint32_t num_elements = readLittleEndianU32(value.data() + 1, num_elements_size); + uint32_t num_elements = + readLittleEndianU32(value.data() + kHeaderSizeBytes, num_elements_size); ComplexInfo complex_info{}; complex_info.num_elements = num_elements; complex_info.id_size = field_id_size; @@ -625,19 +656,6 @@ VariantValue::ComplexInfo VariantValue::getObjectInfo(std::string_view value) { std::to_string(complex_info.data_start_offset) + ", value_size=" + std::to_string(value.size())); } - { - uint32_t final_offset = - readLittleEndianU32(value.data() + complex_info.offset_start_offset + - num_elements * field_offset_size, - field_offset_size); - // It could be less than value size since it could be a sub-object. - if (final_offset + complex_info.data_start_offset > value.size()) { - throw ParquetException( - "Invalid object value: final_offset=" + std::to_string(final_offset) + - ", data_start_offset=" + std::to_string(complex_info.data_start_offset) + - ", value_size=" + std::to_string(value.size())); - } - } return complex_info; } @@ -675,27 +693,22 @@ std::optional VariantValue::getObjectValueByKey( std::optional VariantValue::getObjectFieldByFieldId( uint32_t variant_id) const { checkBasicType(VariantBasicType::Object); - std::optional field_offset_opt; + std::optional field_index_opt; // Get the field offset // TODO(mwish): Using binary search to optimize it. for (uint32_t i = 0; i < complex_info_.num_elements; ++i) { - uint32_t variant_field_id = readLittleEndianU32( - value_.data() + complex_info_.id_start_offset + i * complex_info_.id_size, - complex_info_.id_size); + uint32_t variant_field_id = complexFieldIdAt(i); if (variant_field_id == variant_id) { - field_offset_opt = i; + field_index_opt = i; break; } } - if (!field_offset_opt.has_value()) { + if (!field_index_opt.has_value()) { return std::nullopt; } - uint32_t field_offset = field_offset_opt.value(); + uint32_t field_index = field_index_opt.value(); // Read the offset and next offset - uint32_t offset = - readLittleEndianU32(value_.data() + complex_info_.offset_start_offset + - field_offset * complex_info_.offset_size, - complex_info_.offset_size); + uint32_t offset = complexOffsetAt(field_index); if (complex_info_.data_start_offset + offset > value_.size()) { throw ParquetException("Invalid object field offsets: data_start_offset=" + @@ -713,22 +726,23 @@ std::optional VariantValue::getObjectFieldByFieldId( VariantValue::ComplexInfo VariantValue::getArrayInfo(std::string_view value) { uint8_t value_header = value[0] >> 2; - uint8_t field_offset_size = (value_header & 0b11) + 1; + uint8_t field_offset_size = (value_header & 0b11) + kHeaderSizeBytes; bool is_large = ((value_header >> 2) & 0b1); // check the array header uint8_t num_elements_size = is_large ? 4 : 1; - if (value.size() < static_cast(1 + num_elements_size)) { + if (value.size() < static_cast(kHeaderSizeBytes + num_elements_size)) { throw ParquetException( "Invalid array value: too short: " + std::to_string(value.size()) + " for at least " + std::to_string(1 + num_elements_size)); } - uint32_t num_elements = readLittleEndianU32(value.data() + 1, num_elements_size); + uint32_t num_elements = + readLittleEndianU32(value.data() + kHeaderSizeBytes, num_elements_size); ComplexInfo complex_info{}; complex_info.num_elements = num_elements; complex_info.offset_size = field_offset_size; - complex_info.offset_start_offset = 1 + num_elements_size; + complex_info.offset_start_offset = kHeaderSizeBytes + num_elements_size; complex_info.data_start_offset = complex_info.offset_start_offset + (num_elements + 1) * field_offset_size; @@ -750,14 +764,8 @@ VariantValue VariantValue::getArrayValueByIndex(uint32_t index) const { } // Read the offset and next offset - uint32_t offset = - readLittleEndianU32(value_.data() + complex_info_.offset_start_offset + - index * complex_info_.offset_size, - complex_info_.offset_size); - uint32_t next_offset = - readLittleEndianU32(value_.data() + complex_info_.offset_start_offset + - (index + 1) * complex_info_.offset_size, - complex_info_.offset_size); + uint32_t offset = complexOffsetAt(index); + uint32_t next_offset = complexOffsetAt(index + 1); // Create a VariantValue for the element VariantValue element_value{ diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index a4c7aec28d19..cfa48a1bde9f 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -144,6 +144,8 @@ class PARQUET_EXPORT VariantMetadata { /// Metadata for primitive types and any nested types /// without key dictionary. static constexpr char kEmptyMetadataChars[] = {0x1, 0x0, 0x0}; + static constexpr std::string_view kEmptyMetadataStringView{kEmptyMetadataChars, + sizeof(kEmptyMetadataChars)}; private: static uint32_t loadDictionarySize(std::string_view metadata, uint8_t offset_size); @@ -262,7 +264,11 @@ class PARQUET_EXPORT VariantValue { VariantValue getArrayValueByIndex(uint32_t index) const; private: + static constexpr uint8_t kHeaderSizeBytes = 1; + static constexpr size_t kDecimalScaleSizeBytes = 1; + static constexpr size_t kPrimitiveStringLengthSizeBytes = 4; static constexpr uint8_t kBasicTypeMask = 0b00000011; + static constexpr uint8_t kPrimitiveTypeBitShift = 2; static constexpr uint8_t kPrimitiveTypeMask = 0b00111111; /// The inclusive maximum value of the type info value. It is the size limit of /// ShortString. @@ -292,11 +298,15 @@ class PARQUET_EXPORT VariantValue { // An extra function because binary/string uses 4 bytes for length. std::string_view getPrimitiveBinaryType(VariantPrimitiveType type) const; void checkBasicType(VariantBasicType type) const; + void checkIsComplexType() const; void checkPrimitiveType(VariantPrimitiveType type, size_t size_required) const; static ComplexInfo getArrayInfo(std::string_view value); static ComplexInfo getObjectInfo(std::string_view value); + uint32_t complexOffsetAt(uint32_t field_index) const; + uint32_t complexFieldIdAt(uint32_t field_index) const; + private: VariantMetadata metadata_; std::string_view value_; diff --git a/cpp/src/parquet/variant_test.cc b/cpp/src/parquet/variant_test.cc index a55e601328e9..335c8c080c0c 100644 --- a/cpp/src/parquet/variant_test.cc +++ b/cpp/src/parquet/variant_test.cc @@ -52,28 +52,27 @@ uint8_t primitiveHeader(VariantPrimitiveType primitive) { return (static_cast(primitive) << 2); } -// TODO(mwish): Extract this to primitive metadata test TEST(ParquetVariant, MetadataBase) { std::string dir_string(parquet::test::get_variant_dir()); auto file_system = std::make_shared<::arrow::fs::LocalFileSystem>(); std::vector primitive_metadatas = { - // FIXME(mwish): null metadata is corrupt, see - // https://github.com/apache/parquet-testing/issues/81 - // "primitive_null.metadata", - "primitive_boolean_true.metadata", "primitive_boolean_false.metadata", - "primitive_date.metadata", "primitive_decimal4.metadata", - "primitive_decimal8.metadata", "primitive_decimal16.metadata", - "primitive_float.metadata", "primitive_double.metadata", - "primitive_int8.metadata", "primitive_int16.metadata", - "primitive_int32.metadata", "primitive_int64.metadata", - "primitive_binary.metadata", "primitive_string.metadata", + "primitive_null.metadata", "primitive_boolean_true.metadata", + "primitive_boolean_false.metadata", "primitive_date.metadata", + "primitive_decimal4.metadata", "primitive_decimal8.metadata", + "primitive_decimal16.metadata", "primitive_float.metadata", + "primitive_double.metadata", "primitive_int8.metadata", + "primitive_int16.metadata", "primitive_int32.metadata", + "primitive_int64.metadata", "primitive_binary.metadata", + "primitive_string.metadata", }; for (auto& test_file : primitive_metadatas) { ARROW_SCOPED_TRACE("Testing file: " + test_file); std::string path = dir_string + "/" + test_file; auto buf = readFromFile(*file_system, path); - VariantMetadata metadata(std::string_view{*buf}); + std::string_view metadata_buf{*buf}; + EXPECT_EQ(metadata_buf, VariantMetadata::kEmptyMetadataStringView); + VariantMetadata metadata(metadata_buf); EXPECT_EQ(1, metadata.version()); EXPECT_THROW(metadata.GetMetadataKey(0), ParquetException); } @@ -167,13 +166,11 @@ TEST(ParquetVariant, NumericValues) { EXPECT_EQ(123456, variant.getInt32()); } { - // FIXME(mwish): https://github.com/apache/parquet-testing/issues/82 - // The primitive_int64 is a int32 value, but the metadata is int64. std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; auto variant = LoadVariantValue("primitive_int64", &metadata_buf, &value_buf); - EXPECT_EQ(VariantType::Int32, variant.getType()); - EXPECT_EQ("Int32", variant.typeDebugString()); - EXPECT_EQ(12345678, variant.getInt32()); + EXPECT_EQ(VariantType::Int64, variant.getType()); + EXPECT_EQ("Int64", variant.typeDebugString()); + EXPECT_EQ(1234567890123456789, variant.getInt64()); } { // Test handwritten int64 @@ -186,10 +183,9 @@ TEST(ParquetVariant, NumericValues) { 0x10, 0x22, 0x11}; - std::string_view metadata(VariantMetadata::kEmptyMetadataChars, 3); std::string_view value{reinterpret_cast(int64_chars), sizeof(int64_chars)}; - VariantValue variant{metadata, value}; + VariantValue variant{VariantMetadata::kEmptyMetadataStringView, value}; EXPECT_EQ(VariantType::Int64, variant.getType()); EXPECT_EQ(1234567890987654321L, variant.getInt64()); } @@ -204,10 +200,9 @@ TEST(ParquetVariant, NumericValues) { 0xFF, 0xFF, 0xFF}; - std::string_view metadata(VariantMetadata::kEmptyMetadataChars, 3); std::string_view value{reinterpret_cast(int64_chars), sizeof(int64_chars)}; - VariantValue variant{metadata, value}; + VariantValue variant{VariantMetadata::kEmptyMetadataStringView, value}; EXPECT_EQ(VariantType::Int64, variant.getType()); EXPECT_EQ(-1L, variant.getInt64()); } @@ -440,7 +435,7 @@ TEST(ParquetVariant, DecimalValues) { } TEST(ParquetVariant, Uuid) { - std::string_view empty_metadata(VariantMetadata::kEmptyMetadataChars, 3); + std::string_view empty_metadata = VariantMetadata::kEmptyMetadataStringView; const uint8_t uuid_chars[] = {primitiveHeader(VariantPrimitiveType::Uuid), 0x00, 0x11, @@ -494,7 +489,6 @@ TEST(ParquetVariant, DateTimeValues) { } { // Timestamp Nanos tz negative - std::string_view empty_metadata(VariantMetadata::kEmptyMetadataChars, 3); const uint8_t timestamp_nanos_ntz_chars[] = { primitiveHeader(VariantPrimitiveType::TimestampNanosTz), 0xFF, @@ -507,13 +501,12 @@ TEST(ParquetVariant, DateTimeValues) { 0xFF}; std::string_view value{reinterpret_cast(timestamp_nanos_ntz_chars), sizeof(timestamp_nanos_ntz_chars)}; - VariantValue variant{empty_metadata, value}; + VariantValue variant{VariantMetadata::kEmptyMetadataStringView, value}; EXPECT_EQ(VariantType::TimestampNanosTz, variant.getType()); EXPECT_EQ(-1L, variant.getTimestampNanosTz()); } { // Timestamp Nanos tz negative - std::string_view empty_metadata(VariantMetadata::kEmptyMetadataChars, 3); const uint8_t timestamp_nanos_ntz_chars[] = { primitiveHeader(VariantPrimitiveType::TimestampNanosTz), 0x15, @@ -526,7 +519,7 @@ TEST(ParquetVariant, DateTimeValues) { 0x18}; std::string_view value{reinterpret_cast(timestamp_nanos_ntz_chars), sizeof(timestamp_nanos_ntz_chars)}; - VariantValue variant{empty_metadata, value}; + VariantValue variant{VariantMetadata::kEmptyMetadataStringView, value}; EXPECT_EQ(VariantType::TimestampNanosTz, variant.getType()); EXPECT_EQ(1744877350123456789L, variant.getTimestampNanosTz()); } diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index 2dc8bf140ed6..e7b28dd520fb 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit 2dc8bf140ed6e28652fc347211c7d661714c7f95 +Subproject commit e7b28dd520fb3c9f0908daa84c0ef20d83c73794 From e6c596380c8b344f122dd54a93deff5205a249d5 Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 24 May 2025 02:50:27 +0800 Subject: [PATCH 30/31] cleanup some constant using --- cpp/src/parquet/variant.cc | 16 ++++++++-------- cpp/src/parquet/variant.h | 6 +----- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index ec1660e830da..3450ae14132b 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -301,7 +301,7 @@ VariantType VariantValue::getType() const { switch (basic_type) { case VariantBasicType::Primitive: { auto primitive_type = - static_cast(value_[0] >> kPrimitiveTypeBitShift); + static_cast(value_[0] >> kValueHeaderBitShift); switch (primitive_type) { case VariantPrimitiveType::NullType: return VariantType::Null; @@ -421,7 +421,7 @@ bool VariantValue::getBool() const { VariantBasicTypeToString(getBasicType())); } - uint8_t primitive_type = static_cast(value_[0]) >> kPrimitiveTypeBitShift; + uint8_t primitive_type = static_cast(value_[0]) >> kValueHeaderBitShift; if (primitive_type == static_cast(VariantPrimitiveType::BooleanTrue)) { return true; } @@ -453,7 +453,7 @@ void VariantValue::checkPrimitiveType(VariantPrimitiveType type, checkBasicType(VariantBasicType::Primitive); auto primitive_type = - static_cast(value_[0] >> kPrimitiveTypeBitShift); + static_cast(value_[0] >> kValueHeaderBitShift); if (primitive_type != type) { throw ParquetException( "Expected primitive type: " + VariantPrimitiveTypeToString(type) + @@ -522,7 +522,7 @@ std::string_view VariantValue::getString() const { VariantBasicType basic_type = getBasicType(); if (basic_type == VariantBasicType::ShortString) { - uint8_t short_string_length = (value_[0] >> 2) & kMaxShortStrSizeMask; + uint8_t short_string_length = (value_[0] >> kValueHeaderBitShift); if (value_.size() < static_cast(short_string_length + kHeaderSizeBytes)) { throw ParquetException( "Invalid short string: too short: " + std::to_string(value_.size()) + @@ -628,9 +628,9 @@ uint32_t VariantValue::complexFieldIdAt(uint32_t field_index) const { } VariantValue::ComplexInfo VariantValue::getObjectInfo(std::string_view value) { - uint8_t value_header = value[0] >> 2; + uint8_t value_header = value[0] >> kValueHeaderBitShift; uint8_t field_offset_size = (value_header & 0b11) + 1; - uint8_t field_id_size = ((value_header >> 2) & 0b11) + 1; + uint8_t field_id_size = ((value_header >> kValueHeaderBitShift) & 0b11) + 1; bool is_large = ((value_header >> 4) & 0b1); uint8_t num_elements_size = is_large ? 4 : 1; if (value.size() < static_cast(kHeaderSizeBytes + num_elements_size)) { @@ -725,9 +725,9 @@ std::optional VariantValue::getObjectFieldByFieldId( } VariantValue::ComplexInfo VariantValue::getArrayInfo(std::string_view value) { - uint8_t value_header = value[0] >> 2; + uint8_t value_header = value[0] >> kValueHeaderBitShift; uint8_t field_offset_size = (value_header & 0b11) + kHeaderSizeBytes; - bool is_large = ((value_header >> 2) & 0b1); + bool is_large = ((value_header >> kValueHeaderBitShift) & 0b1); // check the array header uint8_t num_elements_size = is_large ? 4 : 1; diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index cfa48a1bde9f..9a2ba340ccab 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -268,11 +268,7 @@ class PARQUET_EXPORT VariantValue { static constexpr size_t kDecimalScaleSizeBytes = 1; static constexpr size_t kPrimitiveStringLengthSizeBytes = 4; static constexpr uint8_t kBasicTypeMask = 0b00000011; - static constexpr uint8_t kPrimitiveTypeBitShift = 2; - static constexpr uint8_t kPrimitiveTypeMask = 0b00111111; - /// The inclusive maximum value of the type info value. It is the size limit of - /// ShortString. - static constexpr uint8_t kMaxShortStrSizeMask = 0b00111111; + static constexpr uint8_t kValueHeaderBitShift = 2; /// ComplexInfo is used to store the metadata of the array or object. /// For array, it doesn't have id_size and id_start_offset. From 261cde1080a1ecaef5724e3622f83aa0f07ab7d8 Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 26 May 2025 23:49:36 +0800 Subject: [PATCH 31/31] Fix the logic for signed shift --- cpp/src/parquet/variant.cc | 29 +++++++++++++++++------------ cpp/src/parquet/variant.h | 2 ++ 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc index 3450ae14132b..942cf0ecd8ad 100644 --- a/cpp/src/parquet/variant.cc +++ b/cpp/src/parquet/variant.cc @@ -300,8 +300,7 @@ VariantType VariantValue::getType() const { VariantBasicType basic_type = getBasicType(); switch (basic_type) { case VariantBasicType::Primitive: { - auto primitive_type = - static_cast(value_[0] >> kValueHeaderBitShift); + auto primitive_type = static_cast(valueHeader()); switch (primitive_type) { case VariantPrimitiveType::NullType: return VariantType::Null; @@ -452,8 +451,7 @@ void VariantValue::checkPrimitiveType(VariantPrimitiveType type, size_t size_required) const { checkBasicType(VariantBasicType::Primitive); - auto primitive_type = - static_cast(value_[0] >> kValueHeaderBitShift); + auto primitive_type = static_cast(valueHeader()); if (primitive_type != type) { throw ParquetException( "Expected primitive type: " + VariantPrimitiveTypeToString(type) + @@ -522,7 +520,7 @@ std::string_view VariantValue::getString() const { VariantBasicType basic_type = getBasicType(); if (basic_type == VariantBasicType::ShortString) { - uint8_t short_string_length = (value_[0] >> kValueHeaderBitShift); + uint8_t short_string_length = valueHeader(); if (value_.size() < static_cast(short_string_length + kHeaderSizeBytes)) { throw ParquetException( "Invalid short string: too short: " + std::to_string(value_.size()) + @@ -627,8 +625,13 @@ uint32_t VariantValue::complexFieldIdAt(uint32_t field_index) const { complex_info_.id_size); } +uint8_t VariantValue::valueHeader() const { + // Using unsigned shift to avoid sign extension. + return static_cast(value_[0]) >> kValueHeaderBitShift; +} + VariantValue::ComplexInfo VariantValue::getObjectInfo(std::string_view value) { - uint8_t value_header = value[0] >> kValueHeaderBitShift; + uint8_t value_header = static_cast(value[0]) >> kValueHeaderBitShift; uint8_t field_offset_size = (value_header & 0b11) + 1; uint8_t field_id_size = ((value_header >> kValueHeaderBitShift) & 0b11) + 1; bool is_large = ((value_header >> 4) & 0b1); @@ -725,7 +728,7 @@ std::optional VariantValue::getObjectFieldByFieldId( } VariantValue::ComplexInfo VariantValue::getArrayInfo(std::string_view value) { - uint8_t value_header = value[0] >> kValueHeaderBitShift; + uint8_t value_header = static_cast(value[0]) >> kValueHeaderBitShift; uint8_t field_offset_size = (value_header & 0b11) + kHeaderSizeBytes; bool is_large = ((value_header >> kValueHeaderBitShift) & 0b1); @@ -763,15 +766,17 @@ VariantValue VariantValue::getArrayValueByIndex(uint32_t index) const { " >= " + std::to_string(complex_info_.num_elements)); } - // Read the offset and next offset uint32_t offset = complexOffsetAt(index); - uint32_t next_offset = complexOffsetAt(index + 1); + if (complex_info_.data_start_offset + offset > value_.size()) { + throw ParquetException("Invalid array value: data_start_offset=" + + std::to_string(complex_info_.data_start_offset) + + ", offset=" + std::to_string(offset) + + ", value_size=" + std::to_string(value_.size())); + } // Create a VariantValue for the element VariantValue element_value{ - metadata_, - std::string_view(value_.data() + complex_info_.data_start_offset + offset, - next_offset - offset)}; + metadata_, value_.substr(/*pos=*/complex_info_.data_start_offset + offset)}; return element_value; } diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h index 9a2ba340ccab..6b066c2cf547 100644 --- a/cpp/src/parquet/variant.h +++ b/cpp/src/parquet/variant.h @@ -303,6 +303,8 @@ class PARQUET_EXPORT VariantValue { uint32_t complexOffsetAt(uint32_t field_index) const; uint32_t complexFieldIdAt(uint32_t field_index) const; + uint8_t valueHeader() const; + private: VariantMetadata metadata_; std::string_view value_;