diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index a8d92f190c32..9f4328fc3b27 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -189,7 +189,8 @@ set(PARQUET_SRCS statistics.cc stream_reader.cc stream_writer.cc - types.cc) + types.cc + variant.cc) if(ARROW_HAVE_RUNTIME_AVX2) # AVX2 is used as a proxy for BMI2. @@ -382,7 +383,8 @@ add_parquet_test(internals-test public_api_test.cc size_statistics_test.cc statistics_test.cc - types_test.cc) + types_test.cc + variant_test.cc) set_source_files_properties(public_api_test.cc PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON) diff --git a/cpp/src/parquet/test_util.cc b/cpp/src/parquet/test_util.cc index 9d35413d36f9..ecd95500cab7 100644 --- a/cpp/src/parquet/test_util.cc +++ b/cpp/src/parquet/test_util.cc @@ -53,6 +53,15 @@ std::string get_bad_data_dir() { return ss.str(); } +std::string get_variant_dir() { + // PARQUET_TEST_DATA should point to ARROW_HOME/cpp/submodules/parquet-testing/data + // so need to reach one folder up to access the "variant" folder. + std::string data_dir(get_data_dir()); + std::stringstream ss; + ss << data_dir << "/../variant"; + return ss.str(); +} + std::string get_data_file(const std::string& filename, bool is_good) { std::stringstream ss; diff --git a/cpp/src/parquet/test_util.h b/cpp/src/parquet/test_util.h index 3ed9a1a007b9..6233844f552e 100644 --- a/cpp/src/parquet/test_util.h +++ b/cpp/src/parquet/test_util.h @@ -71,6 +71,7 @@ class ParquetTestException : public parquet::ParquetException { const char* get_data_dir(); std::string get_bad_data_dir(); +std::string get_variant_dir(); std::string get_data_file(const std::string& filename, bool is_good = true); diff --git a/cpp/src/parquet/variant.cc b/cpp/src/parquet/variant.cc new file mode 100644 index 000000000000..942cf0ecd8ad --- /dev/null +++ b/cpp/src/parquet/variant.cc @@ -0,0 +1,784 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/variant.h" + +#include +#include +#include + +#include +#include +#include + +#include "parquet/exception.h" + +namespace parquet::variant { + +std::string VariantBasicTypeToString(VariantBasicType type) { + switch (type) { + case VariantBasicType::Primitive: + return "Primitive"; + case VariantBasicType::ShortString: + return "ShortString"; + case VariantBasicType::Object: + return "Object"; + case VariantBasicType::Array: + return "Array"; + default: + return "Unknown"; + } +} + +std::string VariantPrimitiveTypeToString(VariantPrimitiveType type) { + switch (type) { + case VariantPrimitiveType::NullType: + return "NullType"; + case VariantPrimitiveType::BooleanTrue: + return "BooleanTrue"; + case VariantPrimitiveType::BooleanFalse: + return "BooleanFalse"; + case VariantPrimitiveType::Int8: + return "Int8"; + case VariantPrimitiveType::Int16: + return "Int16"; + case VariantPrimitiveType::Int32: + return "Int32"; + case VariantPrimitiveType::Int64: + return "Int64"; + case VariantPrimitiveType::Double: + return "Double"; + case VariantPrimitiveType::Decimal4: + return "Decimal4"; + case VariantPrimitiveType::Decimal8: + return "Decimal8"; + case VariantPrimitiveType::Decimal16: + return "Decimal16"; + case VariantPrimitiveType::Date: + return "Date"; + case VariantPrimitiveType::TimestampMicros: + return "TimestampMicros"; + case VariantPrimitiveType::TimestampMicrosNtz: + return "TimestampMicrosNtz"; + case VariantPrimitiveType::Float: + return "Float"; + case VariantPrimitiveType::Binary: + return "Binary"; + case VariantPrimitiveType::String: + return "String"; + case VariantPrimitiveType::TimeMicrosNtz: + return "TimeMicrosNtz"; + case VariantPrimitiveType::TimestampNanosTz: + return "TimestampNanosTz"; + case VariantPrimitiveType::TimestampNanosNtz: + return "TimestampNanosNtz"; + case VariantPrimitiveType::Uuid: + return "Uuid"; + default: + return "Unknown"; + } +} + +std::string VariantTypeToString(VariantType type) { + switch (type) { + case VariantType::Object: + return "Object"; + case VariantType::Array: + return "Array"; + case VariantType::Null: + return "Null"; + case VariantType::Boolean: + return "Boolean"; + case VariantType::Int8: + return "Int8"; + case VariantType::Int16: + return "Int16"; + case VariantType::Int32: + return "Int32"; + case VariantType::Int64: + return "Int64"; + case VariantType::String: + return "String"; + case VariantType::Double: + return "Double"; + case VariantType::Decimal4: + return "Decimal4"; + case VariantType::Decimal8: + return "Decimal8"; + case VariantType::Decimal16: + return "Decimal16"; + case VariantType::Date: + return "Date"; + case VariantType::TimestampMicrosTz: + return "TimestampMicrosTz"; + case VariantType::TimestampMicrosNtz: + return "TimestampMicrosNtz"; + case VariantType::Float: + return "Float"; + case VariantType::Binary: + return "Binary"; + case VariantType::Time: + return "Time"; + case VariantType::TimestampNanosTz: + return "TimestampNanosTz"; + case VariantType::TimestampNanosNtz: + return "TimestampNanosNtz"; + case VariantType::Uuid: + return "Uuid"; + default: + return "Unknown"; + } +} + +inline uint32_t readLittleEndianU32(const void* from, uint8_t size) { + ARROW_DCHECK_LE(size, 4); + ARROW_DCHECK_GE(size, 1); + + uint32_t result = 0; + memcpy(&result, from, size); + return ::arrow::bit_util::FromLittleEndian(result); +} + +VariantMetadata::VariantMetadata(std::string_view metadata) : metadata_(metadata) { + if (metadata.size() < kHeaderSizeBytes + kMinimalOffsetSizeBytes * 2) { + // Empty metadata is at least 3 bytes: version, dictionarySize and + // at least one offset. + throw ParquetException("Invalid Variant metadata: too short: size=" + + std::to_string(metadata.size())); + } + if (version() != kSupportedVersion) { + // Currently we only supports version 1. + throw ParquetException("Unsupported Variant metadata version: " + + std::to_string(version())); + } + uint8_t offset_sz = offset_size(); + if (offset_sz < kMinimalOffsetSizeBytes || offset_sz > kMaximumOffsetSizeBytes) { + throw ParquetException("Invalid Variant metadata: invalid offset size: " + + std::to_string(offset_sz)); + } + dictionary_size_ = loadDictionarySize(metadata, offset_sz); + if (kHeaderSizeBytes + (dictionary_size_ + 2) * offset_sz > metadata_.size()) { + throw ParquetException( + "Invalid Variant metadata: offset out of range: " + + std::to_string((dictionary_size_ + kHeaderSizeBytes) * offset_sz) + " > " + + std::to_string(metadata_.size())); + } +} + +uint8_t VariantMetadata::version() const { + return static_cast(metadata_[0]) & kVersionMask; +} + +bool VariantMetadata::sorted_and_unique() const { + return (metadata_[0] & kSortedStringMask) != 0; +} + +uint8_t VariantMetadata::offset_size() const { + // Since it stores offsetSize - 1, we add 1 here. + return ((metadata_[0] >> kOffsetSizeBitShift) & kOffsetSizeMask) + 1; +} + +uint32_t VariantMetadata::loadDictionarySize(std::string_view metadata, + uint8_t offset_size) { + if (static_cast(offset_size + kHeaderSizeBytes) > metadata.size()) { + throw ParquetException("Invalid Variant metadata: too short for dictionary size"); + } + return readLittleEndianU32(metadata.data() + kHeaderSizeBytes, offset_size); +} + +uint32_t VariantMetadata::dictionary_size() const { return dictionary_size_; } + +std::string_view VariantMetadata::GetMetadataKey(uint32_t variant_id) const { + uint32_t offset_bytes = offset_size(); + uint32_t dictionary_item_count = dictionary_size(); + + if (variant_id >= dictionary_item_count) { + throw ParquetException("Invalid Variant metadata: variant_id out of range: " + + std::to_string(variant_id) + + " >= " + std::to_string(dictionary_item_count)); + } + + size_t offset_start_pos = kHeaderSizeBytes + offset_bytes + (variant_id * offset_bytes); + + // Index range of offsets are already checked in ctor, so no need to check again. + uint32_t variant_offset = + readLittleEndianU32(metadata_.data() + offset_start_pos, offset_bytes); + uint32_t variant_next_offset = readLittleEndianU32( + metadata_.data() + offset_start_pos + offset_bytes, offset_bytes); + uint32_t key_size = variant_next_offset - variant_offset; + + // 2 for dictionary_size and an extra offset for the dictionary key string. + size_t string_start = + kHeaderSizeBytes + offset_bytes * (dictionary_item_count + 2) + variant_offset; + if (string_start + key_size > metadata_.size()) { + throw ParquetException("Invalid Variant metadata: string data out of range: " + + std::to_string(string_start) + " + " + + std::to_string(key_size) + " > " + + std::to_string(metadata_.size())); + } + return {metadata_.data() + string_start, key_size}; +} + +::arrow::internal::SmallVector VariantMetadata::GetMetadataId( + std::string_view key) const { + uint32_t offset_bytes = offset_size(); + uint32_t dictionary_bytes = dictionary_size(); + + const bool sort_and_unique = sorted_and_unique(); + // TODO(mwish): This can be optimized by using binary search if the metadata is sorted. + ::arrow::internal::SmallVector vector; + uint32_t dictionary_key_offset = 0; + uint32_t dictionary_next_key_offset = 0; + const uint32_t dictionary_key_start_offset = + kHeaderSizeBytes + offset_bytes * (dictionary_bytes + 2); + for (uint32_t i = 0; i < dictionary_bytes; ++i) { + size_t offset_start_pos = kHeaderSizeBytes + (i + 1) * offset_bytes; + dictionary_key_offset = dictionary_next_key_offset; + dictionary_next_key_offset = readLittleEndianU32( + metadata_.data() + offset_start_pos + offset_bytes, offset_bytes); + uint32_t dictionary_key_size = dictionary_next_key_offset - dictionary_key_offset; + + size_t dictionary_key_start = dictionary_key_start_offset + dictionary_key_offset; + if (dictionary_key_start + dictionary_key_size > metadata_.size()) { + throw ParquetException("Invalid Variant metadata: string data out of range"); + } + std::string_view current_key{metadata_.data() + dictionary_key_start, + dictionary_key_size}; + if (current_key == key) { + vector.push_back(i); + if (sort_and_unique) { + break; + } + } + } + return vector; +} + +VariantValue::VariantValue(VariantMetadata metadata, std::string_view value) + : metadata_(metadata), value_(value) { + if (value_.empty()) { + throw ParquetException("Invalid Variant metadata: empty string"); + } + switch (getBasicType()) { + case VariantBasicType::Array: { + complex_info_ = getArrayInfo(value_); + break; + } + case VariantBasicType::Object: { + complex_info_ = getObjectInfo(value_); + break; + } + case VariantBasicType::ShortString: + case VariantBasicType::Primitive: { + break; + } + } +} + +VariantValue::VariantValue(std::string_view metadata, std::string_view value) + : VariantValue(VariantMetadata(metadata), value) {} + +VariantBasicType VariantValue::getBasicType() const { + return static_cast(value_[0] & kBasicTypeMask); +} + +VariantType VariantValue::getType() const { + VariantBasicType basic_type = getBasicType(); + switch (basic_type) { + case VariantBasicType::Primitive: { + auto primitive_type = static_cast(valueHeader()); + switch (primitive_type) { + case VariantPrimitiveType::NullType: + return VariantType::Null; + case VariantPrimitiveType::BooleanTrue: + case VariantPrimitiveType::BooleanFalse: + return VariantType::Boolean; + case VariantPrimitiveType::Int8: + return VariantType::Int8; + case VariantPrimitiveType::Int16: + return VariantType::Int16; + case VariantPrimitiveType::Int32: + return VariantType::Int32; + case VariantPrimitiveType::Int64: + return VariantType::Int64; + case VariantPrimitiveType::Double: + return VariantType::Double; + case VariantPrimitiveType::Decimal4: + return VariantType::Decimal4; + case VariantPrimitiveType::Decimal8: + return VariantType::Decimal8; + case VariantPrimitiveType::Decimal16: + return VariantType::Decimal16; + case VariantPrimitiveType::Date: + return VariantType::Date; + case VariantPrimitiveType::TimestampMicros: + return VariantType::TimestampMicrosTz; + case VariantPrimitiveType::TimestampMicrosNtz: + return VariantType::TimestampMicrosNtz; + case VariantPrimitiveType::Float: + return VariantType::Float; + case VariantPrimitiveType::Binary: + return VariantType::Binary; + case VariantPrimitiveType::String: + return VariantType::String; + case VariantPrimitiveType::TimeMicrosNtz: + return VariantType::Time; + case VariantPrimitiveType::TimestampNanosTz: + return VariantType::TimestampNanosTz; + case VariantPrimitiveType::TimestampNanosNtz: + return VariantType::TimestampNanosNtz; + case VariantPrimitiveType::Uuid: + return VariantType::Uuid; + default: + throw ParquetException("Unknown primitive type: " + + std::to_string(static_cast(primitive_type))); + } + } + case VariantBasicType::ShortString: + return VariantType::String; + case VariantBasicType::Object: + return VariantType::Object; + case VariantBasicType::Array: + return VariantType::Array; + default: + throw ParquetException("Unknown basic type: " + + std::to_string(static_cast(basic_type))); + } +} + +const VariantMetadata& VariantValue::metadata() const { return metadata_; } + +std::string_view VariantValue::typeDebugString() const { + VariantType variant_type = getType(); + switch (variant_type) { + case VariantType::Object: + return "Object"; + case VariantType::Array: + return "Array"; + case VariantType::Null: + return "Null"; + case VariantType::Boolean: + return "Boolean"; + case VariantType::Int8: + return "Int8"; + case VariantType::Int16: + return "Int16"; + case VariantType::Int32: + return "Int32"; + case VariantType::Int64: + return "Int64"; + case VariantType::String: + return "String"; + case VariantType::Double: + return "Double"; + case VariantType::Decimal4: + return "Decimal4"; + case VariantType::Decimal8: + return "Decimal8"; + case VariantType::Decimal16: + return "Decimal16"; + case VariantType::Date: + return "Date"; + case VariantType::TimestampMicrosTz: + return "TimestampMicrosTz"; + case VariantType::TimestampMicrosNtz: + return "TimestampMicrosNtz"; + case VariantType::Float: + return "Float"; + case VariantType::Binary: + return "Binary"; + case VariantType::Time: + return "Time"; + case VariantType::TimestampNanosTz: + return "TimestampNanosTz"; + case VariantType::TimestampNanosNtz: + return "TimestampNanosNtz"; + case VariantType::Uuid: + return "Uuid"; + default: + ::arrow::Unreachable(); + } +} + +bool VariantValue::getBool() const { + if (getBasicType() != VariantBasicType::Primitive) { + throw ParquetException("Expected primitive type, but got: " + + VariantBasicTypeToString(getBasicType())); + } + + uint8_t primitive_type = static_cast(value_[0]) >> kValueHeaderBitShift; + if (primitive_type == static_cast(VariantPrimitiveType::BooleanTrue)) { + return true; + } + if (primitive_type == static_cast(VariantPrimitiveType::BooleanFalse)) { + return false; + } + + throw ParquetException("Not a variant primitive boolean type with primitive type: " + + std::to_string(primitive_type)); +} + +void VariantValue::checkBasicType(VariantBasicType type) const { + if (getBasicType() != type) { + throw ParquetException("Expected basic type: " + VariantBasicTypeToString(type) + + ", but got: " + VariantBasicTypeToString(getBasicType())); + } +} + +void VariantValue::checkIsComplexType() const { + VariantBasicType basic_type = getBasicType(); + if (basic_type != VariantBasicType::Object && basic_type != VariantBasicType::Array) { + throw ParquetException("Expected complex type, but got: " + + VariantBasicTypeToString(basic_type)); + } +} + +void VariantValue::checkPrimitiveType(VariantPrimitiveType type, + size_t size_required) const { + checkBasicType(VariantBasicType::Primitive); + + auto primitive_type = static_cast(valueHeader()); + if (primitive_type != type) { + throw ParquetException( + "Expected primitive type: " + VariantPrimitiveTypeToString(type) + + ", but got: " + VariantPrimitiveTypeToString(primitive_type)); + } + + if (value_.size() < size_required) { + throw ParquetException("Invalid value: too short, expected at least " + + std::to_string(size_required) + " bytes for type " + + VariantPrimitiveTypeToString(type) + + ", but got: " + std::to_string(value_.size()) + " bytes"); + } +} + +template +PrimitiveType VariantValue::getPrimitiveType(VariantPrimitiveType type) const { + checkPrimitiveType(type, sizeof(PrimitiveType) + kHeaderSizeBytes); + + PrimitiveType primitive_value{}; + memcpy(&primitive_value, value_.data() + kHeaderSizeBytes, sizeof(PrimitiveType)); + // Here we should cast from Little endian. + primitive_value = ::arrow::bit_util::FromLittleEndian(primitive_value); + return primitive_value; +} + +int8_t VariantValue::getInt8() const { + return getPrimitiveType(VariantPrimitiveType::Int8); +} + +int16_t VariantValue::getInt16() const { + return getPrimitiveType(VariantPrimitiveType::Int16); +} + +int32_t VariantValue::getInt32() const { + return getPrimitiveType(VariantPrimitiveType::Int32); +} + +int64_t VariantValue::getInt64() const { + return getPrimitiveType(VariantPrimitiveType::Int64); +} + +float VariantValue::getFloat() const { + return getPrimitiveType(VariantPrimitiveType::Float); +} + +double VariantValue::getDouble() const { + return getPrimitiveType(VariantPrimitiveType::Double); +} + +std::string_view VariantValue::getPrimitiveBinaryType(VariantPrimitiveType type) const { + checkPrimitiveType( + type, /*size_required=*/kHeaderSizeBytes + kPrimitiveStringLengthSizeBytes); + + uint32_t length; + memcpy(&length, value_.data() + kHeaderSizeBytes, sizeof(uint32_t)); + length = ::arrow::bit_util::FromLittleEndian(length); + + if (value_.size() < length + kHeaderSizeBytes + kPrimitiveStringLengthSizeBytes) { + throw ParquetException("Invalid string value: too short for specified length"); + } + + return {value_.data() + kHeaderSizeBytes + kPrimitiveStringLengthSizeBytes, length}; +} + +std::string_view VariantValue::getString() const { + VariantBasicType basic_type = getBasicType(); + + if (basic_type == VariantBasicType::ShortString) { + uint8_t short_string_length = valueHeader(); + if (value_.size() < static_cast(short_string_length + kHeaderSizeBytes)) { + throw ParquetException( + "Invalid short string: too short: " + std::to_string(value_.size()) + + " for at least " + std::to_string(short_string_length + kHeaderSizeBytes)); + } + return {value_.data() + kHeaderSizeBytes, short_string_length}; + } + if (basic_type == VariantBasicType::Primitive) { + // TODO(mwish): Should we validate utf8 here? + return getPrimitiveBinaryType(VariantPrimitiveType::String); + } + + throw ParquetException("Expected ShortString or Primitive type, but got: " + + VariantBasicTypeToString(basic_type)); +} + +std::string_view VariantValue::getBinary() const { + return getPrimitiveBinaryType(VariantPrimitiveType::Binary); +} + +template +DecimalValue VariantValue::getPrimitiveDecimalType( + VariantPrimitiveType type) const { + using DecimalValueType = typename DecimalType::ValueType; + checkPrimitiveType( + type, sizeof(DecimalValueType) + kHeaderSizeBytes + kDecimalScaleSizeBytes); + + uint8_t scale = value_[kHeaderSizeBytes]; + DecimalValueType decimal_value; + memcpy(&decimal_value, value_.data() + kHeaderSizeBytes + kDecimalScaleSizeBytes, + sizeof(DecimalValueType)); + decimal_value = ::arrow::bit_util::FromLittleEndian(decimal_value); + + return {scale, DecimalType(decimal_value)}; +} + +DecimalValue<::arrow::Decimal32> VariantValue::getDecimal4() const { + return getPrimitiveDecimalType<::arrow::Decimal32>(VariantPrimitiveType::Decimal4); +} + +DecimalValue<::arrow::Decimal64> VariantValue::getDecimal8() const { + return getPrimitiveDecimalType<::arrow::Decimal64>(VariantPrimitiveType::Decimal8); +} + +DecimalValue<::arrow::Decimal128> VariantValue::getDecimal16() const { + checkPrimitiveType( + VariantPrimitiveType::Decimal16, + /*size_required=*/sizeof(int64_t) * 2 + kHeaderSizeBytes + kDecimalScaleSizeBytes); + + uint8_t scale = value_[1]; + + // TODO(mwish): Do we have better way for this? + std::array low_high_bits; + memcpy(&low_high_bits[0], value_.data() + 2, sizeof(int64_t)); + memcpy(&low_high_bits[1], value_.data() + 10, sizeof(int64_t)); + ::arrow::bit_util::little_endian::ToNative(low_high_bits); + return {scale, ::arrow::Decimal128(low_high_bits[1], low_high_bits[0])}; +} + +int32_t VariantValue::getDate() const { + return getPrimitiveType(VariantPrimitiveType::Date); +} + +int64_t VariantValue::getTimeMicrosNtz() const { + return getPrimitiveType(VariantPrimitiveType::TimeMicrosNtz); +} + +int64_t VariantValue::getTimestampMicros() const { + return getPrimitiveType(VariantPrimitiveType::TimestampMicros); +} + +int64_t VariantValue::getTimestampMicrosNtz() const { + return getPrimitiveType(VariantPrimitiveType::TimestampMicrosNtz); +} + +int64_t VariantValue::getTimestampNanosTz() const { + return getPrimitiveType(VariantPrimitiveType::TimestampNanosTz); +} + +int64_t VariantValue::getTimestampNanosNtz() const { + return getPrimitiveType(VariantPrimitiveType::TimestampNanosNtz); +} + +std::array VariantValue::getUuid() const { + checkPrimitiveType(VariantPrimitiveType::Uuid, /*size_required=*/16 + kHeaderSizeBytes); + std::array uuid_value; + memcpy(uuid_value.data(), value_.data() + kHeaderSizeBytes, sizeof(uuid_value)); + return uuid_value; +} + +uint32_t VariantValue::complexOffsetAt(uint32_t field_index) const { + checkIsComplexType(); + return readLittleEndianU32(value_.data() + complex_info_.offset_start_offset + + field_index * complex_info_.offset_size, + complex_info_.offset_size); +} + +uint32_t VariantValue::complexFieldIdAt(uint32_t field_index) const { + checkBasicType(VariantBasicType::Object); + return readLittleEndianU32( + value_.data() + complex_info_.id_start_offset + field_index * complex_info_.id_size, + complex_info_.id_size); +} + +uint8_t VariantValue::valueHeader() const { + // Using unsigned shift to avoid sign extension. + return static_cast(value_[0]) >> kValueHeaderBitShift; +} + +VariantValue::ComplexInfo VariantValue::getObjectInfo(std::string_view value) { + uint8_t value_header = static_cast(value[0]) >> kValueHeaderBitShift; + uint8_t field_offset_size = (value_header & 0b11) + 1; + uint8_t field_id_size = ((value_header >> kValueHeaderBitShift) & 0b11) + 1; + bool is_large = ((value_header >> 4) & 0b1); + uint8_t num_elements_size = is_large ? 4 : 1; + if (value.size() < static_cast(kHeaderSizeBytes + num_elements_size)) { + throw ParquetException( + "Invalid object value: too short: " + std::to_string(value.size()) + + " for at least " + std::to_string(kHeaderSizeBytes + num_elements_size)); + } + // parse num_elements + uint32_t num_elements = + readLittleEndianU32(value.data() + kHeaderSizeBytes, num_elements_size); + ComplexInfo complex_info{}; + complex_info.num_elements = num_elements; + complex_info.id_size = field_id_size; + complex_info.offset_size = field_offset_size; + complex_info.id_start_offset = 1 + num_elements_size; + complex_info.offset_start_offset = + complex_info.id_start_offset + num_elements * field_id_size; + complex_info.data_start_offset = + complex_info.offset_start_offset + (num_elements + 1) * field_offset_size; + // Check the boundary with the final offset + if (complex_info.data_start_offset > value.size()) { + throw ParquetException("Invalid object value: data_start_offset=" + + std::to_string(complex_info.data_start_offset) + + ", value_size=" + std::to_string(value.size())); + } + return complex_info; +} + +uint32_t VariantValue::num_elements() const { + auto basic_type = getBasicType(); + switch (basic_type) { + case VariantBasicType::Object: + case VariantBasicType::Array: + return complex_info_.num_elements; + case VariantBasicType::Primitive: + case VariantBasicType::ShortString: { + throw ParquetException("Invalid call to num_elements() for basic type: " + + VariantBasicTypeToString(basic_type)); + } + } + ::arrow::Unreachable(); +} + +std::optional VariantValue::getObjectValueByKey( + std::string_view key) const { + checkBasicType(VariantBasicType::Object); + auto metadata_ids = metadata_.GetMetadataId(key); + if (metadata_ids.empty()) { + return std::nullopt; + } + for (uint32_t variant_id : metadata_ids) { + auto variant_value = getObjectFieldByFieldId(variant_id); + if (variant_value.has_value()) { + return variant_value; + } + } + return std::nullopt; +} + +std::optional VariantValue::getObjectFieldByFieldId( + uint32_t variant_id) const { + checkBasicType(VariantBasicType::Object); + std::optional field_index_opt; + // Get the field offset + // TODO(mwish): Using binary search to optimize it. + for (uint32_t i = 0; i < complex_info_.num_elements; ++i) { + uint32_t variant_field_id = complexFieldIdAt(i); + if (variant_field_id == variant_id) { + field_index_opt = i; + break; + } + } + if (!field_index_opt.has_value()) { + return std::nullopt; + } + uint32_t field_index = field_index_opt.value(); + // Read the offset and next offset + uint32_t offset = complexOffsetAt(field_index); + + if (complex_info_.data_start_offset + offset > value_.size()) { + throw ParquetException("Invalid object field offsets: data_start_offset=" + + std::to_string(complex_info_.data_start_offset) + + ", offset=" + std::to_string(offset) + + ", value_size=" + std::to_string(value_.size())); + } + + // Create a VariantValue for the field + VariantValue field_value{metadata_, + value_.substr(complex_info_.data_start_offset + offset)}; + + return field_value; +} + +VariantValue::ComplexInfo VariantValue::getArrayInfo(std::string_view value) { + uint8_t value_header = static_cast(value[0]) >> kValueHeaderBitShift; + uint8_t field_offset_size = (value_header & 0b11) + kHeaderSizeBytes; + bool is_large = ((value_header >> kValueHeaderBitShift) & 0b1); + + // check the array header + uint8_t num_elements_size = is_large ? 4 : 1; + if (value.size() < static_cast(kHeaderSizeBytes + num_elements_size)) { + throw ParquetException( + "Invalid array value: too short: " + std::to_string(value.size()) + + " for at least " + std::to_string(1 + num_elements_size)); + } + + uint32_t num_elements = + readLittleEndianU32(value.data() + kHeaderSizeBytes, num_elements_size); + ComplexInfo complex_info{}; + complex_info.num_elements = num_elements; + complex_info.offset_size = field_offset_size; + complex_info.offset_start_offset = kHeaderSizeBytes + num_elements_size; + complex_info.data_start_offset = + complex_info.offset_start_offset + (num_elements + 1) * field_offset_size; + + // Boundary check + if (complex_info.data_start_offset > value.size()) { + throw ParquetException("Invalid array value: data_start_offset=" + + std::to_string(complex_info.data_start_offset) + + ", value_size=" + std::to_string(value.size())); + } + + return complex_info; +} + +VariantValue VariantValue::getArrayValueByIndex(uint32_t index) const { + checkBasicType(VariantBasicType::Array); + if (index >= complex_info_.num_elements) { + throw ParquetException("Array index out of range: " + std::to_string(index) + + " >= " + std::to_string(complex_info_.num_elements)); + } + + uint32_t offset = complexOffsetAt(index); + if (complex_info_.data_start_offset + offset > value_.size()) { + throw ParquetException("Invalid array value: data_start_offset=" + + std::to_string(complex_info_.data_start_offset) + + ", offset=" + std::to_string(offset) + + ", value_size=" + std::to_string(value_.size())); + } + + // Create a VariantValue for the element + VariantValue element_value{ + metadata_, value_.substr(/*pos=*/complex_info_.data_start_offset + offset)}; + + return element_value; +} + +} // namespace parquet::variant diff --git a/cpp/src/parquet/variant.h b/cpp/src/parquet/variant.h new file mode 100644 index 000000000000..6b066c2cf547 --- /dev/null +++ b/cpp/src/parquet/variant.h @@ -0,0 +1,315 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include "parquet/platform.h" + +namespace parquet::variant { + +// TODO(mwish): Should I use parquet::ByteArray rather than +// std::string_view? + +enum class VariantBasicType { + /// One of the primitive types + Primitive = 0, + /// A string with a length less than 64 bytes + ShortString = 1, + /// A collection of (string-key, variant-value) pairs + Object = 2, + /// An ordered sequence of variant values + Array = 3 +}; + +PARQUET_EXPORT std::string VariantBasicTypeToString(VariantBasicType type); + +enum class VariantPrimitiveType : int8_t { + /// Equivalent Parquet Type: UNKNOWN + NullType = 0, + /// Equivalent Parquet Type: BOOLEAN + BooleanTrue = 1, + /// Equivalent Parquet Type: BOOLEAN + BooleanFalse = 2, + /// Equivalent Parquet Type: INT(8, signed) + Int8 = 3, + /// Equivalent Parquet Type: INT(16, signed) + Int16 = 4, + /// Equivalent Parquet Type: INT(32, signed) + Int32 = 5, + /// Equivalent Parquet Type: INT(64, signed) + Int64 = 6, + /// Equivalent Parquet Type: DOUBLE + Double = 7, + /// Equivalent Parquet Type: DECIMAL(precision, scale) + Decimal4 = 8, + /// Equivalent Parquet Type: DECIMAL(precision, scale) + Decimal8 = 9, + /// Equivalent Parquet Type: DECIMAL(precision, scale) + Decimal16 = 10, + /// Equivalent Parquet Type: DATE + Date = 11, + /// Equivalent Parquet Type: TIMESTAMP(isAdjustedToUTC=true, MICROS) + TimestampMicros = 12, + /// Equivalent Parquet Type: TIMESTAMP(isAdjustedToUTC=false, MICROS) + TimestampMicrosNtz = 13, + /// Equivalent Parquet Type: FLOAT + Float = 14, + /// Equivalent Parquet Type: BINARY + Binary = 15, + /// Equivalent Parquet Type: STRING + String = 16, + /// Equivalent Parquet Type: TIME(isAdjustedToUTC=false, MICROS) + TimeMicrosNtz = 17, + /// Equivalent Parquet Type: TIMESTAMP(isAdjustedToUTC=true, NANOS) + TimestampNanosTz = 18, // Assuming TZ stands for TimeZone, and follows the document's + // 'timestamp with time zone' + /// Equivalent Parquet Type: TIMESTAMP(isAdjustedToUTC=false, NANOS) + TimestampNanosNtz = 19, // Differentiating from TimestampNtz (MICROS) + /// Equivalent Parquet Type: UUID + Uuid = 20 +}; + +PARQUET_EXPORT std::string VariantPrimitiveTypeToString(VariantPrimitiveType type); + +/// VariantType is from basic type and primitive type. +enum class VariantType { + Object, + Array, + Null, + Boolean, + Int8, + Int16, + Int32, + Int64, + String, + Double, + Decimal4, + Decimal8, + Decimal16, + Date, + TimestampMicrosTz, + TimestampMicrosNtz, + Float, + Binary, + Time, + TimestampNanosTz, + TimestampNanosNtz, + Uuid +}; + +PARQUET_EXPORT std::string VariantTypeToString(VariantType type); + +class PARQUET_EXPORT VariantMetadata { + public: + explicit VariantMetadata(std::string_view metadata); + /// \brief Get the variant metadata version. Currently, always 1. + uint8_t version() const; + /// \brief Get the metadata key for a given variant field id. + /// \throw ParquetException if the variant_id is out of range(larger than + /// dictionary_size). + std::string_view GetMetadataKey(uint32_t variant_id) const; + /// \brief Get the metadata id for a given key. + /// From the discussion in ML: + /// https://lists.apache.org/thread/b68tjmrjmy64mbv9dknpmqs28vnzjj96 if + /// !sorted_and_unique(), the metadata key is not guaranteed to be unique, so we use a + /// vector to store all the metadata ids. + ::arrow::internal::SmallVector GetMetadataId(std::string_view key) const; + + bool sorted_and_unique() const; + uint8_t offset_size() const; + uint32_t dictionary_size() const; + + /// Metadata for primitive types and any nested types + /// without key dictionary. + static constexpr char kEmptyMetadataChars[] = {0x1, 0x0, 0x0}; + static constexpr std::string_view kEmptyMetadataStringView{kEmptyMetadataChars, + sizeof(kEmptyMetadataChars)}; + + private: + static uint32_t loadDictionarySize(std::string_view metadata, uint8_t offset_size); + + private: + static constexpr uint8_t kVersionMask = 0b1111; + static constexpr uint8_t kSortedStringMask = 0b10000; + static constexpr size_t kHeaderSizeBytes = 1; + static constexpr size_t kMinimalOffsetSizeBytes = 1; + static constexpr size_t kMaximumOffsetSizeBytes = 4; + // mask is applied after shift, it's like 0b11000000 before shift. + static constexpr uint8_t kOffsetSizeMask = 0b11; + static constexpr uint8_t kOffsetSizeBitShift = 6; + static constexpr uint8_t kSupportedVersion = 1; + + private: + std::string_view metadata_; + uint32_t dictionary_size_{0}; +}; + +template +struct PARQUET_EXPORT DecimalValue { + uint8_t scale; + DecimalType value; +}; + +class PARQUET_EXPORT VariantValue { + public: + VariantValue(std::string_view metadata, std::string_view value); + + VariantBasicType getBasicType() const; + VariantType getType() const; + std::string_view typeDebugString() const; + const VariantMetadata& metadata() const; + + // Note: Null doesn't need visitor. + + /// \brief Get the primitive boolean value. + /// \throw ParquetException if the type is not a boolean type. + bool getBool() const; + /// \brief Get the primitive int8 value. + /// \throw ParquetException if the type is not an int8 type. + int8_t getInt8() const; + /// \brief Get the primitive int16 value. + /// \throw ParquetException if the type is not an int16 type. + int16_t getInt16() const; + /// \brief Get the primitive int32 value. + /// \throw ParquetException if the type is not an int32 type. + int32_t getInt32() const; + /// \brief Get the primitive int64 value. + /// \throw ParquetException if the type is not an int64 type. + int64_t getInt64() const; + /// \brief Get the string value, including both short string optimization and primitive + /// string type. \throw ParquetException if the type is not a string type. + std::string_view getString() const; + /// \brief Get the binary value. + /// \throw ParquetException if the type is not a binary type. + std::string_view getBinary() const; + /// \brief Get the primitive float value. + /// \throw ParquetException if the type is not a float type. + float getFloat() const; + /// \brief Get the primitive double value. + /// \throw ParquetException if the type is not a double type. + double getDouble() const; + + /// \brief Get the decimal value with 4 bytes precision. + /// \throw ParquetException if the type is not a decimal4 type. + DecimalValue<::arrow::Decimal32> getDecimal4() const; + /// \brief Get the decimal value with 8 bytes precision. + /// \throw ParquetException if the type is not a decimal8 type. + DecimalValue<::arrow::Decimal64> getDecimal8() const; + /// \brief Get the decimal value with 16 bytes precision. + /// \throw ParquetException if the type is not a decimal16 type. + DecimalValue<::arrow::Decimal128> getDecimal16() const; + + /// \brief Get the date value as days since Unix epoch. + /// \throw ParquetException if the type is not a date type. + int32_t getDate() const; + /// \brief Get the time value without timezone as microseconds since midnight. + /// \throw ParquetException if the type is not a time type. + int64_t getTimeMicrosNtz() const; + /// \brief Get the timestamp value with UTC timezone as microseconds since Unix epoch. + /// \throw ParquetException if the type is not a timestamp type. + int64_t getTimestampMicros() const; + /// \brief Get the timestamp value without timezone as microseconds since Unix epoch. + /// \throw ParquetException if the type is not a timestamp without timezone type. + int64_t getTimestampMicrosNtz() const; + /// \brief Get the timestamp value with UTC timezone as nanoseconds since Unix epoch. + /// \throw ParquetException if the type is not a timestamp type. + int64_t getTimestampNanosTz() const; + /// \brief Get the timestamp value without timezone as nanoseconds since Unix epoch. + /// \throw ParquetException if the type is not a timestamp without timezone type. + int64_t getTimestampNanosNtz() const; + /// \brief Get the UUID value as a 16-byte array. + /// \throw ParquetException if the type is not a UUID type. + std::array getUuid() const; + + /// \brief Get the num_elements of the array or object. + /// For array, it returns the number of elements in the array. + /// For object, it returns the number of fields in the object. + /// \throw ParquetException if the type is not an array or object type. + uint32_t num_elements() const; + + /// \brief Get the value of the object field by key. + /// \return returns the value of the field with the given key, or empty if the key + /// doesn't exist. + /// \throw ParquetException if the type is not an object type. + std::optional getObjectValueByKey(std::string_view key) const; + /// \brief Get the value of the object field by field id. + /// \return returns the value of the field with the given field id, or empty if the + /// field id doesn't exist. + /// \throw ParquetException if the type is not an object type. + std::optional getObjectFieldByFieldId(uint32_t variant_id) const; + + // Would throw ParquetException if index is out of range. + VariantValue getArrayValueByIndex(uint32_t index) const; + + private: + static constexpr uint8_t kHeaderSizeBytes = 1; + static constexpr size_t kDecimalScaleSizeBytes = 1; + static constexpr size_t kPrimitiveStringLengthSizeBytes = 4; + static constexpr uint8_t kBasicTypeMask = 0b00000011; + static constexpr uint8_t kValueHeaderBitShift = 2; + + /// ComplexInfo is used to store the metadata of the array or object. + /// For array, it doesn't have id_size and id_start_offset. + struct ComplexInfo { + uint32_t num_elements; + uint32_t id_start_offset; + uint32_t offset_start_offset; + uint32_t data_start_offset; + uint8_t id_size; + uint8_t offset_size; + }; + + private: + VariantValue(VariantMetadata metadata, std::string_view value); + + template + PrimitiveType getPrimitiveType(VariantPrimitiveType type) const; + + // An extra function because decimal uses 1 byte for scale. + template + DecimalValue getPrimitiveDecimalType(VariantPrimitiveType type) const; + + // An extra function because binary/string uses 4 bytes for length. + std::string_view getPrimitiveBinaryType(VariantPrimitiveType type) const; + void checkBasicType(VariantBasicType type) const; + void checkIsComplexType() const; + void checkPrimitiveType(VariantPrimitiveType type, size_t size_required) const; + + static ComplexInfo getArrayInfo(std::string_view value); + static ComplexInfo getObjectInfo(std::string_view value); + + uint32_t complexOffsetAt(uint32_t field_index) const; + uint32_t complexFieldIdAt(uint32_t field_index) const; + + uint8_t valueHeader() const; + + private: + VariantMetadata metadata_; + std::string_view value_; + + ComplexInfo complex_info_{}; +}; + +} // namespace parquet::variant diff --git a/cpp/src/parquet/variant_test.cc b/cpp/src/parquet/variant_test.cc new file mode 100644 index 000000000000..335c8c080c0c --- /dev/null +++ b/cpp/src/parquet/variant_test.cc @@ -0,0 +1,620 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "parquet/exception.h" +#include "parquet/test_util.h" +#include "parquet/variant.h" + +#include +#include +#include + +#include +#include + +namespace parquet::variant { + +std::string metadata_test_file_name(std::string_view test_name) { + return std::string(test_name) + ".metadata"; +} + +std::string value_test_file_name(std::string_view test_name) { + return std::string(test_name) + ".value"; +} + +std::shared_ptr<::arrow::Buffer> readFromFile(::arrow::fs::FileSystem& fs, + const std::string& path) { + ASSIGN_OR_ABORT(auto file, fs.OpenInputFile(path)); + ASSIGN_OR_ABORT(auto file_size, file->GetSize()); + ASSIGN_OR_ABORT(auto buf, file->Read(file_size)); + return buf; +} + +uint8_t primitiveHeader(VariantPrimitiveType primitive) { + return (static_cast(primitive) << 2); +} + +TEST(ParquetVariant, MetadataBase) { + std::string dir_string(parquet::test::get_variant_dir()); + auto file_system = std::make_shared<::arrow::fs::LocalFileSystem>(); + std::vector primitive_metadatas = { + "primitive_null.metadata", "primitive_boolean_true.metadata", + "primitive_boolean_false.metadata", "primitive_date.metadata", + "primitive_decimal4.metadata", "primitive_decimal8.metadata", + "primitive_decimal16.metadata", "primitive_float.metadata", + "primitive_double.metadata", "primitive_int8.metadata", + "primitive_int16.metadata", "primitive_int32.metadata", + "primitive_int64.metadata", "primitive_binary.metadata", + "primitive_string.metadata", + }; + for (auto& test_file : primitive_metadatas) { + ARROW_SCOPED_TRACE("Testing file: " + test_file); + std::string path = dir_string + "/" + test_file; + auto buf = readFromFile(*file_system, path); + + std::string_view metadata_buf{*buf}; + EXPECT_EQ(metadata_buf, VariantMetadata::kEmptyMetadataStringView); + VariantMetadata metadata(metadata_buf); + EXPECT_EQ(1, metadata.version()); + EXPECT_THROW(metadata.GetMetadataKey(0), ParquetException); + } + { + std::string object_metadata = "object_primitive.metadata"; + ARROW_SCOPED_TRACE("Testing file: " + object_metadata); + std::string path = dir_string + "/" + object_metadata; + auto buf = readFromFile(*file_system, path); + + VariantMetadata metadata(std::string_view{*buf}); + EXPECT_EQ("int_field", metadata.GetMetadataKey(0)); + EXPECT_EQ("double_field", metadata.GetMetadataKey(1)); + EXPECT_EQ("boolean_true_field", metadata.GetMetadataKey(2)); + EXPECT_EQ("boolean_false_field", metadata.GetMetadataKey(3)); + EXPECT_EQ("string_field", metadata.GetMetadataKey(4)); + EXPECT_EQ("null_field", metadata.GetMetadataKey(5)); + EXPECT_EQ("timestamp_field", metadata.GetMetadataKey(6)); + } +} + +VariantValue LoadVariantValue(const std::string& test_name, + std::shared_ptr<::arrow::Buffer>* metadata_buf_out, + std::shared_ptr<::arrow::Buffer>* value_buf_out) { + std::string dir_string(parquet::test::get_variant_dir()); + // TODO(mwish): Share in a base class? + auto file_system = std::make_shared<::arrow::fs::LocalFileSystem>(); + + std::string metadata_path = dir_string + "/" + metadata_test_file_name(test_name); + *metadata_buf_out = readFromFile(*file_system, metadata_path); + + std::string value_path = dir_string + "/" + value_test_file_name(test_name); + *value_buf_out = readFromFile(*file_system, value_path); + + std::string_view value{**value_buf_out}; + std::string_view metadata{**metadata_buf_out}; + return VariantValue{metadata, value}; +} + +TEST(ParquetVariant, NullValue) { + std::string_view empty_metadata(VariantMetadata::kEmptyMetadataChars, 3); + const uint8_t null_chars[] = {primitiveHeader(VariantPrimitiveType::NullType)}; + VariantValue variant{empty_metadata, + std::string_view{reinterpret_cast(null_chars), 1}}; + EXPECT_EQ(VariantType::Null, variant.getType()); + EXPECT_EQ("Null", variant.typeDebugString()); +} + +TEST(ParquetVariant, BooleanValue) { + // test true + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_boolean_true", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::Boolean, variant.getType()); + EXPECT_EQ("Boolean", variant.typeDebugString()); + EXPECT_EQ(true, variant.getBool()); + } + // test false + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_boolean_false", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::Boolean, variant.getType()); + EXPECT_EQ(false, variant.getBool()); + } + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_int32", &metadata_buf, &value_buf); + EXPECT_THROW(variant.getBool(), ParquetException); + } +} + +TEST(ParquetVariant, NumericValues) { + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_int8", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::Int8, variant.getType()); + EXPECT_EQ("Int8", variant.typeDebugString()); + EXPECT_EQ(42, variant.getInt8()); + } + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_int16", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::Int16, variant.getType()); + EXPECT_EQ("Int16", variant.typeDebugString()); + EXPECT_EQ(1234, variant.getInt16()); + } + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_int32", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::Int32, variant.getType()); + EXPECT_EQ("Int32", variant.typeDebugString()); + EXPECT_EQ(123456, variant.getInt32()); + } + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_int64", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::Int64, variant.getType()); + EXPECT_EQ("Int64", variant.typeDebugString()); + EXPECT_EQ(1234567890123456789, variant.getInt64()); + } + { + // Test handwritten int64 + const uint8_t int64_chars[] = {primitiveHeader(VariantPrimitiveType::Int64), + 0xB1, + 0x1C, + 0x6C, + 0xB1, + 0xF4, + 0x10, + 0x22, + 0x11}; + std::string_view value{reinterpret_cast(int64_chars), + sizeof(int64_chars)}; + VariantValue variant{VariantMetadata::kEmptyMetadataStringView, value}; + EXPECT_EQ(VariantType::Int64, variant.getType()); + EXPECT_EQ(1234567890987654321L, variant.getInt64()); + } + { + // Test handwritten int64 negative + const uint8_t int64_chars[] = {primitiveHeader(VariantPrimitiveType::Int64), + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF}; + std::string_view value{reinterpret_cast(int64_chars), + sizeof(int64_chars)}; + VariantValue variant{VariantMetadata::kEmptyMetadataStringView, value}; + EXPECT_EQ(VariantType::Int64, variant.getType()); + EXPECT_EQ(-1L, variant.getInt64()); + } + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_float", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::Float, variant.getType()); + EXPECT_EQ("Float", variant.typeDebugString()); + EXPECT_FLOAT_EQ(1234567940.0f, variant.getFloat()); + } + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_double", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::Double, variant.getType()); + EXPECT_EQ("Double", variant.typeDebugString()); + EXPECT_DOUBLE_EQ(1234567890.1234, variant.getDouble()); + } + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_int32", &metadata_buf, &value_buf); + EXPECT_THROW(variant.getInt64(), ParquetException); + EXPECT_THROW(variant.getFloat(), ParquetException); + EXPECT_THROW(variant.getDouble(), ParquetException); + } +} + +TEST(ParquetVariant, StringValues) { + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_string", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::String, variant.getType()); + EXPECT_EQ("String", variant.typeDebugString()); + std::string expected = + R"(This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥️, 🎣 and 🤦!!)"; + EXPECT_EQ(expected, variant.getString()); + } + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("short_string", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::String, variant.getType()); + EXPECT_EQ(VariantBasicType::ShortString, variant.getBasicType()); + std::string expected = R"(Less than 64 bytes (❤️ with utf8))"; + EXPECT_EQ(expected, variant.getString()); + } + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_binary", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::Binary, variant.getType()); + EXPECT_EQ("Binary", variant.typeDebugString()); + auto binary_data = variant.getBinary(); + std::string expected = ::arrow::util::base64_decode("AxM33q2+78r+"); + EXPECT_EQ(expected, binary_data); + } + + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_int32", &metadata_buf, &value_buf); + EXPECT_THROW(variant.getString(), ParquetException); + EXPECT_THROW(variant.getBinary(), ParquetException); + } +} + +TEST(ParquetVariant, ObjectValues) { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("object_primitive", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::Object, variant.getType()); + EXPECT_EQ("Object", variant.typeDebugString()); + + EXPECT_EQ(7, variant.num_elements()); + auto handle_int_field = [](const std::optional& value) { + EXPECT_TRUE(value.has_value()); + EXPECT_EQ(VariantType::Int8, value->getType()); + EXPECT_EQ(1, value->getInt8()); + }; + auto handle_double_field = [](const std::optional& value) { + EXPECT_TRUE(value.has_value()); + EXPECT_EQ(VariantType::Decimal4, value->getType()); + auto decimal_value = value->getDecimal4(); + EXPECT_EQ("1.23456789", decimal_value.value.ToString(decimal_value.scale)); + }; + auto handle_boolean_true_field = [](const std::optional& value) { + EXPECT_TRUE(value.has_value()); + EXPECT_EQ(VariantType::Boolean, value->getType()); + EXPECT_TRUE(value->getBool()); + }; + auto handle_boolean_false_field = [](const std::optional& value) { + EXPECT_TRUE(value.has_value()); + EXPECT_EQ(VariantType::Boolean, value->getType()); + EXPECT_FALSE(value->getBool()); + }; + auto handle_string_field = [](const std::optional& value) { + EXPECT_TRUE(value.has_value()); + EXPECT_EQ(VariantType::String, value->getType()); + EXPECT_EQ("Apache Parquet", value->getString()); + }; + auto handle_null_field = [](const std::optional& value) { + EXPECT_TRUE(value.has_value()); + EXPECT_EQ(VariantType::Null, value->getType()); + }; + auto handle_timestamp_field = [](const std::optional& value) { + EXPECT_TRUE(value.has_value()); + EXPECT_EQ(VariantType::String, value->getType()); + EXPECT_EQ("2025-04-16T12:34:56.78", value->getString()); + }; + + std::map& value)>> + key_handler = {{"int_field", handle_int_field}, + {"double_field", handle_double_field}, + {"boolean_true_field", handle_boolean_true_field}, + {"boolean_false_field", handle_boolean_false_field}, + {"string_field", handle_string_field}, + {"null_field", handle_null_field}, + {"timestamp_field", handle_timestamp_field}}; + // Test getObjectValueByKey with existing keys + { + ARROW_SCOPED_TRACE("Test getObjectValueByKey with existing keys"); + for (auto& [key, handler] : key_handler) { + auto value = variant.getObjectValueByKey(key); + handler(value); + } + } + // Test non-existing key + { + auto ne = variant.getObjectValueByKey("non_exists"); + EXPECT_FALSE(ne.has_value()); + } + // Test get by index + { + ARROW_SCOPED_TRACE("Test getObjectFieldByFieldId with existing indexes"); + for (uint32_t i = 0; i < variant.num_elements(); ++i) { + auto value = variant.getObjectFieldByFieldId(i); + auto key = variant.metadata().GetMetadataKey(i); + auto iter = key_handler.find(std::string(key)); + ASSERT_TRUE(iter != key_handler.end()); + auto handler = iter->second; + handler(value); + } + } + EXPECT_FALSE(variant.getObjectFieldByFieldId(100).has_value()); +} + +TEST(ParquetVariant, NestedObjectValues) { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("object_nested", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::Object, variant.getType()); + EXPECT_EQ("Object", variant.typeDebugString()); + EXPECT_EQ(3, variant.num_elements()); + + // Trying to get the exists key + auto id = variant.getObjectValueByKey("id"); + ASSERT_TRUE(id.has_value()); + EXPECT_EQ(VariantType::Int8, id->getType()); + EXPECT_EQ(1, id->getInt8()); + + auto observation = variant.getObjectValueByKey("observation"); + ASSERT_TRUE(observation.has_value()); + EXPECT_EQ(VariantType::Object, observation->getType()); + + auto species = variant.getObjectValueByKey("species"); + ASSERT_TRUE(species.has_value()); + EXPECT_EQ(VariantType::Object, species->getType()); + + // Inner object works well + { + EXPECT_EQ(2, species->num_elements()); + auto name = species->getObjectValueByKey("name"); + ASSERT_TRUE(name.has_value()); + EXPECT_EQ(VariantType::String, name->getType()); + EXPECT_EQ("lava monster", name->getString()); + + auto population = species->getObjectValueByKey("population"); + ASSERT_TRUE(population.has_value()); + EXPECT_EQ(VariantType::Int16, population->getType()); + EXPECT_EQ(6789, population->getInt16()); + } + + // Get inner key outside will fail + { + std::vector observation_keys = {"location", "time", "value"}; + for (auto& key : observation_keys) { + // Only observation would get it successfully. + auto inner_value = observation->getObjectValueByKey(key); + ASSERT_TRUE(inner_value.has_value()); + + inner_value = variant.getObjectValueByKey(key); + ASSERT_FALSE(inner_value.has_value()); + + inner_value = species->getObjectValueByKey(key); + ASSERT_FALSE(inner_value.has_value()); + } + } + // Get outside keys in inner object + { + auto inner_value = observation->getObjectValueByKey("id"); + EXPECT_FALSE(inner_value.has_value()); + + inner_value = species->getObjectValueByKey("id"); + EXPECT_FALSE(inner_value.has_value()); + } +} + +TEST(ParquetVariant, DecimalValues) { + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_decimal4", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::Decimal4, variant.getType()); + EXPECT_EQ("Decimal4", variant.typeDebugString()); + auto decimal = variant.getDecimal4(); + EXPECT_EQ(2, decimal.scale); + EXPECT_EQ("12.34", decimal.value.ToString(decimal.scale)); + } + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_decimal8", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::Decimal8, variant.getType()); + EXPECT_EQ("Decimal8", variant.typeDebugString()); + auto decimal = variant.getDecimal8(); + EXPECT_EQ(2, decimal.scale); + EXPECT_EQ("12345678.90", decimal.value.ToString(decimal.scale)); + } + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_decimal16", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::Decimal16, variant.getType()); + EXPECT_EQ("Decimal16", variant.typeDebugString()); + auto decimal = variant.getDecimal16(); + EXPECT_EQ(2, decimal.scale); + EXPECT_EQ("12345678912345678.90", decimal.value.ToString(decimal.scale)); + } +} + +TEST(ParquetVariant, Uuid) { + std::string_view empty_metadata = VariantMetadata::kEmptyMetadataStringView; + const uint8_t uuid_chars[] = {primitiveHeader(VariantPrimitiveType::Uuid), + 0x00, + 0x11, + 0x22, + 0x33, + 0x44, + 0x55, + 0x66, + 0x77, + 0x88, + 0x99, + 0xAA, + 0xBB, + 0xCC, + 0xDD, + 0xEE, + 0xFF}; + std::string_view value(reinterpret_cast(uuid_chars), sizeof(uuid_chars)); + VariantValue variant(empty_metadata, value); + ASSERT_EQ(VariantType::Uuid, variant.getType()); + auto uuid_val = variant.getUuid(); + boost::uuids::uuid uuid{}; + for (size_t i = 0; i < uuid.size(); ++i) { + uuid.data[i] = uuid_val[i]; + } + EXPECT_EQ("00112233-4455-6677-8899-aabbccddeeff", to_string(uuid)); +} + +TEST(ParquetVariant, DateTimeValues) { + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_date", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::Date, variant.getType()); + EXPECT_EQ("Date", variant.typeDebugString()); + // 2025-04-16 + EXPECT_EQ(20194, variant.getDate()); + } + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_timestamp", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::TimestampMicrosTz, variant.getType()); + EXPECT_EQ("TimestampMicrosTz", variant.typeDebugString()); + EXPECT_EQ(1744821296780000, variant.getTimestampMicros()); + } + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("primitive_timestampntz", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::TimestampMicrosNtz, variant.getType()); + EXPECT_EQ("TimestampMicrosNtz", variant.typeDebugString()); + EXPECT_EQ(1744806896780000, variant.getTimestampMicrosNtz()); + } + { + // Timestamp Nanos tz negative + const uint8_t timestamp_nanos_ntz_chars[] = { + primitiveHeader(VariantPrimitiveType::TimestampNanosTz), + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF}; + std::string_view value{reinterpret_cast(timestamp_nanos_ntz_chars), + sizeof(timestamp_nanos_ntz_chars)}; + VariantValue variant{VariantMetadata::kEmptyMetadataStringView, value}; + EXPECT_EQ(VariantType::TimestampNanosTz, variant.getType()); + EXPECT_EQ(-1L, variant.getTimestampNanosTz()); + } + { + // Timestamp Nanos tz negative + const uint8_t timestamp_nanos_ntz_chars[] = { + primitiveHeader(VariantPrimitiveType::TimestampNanosTz), + 0x15, + 0xC9, + 0xBB, + 0x86, + 0xB4, + 0x0C, + 0x37, + 0x18}; + std::string_view value{reinterpret_cast(timestamp_nanos_ntz_chars), + sizeof(timestamp_nanos_ntz_chars)}; + VariantValue variant{VariantMetadata::kEmptyMetadataStringView, value}; + EXPECT_EQ(VariantType::TimestampNanosTz, variant.getType()); + EXPECT_EQ(1744877350123456789L, variant.getTimestampNanosTz()); + } + { + // Timestamp Nanos Ntz + std::string_view empty_metadata(VariantMetadata::kEmptyMetadataChars, 3); + const uint8_t timestamp_nanos_ntz_chars[] = { + primitiveHeader(VariantPrimitiveType::TimestampNanosNtz), + 0x15, + 0xC9, + 0xBB, + 0x86, + 0xB4, + 0x0C, + 0x37, + 0x18}; + std::string_view value{reinterpret_cast(timestamp_nanos_ntz_chars), + sizeof(timestamp_nanos_ntz_chars)}; + VariantValue variant{empty_metadata, value}; + EXPECT_EQ(VariantType::TimestampNanosNtz, variant.getType()); + EXPECT_EQ(1744877350123456789L, variant.getTimestampNanosNtz()); + } +} + +TEST(ParquetVariant, ArrayValues) { + { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("array_primitive", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::Array, variant.getType()); + EXPECT_EQ("Array", variant.typeDebugString()); + + EXPECT_EQ(4, variant.num_elements()); + + auto element0 = variant.getArrayValueByIndex(0); + EXPECT_EQ(VariantType::Int8, element0.getType()); + EXPECT_EQ(2, element0.getInt8()); + + auto element1 = variant.getArrayValueByIndex(1); + EXPECT_EQ(VariantType::Int8, element1.getType()); + EXPECT_EQ(1, element1.getInt8()); + + auto element2 = variant.getArrayValueByIndex(2); + EXPECT_EQ(VariantType::Int8, element2.getType()); + EXPECT_EQ(5, element2.getInt8()); + + auto element3 = variant.getArrayValueByIndex(3); + EXPECT_EQ(VariantType::Int8, element3.getType()); + EXPECT_EQ(9, element3.getInt8()); + + EXPECT_THROW(variant.getArrayValueByIndex(4), ParquetException); + EXPECT_THROW(variant.getArrayValueByIndex(100), ParquetException); + EXPECT_THROW(variant.getObjectValueByKey("10"), ParquetException); + EXPECT_THROW(variant.getObjectFieldByFieldId(10), ParquetException); + } + { + // array_empty + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("array_empty", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::Array, variant.getType()); + EXPECT_EQ("Array", variant.typeDebugString()); + EXPECT_EQ(0, variant.num_elements()); + + EXPECT_THROW(variant.getArrayValueByIndex(0), ParquetException); + EXPECT_THROW(variant.getObjectValueByKey("key"), ParquetException); + } +} + +TEST(ParquetVariant, ArrayValuesNested) { + std::shared_ptr<::arrow::Buffer> metadata_buf, value_buf; + auto variant = LoadVariantValue("array_nested", &metadata_buf, &value_buf); + EXPECT_EQ(VariantType::Array, variant.getType()); + EXPECT_EQ("Array", variant.typeDebugString()); + EXPECT_EQ(3, variant.num_elements()); + { + auto first_element = variant.getArrayValueByIndex(0); + EXPECT_EQ(VariantType::Object, first_element.getType()); + EXPECT_EQ(2, first_element.num_elements()); + auto id = first_element.getObjectValueByKey("id"); + ASSERT_TRUE(id.has_value()); + EXPECT_EQ(VariantType::Int8, id->getType()); + EXPECT_EQ(1, id->getInt8()); + } + { + auto second_element = variant.getArrayValueByIndex(1); + EXPECT_EQ(VariantType::Null, second_element.getType()); + } + { + auto third_element = variant.getArrayValueByIndex(2); + EXPECT_EQ(VariantType::Object, third_element.getType()); + EXPECT_EQ(3, third_element.num_elements()); + auto id = third_element.getObjectValueByKey("id"); + ASSERT_TRUE(id.has_value()); + EXPECT_EQ(VariantType::Int8, id->getType()); + EXPECT_EQ(2, id->getInt8()); + } +} + +} // namespace parquet::variant diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index 18d17540097f..e7b28dd520fb 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit 18d17540097fca7c40be3d42c167e6bfad90763c +Subproject commit e7b28dd520fb3c9f0908daa84c0ef20d83c73794