From d5862ef979a1bf2c2ada7b0e22720349bfeae49b Mon Sep 17 00:00:00 2001 From: "jinli.zjw" Date: Fri, 23 Jan 2026 17:49:57 +0800 Subject: [PATCH 01/12] feat(format): optimize avro read performance and support reading various nested types --- LICENSE | 3 + src/paimon/common/utils/status.cpp | 2 +- .../core/io/row_to_arrow_array_converter.h | 2 +- .../core/operation/abstract_split_read.cpp | 2 +- src/paimon/format/avro/CMakeLists.txt | 6 +- src/paimon/format/avro/avro_adaptor_test.cpp | 12 - .../format/avro/avro_array_data_getter.cpp | 91 ---- .../format/avro/avro_array_data_getter.h | 105 ---- .../avro/avro_array_data_getter_test.cpp | 96 ---- .../format/avro/avro_datum_data_getter.h | 149 ------ .../format/avro/avro_direct_decoder.cpp | 459 ++++++++++++++++++ src/paimon/format/avro/avro_direct_decoder.h | 63 +++ .../format/avro/avro_file_batch_reader.cpp | 151 ++++-- .../format/avro/avro_file_batch_reader.h | 44 +- .../avro/avro_file_batch_reader_test.cpp | 21 +- src/paimon/format/avro/avro_file_format.cpp | 1 + .../format/avro/avro_file_format_factory.cpp | 6 + .../format/avro/avro_file_format_factory.h | 8 + .../format/avro/avro_output_stream_impl.cpp | 13 +- .../format/avro/avro_output_stream_impl.h | 2 + src/paimon/format/avro/avro_reader_builder.h | 16 +- .../format/avro/avro_record_converter.cpp | 90 ---- .../format/avro/avro_record_converter.h | 62 --- .../avro/avro_record_converter_test.cpp | 99 ---- .../format/avro/avro_record_data_getter.cpp | 127 ----- .../format/avro/avro_record_data_getter.h | 76 --- .../format/avro/avro_schema_converter.cpp | 162 ++++++- .../format/avro/avro_schema_converter.h | 14 +- .../avro/avro_schema_converter_test.cpp | 2 + 29 files changed, 850 insertions(+), 1034 deletions(-) delete mode 100644 src/paimon/format/avro/avro_array_data_getter.cpp delete mode 100644 src/paimon/format/avro/avro_array_data_getter.h delete mode 100644 src/paimon/format/avro/avro_array_data_getter_test.cpp delete mode 100644 src/paimon/format/avro/avro_datum_data_getter.h create mode 100644 src/paimon/format/avro/avro_direct_decoder.cpp create mode 100644 src/paimon/format/avro/avro_direct_decoder.h delete mode 100644 src/paimon/format/avro/avro_record_converter.cpp delete mode 100644 src/paimon/format/avro/avro_record_converter.h delete mode 100644 src/paimon/format/avro/avro_record_converter_test.cpp delete mode 100644 src/paimon/format/avro/avro_record_data_getter.cpp delete mode 100644 src/paimon/format/avro/avro_record_data_getter.h diff --git a/LICENSE b/LICENSE index fc0ef881..6f30c0b5 100644 --- a/LICENSE +++ b/LICENSE @@ -249,6 +249,9 @@ This product includes code from Apache Iceberg C++. * .devcontainer/devcontainer.json.template * CI utilities: * .pre-commit-config.yaml +* Avro direct decoder/encoder: + * src/paimon/format/avro/avro_direct_decoder.cpp + * src/paimon/format/avro/avro_direct_decoder.h Copyright: 2024-2025 The Apache Software Foundation. Home page: https://iceberg.apache.org/ diff --git a/src/paimon/common/utils/status.cpp b/src/paimon/common/utils/status.cpp index 5f21b08b..0ee1a533 100644 --- a/src/paimon/common/utils/status.cpp +++ b/src/paimon/common/utils/status.cpp @@ -142,7 +142,7 @@ void Status::AddContextLine(const char* filename, int line, const char* function const char* expr) { assert(!ok() && "Cannot add context line to ok status"); std::stringstream ss; - ss << "\nIn " << filename << ", line " << line << ", function: " << function_name + ss << "\nIn " << filename << ":" << line << ", function: " << function_name << ", code: " << expr; state_->msg += ss.str(); } diff --git a/src/paimon/core/io/row_to_arrow_array_converter.h b/src/paimon/core/io/row_to_arrow_array_converter.h index fd8b307b..9d8ea932 100644 --- a/src/paimon/core/io/row_to_arrow_array_converter.h +++ b/src/paimon/core/io/row_to_arrow_array_converter.h @@ -167,7 +167,7 @@ Status RowToArrowArrayConverter::Reserve(arrow::ArrayBuilder* array_builde PAIMON_ASSIGN_OR_RAISE(auto* struct_builder, CastToTypedBuilder(array_builder)); for (int32_t i = 0; i < struct_builder->num_fields(); i++) { - // reserve item builder in map + // reserve item builder in struct PAIMON_RETURN_NOT_OK(Reserve(struct_builder->field_builder(i), idx)); } break; diff --git a/src/paimon/core/operation/abstract_split_read.cpp b/src/paimon/core/operation/abstract_split_read.cpp index 9a7cc001..a792c09f 100644 --- a/src/paimon/core/operation/abstract_split_read.cpp +++ b/src/paimon/core/operation/abstract_split_read.cpp @@ -149,7 +149,7 @@ Result> AbstractSplitRead::CreateFileBatchReade } // TODO(zhanyu.fyh): orc format support prefetch if (context_->EnablePrefetch() && file_format_identifier != "blob" && - file_format_identifier != "orc") { + file_format_identifier != "orc" && file_format_identifier != "avro") { PAIMON_ASSIGN_OR_RAISE(std::unique_ptr prefetch_reader, PrefetchFileBatchReaderImpl::Create( data_file_path, reader_builder, options_.GetFileSystem(), diff --git a/src/paimon/format/avro/CMakeLists.txt b/src/paimon/format/avro/CMakeLists.txt index 0811b2cf..97c4113a 100644 --- a/src/paimon/format/avro/CMakeLists.txt +++ b/src/paimon/format/avro/CMakeLists.txt @@ -16,15 +16,13 @@ if(PAIMON_ENABLE_AVRO) set(PAIMON_AVRO_FILE_FORMAT avro_adaptor.cpp - avro_array_data_getter.cpp + avro_direct_decoder.cpp avro_file_batch_reader.cpp avro_file_format.cpp avro_file_format_factory.cpp avro_format_writer.cpp avro_input_stream_impl.cpp avro_output_stream_impl.cpp - avro_record_converter.cpp - avro_record_data_getter.cpp avro_schema_converter.cpp) add_paimon_lib(paimon_avro_file_format @@ -55,10 +53,8 @@ if(PAIMON_ENABLE_AVRO) avro_file_batch_reader_test.cpp avro_file_format_test.cpp avro_input_stream_impl_test.cpp - avro_record_converter_test.cpp avro_schema_converter_test.cpp avro_writer_builder_test.cpp - avro_array_data_getter_test.cpp EXTRA_INCLUDES ${AVRO_INCLUDE_DIR} STATIC_LINK_LIBS diff --git a/src/paimon/format/avro/avro_adaptor_test.cpp b/src/paimon/format/avro/avro_adaptor_test.cpp index 0cc8140c..34cf6624 100644 --- a/src/paimon/format/avro/avro_adaptor_test.cpp +++ b/src/paimon/format/avro/avro_adaptor_test.cpp @@ -27,7 +27,6 @@ #include "gtest/gtest.h" #include "paimon/common/utils/arrow/mem_utils.h" #include "paimon/core/utils/manifest_meta_reader.h" -#include "paimon/format/avro/avro_record_converter.h" #include "paimon/format/avro/avro_schema_converter.h" #include "paimon/memory/memory_pool.h" #include "paimon/status.h" @@ -62,17 +61,6 @@ TEST(AvroAdaptorTest, Simple) { ASSERT_OK_AND_ASSIGN(std::vector<::avro::GenericDatum> datums, adaptor.ConvertArrayToGenericDatums(array, avro_schema)); ASSERT_EQ(4, datums.size()); - ASSERT_OK_AND_ASSIGN(auto record_converter, - AvroRecordConverter::Create(data_type, GetDefaultPool())); - auto read_batch_result = record_converter->NextBatch(datums); - ASSERT_OK(read_batch_result); - auto [c_array, c_schema] = std::move(read_batch_result).value(); - - auto arrow_array = arrow::ImportArray(c_array.get(), c_schema.get()).ValueOrDie(); - auto arrow_pool = GetArrowPool(GetDefaultPool()); - ASSERT_OK_AND_ASSIGN(arrow_array, ManifestMetaReader::AlignArrayWithSchema( - arrow_array, data_type, arrow_pool.get())); - ASSERT_TRUE(array->Equals(arrow_array)); } } // namespace paimon::avro::test diff --git a/src/paimon/format/avro/avro_array_data_getter.cpp b/src/paimon/format/avro/avro_array_data_getter.cpp deleted file mode 100644 index 1801247b..00000000 --- a/src/paimon/format/avro/avro_array_data_getter.cpp +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright 2024-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "paimon/format/avro/avro_array_data_getter.h" - -#include "avro/GenericDatum.hh" -#include "paimon/format/avro/avro_datum_data_getter.h" - -namespace paimon::avro { - -bool AvroArrayDataGetter::IsNullAt(int32_t pos) const { - assert(pos < Size()); - return AvroDatumDataGetter::IsNullAt(array_.value()[pos]); -} -bool AvroArrayDataGetter::GetBoolean(int32_t pos) const { - assert(pos < Size()); - return AvroDatumDataGetter::GetBoolean(array_.value()[pos]); -} -char AvroArrayDataGetter::GetByte(int32_t pos) const { - assert(pos < Size()); - return AvroDatumDataGetter::GetByte(array_.value()[pos]); -} -int16_t AvroArrayDataGetter::GetShort(int32_t pos) const { - assert(pos < Size()); - return AvroDatumDataGetter::GetShort(array_.value()[pos]); -} -int32_t AvroArrayDataGetter::GetInt(int32_t pos) const { - assert(pos < Size()); - return AvroDatumDataGetter::GetInt(array_.value()[pos]); -} -int32_t AvroArrayDataGetter::GetDate(int32_t pos) const { - assert(pos < Size()); - return AvroDatumDataGetter::GetDate(array_.value()[pos]); -} -int64_t AvroArrayDataGetter::GetLong(int32_t pos) const { - assert(pos < Size()); - return AvroDatumDataGetter::GetLong(array_.value()[pos]); -} -float AvroArrayDataGetter::GetFloat(int32_t pos) const { - assert(pos < Size()); - return AvroDatumDataGetter::GetFloat(array_.value()[pos]); -} -double AvroArrayDataGetter::GetDouble(int32_t pos) const { - assert(pos < Size()); - return AvroDatumDataGetter::GetDouble(array_.value()[pos]); -} -BinaryString AvroArrayDataGetter::GetString(int32_t pos) const { - assert(pos < Size()); - return AvroDatumDataGetter::GetString(array_.value()[pos], pool_); -} -std::string_view AvroArrayDataGetter::GetStringView(int32_t pos) const { - assert(pos < Size()); - return AvroDatumDataGetter::GetStringView(array_.value()[pos]); -} -Decimal AvroArrayDataGetter::GetDecimal(int32_t pos, int32_t precision, int32_t scale) const { - assert(pos < Size()); - return AvroDatumDataGetter::GetDecimal(array_.value()[pos], precision, scale, pool_); -} -Timestamp AvroArrayDataGetter::GetTimestamp(int32_t pos, int32_t precision) const { - assert(pos < Size()); - return AvroDatumDataGetter::GetTimestamp(array_.value()[pos], precision); -} -std::shared_ptr AvroArrayDataGetter::GetBinary(int32_t pos) const { - assert(pos < Size()); - return AvroDatumDataGetter::GetBinary(array_.value()[pos], pool_); -} - -std::shared_ptr AvroArrayDataGetter::GetArray(int32_t pos) const { - assert(pos < Size()); - return AvroDatumDataGetter::GetArray(array_.value()[pos], pool_); -} - -std::shared_ptr AvroArrayDataGetter::GetRow(int32_t pos, int32_t num_fields) const { - assert(pos < Size()); - return AvroDatumDataGetter::GetRow(array_.value()[pos], num_fields, pool_); -} - -} // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_array_data_getter.h b/src/paimon/format/avro/avro_array_data_getter.h deleted file mode 100644 index cd102073..00000000 --- a/src/paimon/format/avro/avro_array_data_getter.h +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright 2024-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include - -#include "avro/GenericDatum.hh" -#include "paimon/common/data/binary_string.h" -#include "paimon/common/data/internal_array.h" -#include "paimon/common/data/internal_row.h" -#include "paimon/data/decimal.h" -#include "paimon/data/timestamp.h" -#include "paimon/memory/bytes.h" -#include "paimon/result.h" -#include "paimon/status.h" - -namespace paimon { -class InternalMap; -class MemoryPool; -} // namespace paimon - -namespace paimon::avro { - -class AvroArrayDataGetter : public InternalArray { - public: - AvroArrayDataGetter(const ::avro::GenericArray& array, const std::shared_ptr& pool) - : array_(array), pool_(pool) {} - - bool IsNullAt(int32_t pos) const override; - bool GetBoolean(int32_t pos) const override; - char GetByte(int32_t pos) const override; - int16_t GetShort(int32_t pos) const override; - int32_t GetInt(int32_t pos) const override; - int32_t GetDate(int32_t pos) const override; - int64_t GetLong(int32_t pos) const override; - float GetFloat(int32_t pos) const override; - double GetDouble(int32_t pos) const override; - BinaryString GetString(int32_t pos) const override; - std::string_view GetStringView(int32_t pos) const override; - Decimal GetDecimal(int32_t pos, int32_t precision, int32_t scale) const override; - Timestamp GetTimestamp(int32_t pos, int32_t precision) const override; - std::shared_ptr GetBinary(int32_t pos) const override; - std::shared_ptr GetArray(int32_t pos) const override; - std::shared_ptr GetRow(int32_t pos, int32_t num_fields) const override; - - int32_t Size() const override { - return array_.value().size(); - } - std::shared_ptr GetMap(int32_t pos) const override { - assert(false); - return nullptr; - } - Result> ToBooleanArray() const override { - assert(false); - return Status::NotImplemented("not implemented"); - } - Result> ToByteArray() const override { - assert(false); - return Status::NotImplemented("not implemented"); - } - Result> ToShortArray() const override { - assert(false); - return Status::NotImplemented("not implemented"); - } - Result> ToIntArray() const override { - assert(false); - return Status::NotImplemented("not implemented"); - } - Result> ToLongArray() const override { - assert(false); - return Status::NotImplemented("not implemented"); - } - Result> ToFloatArray() const override { - assert(false); - return Status::NotImplemented("not implemented"); - } - Result> ToDoubleArray() const override { - assert(false); - return Status::NotImplemented("not implemented"); - } - - private: - const ::avro::GenericArray& array_; - std::shared_ptr pool_; -}; - -} // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_array_data_getter_test.cpp b/src/paimon/format/avro/avro_array_data_getter_test.cpp deleted file mode 100644 index 7de5b38e..00000000 --- a/src/paimon/format/avro/avro_array_data_getter_test.cpp +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright 2024-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "paimon/format/avro/avro_array_data_getter.h" - -#include -#include - -#include "avro/GenericDatum.hh" -#include "avro/Schema.hh" -#include "gtest/gtest.h" -#include "paimon/memory/memory_pool.h" - -namespace paimon::avro::test { - -TEST(AvroArrayDataGetterTest, TestBasic) { - { - ::avro::GenericArray bool_array(::avro::ArraySchema(::avro::BoolSchema()).root()); - bool_array.value().push_back(true); - bool_array.value().push_back(false); - AvroArrayDataGetter getter(bool_array, GetDefaultPool()); - ASSERT_EQ(true, getter.GetBoolean(0)); - ASSERT_EQ(false, getter.GetBoolean(1)); - } - { - ::avro::GenericArray char_array(::avro::ArraySchema(::avro::IntSchema()).root()); - char_array.value().push_back(std::numeric_limits::max()); - char_array.value().push_back(std::numeric_limits::min()); - AvroArrayDataGetter getter(char_array, GetDefaultPool()); - ASSERT_EQ(std::numeric_limits::max(), getter.GetByte(0)); - ASSERT_EQ(std::numeric_limits::min(), getter.GetByte(1)); - } - { - ::avro::GenericArray short_array(::avro::ArraySchema(::avro::IntSchema()).root()); - short_array.value().push_back(std::numeric_limits::max()); - short_array.value().push_back(std::numeric_limits::min()); - AvroArrayDataGetter getter(short_array, GetDefaultPool()); - ASSERT_EQ(std::numeric_limits::max(), getter.GetShort(0)); - ASSERT_EQ(std::numeric_limits::min(), getter.GetShort(1)); - } - { - ::avro::GenericArray int_array(::avro::ArraySchema(::avro::IntSchema()).root()); - int_array.value().push_back(std::numeric_limits::max()); - int_array.value().push_back(std::numeric_limits::min()); - AvroArrayDataGetter getter(int_array, GetDefaultPool()); - ASSERT_EQ(std::numeric_limits::max(), getter.GetInt(0)); - ASSERT_EQ(std::numeric_limits::min(), getter.GetInt(1)); - } - { - ::avro::GenericArray long_array(::avro::ArraySchema(::avro::LongSchema()).root()); - long_array.value().push_back(std::numeric_limits::max()); - long_array.value().push_back(std::numeric_limits::min()); - AvroArrayDataGetter getter(long_array, GetDefaultPool()); - ASSERT_EQ(std::numeric_limits::max(), getter.GetLong(0)); - ASSERT_EQ(std::numeric_limits::min(), getter.GetLong(1)); - } - { - ::avro::GenericArray float_array(::avro::ArraySchema(::avro::FloatSchema()).root()); - float_array.value().push_back(std::numeric_limits::max()); - float_array.value().push_back(std::numeric_limits::min()); - AvroArrayDataGetter getter(float_array, GetDefaultPool()); - ASSERT_EQ(std::numeric_limits::max(), getter.GetFloat(0)); - ASSERT_EQ(std::numeric_limits::min(), getter.GetFloat(1)); - } - { - ::avro::GenericArray double_array(::avro::ArraySchema(::avro::DoubleSchema()).root()); - double_array.value().push_back(std::numeric_limits::max()); - double_array.value().push_back(std::numeric_limits::min()); - AvroArrayDataGetter getter(double_array, GetDefaultPool()); - ASSERT_EQ(std::numeric_limits::max(), getter.GetDouble(0)); - ASSERT_EQ(std::numeric_limits::min(), getter.GetDouble(1)); - } - { - ::avro::GenericArray string_array(::avro::ArraySchema(::avro::StringSchema()).root()); - string_array.value().push_back(std::string("apple")); - string_array.value().push_back(std::string("banana")); - AvroArrayDataGetter getter(string_array, GetDefaultPool()); - ASSERT_EQ("apple", getter.GetString(0).ToString()); - ASSERT_EQ("banana", getter.GetString(1).ToString()); - } -} - -} // namespace paimon::avro::test diff --git a/src/paimon/format/avro/avro_datum_data_getter.h b/src/paimon/format/avro/avro_datum_data_getter.h deleted file mode 100644 index 1f354f00..00000000 --- a/src/paimon/format/avro/avro_datum_data_getter.h +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright 2024-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -#include "avro/GenericDatum.hh" -#include "paimon/common/data/internal_array.h" -#include "paimon/common/data/internal_row.h" -#include "paimon/common/utils/date_time_utils.h" -#include "paimon/format/avro/avro_array_data_getter.h" -#include "paimon/format/avro/avro_record_data_getter.h" - -namespace paimon::avro { - -class AvroDatumDataGetter { - public: - static bool IsNullAt(const ::avro::GenericDatum& datum) { - return (datum.type() == ::avro::AVRO_NULL); - } - static bool GetBoolean(const ::avro::GenericDatum& datum) { - assert(datum.type() == ::avro::AVRO_BOOL); - return datum.value(); - } - static char GetByte(const ::avro::GenericDatum& datum) { - assert(datum.type() == ::avro::AVRO_INT); - return datum.value(); - } - static int16_t GetShort(const ::avro::GenericDatum& datum) { - assert(datum.type() == ::avro::AVRO_INT); - return datum.value(); - } - static int32_t GetInt(const ::avro::GenericDatum& datum) { - assert(datum.type() == ::avro::AVRO_INT); - return datum.value(); - } - static int32_t GetDate(const ::avro::GenericDatum& datum) { - assert(datum.type() == ::avro::AVRO_INT); - return datum.value(); - } - static int64_t GetLong(const ::avro::GenericDatum& datum) { - assert(datum.type() == ::avro::AVRO_LONG); - return datum.value(); - } - static float GetFloat(const ::avro::GenericDatum& datum) { - assert(datum.type() == ::avro::AVRO_FLOAT); - return datum.value(); - } - static double GetDouble(const ::avro::GenericDatum& datum) { - assert(datum.type() == ::avro::AVRO_DOUBLE); - return datum.value(); - } - static BinaryString GetString(const ::avro::GenericDatum& datum, - const std::shared_ptr& pool) { - assert(datum.type() == ::avro::AVRO_STRING); - return BinaryString::FromString(datum.value(), pool.get()); - } - static std::string_view GetStringView(const ::avro::GenericDatum& datum) { - if (datum.type() == ::avro::AVRO_STRING) { - return {datum.value()}; - } else if (datum.type() == ::avro::AVRO_BYTES) { - const auto& binary = datum.value>(); - return {reinterpret_cast(binary.data()), binary.size()}; - } else { - assert(false); - return {""}; - } - } - static Decimal GetDecimal(const ::avro::GenericDatum& datum, int32_t precision, int32_t scale, - const std::shared_ptr& pool) { - assert(datum.type() == ::avro::AVRO_BYTES); - auto logical_type = datum.logicalType(); - switch (logical_type.type()) { - case ::avro::LogicalType::DECIMAL: { - auto bytes = GetBinary(datum, pool); - assert(logical_type.precision() == precision && logical_type.scale() == scale); - return Decimal::FromUnscaledBytes(precision, scale, bytes.get()); - } - default: - assert(false); - return Decimal::FromUnscaledLong(0, 0, 0); - } - } - static Timestamp GetTimestamp(const ::avro::GenericDatum& datum, int32_t precision) { - assert(datum.type() == ::avro::AVRO_LONG); - switch (datum.logicalType().type()) { - case ::avro::LogicalType::TIMESTAMP_MILLIS: - case ::avro::LogicalType::LOCAL_TIMESTAMP_MILLIS: - return Timestamp(/*millisecond=*/datum.value(), /*nano_of_millisecond=*/0); - case ::avro::LogicalType::TIMESTAMP_MICROS: - case ::avro::LogicalType::LOCAL_TIMESTAMP_MICROS: { - auto [milliseconds, nanoseconds] = DateTimeUtils::TimestampConverter( - datum.value(), DateTimeUtils::MICROSECOND, DateTimeUtils::MILLISECOND, - DateTimeUtils::NANOSECOND); - return Timestamp(milliseconds, nanoseconds); - } - case ::avro::LogicalType::TIMESTAMP_NANOS: - case ::avro::LogicalType::LOCAL_TIMESTAMP_NANOS: { - assert(false); // Java Avro do not support TIMESTAMP_NANOS, should not call this - auto [milliseconds, nanoseconds] = DateTimeUtils::TimestampConverter( - datum.value(), DateTimeUtils::NANOSECOND, DateTimeUtils::MILLISECOND, - DateTimeUtils::NANOSECOND); - return Timestamp(milliseconds, nanoseconds); - } - default: - assert(false); // do not have TIMESTAMP_SECONDS/LOCAL_TIMESTAMP_SECONDS - return Timestamp(/*millisecond=*/0, /*nano_of_millisecond=*/0); - } - } - - static std::shared_ptr GetBinary(const ::avro::GenericDatum& datum, - const std::shared_ptr& pool) { - assert(datum.type() == ::avro::AVRO_BYTES); - const auto& binary = datum.value>(); - return std::make_shared( - std::string(reinterpret_cast(binary.data()), binary.size()), pool.get()); - } - static std::shared_ptr GetArray(const ::avro::GenericDatum& datum, - const std::shared_ptr& pool) { - assert(datum.type() == ::avro::AVRO_ARRAY); - return std::make_shared(datum.value<::avro::GenericArray>(), pool); - } - static std::shared_ptr GetRow(const ::avro::GenericDatum& datum, - int32_t num_fields, - const std::shared_ptr& pool) { - assert(datum.type() == ::avro::AVRO_RECORD); - const auto& record = datum.value<::avro::GenericRecord>(); - assert(record.fieldCount() == static_cast(num_fields)); - return std::make_shared(record, pool); - } -}; - -} // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_direct_decoder.cpp b/src/paimon/format/avro/avro_direct_decoder.cpp new file mode 100644 index 00000000..209f6822 --- /dev/null +++ b/src/paimon/format/avro/avro_direct_decoder.cpp @@ -0,0 +1,459 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Adapted from Apache Iceberg C++ +// https://github.com/apache/iceberg-cpp/blob/main/src/iceberg/avro/avro_direct_decoder.cc + +#include "paimon/format/avro/avro_direct_decoder.h" + +#include "arrow/api.h" +#include "arrow/util/checked_cast.h" +#include "avro/Decoder.hh" +#include "avro/Node.hh" +#include "avro/Types.hh" +#include "paimon/common/utils/arrow/status_utils.h" +#include "paimon/common/utils/date_time_utils.h" + +namespace paimon::avro { + +namespace { + +std::string ToString(const ::avro::NodePtr& node) { + std::stringstream ss; + ss << *node; + return ss.str(); +} + +bool HasMapLogicalType(const ::avro::NodePtr& node) { + return node->logicalType().type() == ::avro::LogicalType::CUSTOM && + node->logicalType().customLogicalType() != nullptr && + node->logicalType().customLogicalType()->name() == "map"; +} + +/// Forward declaration for mutual recursion. +Status DecodeFieldToBuilder(const ::avro::NodePtr& avro_node, + const std::optional>& projection, + ::avro::Decoder& decoder, arrow::ArrayBuilder* array_builder, + AvroDirectDecoder::DecodeContext& ctx); + +/// \brief Skip an Avro value based on its schema without decoding +Status SkipAvroValue(const ::avro::NodePtr& avro_node, ::avro::Decoder& decoder) { + switch (avro_node->type()) { + case ::avro::AVRO_NULL: + decoder.decodeNull(); + return Status::OK(); + + case ::avro::AVRO_BOOL: + decoder.decodeBool(); + return Status::OK(); + + case ::avro::AVRO_INT: + decoder.decodeInt(); + return Status::OK(); + + case ::avro::AVRO_LONG: + decoder.decodeLong(); + return Status::OK(); + + case ::avro::AVRO_FLOAT: + decoder.decodeFloat(); + return Status::OK(); + + case ::avro::AVRO_DOUBLE: + decoder.decodeDouble(); + return Status::OK(); + + case ::avro::AVRO_STRING: + decoder.skipString(); + return Status::OK(); + + case ::avro::AVRO_BYTES: + decoder.skipBytes(); + return Status::OK(); + + case ::avro::AVRO_FIXED: + decoder.skipFixed(avro_node->fixedSize()); + return Status::OK(); + + case ::avro::AVRO_RECORD: { + // Skip all fields in order + for (size_t i = 0; i < avro_node->leaves(); ++i) { + PAIMON_RETURN_NOT_OK(SkipAvroValue(avro_node->leafAt(i), decoder)); + } + return Status::OK(); + } + + case ::avro::AVRO_ENUM: + decoder.decodeEnum(); + return Status::OK(); + + case ::avro::AVRO_ARRAY: { + const auto& element_node = avro_node->leafAt(0); + // skipArray() returns count like arrayStart(), must handle all blocks + int64_t block_count = decoder.skipArray(); + while (block_count > 0) { + for (int64_t i = 0; i < block_count; ++i) { + PAIMON_RETURN_NOT_OK(SkipAvroValue(element_node, decoder)); + } + block_count = decoder.arrayNext(); + } + return Status::OK(); + } + + case ::avro::AVRO_MAP: { + const auto& value_node = avro_node->leafAt(1); + // skipMap() returns count like mapStart(), must handle all blocks + int64_t block_count = decoder.skipMap(); + while (block_count > 0) { + for (int64_t i = 0; i < block_count; ++i) { + decoder.skipString(); // Skip key (always string in Avro maps) + PAIMON_RETURN_NOT_OK(SkipAvroValue(value_node, decoder)); + } + block_count = decoder.mapNext(); + } + return Status::OK(); + } + + case ::avro::AVRO_UNION: { + const size_t branch_index = decoder.decodeUnionIndex(); + // Validate branch index + const size_t num_branches = avro_node->leaves(); + if (branch_index >= num_branches) { + return Status::Invalid(fmt::format("Union branch index {} out of range [0, {})", + branch_index, num_branches)); + } + return SkipAvroValue(avro_node->leafAt(branch_index), decoder); + } + + default: + return Status::Invalid( + fmt::format("Unsupported Avro type for skipping: {}", ToString(avro_node))); + } +} + +/// Decode Avro record directly to Arrow struct builder. +Status DecodeStructToBuilder(const ::avro::NodePtr& avro_node, + const std::optional>& projection, + ::avro::Decoder& decoder, arrow::ArrayBuilder* array_builder, + AvroDirectDecoder::DecodeContext& ctx) { + if (avro_node->type() != ::avro::AVRO_RECORD) { + return Status::Invalid( + fmt::format("Expected Avro record, got type: {}", ToString(avro_node))); + } + + auto* struct_builder = arrow::internal::checked_cast(array_builder); + PAIMON_RETURN_NOT_OK_FROM_ARROW(struct_builder->Append()); + + size_t skipped_fields = 0; + // Read all Avro fields in order (must maintain decoder position) + for (size_t avro_idx = 0; avro_idx < avro_node->leaves(); ++avro_idx) { + if (projection && projection->find(avro_idx) == projection->end()) { + skipped_fields++; + PAIMON_RETURN_NOT_OK(SkipAvroValue(avro_node->leafAt(avro_idx), decoder)); + } else { + // Decode this field + const auto& avro_field_node = avro_node->leafAt(avro_idx); + auto* field_builder = struct_builder->field_builder(avro_idx - skipped_fields); + PAIMON_RETURN_NOT_OK(DecodeFieldToBuilder(avro_field_node, /*projection=*/std::nullopt, + decoder, field_builder, ctx)); + } + } + + return Status::OK(); +} + +/// Decode Avro array directly to Arrow list builder. +Status DecodeListToBuilder(const ::avro::NodePtr& avro_node, ::avro::Decoder& decoder, + arrow::ArrayBuilder* array_builder, + AvroDirectDecoder::DecodeContext& ctx) { + if (avro_node->type() != ::avro::AVRO_ARRAY) { + return Status::Invalid( + fmt::format("Expected Avro array, got type: {}", ToString(avro_node))); + } + + auto* list_builder = arrow::internal::checked_cast(array_builder); + PAIMON_RETURN_NOT_OK_FROM_ARROW(list_builder->Append()); + + auto* value_builder = list_builder->value_builder(); + const auto& element_node = avro_node->leafAt(0); + + // Read array block count + int64_t block_count = decoder.arrayStart(); + while (block_count != 0) { + for (int64_t i = 0; i < block_count; ++i) { + PAIMON_RETURN_NOT_OK(DecodeFieldToBuilder(element_node, /*projection=*/std::nullopt, + decoder, value_builder, ctx)); + } + block_count = decoder.arrayNext(); + } + + return Status::OK(); +} + +/// Decode Avro map directly to Arrow map builder. +Status DecodeMapToBuilder(const ::avro::NodePtr& avro_node, ::avro::Decoder& decoder, + arrow::ArrayBuilder* array_builder, + AvroDirectDecoder::DecodeContext& ctx) { + auto* map_builder = arrow::internal::checked_cast(array_builder); + + if (avro_node->type() == ::avro::AVRO_MAP) { + // Handle regular Avro map: map + const auto& key_node = avro_node->leafAt(0); + const auto& value_node = avro_node->leafAt(1); + + PAIMON_RETURN_NOT_OK_FROM_ARROW(map_builder->Append()); + auto* key_builder = map_builder->key_builder(); + auto* item_builder = map_builder->item_builder(); + + // Read map block count + int64_t block_count = decoder.mapStart(); + while (block_count != 0) { + for (int64_t i = 0; i < block_count; ++i) { + PAIMON_RETURN_NOT_OK(DecodeFieldToBuilder(key_node, /*projection=*/std::nullopt, + decoder, key_builder, ctx)); + PAIMON_RETURN_NOT_OK(DecodeFieldToBuilder(value_node, /*projection=*/std::nullopt, + decoder, item_builder, ctx)); + } + block_count = decoder.mapNext(); + } + return Status::OK(); + } else if (avro_node->type() == ::avro::AVRO_ARRAY && HasMapLogicalType(avro_node)) { + // Handle array-based map: list> + PAIMON_RETURN_NOT_OK_FROM_ARROW(map_builder->Append()); + auto* key_builder = map_builder->key_builder(); + auto* item_builder = map_builder->item_builder(); + + const auto& record_node = avro_node->leafAt(0); + if (record_node->type() != ::avro::AVRO_RECORD || record_node->leaves() != 2) { + return Status::Invalid( + fmt::format("Array-based map must contain records with exactly 2 fields, got: {}", + ToString(record_node))); + } + const auto& key_node = record_node->leafAt(0); + const auto& value_node = record_node->leafAt(1); + + // Read array block count + int64_t block_count = decoder.arrayStart(); + while (block_count != 0) { + for (int64_t i = 0; i < block_count; ++i) { + PAIMON_RETURN_NOT_OK(DecodeFieldToBuilder(key_node, /*projection=*/std::nullopt, + decoder, key_builder, ctx)); + PAIMON_RETURN_NOT_OK(DecodeFieldToBuilder(value_node, /*projection=*/std::nullopt, + decoder, item_builder, ctx)); + } + block_count = decoder.arrayNext(); + } + return Status::OK(); + } else { + return Status::Invalid(fmt::format( + "Expected Avro map or array with map logical type, got: {}", ToString(avro_node))); + } +} + +/// Decode Avro data directly to Arrow array builder. +Status DecodeAvroValueToBuilder(const ::avro::NodePtr& avro_node, + const std::optional>& projection, + ::avro::Decoder& decoder, arrow::ArrayBuilder* array_builder, + AvroDirectDecoder::DecodeContext& ctx) { + auto type = avro_node->type(); + auto logical_type = avro_node->logicalType(); + + switch (type) { + case ::avro::AVRO_BOOL: { + auto* builder = arrow::internal::checked_cast(array_builder); + bool value = decoder.decodeBool(); + PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append(value)); + return Status::OK(); + } + + case ::avro::AVRO_INT: { + int32_t value = decoder.decodeInt(); + auto arrow_type = array_builder->type(); + switch (arrow_type->id()) { + case arrow::Type::INT8: { + auto* builder = + arrow::internal::checked_cast(array_builder); + PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append(value)); + return Status::OK(); + } + case arrow::Type::INT16: { + auto* builder = + arrow::internal::checked_cast(array_builder); + PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append(value)); + return Status::OK(); + } + case arrow::Type::INT32: { + auto* builder = + arrow::internal::checked_cast(array_builder); + PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append(value)); + return Status::OK(); + } + case arrow::Type::DATE32: { + if (logical_type.type() != ::avro::LogicalType::Type::DATE) { + return Status::TypeError( + fmt::format("Unexpected avro type [{}] with arrow type [{}].", + toString(type), arrow_type->ToString())); + } + auto* builder = + arrow::internal::checked_cast(array_builder); + PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append(value)); + return Status::OK(); + } + default: + return Status::TypeError( + fmt::format("Unexpected avro type [{}] with arrow type [{}].", + toString(type), arrow_type->ToString())); + } + } + + case ::avro::AVRO_LONG: { + int64_t value = decoder.decodeLong(); + switch (logical_type.type()) { + case ::avro::LogicalType::Type::NONE: { + auto* builder = + arrow::internal::checked_cast(array_builder); + PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append(value)); + return Status::OK(); + } + case ::avro::LogicalType::Type::TIMESTAMP_MILLIS: + case ::avro::LogicalType::Type::TIMESTAMP_MICROS: + case ::avro::LogicalType::Type::TIMESTAMP_NANOS: + case ::avro::LogicalType::Type::LOCAL_TIMESTAMP_MILLIS: + case ::avro::LogicalType::Type::LOCAL_TIMESTAMP_MICROS: + case ::avro::LogicalType::Type::LOCAL_TIMESTAMP_NANOS: { + auto* builder = + arrow::internal::checked_cast(array_builder); + auto ts_type = + arrow::internal::checked_cast(builder->type().get()); + // for arrow second, we need to convert it from avro millisecond + if (ts_type->unit() == arrow::TimeUnit::type::SECOND) { + value /= DateTimeUtils::CONVERSION_FACTORS[DateTimeUtils::MILLISECOND]; + } + PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append(value)); + return Status::OK(); + } + default: + return Status::TypeError( + fmt::format("Unexpected avro type [{}] with arrow type [{}].", + toString(type), array_builder->type()->ToString())); + } + } + + case ::avro::AVRO_FLOAT: { + auto* builder = arrow::internal::checked_cast(array_builder); + float value = decoder.decodeFloat(); + PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append(value)); + return Status::OK(); + } + case ::avro::AVRO_DOUBLE: { + auto* builder = arrow::internal::checked_cast(array_builder); + double value = decoder.decodeDouble(); + PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append(value)); + return Status::OK(); + } + case ::avro::AVRO_STRING: { + auto* builder = arrow::internal::checked_cast(array_builder); + decoder.decodeString(ctx.string_scratch); + PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append(ctx.string_scratch)); + return Status::OK(); + } + + case ::avro::AVRO_BYTES: { + decoder.decodeBytes(ctx.bytes_scratch); + switch (logical_type.type()) { + case ::avro::LogicalType::Type::NONE: { + auto* builder = + arrow::internal::checked_cast(array_builder); + PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append( + ctx.bytes_scratch.data(), static_cast(ctx.bytes_scratch.size()))); + return Status::OK(); + } + case ::avro::LogicalType::Type::DECIMAL: { + auto* builder = + arrow::internal::checked_cast(array_builder); + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW( + arrow::Decimal128 decimal, + arrow::Decimal128::FromBigEndian(ctx.bytes_scratch.data(), + ctx.bytes_scratch.size())); + PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append(decimal)); + return Status::OK(); + } + default: + return Status::TypeError( + fmt::format("Unexpected avro type [{}] with arrow type [{}].", + toString(type), array_builder->type()->ToString())); + } + } + + case ::avro::AVRO_RECORD: { + return DecodeStructToBuilder(avro_node, projection, decoder, array_builder, ctx); + } + case ::avro::AVRO_ARRAY: { + if (HasMapLogicalType(avro_node)) { + return DecodeMapToBuilder(avro_node, decoder, array_builder, ctx); + } else { + return DecodeListToBuilder(avro_node, decoder, array_builder, ctx); + } + } + case ::avro::AVRO_MAP: { + return DecodeMapToBuilder(avro_node, decoder, array_builder, ctx); + } + default: + return Status::Invalid( + fmt::format("Unsupported avro type: {}", ::avro::toString(type))); + } +} + +Status DecodeFieldToBuilder(const ::avro::NodePtr& avro_node, + const std::optional>& projection, + ::avro::Decoder& decoder, arrow::ArrayBuilder* array_builder, + AvroDirectDecoder::DecodeContext& ctx) { + if (avro_node->type() == ::avro::AVRO_UNION) { + const size_t branch_index = decoder.decodeUnionIndex(); + + // Validate branch index + const size_t num_branches = avro_node->leaves(); + if (branch_index >= num_branches) { + return Status::Invalid(fmt::format("Union branch index {} out of range [0, {})", + branch_index, num_branches)); + } + + const auto& branch_node = avro_node->leafAt(branch_index); + if (branch_node->type() == ::avro::AVRO_NULL) { + PAIMON_RETURN_NOT_OK_FROM_ARROW(array_builder->AppendNull()); + return Status::OK(); + } else { + return DecodeFieldToBuilder(branch_node, projection, decoder, array_builder, ctx); + } + } + + return DecodeAvroValueToBuilder(avro_node, projection, decoder, array_builder, ctx); +} + +} // namespace + +Status AvroDirectDecoder::DecodeAvroToBuilder(const ::avro::NodePtr& avro_node, + const std::optional>& projection, + ::avro::Decoder& decoder, + arrow::ArrayBuilder* array_builder, + DecodeContext& ctx) { + return DecodeFieldToBuilder(avro_node, projection, decoder, array_builder, ctx); +} + +} // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_direct_decoder.h b/src/paimon/format/avro/avro_direct_decoder.h new file mode 100644 index 00000000..e63b6e84 --- /dev/null +++ b/src/paimon/format/avro/avro_direct_decoder.h @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Adapted from Apache Iceberg C++ +// https://github.com/apache/iceberg-cpp/blob/main/src/iceberg/avro/avro_direct_decoder_internal.h + +#pragma once + +#include + +#include "arrow/array/builder_base.h" +#include "avro/Decoder.hh" +#include "avro/Node.hh" +#include "paimon/status.h" + +namespace paimon::avro { + +class AvroDirectDecoder { + public: + /// Context for reusing scratch buffers during Avro decoding + /// + /// Avoids frequent small allocations by reusing temporary buffers across multiple decode + /// operations. This is particularly important for string, binary, and decimal data types. + struct DecodeContext { + // Scratch buffer for string decoding (reused across rows) + std::string string_scratch; + // Scratch buffer for binary/decimal data (reused across rows) + std::vector bytes_scratch; + }; + + /// Directly decode Avro data to Arrow array builders without GenericDatum + /// + /// Eliminates the GenericDatum intermediate layer by directly calling Avro decoder + /// methods and immediately appending to Arrow builders. + /// + /// @param avro_node The Avro schema node for the data being decoded + /// @param decoder The Avro decoder positioned at the data to read + /// @param array_builder The Arrow array builder to append decoded data to + /// @param ctx Decode context for reusing scratch buffers + /// @return Status indicating success, or an error status + static Status DecodeAvroToBuilder(const ::avro::NodePtr& avro_node, + const std::optional>& projection, + ::avro::Decoder& decoder, arrow::ArrayBuilder* array_builder, + DecodeContext& ctx); +}; + +} // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_file_batch_reader.cpp b/src/paimon/format/avro/avro_file_batch_reader.cpp index c5c02a45..0fa6fb35 100644 --- a/src/paimon/format/avro/avro_file_batch_reader.cpp +++ b/src/paimon/format/avro/avro_file_batch_reader.cpp @@ -18,24 +18,30 @@ #include #include -#include #include "arrow/c/bridge.h" -#include "avro/Generic.hh" -#include "avro/GenericDatum.hh" #include "fmt/format.h" +#include "paimon/common/metrics/metrics_impl.h" #include "paimon/common/utils/arrow/status_utils.h" +#include "paimon/format/avro/avro_input_stream_impl.h" #include "paimon/format/avro/avro_schema_converter.h" #include "paimon/reader/batch_reader.h" namespace paimon::avro { -AvroFileBatchReader::AvroFileBatchReader( - std::unique_ptr<::avro::DataFileReader<::avro::GenericDatum>>&& reader, - std::unique_ptr&& record_converter, int32_t batch_size) - : reader_(std::move(reader)), - record_converter_(std::move(record_converter)), - batch_size_(batch_size) {} +AvroFileBatchReader::AvroFileBatchReader(const std::shared_ptr& input_stream, + const std::shared_ptr<::arrow::DataType>& file_data_type, + std::unique_ptr<::avro::DataFileReaderBase>&& reader, + std::unique_ptr&& array_builder, + int32_t batch_size, + const std::shared_ptr& pool) + : pool_(pool), + input_stream_(input_stream), + file_data_type_(file_data_type), + reader_(std::move(reader)), + array_builder_(std::move(array_builder)), + batch_size_(batch_size), + metrics_(std::make_shared()) {} AvroFileBatchReader::~AvroFileBatchReader() { DoClose(); @@ -49,44 +55,67 @@ void AvroFileBatchReader::DoClose() { } Result> AvroFileBatchReader::Create( - std::unique_ptr<::avro::DataFileReader<::avro::GenericDatum>>&& reader, int32_t batch_size, + const std::shared_ptr& input_stream, int32_t batch_size, const std::shared_ptr& pool) { if (batch_size <= 0) { return Status::Invalid( fmt::format("invalid batch size {}, must be larger than 0", batch_size)); } - const auto& avro_read_schema = reader->readerSchema(); - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<::arrow::DataType> arrow_data_type, - AvroSchemaConverter::AvroSchemaToArrowDataType(avro_read_schema)); - PAIMON_ASSIGN_OR_RAISE(std::unique_ptr record_converter, - AvroRecordConverter::Create(arrow_data_type, pool)); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<::avro::DataFileReaderBase> reader, + CreateDataFileReader(input_stream, pool)); + const auto& avro_file_schema = reader->dataSchema(); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<::arrow::DataType> file_data_type, + AvroSchemaConverter::AvroSchemaToArrowDataType(avro_file_schema)); + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::unique_ptr array_builder, + arrow::MakeBuilder(file_data_type)); return std::unique_ptr( - new AvroFileBatchReader(std::move(reader), std::move(record_converter), batch_size)); + new AvroFileBatchReader(input_stream, file_data_type, std::move(reader), + std::move(array_builder), batch_size, pool)); +} + +Result> AvroFileBatchReader::CreateDataFileReader( + const std::shared_ptr& input_stream, const std::shared_ptr& pool) { + PAIMON_RETURN_NOT_OK(input_stream->Seek(0, SeekOrigin::FS_SEEK_SET)); + try { + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<::avro::InputStream> in, + AvroInputStreamImpl::Create(input_stream, BUFFER_SIZE, pool)); + auto reader = std::make_unique<::avro::DataFileReaderBase>(std::move(in)); + reader->init(); + return reader; + } catch (const ::avro::Exception& e) { + return Status::Invalid(fmt::format("build avro reader failed. {}", e.what())); + } catch (const std::exception& e) { + return Status::Invalid(fmt::format("build avro reader failed. {}", e.what())); + } catch (...) { + return Status::Invalid("build avro reader failed. unknown error"); + } } Result AvroFileBatchReader::NextBatch() { - std::vector<::avro::GenericDatum> datums; - datums.reserve(batch_size_); + if (next_row_to_read_ == std::numeric_limits::max()) { + next_row_to_read_ = 0; + } try { - for (int32_t i = 0; i < batch_size_; i++) { - ::avro::GenericDatum datum(reader_->readerSchema()); - if (!reader_->read(datum)) { - // reach eof + while (array_builder_->length() < batch_size_) { + if (!reader_->hasMore()) { break; } - if (datum.type() != ::avro::AVRO_RECORD) { - return Status::Invalid( - fmt::format("avro reader next batch failed. invalid datum type: {}", - ::avro::toString(datum.type()))); - } - datums.emplace_back(datum); + reader_->decr(); + PAIMON_RETURN_NOT_OK(AvroDirectDecoder::DecodeAvroToBuilder( + reader_->dataSchema().root(), read_fields_projection_, reader_->decoder(), + array_builder_.get(), decode_context_)); } - if (datums.empty()) { + previous_first_row_ = next_row_to_read_; + next_row_to_read_ += array_builder_->length(); + if (array_builder_->length() == 0) { return BatchReader::MakeEofBatch(); } - // TODO(jinli.zjw) when support SetReadSchema(), may need convert file timestamp (milli) to - // target read type timestamp(second) - return record_converter_->NextBatch(datums); + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr array, + array_builder_->Finish()); + std::unique_ptr c_array = std::make_unique(); + std::unique_ptr c_schema = std::make_unique(); + PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportArray(*array, c_array.get(), c_schema.get())); + return make_pair(std::move(c_array), std::move(c_schema)); } catch (const ::avro::Exception& e) { return Status::Invalid(fmt::format("avro reader next batch failed. {}", e.what())); } catch (const std::exception& e) { @@ -99,28 +128,50 @@ Result AvroFileBatchReader::NextBatch() { Status AvroFileBatchReader::SetReadSchema(::ArrowSchema* read_schema, const std::shared_ptr& predicate, const std::optional& selection_bitmap) { - assert(false); - return Status::NotImplemented("avro reader not support set read schema"); + if (!read_schema) { + return Status::Invalid("SetReadSchema failed: read schema cannot be nullptr"); + } + // TODO(menglingda.mld): support predicate + if (selection_bitmap) { + // TODO(menglingda.mld): support bitmap + } + previous_first_row_ = std::numeric_limits::max(); + next_row_to_read_ = std::numeric_limits::max(); + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr arrow_read_schema, + arrow::ImportSchema(read_schema)); + std::shared_ptr<::arrow::DataType> read_data_type = arrow::struct_(arrow_read_schema->fields()); + PAIMON_ASSIGN_OR_RAISE(read_fields_projection_, + CalculateReadFieldsProjection(file_data_type_, read_data_type)); + array_builder_->Reset(); + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(array_builder_, arrow::MakeBuilder(read_data_type)); + return Status::OK(); +} + +Result> AvroFileBatchReader::CalculateReadFieldsProjection( + const std::shared_ptr<::arrow::DataType>& file_data_type, + const std::shared_ptr<::arrow::DataType>& read_data_type) { + if (file_data_type->id() != arrow::Type::STRUCT || + read_data_type->id() != arrow::Type::STRUCT) { + return Status::Invalid( + fmt::format("Expected struct data type, file data type: {}, read data type: {}", + file_data_type->ToString(), read_data_type->ToString())); + } + const auto& file_struct_type = std::static_pointer_cast(file_data_type); + const auto& read_struct_type = std::static_pointer_cast(read_data_type); + std::set projection; + for (const auto& field : read_struct_type->fields()) { + auto field_index = file_struct_type->GetFieldIndex(field->name()); + assert(field_index != -1); + projection.insert(field_index); + } + return projection; } Result> AvroFileBatchReader::GetFileSchema() const { assert(reader_); - try { - const auto& avro_file_schema = reader_->dataSchema(); - bool nullable = false; - PAIMON_ASSIGN_OR_RAISE( - std::shared_ptr arrow_file_type, - AvroSchemaConverter::GetArrowType(avro_file_schema.root(), &nullable)); - auto c_schema = std::make_unique<::ArrowSchema>(); - PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportType(*arrow_file_type, c_schema.get())); - return c_schema; - } catch (const ::avro::Exception& e) { - return Status::Invalid(fmt::format("get file schema failed. {}", e.what())); - } catch (const std::exception& e) { - return Status::Invalid(fmt::format("get file schema batch failed. {}", e.what())); - } catch (...) { - return Status::Invalid("get file schema failed. unknown error"); - } + auto c_schema = std::make_unique<::ArrowSchema>(); + PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportType(*file_data_type_, c_schema.get())); + return c_schema; } } // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_file_batch_reader.h b/src/paimon/format/avro/avro_file_batch_reader.h index 08645b92..8e23050b 100644 --- a/src/paimon/format/avro/avro_file_batch_reader.h +++ b/src/paimon/format/avro/avro_file_batch_reader.h @@ -17,12 +17,14 @@ #pragma once #include +#include #include #include #include "avro/DataFile.hh" -#include "paimon/format/avro/avro_record_converter.h" +#include "paimon/format/avro/avro_direct_decoder.h" #include "paimon/memory/memory_pool.h" +#include "paimon/metrics.h" #include "paimon/reader/file_batch_reader.h" #include "paimon/result.h" @@ -31,7 +33,7 @@ namespace paimon::avro { class AvroFileBatchReader : public FileBatchReader { public: static Result> Create( - std::unique_ptr<::avro::DataFileReader<::avro::GenericDatum>>&& reader, int32_t batch_size, + const std::shared_ptr& input_stream, int32_t batch_size, const std::shared_ptr& pool); ~AvroFileBatchReader() override; @@ -44,8 +46,7 @@ class AvroFileBatchReader : public FileBatchReader { const std::optional& selection_bitmap) override; uint64_t GetPreviousBatchFirstRowNumber() const override { - assert(false); - return -1; + return previous_first_row_; } uint64_t GetNumberOfRows() const override { @@ -54,8 +55,7 @@ class AvroFileBatchReader : public FileBatchReader { } std::shared_ptr GetReaderMetrics() const override { - assert(false); - return nullptr; + return metrics_; } void Close() override { @@ -69,14 +69,34 @@ class AvroFileBatchReader : public FileBatchReader { private: void DoClose(); - AvroFileBatchReader(std::unique_ptr<::avro::DataFileReader<::avro::GenericDatum>>&& reader, - std::unique_ptr&& record_converter, - int32_t batch_size); - - std::unique_ptr<::avro::DataFileReader<::avro::GenericDatum>> reader_; - std::unique_ptr record_converter_; + static Result> CreateDataFileReader( + const std::shared_ptr& input_stream, const std::shared_ptr& pool); + + static Result> CalculateReadFieldsProjection( + const std::shared_ptr<::arrow::DataType>& file_data_type, + const std::shared_ptr<::arrow::DataType>& read_data_type); + + AvroFileBatchReader(const std::shared_ptr& input_stream, + const std::shared_ptr<::arrow::DataType>& file_data_type, + std::unique_ptr<::avro::DataFileReaderBase>&& reader, + std::unique_ptr&& array_builder, int32_t batch_size, + const std::shared_ptr& pool); + + static constexpr size_t BUFFER_SIZE = 1024 * 1024; // 1M + + std::shared_ptr pool_; + std::shared_ptr input_stream_; + std::shared_ptr<::arrow::DataType> file_data_type_; + std::unique_ptr<::avro::DataFileReaderBase> reader_; + std::unique_ptr array_builder_; + std::optional> read_fields_projection_; + uint64_t previous_first_row_ = std::numeric_limits::max(); + uint64_t next_row_to_read_ = std::numeric_limits::max(); const int32_t batch_size_; bool close_ = false; + std::shared_ptr metrics_; + // Decode context for reusing scratch buffers + AvroDirectDecoder::DecodeContext decode_context_; }; } // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_file_batch_reader_test.cpp b/src/paimon/format/avro/avro_file_batch_reader_test.cpp index 6ab2639c..845d62dd 100644 --- a/src/paimon/format/avro/avro_file_batch_reader_test.cpp +++ b/src/paimon/format/avro/avro_file_batch_reader_test.cpp @@ -215,30 +215,27 @@ TEST_P(AvroFileBatchReaderTest, TestReadTimestampTypes) { ASSERT_OK_AND_ASSIGN(std::shared_ptr in, fs_->Open(path)); ASSERT_OK_AND_ASSIGN(auto batch_reader, reader_builder->Build(in)); - // check file schema - ASSERT_OK_AND_ASSIGN(auto c_file_schema, batch_reader->GetFileSchema()); - auto result_file_schema = arrow::ImportSchema(c_file_schema.get()).ValueOr(nullptr); - ASSERT_TRUE(result_file_schema); auto timezone = DateTimeUtils::GetLocalTimezoneName(); - arrow::FieldVector fields = { - arrow::field("ts_sec", arrow::timestamp(arrow::TimeUnit::MILLI)), + arrow::FieldVector read_fields = { + arrow::field("ts_sec", arrow::timestamp(arrow::TimeUnit::SECOND)), arrow::field("ts_milli", arrow::timestamp(arrow::TimeUnit::MILLI)), arrow::field("ts_micro", arrow::timestamp(arrow::TimeUnit::MICRO)), - arrow::field("ts_tz_sec", arrow::timestamp(arrow::TimeUnit::MILLI, timezone)), + arrow::field("ts_tz_sec", arrow::timestamp(arrow::TimeUnit::SECOND, timezone)), arrow::field("ts_tz_milli", arrow::timestamp(arrow::TimeUnit::MILLI, timezone)), arrow::field("ts_tz_micro", arrow::timestamp(arrow::TimeUnit::MICRO, timezone)), }; - auto expected_file_schema = arrow::schema(fields); - ASSERT_TRUE(result_file_schema->Equals(expected_file_schema)) << result_file_schema->ToString(); + auto read_schema = arrow::schema(read_fields); + std::unique_ptr c_schema = std::make_unique(); + ASSERT_TRUE(arrow::ExportSchema(*read_schema, c_schema.get()).ok()); + EXPECT_OK(batch_reader->SetReadSchema(c_schema.get(), /*predicate=*/nullptr, + /*selection_bitmap=*/std::nullopt)); // check array ASSERT_OK_AND_ASSIGN(auto result_array, ::paimon::test::ReadResultCollector::CollectResult(batch_reader.get())); - // TODO(jinli.zjw) after support SetReadSchema, need change ts_sec/ts_tz_sec type from milli - // to second std::shared_ptr expected_array; auto array_status = - arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow::struct_(fields), {R"([ + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow::struct_(read_fields), {R"([ ["1970-01-01T00:00:01","1970-01-01T00:00:00.001","1970-01-01T00:00:00.000001","1970-01-01T00:00:02","1970-01-01T00:00:00.002","1970-01-01T00:00:00.000002"], [null,"1970-01-01T00:00:00.003",null,null,"1970-01-01T00:00:00.004",null], ["1970-01-01T00:00:05",null,"1970-01-01T00:00:00.000005","1970-01-01T00:00:06",null,"1970-01-01T00:00:00.000006"] diff --git a/src/paimon/format/avro/avro_file_format.cpp b/src/paimon/format/avro/avro_file_format.cpp index bfdeacce..0d9e7908 100644 --- a/src/paimon/format/avro/avro_file_format.cpp +++ b/src/paimon/format/avro/avro_file_format.cpp @@ -16,6 +16,7 @@ #include "paimon/format/avro/avro_file_format.h" +#include #include #include "arrow/c/bridge.h" diff --git a/src/paimon/format/avro/avro_file_format_factory.cpp b/src/paimon/format/avro/avro_file_format_factory.cpp index 365417e4..d1c8c0cd 100644 --- a/src/paimon/format/avro/avro_file_format_factory.cpp +++ b/src/paimon/format/avro/avro_file_format_factory.cpp @@ -27,9 +27,15 @@ const char AvroFileFormatFactory::IDENTIFIER[] = "avro"; Result> AvroFileFormatFactory::Create( const std::map& options) const { + RegisterLogicalTypes(); return std::make_unique(options); } +void AvroFileFormatFactory::RegisterLogicalTypes() { + ::avro::CustomLogicalTypeRegistry::instance().registerType( + "map", [](const std::string&) { return std::make_shared(); }); +} + REGISTER_PAIMON_FACTORY(AvroFileFormatFactory); } // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_file_format_factory.h b/src/paimon/format/avro/avro_file_format_factory.h index 64daf6ac..3a3ecfd1 100644 --- a/src/paimon/format/avro/avro_file_format_factory.h +++ b/src/paimon/format/avro/avro_file_format_factory.h @@ -20,12 +20,17 @@ #include #include +#include "avro/LogicalType.hh" #include "paimon/format/file_format.h" #include "paimon/format/file_format_factory.h" #include "paimon/result.h" namespace paimon::avro { +struct MapLogicalType : public ::avro::CustomLogicalType { + MapLogicalType() : ::avro::CustomLogicalType("map") {} +}; + class AvroFileFormatFactory : public FileFormatFactory { public: static const char IDENTIFIER[]; @@ -36,6 +41,9 @@ class AvroFileFormatFactory : public FileFormatFactory { Result> Create( const std::map& options) const override; + + private: + static void RegisterLogicalTypes(); }; } // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_output_stream_impl.cpp b/src/paimon/format/avro/avro_output_stream_impl.cpp index a590bf69..4a0d8819 100644 --- a/src/paimon/format/avro/avro_output_stream_impl.cpp +++ b/src/paimon/format/avro/avro_output_stream_impl.cpp @@ -39,12 +39,13 @@ AvroOutputStreamImpl::AvroOutputStreamImpl(const std::shared_ptrFree(buffer_, buffer_size_); } bool AvroOutputStreamImpl::next(uint8_t** data, size_t* len) { if (available_ == 0) { - flush(); + FlushBuffer(); } *data = next_; *len = available_; @@ -60,7 +61,7 @@ void AvroOutputStreamImpl::backup(size_t len) { byte_count_ -= len; } -void AvroOutputStreamImpl::flush() { +void AvroOutputStreamImpl::FlushBuffer() { size_t length = buffer_size_ - available_; Result write_len = out_->Write(reinterpret_cast(buffer_), length); if (!write_len.ok()) { @@ -76,4 +77,12 @@ void AvroOutputStreamImpl::flush() { available_ = buffer_size_; } +void AvroOutputStreamImpl::flush() { + // ::avro::OutputStream's flush do nothing, because in the avro-cpp impl, calling flush() too + // frequently generates many small I/O operations, affecting write performance. + // + // And In avro-java impl, there is an option to control flush frequency. + // See: https://github.com/apache/avro/commit/35750393891c40f0ceb925a852162ec764bcae6c +} + } // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_output_stream_impl.h b/src/paimon/format/avro/avro_output_stream_impl.h index 389a7eb8..349c0c86 100644 --- a/src/paimon/format/avro/avro_output_stream_impl.h +++ b/src/paimon/format/avro/avro_output_stream_impl.h @@ -45,6 +45,8 @@ class AvroOutputStreamImpl : public ::avro::OutputStream { } private: + void FlushBuffer(); + std::shared_ptr pool_; const size_t buffer_size_; uint8_t* const buffer_; diff --git a/src/paimon/format/avro/avro_reader_builder.h b/src/paimon/format/avro/avro_reader_builder.h index 48c9fc92..aa0afd7a 100644 --- a/src/paimon/format/avro/avro_reader_builder.h +++ b/src/paimon/format/avro/avro_reader_builder.h @@ -42,19 +42,7 @@ class AvroReaderBuilder : public ReaderBuilder { Result> Build( const std::shared_ptr& path) const override { - try { - PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<::avro::InputStream> in, - AvroInputStreamImpl::Create(path, BUFFER_SIZE, pool_)); - auto data_file_reader = - std::make_unique<::avro::DataFileReader<::avro::GenericDatum>>(std::move(in)); - return AvroFileBatchReader::Create(std::move(data_file_reader), batch_size_, pool_); - } catch (const ::avro::Exception& e) { - return Status::Invalid(fmt::format("build avro reader failed. {}", e.what())); - } catch (const std::exception& e) { - return Status::Invalid(fmt::format("build avro reader failed. {}", e.what())); - } catch (...) { - return Status::Invalid("build avro reader failed. unknown error"); - } + return AvroFileBatchReader::Create(path, batch_size_, pool_); } Result> Build(const std::string& path) const override { @@ -62,8 +50,6 @@ class AvroReaderBuilder : public ReaderBuilder { } private: - static constexpr size_t BUFFER_SIZE = 1024 * 1024; // 1M - const int32_t batch_size_; std::shared_ptr pool_; const std::map options_; diff --git a/src/paimon/format/avro/avro_record_converter.cpp b/src/paimon/format/avro/avro_record_converter.cpp deleted file mode 100644 index b74aaf1b..00000000 --- a/src/paimon/format/avro/avro_record_converter.cpp +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright 2024-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "paimon/format/avro/avro_record_converter.h" - -#include -#include -#include -#include - -#include "arrow/api.h" -#include "arrow/array/builder_base.h" -#include "arrow/array/builder_nested.h" -#include "arrow/c/abi.h" -#include "arrow/util/checked_cast.h" -#include "avro/GenericDatum.hh" -#include "paimon/common/utils/arrow/mem_utils.h" -#include "paimon/common/utils/arrow/status_utils.h" -#include "paimon/format/avro/avro_record_data_getter.h" -#include "paimon/status.h" - -namespace paimon { -class MemoryPool; -} // namespace paimon - -namespace paimon::avro { - -Result> AvroRecordConverter::Create( - const std::shared_ptr<::arrow::DataType>& type, const std::shared_ptr& pool) { - auto arrow_pool = GetArrowPool(pool); - std::unique_ptr array_builder; - PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::MakeBuilder(arrow_pool.get(), type, &array_builder)); - - auto struct_builder = - arrow::internal::checked_pointer_cast(std::move(array_builder)); - assert(struct_builder); - std::vector appenders; - // first is the root struct array - int32_t reserve_count = 1; - for (size_t i = 0; i < type->fields().size(); i++) { - PAIMON_ASSIGN_OR_RAISE( - RowToArrowArrayConverter::AppendValueFunc func, - AppendField(/*use_view=*/true, struct_builder->field_builder(i), &reserve_count)); - appenders.emplace_back(func); - } - return std::unique_ptr( - new AvroRecordConverter(reserve_count, std::move(appenders), std::move(struct_builder), - std::move(arrow_pool), type, pool)); -} - -Result AvroRecordConverter::NextBatch( - const std::vector<::avro::GenericDatum>& avro_datums) { - PAIMON_RETURN_NOT_OK(ResetAndReserve()); - PAIMON_RETURN_NOT_OK_FROM_ARROW( - array_builder_->AppendValues(avro_datums.size(), /*valid_bytes=*/nullptr)); - const auto& fields = type_->fields(); - for (size_t i = 0; i < fields.size(); i++) { - for (const auto& avro_datum : avro_datums) { - PAIMON_RETURN_NOT_OK_FROM_ARROW(appenders_[i]( - AvroRecordDataGetter(avro_datum.value<::avro::GenericRecord>(), pool_), i)); - } - } - return FinishAndAccumulate(); -} - -AvroRecordConverter::AvroRecordConverter(int32_t reserve_count, - std::vector&& appenders, - std::unique_ptr&& array_builder, - std::unique_ptr&& arrow_pool, - const std::shared_ptr<::arrow::DataType>& type, - const std::shared_ptr& pool) - : RowToArrowArrayConverter(reserve_count, std::move(appenders), std::move(array_builder), - std::move(arrow_pool)), - type_(type), - pool_(pool) {} - -} // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_record_converter.h b/src/paimon/format/avro/avro_record_converter.h deleted file mode 100644 index 33765199..00000000 --- a/src/paimon/format/avro/avro_record_converter.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright 2024-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -#include "avro/GenericDatum.hh" -#include "paimon/core/io/row_to_arrow_array_converter.h" -#include "paimon/reader/batch_reader.h" -#include "paimon/result.h" - -namespace arrow { -class DataType; -class MemoryPool; -class StructBuilder; -} // namespace arrow -namespace avro { -class GenericDatum; -} // namespace avro -namespace paimon { -class MemoryPool; -} // namespace paimon - -namespace paimon::avro { - -class AvroRecordConverter - : public RowToArrowArrayConverter<::avro::GenericDatum, BatchReader::ReadBatch> { - public: - static Result> Create( - const std::shared_ptr<::arrow::DataType>& type, const std::shared_ptr& pool); - - Result NextBatch( - const std::vector<::avro::GenericDatum>& avro_datums) override; - - private: - AvroRecordConverter(int32_t reserve_count, std::vector&& appenders, - std::unique_ptr&& array_builder, - std::unique_ptr&& arrow_pool, - const std::shared_ptr<::arrow::DataType>& type, - const std::shared_ptr& pool); - - std::shared_ptr type_; - std::shared_ptr pool_; -}; - -} // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_record_converter_test.cpp b/src/paimon/format/avro/avro_record_converter_test.cpp deleted file mode 100644 index 558f2769..00000000 --- a/src/paimon/format/avro/avro_record_converter_test.cpp +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright 2024-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "paimon/format/avro/avro_record_converter.h" - -#include -#include - -#include "arrow/api.h" -#include "arrow/array/array_base.h" -#include "arrow/c/abi.h" -#include "arrow/c/bridge.h" -#include "arrow/ipc/json_simple.h" -#include "avro/Compiler.hh" -#include "avro/GenericDatum.hh" -#include "avro/ValidSchema.hh" -#include "gtest/gtest.h" -#include "paimon/memory/memory_pool.h" -#include "paimon/status.h" -#include "paimon/testing/utils/testharness.h" - -namespace paimon::avro::test { -class AvroRecordConverterTest : public testing::Test { - public: - std::shared_ptr CreateSimpleSchema() { - std::string schema_str = R"({ - "type": "record", - "name": "TestRecord", - "fields": [ - {"name": "id", "type": "int"}, - {"name": "name", "type": "string"}, - {"name": "is_active", "type": "boolean"} - ] - })"; - return std::make_shared( - ::avro::compileJsonSchemaFromString(schema_str)); - } - - std::vector<::avro::GenericDatum> CreateTestDatums(int count) { - auto valid_schema = CreateSimpleSchema(); - std::vector<::avro::GenericDatum> datums; - for (int i = 0; i < count; ++i) { - ::avro::GenericRecord record(valid_schema->root()); - record.setFieldAt(0, ::avro::GenericDatum(i + 1)); - record.setFieldAt(1, ::avro::GenericDatum("user_" + std::to_string(i + 1))); - record.setFieldAt(2, ::avro::GenericDatum(i % 2 == 0)); - - ::avro::GenericDatum datum(*valid_schema); - auto& value = datum.value<::avro::GenericRecord>(); - value = std::move(record); - datums.emplace_back(std::move(datum)); - } - return datums; - } - - std::shared_ptr CreateExpectedArrowSchema() { - auto id_field = arrow::field("id", arrow::int32()); - auto name_field = arrow::field("name", arrow::utf8()); - auto active_field = arrow::field("is_active", arrow::boolean()); - return arrow::schema({id_field, name_field, active_field}); - } -}; - -TEST_F(AvroRecordConverterTest, NextBatchConvertsSimpleDataCorrectly) { - auto arrow_schema = CreateExpectedArrowSchema(); - auto arrow_type = arrow::struct_({arrow_schema->fields()}); - ASSERT_OK_AND_ASSIGN(auto converter, AvroRecordConverter::Create(arrow_type, GetDefaultPool())); - - auto avro_datums = CreateTestDatums(3); - ASSERT_OK_AND_ASSIGN(auto batch, converter->NextBatch(avro_datums)); - auto [c_array, c_schema] = std::move(batch); - ASSERT_EQ(c_array->length, 3); - auto array = arrow::ImportArray(c_array.get(), c_schema.get()).ValueOr(nullptr); - ASSERT_TRUE(array); - std::string data_str = R"([ -[1, "user_1", true], -[2, "user_2", false], -[3, "user_3", true] -])"; - auto expected_array = - arrow::ipc::internal::json::ArrayFromJSON(arrow_type, data_str).ValueOr(nullptr); - ASSERT_TRUE(expected_array); - ASSERT_TRUE(expected_array->Equals(array)); -} - -} // namespace paimon::avro::test diff --git a/src/paimon/format/avro/avro_record_data_getter.cpp b/src/paimon/format/avro/avro_record_data_getter.cpp deleted file mode 100644 index 39f93db1..00000000 --- a/src/paimon/format/avro/avro_record_data_getter.cpp +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright 2024-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "paimon/format/avro/avro_record_data_getter.h" - -#include - -#include "avro/GenericDatum.hh" -#include "paimon/format/avro/avro_datum_data_getter.h" -#include "paimon/status.h" - -namespace paimon { -class InternalMap; -class MemoryPool; -} // namespace paimon - -namespace paimon::avro { - -AvroRecordDataGetter::AvroRecordDataGetter(const ::avro::GenericRecord& record, - const std::shared_ptr& pool) - : record_(record), pool_(pool) {} - -bool AvroRecordDataGetter::IsNullAt(int32_t pos) const { - assert(pos < GetFieldCount()); - return AvroDatumDataGetter::IsNullAt(record_.fieldAt(pos)); -} -bool AvroRecordDataGetter::GetBoolean(int32_t pos) const { - assert(pos < GetFieldCount()); - return AvroDatumDataGetter::GetBoolean(record_.fieldAt(pos)); -} -char AvroRecordDataGetter::GetByte(int32_t pos) const { - assert(pos < GetFieldCount()); - return AvroDatumDataGetter::GetByte(record_.fieldAt(pos)); -} -int16_t AvroRecordDataGetter::GetShort(int32_t pos) const { - assert(pos < GetFieldCount()); - return AvroDatumDataGetter::GetShort(record_.fieldAt(pos)); -} -int32_t AvroRecordDataGetter::GetInt(int32_t pos) const { - assert(pos < GetFieldCount()); - return AvroDatumDataGetter::GetInt(record_.fieldAt(pos)); -} -int32_t AvroRecordDataGetter::GetDate(int32_t pos) const { - assert(pos < GetFieldCount()); - return AvroDatumDataGetter::GetDate(record_.fieldAt(pos)); -} -int64_t AvroRecordDataGetter::GetLong(int32_t pos) const { - assert(pos < GetFieldCount()); - return AvroDatumDataGetter::GetLong(record_.fieldAt(pos)); -} -float AvroRecordDataGetter::GetFloat(int32_t pos) const { - assert(pos < GetFieldCount()); - return AvroDatumDataGetter::GetFloat(record_.fieldAt(pos)); -} -double AvroRecordDataGetter::GetDouble(int32_t pos) const { - assert(pos < GetFieldCount()); - return AvroDatumDataGetter::GetDouble(record_.fieldAt(pos)); -} -BinaryString AvroRecordDataGetter::GetString(int32_t pos) const { - assert(false); - assert(pos < GetFieldCount()); - return AvroDatumDataGetter::GetString(record_.fieldAt(pos), pool_); -} -std::string_view AvroRecordDataGetter::GetStringView(int32_t pos) const { - assert(pos < GetFieldCount()); - return AvroDatumDataGetter::GetStringView(record_.fieldAt(pos)); -} -Decimal AvroRecordDataGetter::GetDecimal(int32_t pos, int32_t precision, int32_t scale) const { - assert(pos < GetFieldCount()); - return AvroDatumDataGetter::GetDecimal(record_.fieldAt(pos), precision, scale, pool_); -} -Timestamp AvroRecordDataGetter::GetTimestamp(int32_t pos, int32_t precision) const { - assert(pos < GetFieldCount()); - return AvroDatumDataGetter::GetTimestamp(record_.fieldAt(pos), precision); -} -std::shared_ptr AvroRecordDataGetter::GetBinary(int32_t pos) const { - assert(pos < GetFieldCount()); - return AvroDatumDataGetter::GetBinary(record_.fieldAt(pos), pool_); -} - -std::shared_ptr AvroRecordDataGetter::GetArray(int32_t pos) const { - assert(pos < GetFieldCount()); - return AvroDatumDataGetter::GetArray(record_.fieldAt(pos), pool_); -} - -std::shared_ptr AvroRecordDataGetter::GetRow(int32_t pos, int32_t num_fields) const { - assert(pos < GetFieldCount()); - return AvroDatumDataGetter::GetRow(record_.fieldAt(pos), num_fields, pool_); -} - -int32_t AvroRecordDataGetter::GetFieldCount() const { - return record_.fieldCount(); -} - -std::shared_ptr AvroRecordDataGetter::GetMap(int32_t pos) const { - assert(false); - return nullptr; -} - -Result AvroRecordDataGetter::GetRowKind() const { - assert(false); - return Status::Invalid("avro record do not have row kind."); -} - -void AvroRecordDataGetter::SetRowKind(const RowKind* kind) { - assert(false); -} - -std::string AvroRecordDataGetter::ToString() const { - assert(false); - return ""; -} - -} // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_record_data_getter.h b/src/paimon/format/avro/avro_record_data_getter.h deleted file mode 100644 index 6ce406fe..00000000 --- a/src/paimon/format/avro/avro_record_data_getter.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright 2024-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -#include "avro/GenericDatum.hh" -#include "paimon/common/data/binary_string.h" -#include "paimon/common/data/data_define.h" -#include "paimon/common/data/internal_array.h" -#include "paimon/common/data/internal_row.h" -#include "paimon/common/types/row_kind.h" -#include "paimon/data/decimal.h" -#include "paimon/data/timestamp.h" -#include "paimon/memory/bytes.h" -#include "paimon/result.h" - -namespace avro { -class GenericRecord; -} // namespace avro -namespace paimon { -class MemoryPool; -} // namespace paimon - -namespace paimon::avro { - -class AvroRecordDataGetter : public InternalRow { - public: - AvroRecordDataGetter(const ::avro::GenericRecord& record, - const std::shared_ptr& pool); - - bool IsNullAt(int32_t pos) const override; - bool GetBoolean(int32_t pos) const override; - char GetByte(int32_t pos) const override; - int16_t GetShort(int32_t pos) const override; - int32_t GetInt(int32_t pos) const override; - int32_t GetDate(int32_t pos) const override; - int64_t GetLong(int32_t pos) const override; - float GetFloat(int32_t pos) const override; - double GetDouble(int32_t pos) const override; - BinaryString GetString(int32_t pos) const override; - std::string_view GetStringView(int32_t pos) const override; - Decimal GetDecimal(int32_t pos, int32_t precision, int32_t scale) const override; - Timestamp GetTimestamp(int32_t pos, int32_t precision) const override; - std::shared_ptr GetBinary(int32_t pos) const override; - std::shared_ptr GetArray(int32_t pos) const override; - std::shared_ptr GetRow(int32_t pos, int32_t num_fields) const override; - int32_t GetFieldCount() const override; - std::shared_ptr GetMap(int32_t pos) const override; - Result GetRowKind() const override; - void SetRowKind(const RowKind* kind) override; - std::string ToString() const override; - - private: - const ::avro::GenericRecord& record_; - std::shared_ptr pool_; -}; - -} // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_schema_converter.cpp b/src/paimon/format/avro/avro_schema_converter.cpp index 621e5151..b1889482 100644 --- a/src/paimon/format/avro/avro_schema_converter.cpp +++ b/src/paimon/format/avro/avro_schema_converter.cpp @@ -30,9 +30,9 @@ #include "avro/ValidSchema.hh" #include "fmt/format.h" #include "paimon/common/utils/date_time_utils.h" +#include "paimon/format/avro/avro_file_format_factory.h" #include "paimon/macros.h" #include "paimon/status.h" - namespace paimon::avro { /// Returns schema with nullable true. @@ -44,6 +44,18 @@ ::avro::Schema AvroSchemaConverter::NullableSchema(const ::avro::Schema& schema) return union_schema; } +void AvroSchemaConverter::AddRecordField(::avro::RecordSchema* record_schema, + const std::string& field_name, + const ::avro::Schema& field_schema) { + if (field_schema.type() == ::avro::Type::AVRO_UNION) { + ::avro::CustomAttributes attrs; + attrs.addAttribute("default", "null", /*addQuotes=*/false); + record_schema->addField(field_name, field_schema, attrs); + } else { + record_schema->addField(field_name, field_schema); + } +} + Result AvroSchemaConverter::CheckUnionType(const ::avro::NodePtr& avro_node) { auto type = avro_node->type(); if (type == ::avro::AVRO_UNION) { @@ -142,9 +154,41 @@ Result> AvroSchemaConverter::GetArrowType( auto timezone = DateTimeUtils::GetLocalTimezoneName(); return arrow::timestamp(arrow::TimeUnit::NANO, timezone); } + case ::avro::LogicalType::Type::CUSTOM: { + if (!HasMapLogicalType(avro_node)) { + return Status::TypeError("invalid avro logical map type"); + } + if (type != ::avro::AVRO_ARRAY) { + return Status::TypeError("invalid avro logical map stored as ", toString(type)); + } + size_t subtype_count = avro_node->leaves(); + if (subtype_count != 1) { + return Status::TypeError("invalid avro logical map type"); + } + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr logical_map_field, + GetArrowField("item", avro_node->leafAt(0))); + auto logical_map_type = logical_map_field->type(); + if (logical_map_type->id() != arrow::Type::STRUCT) { + return Status::TypeError("invalid avro logical map item type"); + } + auto struct_type = + arrow::internal::checked_pointer_cast(logical_map_type); + const auto& fields = struct_type->fields(); + if (fields.size() != 2) { + return Status::TypeError("invalid avro logical map struct fields size"); + } + auto key_field = fields[0]; + key_field = key_field->WithNullable(false); + auto value_field = fields[1]; + if (key_field->name() != "key" || value_field->name() != "value") { + return Status::TypeError("invalid avro logical map struct field names"); + } + return std::make_shared(std::move(key_field), std::move(value_field)); + } default: - return Status::NotImplemented("not support logical type ", - std::to_string(logical_type.type())); + std::stringstream logical_type_str; + logical_type.printJson(logical_type_str); + return Status::NotImplemented("not support logical type: ", logical_type_str.str()); } size_t subtype_count = avro_node->leaves(); @@ -205,7 +249,7 @@ Result> AvroSchemaConverter::GetArrowType( } Result<::avro::Schema> AvroSchemaConverter::ArrowTypeToAvroSchema( - const std::shared_ptr& field) { + const std::shared_ptr& field, const std::string& row_name) { bool nullable = field->nullable(); auto arrow_type = field->type(); switch (arrow_type->id()) { @@ -232,20 +276,37 @@ Result<::avro::Schema> AvroSchemaConverter::ArrowTypeToAvroSchema( return nullable ? NullableSchema(date_schema) : date_schema; } case arrow::Type::type::TIMESTAMP: { - // TODO(jinli.zjw): support convert with multiple precision & timezone const auto& arrow_timestamp_type = arrow::internal::checked_pointer_cast(arrow_type); - if (!arrow_timestamp_type->timezone().empty()) { - return Status::Invalid("Unsupported TimestampType with timezone"); - } - if (arrow_timestamp_type->unit() != arrow::TimeUnit::type::NANO) { - return Status::Invalid("Only supported TimestampType with nano time unit"); - } - // NOTE: Java Avro only support TIMESTAMP_MILLIS && TIMESTAMP_MICROS - ::avro::LogicalType timestamp_type = - ::avro::LogicalType(::avro::LogicalType::TIMESTAMP_MICROS); + bool has_timezone = !arrow_timestamp_type->timezone().empty(); ::avro::LongSchema timestamp_schema; - timestamp_schema.root()->setLogicalType(timestamp_type); + switch (arrow_timestamp_type->unit()) { + // Avro doesn't support seconds, convert to milliseconds + case arrow::TimeUnit::type::SECOND: + case arrow::TimeUnit::type::MILLI: { + ::avro::LogicalType logical_type = ::avro::LogicalType( + has_timezone ? ::avro::LogicalType::LOCAL_TIMESTAMP_MILLIS + : ::avro::LogicalType::TIMESTAMP_MILLIS); + timestamp_schema.root()->setLogicalType(logical_type); + break; + } + case arrow::TimeUnit::type::MICRO: { + ::avro::LogicalType logical_type = ::avro::LogicalType( + has_timezone ? ::avro::LogicalType::LOCAL_TIMESTAMP_MICROS + : ::avro::LogicalType::TIMESTAMP_MICROS); + timestamp_schema.root()->setLogicalType(logical_type); + break; + } + case arrow::TimeUnit::type::NANO: { + ::avro::LogicalType logical_type = ::avro::LogicalType( + has_timezone ? ::avro::LogicalType::LOCAL_TIMESTAMP_NANOS + : ::avro::LogicalType::TIMESTAMP_NANOS); + timestamp_schema.root()->setLogicalType(logical_type); + break; + } + default: + return Status::Invalid("Unknown TimeUnit in TimestampType"); + } return nullable ? NullableSchema(timestamp_schema) : timestamp_schema; } case arrow::Type::type::DECIMAL128: { @@ -258,6 +319,56 @@ Result<::avro::Schema> AvroSchemaConverter::ArrowTypeToAvroSchema( decimal_schema.root()->setLogicalType(decimal_type); return nullable ? NullableSchema(decimal_schema) : decimal_schema; } + case arrow::Type::LIST: { + const auto& list_type = + arrow::internal::checked_pointer_cast(arrow_type); + const auto& value_field = list_type->value_field(); + PAIMON_ASSIGN_OR_RAISE(auto value_schema, ArrowTypeToAvroSchema(value_field, row_name)); + ::avro::ArraySchema array_schema(value_schema); + return nullable ? NullableSchema(array_schema) : array_schema; + } + case arrow::Type::STRUCT: { + const auto& struct_type = + arrow::internal::checked_pointer_cast(arrow_type); + const auto& fields = struct_type->fields(); + + ::avro::RecordSchema record_schema(row_name); + for (const auto& f : fields) { + PAIMON_ASSIGN_OR_RAISE(auto field_schema, + ArrowTypeToAvroSchema(f, row_name + "_" + f->name())); + AddRecordField(&record_schema, f->name(), field_schema); + } + return nullable ? NullableSchema(record_schema) : record_schema; + } + case arrow::Type::MAP: { + const auto& map_type = + arrow::internal::checked_pointer_cast(arrow_type); + const auto& key_field = map_type->key_field(); + const auto& item_field = map_type->item_field(); + if (key_field->nullable()) { + return Status::Invalid("Avro Map key cannot be nullable"); + } + if (key_field->type()->id() == arrow::Type::STRING) { + PAIMON_ASSIGN_OR_RAISE(auto item_schema, + ArrowTypeToAvroSchema(item_field, row_name)); + ::avro::MapSchema map_schema(item_schema); + return nullable ? NullableSchema(map_schema) : map_schema; + } else { + // convert to list> + PAIMON_ASSIGN_OR_RAISE(auto key_schema, + ArrowTypeToAvroSchema(key_field, row_name + "_key")); + PAIMON_ASSIGN_OR_RAISE(auto item_schema, + ArrowTypeToAvroSchema(item_field, row_name + "_value")); + ::avro::LogicalType logical_map_type = + ::avro::LogicalType(std::make_shared()); + ::avro::RecordSchema record_schema(row_name); + AddRecordField(&record_schema, "key", key_schema); + AddRecordField(&record_schema, "value", item_schema); + ::avro::ArraySchema logical_map_schema(record_schema); + logical_map_schema.root()->setLogicalType(logical_map_type); + return nullable ? NullableSchema(logical_map_schema) : logical_map_schema; + } + } default: return Status::Invalid(fmt::format("Not support arrow type '{}' convert to avro", field->type()->ToString())); @@ -265,19 +376,20 @@ Result<::avro::Schema> AvroSchemaConverter::ArrowTypeToAvroSchema( } Result<::avro::ValidSchema> AvroSchemaConverter::ArrowSchemaToAvroSchema( - const std::shared_ptr& arrow_schema) { - ::avro::RecordSchema record_schema("record"); + const std::shared_ptr& arrow_schema, const std::string& row_name) { + ::avro::RecordSchema record_schema(row_name); for (const auto& field : arrow_schema->fields()) { - PAIMON_ASSIGN_OR_RAISE(::avro::Schema schema, ArrowTypeToAvroSchema(field)); - if (schema.type() == ::avro::Type::AVRO_UNION) { - ::avro::CustomAttributes attrs; - attrs.addAttribute("default", "null", /*addQuotes=*/false); - record_schema.addField(field->name(), schema, attrs); - } else { - record_schema.addField(field->name(), schema); - } + PAIMON_ASSIGN_OR_RAISE(::avro::Schema field_schema, + ArrowTypeToAvroSchema(field, row_name + "_" + field->name())); + AddRecordField(&record_schema, field->name(), field_schema); } return ::avro::ValidSchema(record_schema); } +bool AvroSchemaConverter::HasMapLogicalType(const ::avro::NodePtr& node) { + return node->logicalType().type() == ::avro::LogicalType::CUSTOM && + node->logicalType().customLogicalType() != nullptr && + node->logicalType().customLogicalType()->name() == "map"; +} + } // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_schema_converter.h b/src/paimon/format/avro/avro_schema_converter.h index f963ff43..da7c01da 100644 --- a/src/paimon/format/avro/avro_schema_converter.h +++ b/src/paimon/format/avro/avro_schema_converter.h @@ -32,24 +32,32 @@ class AvroSchemaConverter { AvroSchemaConverter() = delete; ~AvroSchemaConverter() = delete; + // TODO(menglingda.mld): avro添加field id static Result<::avro::ValidSchema> ArrowSchemaToAvroSchema( - const std::shared_ptr& arrow_schema); + const std::shared_ptr& arrow_schema, + const std::string& row_name = "org.apache.paimon.avro.generated.record"); static Result> AvroSchemaToArrowDataType( const ::avro::ValidSchema& avro_schema); + private: static Result> GetArrowType(const ::avro::NodePtr& avro_node, bool* nullable); - private: - static Result<::avro::Schema> ArrowTypeToAvroSchema(const std::shared_ptr& field); + static Result<::avro::Schema> ArrowTypeToAvroSchema(const std::shared_ptr& field, + const std::string& row_name); static ::avro::Schema NullableSchema(const ::avro::Schema& schema); + static void AddRecordField(::avro::RecordSchema* record_schema, const std::string& field_name, + const ::avro::Schema& field_schema); + static Result CheckUnionType(const ::avro::NodePtr& avro_node); static Result> GetArrowField(const std::string& name, const ::avro::NodePtr& avro_node); + + static bool HasMapLogicalType(const ::avro::NodePtr& node); }; } // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_schema_converter_test.cpp b/src/paimon/format/avro/avro_schema_converter_test.cpp index 08550aae..c18e4bcb 100644 --- a/src/paimon/format/avro/avro_schema_converter_test.cpp +++ b/src/paimon/format/avro/avro_schema_converter_test.cpp @@ -32,6 +32,7 @@ TEST(AvroSchemaConverterTest, TestSimple) { // Test a basic record with primitive types std::string schema_json = R"({ "type": "record", + "namespace": "org.apache.paimon.avro.generated", "name": "record", "fields": [ {"name": "f_bool", "type": "boolean"}, @@ -196,6 +197,7 @@ TEST(AvroSchemaConverterTest, TestAvroSchemaToArrowDataTypeWithNullableAndComple TEST(AvroSchemaConverterTest, TestAvroSchemaToArrowDataTypeWithTimestampType) { std::string schema_json = R"({ "type": "record", + "namespace": "org.apache.paimon.avro.generated", "name": "record", "fields": [ { From 30e67a214092ee142023af0014a9003b0ad09989 Mon Sep 17 00:00:00 2001 From: "jinli.zjw" Date: Mon, 26 Jan 2026 11:23:35 +0800 Subject: [PATCH 02/12] add inte test & ut --- .../append_multiple.db/append_multiple/README | 14 ++ ...c670e9a-cc3a-4c29-9ba2-775908cb5650-0.avro | Bin 0 -> 1716 bytes ...c670e9a-cc3a-4c29-9ba2-775908cb5650-1.avro | Bin 0 -> 1722 bytes ...55f9742-6651-4b8e-9ba6-95e8b74b0549-0.avro | Bin 0 -> 1703 bytes ...55f9742-6651-4b8e-9ba6-95e8b74b0549-1.avro | Bin 0 -> 1710 bytes ...fbfe712-94fa-4b8a-870c-c500dec838a5-0.avro | Bin 0 -> 1759 bytes ...fbfe712-94fa-4b8a-870c-c500dec838a5-1.avro | Bin 0 -> 1724 bytes ...est-39146ee7-fda8-42a9-894d-5395bc44e8a2-0 | Bin 0 -> 2349 bytes ...est-b91f4949-0601-4535-8b29-84b79666b306-0 | Bin 0 -> 2338 bytes ...ist-21081c8d-4a00-4e90-a265-84f4487d1ec0-0 | Bin 0 -> 1006 bytes ...ist-21081c8d-4a00-4e90-a265-84f4487d1ec0-1 | Bin 0 -> 1125 bytes ...ist-6f4689de-a277-459f-955a-e91a9b52b4ad-0 | Bin 0 -> 1125 bytes ...ist-6f4689de-a277-459f-955a-e91a9b52b4ad-1 | Bin 0 -> 1122 bytes .../append_multiple/schema/schema-0 | 124 ++++++++++++++++++ .../append_multiple/snapshot/EARLIEST | 1 + .../append_multiple/snapshot/LATEST | 1 + .../append_multiple/snapshot/snapshot-1 | 19 +++ .../append_multiple/snapshot/snapshot-2 | 19 +++ .../append_simple.db/append_simple/README | 10 ++ ...7d1c416-6e34-4834-af87-341d09418f0c-0.avro | Bin 0 -> 636 bytes ...est-f77f2a65-c6be-4e7c-accd-c8dc75578c9a-0 | Bin 0 -> 2093 bytes ...ist-3602dcaf-7dc8-4879-9dce-9215f674e866-0 | Bin 0 -> 1006 bytes ...ist-3602dcaf-7dc8-4879-9dce-9215f674e866-1 | Bin 0 -> 1108 bytes .../append_simple/schema/schema-0 | 48 +++++++ .../append_simple/snapshot/EARLIEST | 1 + .../append_simple/snapshot/LATEST | 1 + .../append_simple/snapshot/snapshot-1 | 19 +++ .../append_with_multiple_map/README | 7 + ...2442742-e49e-48a4-a736-a2475aac2d2c-0.avro | Bin 0 -> 2195 bytes ...est-3c01ce68-8b9e-4bc8-8a56-b481d26faab6-0 | Bin 0 -> 2086 bytes ...ist-6a56727e-83dc-4b80-a420-86a036632e80-0 | Bin 0 -> 1006 bytes ...ist-6a56727e-83dc-4b80-a420-86a036632e80-1 | Bin 0 -> 1110 bytes .../append_with_multiple_map/schema/schema-0 | 100 ++++++++++++++ .../snapshot/EARLIEST | 1 + .../append_with_multiple_map/snapshot/LATEST | 1 + .../snapshot/snapshot-1 | 19 +++ .../pk_with_multiple_type/README | 16 +++ ...dbda3f3-c4d4-4e2e-b771-c48fc89f30a7-0.avro | Bin 0 -> 1320 bytes ...dbda3f3-c4d4-4e2e-b771-c48fc89f30a7-1.avro | Bin 0 -> 1294 bytes ...est-4cd8b157-3a02-4a9b-8b65-4976410cb4a9-0 | Bin 0 -> 2157 bytes ...est-4cd8b157-3a02-4a9b-8b65-4976410cb4a9-1 | Bin 0 -> 2157 bytes ...ist-bc16121e-12e3-4ba0-a499-bf054e158852-0 | Bin 0 -> 1006 bytes ...ist-bc16121e-12e3-4ba0-a499-bf054e158852-1 | Bin 0 -> 1119 bytes ...ist-bc16121e-12e3-4ba0-a499-bf054e158852-2 | Bin 0 -> 1119 bytes ...ist-bc16121e-12e3-4ba0-a499-bf054e158852-3 | Bin 0 -> 1119 bytes .../pk_with_multiple_type/schema/schema-0 | 81 ++++++++++++ .../pk_with_multiple_type/snapshot/EARLIEST | 1 + .../pk_with_multiple_type/snapshot/LATEST | 1 + .../pk_with_multiple_type/snapshot/snapshot-1 | 19 +++ .../pk_with_multiple_type/snapshot/snapshot-2 | 19 +++ 50 files changed, 522 insertions(+) create mode 100644 test/test_data/avro/append_multiple.db/append_multiple/README create mode 100644 test/test_data/avro/append_multiple.db/append_multiple/f1=10/bucket-0/data-3c670e9a-cc3a-4c29-9ba2-775908cb5650-0.avro create mode 100644 test/test_data/avro/append_multiple.db/append_multiple/f1=10/bucket-0/data-3c670e9a-cc3a-4c29-9ba2-775908cb5650-1.avro create mode 100644 test/test_data/avro/append_multiple.db/append_multiple/f1=10/bucket-1/data-155f9742-6651-4b8e-9ba6-95e8b74b0549-0.avro create mode 100644 test/test_data/avro/append_multiple.db/append_multiple/f1=10/bucket-1/data-155f9742-6651-4b8e-9ba6-95e8b74b0549-1.avro create mode 100644 test/test_data/avro/append_multiple.db/append_multiple/f1=11/bucket-1/data-3fbfe712-94fa-4b8a-870c-c500dec838a5-0.avro create mode 100644 test/test_data/avro/append_multiple.db/append_multiple/f1=11/bucket-1/data-3fbfe712-94fa-4b8a-870c-c500dec838a5-1.avro create mode 100644 test/test_data/avro/append_multiple.db/append_multiple/manifest/manifest-39146ee7-fda8-42a9-894d-5395bc44e8a2-0 create mode 100644 test/test_data/avro/append_multiple.db/append_multiple/manifest/manifest-b91f4949-0601-4535-8b29-84b79666b306-0 create mode 100644 test/test_data/avro/append_multiple.db/append_multiple/manifest/manifest-list-21081c8d-4a00-4e90-a265-84f4487d1ec0-0 create mode 100644 test/test_data/avro/append_multiple.db/append_multiple/manifest/manifest-list-21081c8d-4a00-4e90-a265-84f4487d1ec0-1 create mode 100644 test/test_data/avro/append_multiple.db/append_multiple/manifest/manifest-list-6f4689de-a277-459f-955a-e91a9b52b4ad-0 create mode 100644 test/test_data/avro/append_multiple.db/append_multiple/manifest/manifest-list-6f4689de-a277-459f-955a-e91a9b52b4ad-1 create mode 100644 test/test_data/avro/append_multiple.db/append_multiple/schema/schema-0 create mode 100644 test/test_data/avro/append_multiple.db/append_multiple/snapshot/EARLIEST create mode 100644 test/test_data/avro/append_multiple.db/append_multiple/snapshot/LATEST create mode 100644 test/test_data/avro/append_multiple.db/append_multiple/snapshot/snapshot-1 create mode 100644 test/test_data/avro/append_multiple.db/append_multiple/snapshot/snapshot-2 create mode 100644 test/test_data/avro/append_simple.db/append_simple/README create mode 100644 test/test_data/avro/append_simple.db/append_simple/bucket-0/data-d7d1c416-6e34-4834-af87-341d09418f0c-0.avro create mode 100644 test/test_data/avro/append_simple.db/append_simple/manifest/manifest-f77f2a65-c6be-4e7c-accd-c8dc75578c9a-0 create mode 100644 test/test_data/avro/append_simple.db/append_simple/manifest/manifest-list-3602dcaf-7dc8-4879-9dce-9215f674e866-0 create mode 100644 test/test_data/avro/append_simple.db/append_simple/manifest/manifest-list-3602dcaf-7dc8-4879-9dce-9215f674e866-1 create mode 100644 test/test_data/avro/append_simple.db/append_simple/schema/schema-0 create mode 100644 test/test_data/avro/append_simple.db/append_simple/snapshot/EARLIEST create mode 100644 test/test_data/avro/append_simple.db/append_simple/snapshot/LATEST create mode 100644 test/test_data/avro/append_simple.db/append_simple/snapshot/snapshot-1 create mode 100644 test/test_data/avro/append_with_multiple_map.db/append_with_multiple_map/README create mode 100644 test/test_data/avro/append_with_multiple_map.db/append_with_multiple_map/bucket-0/data-72442742-e49e-48a4-a736-a2475aac2d2c-0.avro create mode 100644 test/test_data/avro/append_with_multiple_map.db/append_with_multiple_map/manifest/manifest-3c01ce68-8b9e-4bc8-8a56-b481d26faab6-0 create mode 100644 test/test_data/avro/append_with_multiple_map.db/append_with_multiple_map/manifest/manifest-list-6a56727e-83dc-4b80-a420-86a036632e80-0 create mode 100644 test/test_data/avro/append_with_multiple_map.db/append_with_multiple_map/manifest/manifest-list-6a56727e-83dc-4b80-a420-86a036632e80-1 create mode 100644 test/test_data/avro/append_with_multiple_map.db/append_with_multiple_map/schema/schema-0 create mode 100644 test/test_data/avro/append_with_multiple_map.db/append_with_multiple_map/snapshot/EARLIEST create mode 100644 test/test_data/avro/append_with_multiple_map.db/append_with_multiple_map/snapshot/LATEST create mode 100644 test/test_data/avro/append_with_multiple_map.db/append_with_multiple_map/snapshot/snapshot-1 create mode 100644 test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/README create mode 100644 test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/f1=10/bucket-0/data-0dbda3f3-c4d4-4e2e-b771-c48fc89f30a7-0.avro create mode 100644 test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/f1=10/bucket-0/data-0dbda3f3-c4d4-4e2e-b771-c48fc89f30a7-1.avro create mode 100644 test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/manifest/manifest-4cd8b157-3a02-4a9b-8b65-4976410cb4a9-0 create mode 100644 test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/manifest/manifest-4cd8b157-3a02-4a9b-8b65-4976410cb4a9-1 create mode 100644 test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/manifest/manifest-list-bc16121e-12e3-4ba0-a499-bf054e158852-0 create mode 100644 test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/manifest/manifest-list-bc16121e-12e3-4ba0-a499-bf054e158852-1 create mode 100644 test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/manifest/manifest-list-bc16121e-12e3-4ba0-a499-bf054e158852-2 create mode 100644 test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/manifest/manifest-list-bc16121e-12e3-4ba0-a499-bf054e158852-3 create mode 100644 test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/schema/schema-0 create mode 100644 test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/snapshot/EARLIEST create mode 100644 test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/snapshot/LATEST create mode 100644 test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/snapshot/snapshot-1 create mode 100644 test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/snapshot/snapshot-2 diff --git a/test/test_data/avro/append_multiple.db/append_multiple/README b/test/test_data/avro/append_multiple.db/append_multiple/README new file mode 100644 index 00000000..58d086c9 --- /dev/null +++ b/test/test_data/avro/append_multiple.db/append_multiple/README @@ -0,0 +1,14 @@ +f0:tinyint f1:smallint f2:int f3:bigint f4:float f5:double f6:string f7:bytes f8:date f9:decimal(5,2) f10:TIMESTAMP(0) f11:TIMESTAMP(3) f12:TIMESTAMP(6) f13:TIMESTAMP_WITH_LOCAL_TIME_ZONE(0) f14:TIMESTAMP_WITH_LOCAL_TIME_ZONE(3) f15:TIMESTAMP_WITH_LOCAL_TIME_ZONE(6) f16:struct,array> +bucket count: 2 + +Msgs: +snapshot-1 (3 data files) +Add: 1, 10, 0, 100, 1.0, 1.0, "one", "aaa", 123, "123.45", "1970-01-01 00:00:00", "1970-01-01 00:00:00.000", "1970-01-01 00:00:00.000000", "1970-01-01 00:00:00", "1970-01-01 00:00:00.000", "1970-01-01 00:00:00.000000",[[["key",123]],[1,2,3]] +Add: 2, 10, 1, 100, 2.0, 2.0, "two", "bbb", 123, "123.45", "1970-01-02 00:00:00", "1970-01-02 00:00:00.000", "1970-01-02 00:00:00.000000", "1970-01-02 00:00:00", "1970-01-02 00:00:00.000", "1970-01-02 00:00:00.000000",[[["key",123]],[1,2,3]] +Add: 3, 11, 0, 100, null, 3.0, "three", "ccc", 123, "123.45", "1970-01-03 00:00:00", "1970-01-03 00:00:00.000", "1970-01-03 00:00:00.000000", "1970-01-03 00:00:00", "1970-01-03 00:00:00.000", "1970-01-03 00:00:00.000000",[[["key",123]],[1,2,3]] +Add: 4, 11, 0, 100, 4.0, null, "four", "ddd", 123, "123.45", "1970-01-04 00:00:00", "1970-01-04 00:00:00.000", "1970-01-04 00:00:00.000000", "1970-01-04 00:00:00", "1970-01-04 00:00:00.000", "1970-01-04 00:00:00.000000",[[["key",123]],[1,2,3]] + +snapshot-2 (3 data files) +Add: 5, 10, 0, 100, 5.0, 2.0, null, "eee", 123, "123.45", "1970-01-01 00:00:00", "1970-01-05 00:00:00.000", "1970-01-05 00:00:00.000000", "1970-01-05 00:00:00", "1970-01-05 00:00:00.000", "1970-01-05 00:00:00.000000",[[["key",123]],[1,2,3]] +Add: 6, 10, 1, 100, 6.0, 4.0, "six", "fff", 123, "123.45", "1970-01-02 00:00:00", "1970-01-06 00:00:00.000", "1970-01-06 00:00:00.000000", "1970-01-06 00:00:00", "1970-01-06 00:00:00.000", "1970-01-06 00:00:00.000000",[[["key",123]],[1,2,3]] +Add: 7, 11, 0, 100, 7.0, 6.0, "seven", "ggg", 123, "123.45", "1970-01-03 00:00:00", "1970-01-07 00:00:00.000", "1970-01-07 00:00:00.000000", "1970-01-07 00:00:00", "1970-01-07 00:00:00.000", "1970-01-07 00:00:00.000000",[[["key",123]],[1,2,3]] \ No newline at end of file diff --git a/test/test_data/avro/append_multiple.db/append_multiple/f1=10/bucket-0/data-3c670e9a-cc3a-4c29-9ba2-775908cb5650-0.avro b/test/test_data/avro/append_multiple.db/append_multiple/f1=10/bucket-0/data-3c670e9a-cc3a-4c29-9ba2-775908cb5650-0.avro new file mode 100644 index 0000000000000000000000000000000000000000..c1f943110d952894ae22f1cb5460e9a9d8c7a4fd GIT binary patch literal 1716 zcmeZI%3@>@ODrqO*DFrWNX<=LBvGwYQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?Uk832VLwnQuCmFDCC zxtVz-O0hahDXD3Rr8y-^Rv_V89fTG`(zO_ouEiKrOHO`XI$oEVVCqTB$xp=Z9aBtA zDfy*IIjMN9GQ-qWTvC)tz=7tNx{@kOQj76gWr3jyo-jaBrK6OSpPreVm=gj?EGfXq zR;tApftI)pgt!L7$kgP_+(b~?C;%qK%;L=aJS8hr9i?KRt-z#bgx5Ah%p?YPKR8Cw zZ7a#lO)V}-%q`H(&CJQkEXMB{L(JqzRpT*}F7d`E7v&RjJ!Ue;Z+uQZFwAvH3?j^Y zK?6%L^AV|*5D{FM*$!?A_Chi~%@CxAsFDwERBmDcP)}K6PAM>JTA>z<_|rP>oRwHq zlvoLr&MZmICBo2HOnYK$MZhI`a(+r`vQQN;i{zyw7Nsz(l;@Lt9(QY+-ve$&qt|Zf zOzRjmw(9?7@F-3Cp$ufVB(mXrSWx%LO-d)QeR92%ytS^Ol8!5iHs02~}bEC2ui literal 0 HcmV?d00001 diff --git a/test/test_data/avro/append_multiple.db/append_multiple/f1=10/bucket-0/data-3c670e9a-cc3a-4c29-9ba2-775908cb5650-1.avro b/test/test_data/avro/append_multiple.db/append_multiple/f1=10/bucket-0/data-3c670e9a-cc3a-4c29-9ba2-775908cb5650-1.avro new file mode 100644 index 0000000000000000000000000000000000000000..244423ecca75bec6787cfe39374c7e40a07e6a7e GIT binary patch literal 1722 zcmcIkF;5gh6rLT|ttgBkMoXu%SWXrNWgCbZVrOFP$OYc*&K@H(JDZs~;t7}FU$8(E zIuoO%jg1wFjj^ENH_%hrNtnG0M`UEUOSIX{yxH&j-pu>nd)9gxgg0PA$%f9>sDKYQ zH;L(1l%zz_Osbs3qy*vYbp=@+l&akTyI!M>3Yd^(qfgeRB~@T(4KEnwxhCoI<|$J# zZNmeVWm0xHw_8>k(u_$?3+Qk|(mLFW&mhV8keTrz_gn~IFC9cJW`S&Y&Ud=E;wEvn;9seo4Yu(Z5 zGbJy}-nF@(o=f}YQ)51n+ZX>q#Oi&X->>eS*E<2F!Z*JF literal 0 HcmV?d00001 diff --git a/test/test_data/avro/append_multiple.db/append_multiple/f1=10/bucket-1/data-155f9742-6651-4b8e-9ba6-95e8b74b0549-0.avro b/test/test_data/avro/append_multiple.db/append_multiple/f1=10/bucket-1/data-155f9742-6651-4b8e-9ba6-95e8b74b0549-0.avro new file mode 100644 index 0000000000000000000000000000000000000000..80fb4017934c7ca9d6c6ce8255fc88e4cfa4de0f GIT binary patch literal 1703 zcmcIk&1w`u5biCP0S{gzNX`v{gvpwXM$KGu_2$J}vS{s0&8D?~hVI_L8WwW!5%L0x zPap_Bfq3-=JbDuZ!IOeLySQT7bap{+UEO@&S6%g0Js3Xebg$u&H9c1zlERnk$J|dE z;sv)<8e8$4YgAF}8Z6sOV@Ev<=R@i>D3sB?8QF+5S@hJxYeBMdyx2ef#Z+&F>$%E4 z*JCL|D75E;oL5xiSbAQx*VFtrq`x|3dv(aI7$S`xB{RDnV`^zI{hgf{RGINmQc~(p zjB?(J3lh8=qlObtE-7^{0xgWuwkqe+j6{j@;oK}$Xf~gw8n}Bt;XIp&l1ibl^+pq{ zD4Z~w7dtt3K{Z%!JE_)>Ew;#i*3q+SUMO-NRkNvtl)|My(~s@=Pw8XpE>B-tb5ZuO z&C~SKgqm-zSfY3?{)0$7N2`h~QLcFG79xLNl8@?si&<*(EvOVW0ds_M97ePFb1|m9 ze*R>kwKxgS!joEpbP($drfai}UYd%^tIxuS=n8GcKGvTO)?ZwI^}YW3;qAuf*Kpdo z@#f|acK-^n=a&Fr4H&%d0AnY6z`pE3W;8*Di14ig-L2j54B1nfgpffN0A0u!F!%1@ J@#VARKLOd>F~$G@ literal 0 HcmV?d00001 diff --git a/test/test_data/avro/append_multiple.db/append_multiple/f1=10/bucket-1/data-155f9742-6651-4b8e-9ba6-95e8b74b0549-1.avro b/test/test_data/avro/append_multiple.db/append_multiple/f1=10/bucket-1/data-155f9742-6651-4b8e-9ba6-95e8b74b0549-1.avro new file mode 100644 index 0000000000000000000000000000000000000000..73a9a15922595db3f3e76a31075df9a7f82307ad GIT binary patch literal 1710 zcmeZI%3@>@ODrqO*DFrWNX<=LBvGwYQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?Uk832VLwnQuCmFDCC zxtVz-O0hahDXD3Rr8y-^Rv_V89fTG`(zO_ouEiKrOHO`XI$oEVVCqTB$xp=Z9aBtA zDfy*IIjMN9GQ-qWTvC)tz=7tNx{@kOQj76gWr3jyo-jaBrK6OSpPreVm=gj?EGfXq zR;tApftI)pgt!L7$kgP_+(b~?C;%qK%;L=aJS8hr9i?KRt-z#bgx5Ah%p?YPKR8Cw zZ7a#lO)V}-%q`H(&CJQkEXMB{L(JqzRpT*}F7d`E7v&RjJ!Ue;Z+uQZFwAvH3?j^Y zK?6%L^AV|*5D{FM*$!?A_Chi~%@CxAsFDwERBmDcP)}K6PAM>JTA>z<_|rP>oRwHq zlvoLr&MZmICBo2HOnYK$MZhI`a(+r`vQQN;i{zyw7Nsy~s!7e5+2x~I=v{CiV%;7A zrbUbzTlN1kc(^h#D7ElO9#ECdtIy=QoPTSp84uI?eFxs(-@LceGn#Gh>H6!}T=>2# r2fb7XdfB6LERy5YwsU88FPpvSZl5)u3=2a7!weU1#WM`k(Cq*K(WpSa literal 0 HcmV?d00001 diff --git a/test/test_data/avro/append_multiple.db/append_multiple/f1=11/bucket-1/data-3fbfe712-94fa-4b8a-870c-c500dec838a5-0.avro b/test/test_data/avro/append_multiple.db/append_multiple/f1=11/bucket-1/data-3fbfe712-94fa-4b8a-870c-c500dec838a5-0.avro new file mode 100644 index 0000000000000000000000000000000000000000..9315ec5297eb8e8952826bbfeaadce43a4080cfa GIT binary patch literal 1759 zcmeZI%3@>@ODrqO*DFrWNX<=LBvGwYQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?Uk832VLwnQuCmFDCC zxtVz-O0hahDXD3Rr8y-^Rv_V89fTG`(zO_ouEiKrOHO`XI$oEVVCqTB$xp=Z9aBtA zDfy*IIjMN9GQ-qWTvC)tz=7tNx{@kOQj76gWr3jyo-jaBrK6OSpPreVm=gj?EGfXq zR;tApftI)pgt!L7$kgP_+(b~?C;%qK%;L=aJS8hr9i?KRt-z#bgx5Ah%p?YPKR8Cw zZ7a#lO)V}-%q`H(&CJQkEXMB{L(JqzRpT*}F7d`E7v&RjJ!Ue;Z+uQZFwAvH3?j^Y zK?6%L^AV|*5D{FM*$!?A_Chi~%@CxAsFDwERBmDcP)}K6PAM>JTA>z<_|rP>oRwHq zlvoLr&MZmICBo2HOnYK$MZhI`a(+r`vQQN;i{zyw7NsyeesF5eBaiz`(ydKPyLLRt zVrgO0*sA}R!Q(75LlT>~;s*6z#t4R`jd>4(4#;e~aKuC?aq+hK50`~%1U|So>5EFL z=Vck+S|T8)#o_OP}_B{cc0A9&xdUfa{W|)7U_Fta~GGpvpCCy!f>yrJ-RK< qQI6jqK9Vkw<6?+gz*W%TS<*EjZpv4QPYsEQLbY7I+!q+p9R&c4FI+7E literal 0 HcmV?d00001 diff --git a/test/test_data/avro/append_multiple.db/append_multiple/f1=11/bucket-1/data-3fbfe712-94fa-4b8a-870c-c500dec838a5-1.avro b/test/test_data/avro/append_multiple.db/append_multiple/f1=11/bucket-1/data-3fbfe712-94fa-4b8a-870c-c500dec838a5-1.avro new file mode 100644 index 0000000000000000000000000000000000000000..556c10d16a921400b9f2f8dff412e0cc2114ed68 GIT binary patch literal 1724 zcmcIkziSjh6rOvoCsfQ{f^AXrHLfY{jy&fejP8MCJA**TB%zBXkv`2RWTL1;3*UL#%9TUv+1y|f$OIejAM3NzK_n@~wsdQ6{* zo*&KR&&62p_0uPd8N&wtnmd$Jkaj|y&TwwD(NmpadijYT5tT8Mk!PPjy@}^8m@jv~ zT{(>IKZW;=rGu+K$mRi%=Mh|jc_8qy0fc@ODrqO*DFrWNX<>$CtIylQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3(&d-}QHmJM(W3h@j-~-QA$ZoODxSP zQL+N*tc|TjPK;nj!2JU;5^gm1Tmnf6@ep6awGRED0L5IeYhb9WpEEEuhx$6X1|iZI zW-0_GG2&GRJA1hLI>vhnLR!|}ip(=`GfG;mWO86TfZAZ3n!Xpl3a z)PZXUg)lzNNO=R83w;p54l@BBX1IbMpXtE(bPe(Y7SjQaAs(21gBGCpOmOoI3Jw8> zGE%yNxs7lThX;8ABZ*2L1C}bWwIbm9F*!daHCd<%m|XHw5{pt8wpLm*JzekD{qw!i zbxYAt9c(RZ8e8@MGI*TjW+)4=G%WDFByl0`{?|v+{0spV4PmNQ0=IZ_xo_qgvp<|Y zVRd!>CUZshKPqM#x4CwhGB{3JqQjufpwDpOf>=k4War8Y5!IJhXXJcxJh3R!Q%(8R zjVT(b!PnT-wKlGtIioH8)VcW|gk=?eS8tHAKUUAcblBIg@aTWjg$tJ*d*f62f37+I zpVuGxcx5NKibbqFHAzcx`q^{w^6R)yrarFm_#X8%LG<B{Hz+V=Fi zP@_qLyLU4Qu!zTJA9Hl}U2)1e^2UZA#)8vbeOGWDNIvi}ZHm&`8Q0`ew;Vi?wnk?5 zH141k2Mt4)Y}v3PDw!vtxkFV}=s-b4gn?3|(~${+5(hcbIu^||xaoR~E%H#Jg5it0 z!mhm!{;;&2F8IkND8a#Yvp#gftfF}-j)@w2u?nK^rhKhoiLhK*toTfQV^2oaWhr4J zZ_XcQO&6YJ5%*@>w07Ne(fbvWOEtLPWkmN}^s^C2jnO`0D8s>US=FdV-XyR2B6_3% E0FGoNBme*a literal 0 HcmV?d00001 diff --git a/test/test_data/avro/append_multiple.db/append_multiple/manifest/manifest-b91f4949-0601-4535-8b29-84b79666b306-0 b/test/test_data/avro/append_multiple.db/append_multiple/manifest/manifest-b91f4949-0601-4535-8b29-84b79666b306-0 new file mode 100644 index 0000000000000000000000000000000000000000..9e82634bf1ff6b66bcf778f40904460bb61ea417 GIT binary patch literal 2338 zcmeZI%3@>@ODrqO*DFrWNX<>$CtIylQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3(&d-}QHmJM(W3h@j-~-QA$ZoODxSP zQL+N*tc|TjPK;nj!2JU;5^gm1Tmnf6@ep6awGRED0L5IeYhb9WpEEEuhx$6X1|iZI zW-0_GG2&GRJA1hLI>vhnLR!|}ip(=`GfG;mWO86TfZAZ3n!Xpl3a z)PZXUg)lzNNO=R83w;p54l@BBX1IbMpXtE(bPe(Y7SjQaAs(21gBGCpOmOoI3Jw8> zGE%yNxs7lThX;8ABZ*2L1C}bWwIbm9F*!daHCd<%m|XHw5{pt8O!KC#zBH@zMN)CE z#F`tyui4(QYHZd2%ivMU%@B6M!l)qolEjA?mL>+@Rmp$s^$t2bh?NfUe7O6x%dLBh z{@>@W&fjFNxVW+UY+VzZz>Yws-Ai>CR2j4xE(&ThBo3661o42JV+0PIzbfN}>1K&&=Vm(h=UDkh$iJ>aP&B*%Qo_MOU|GzP-AZ_t@NtyXH=xZ0&hr*9+HB zCjS)^eO*LEmn_+`p@*@cIV+V%=s`k7%of90f)brR7RHUK>i1_@F{-_Osy@}>@Y53x zvkrdibKLS_=Yu8Pfz2EC2=XRsA7*9t&w9}pozHg3C)b&0lk5Bz^E;0o^M6`2=^!V6 uHMh9{^U8lF=~JTK)Ms0E8(V%UY&7od;V;v%`B=GBzNF;D%$LmQF#!PBYAOE! literal 0 HcmV?d00001 diff --git a/test/test_data/avro/append_multiple.db/append_multiple/manifest/manifest-list-21081c8d-4a00-4e90-a265-84f4487d1ec0-0 b/test/test_data/avro/append_multiple.db/append_multiple/manifest/manifest-list-21081c8d-4a00-4e90-a265-84f4487d1ec0-0 new file mode 100644 index 0000000000000000000000000000000000000000..70ec039db9ccc64ca4929018b3b1b36d4af9cf0a GIT binary patch literal 1006 zcmbVLO-sWt7*-IE9z?u|mmEBeg5YVlx<*z^TUpyFjuNsq+baDSNhZwM-{Noc;=w=Q z)nv}Ku0!W;P4j%@dEUh7KkV)ulSh_oJRH%KydTVfn2srE0!G7}MW6vPlGfIIOhWZD zXG4w1?-A9;Bu?{8n~M!;Mi~)Q(FZY2A`Y6p*|!qz!j5Y?w%lBJ#hCy_qXb-;2x42e zUG8d!_0hiAw6APNENiFul zyrZB98X%$rGD!qzDrONB)nU1pqYvo{*pxLG6Mq^^mce?5fm4wcB}Aj%4QcSEh@JF4+A)azDR5 zd39duQ}60Y#!2E6=7ZpUw$b?h`o+8ai?gY`xhZ|Dtbe%k1MGj6z}IT^X-twZpc$99 zoA;DmO84ZzCkxxBEw>@;&mB$JN9;q!IaY>kOvj z5rqxNsTXq}Rv;tM(tk1{o>+|euuAg!kXA<|h+p#N7c$s)fJf?2pu4=f&5e?`NljKZNnNhh6<^-os&Dr_MWHpaDL?$UejRK807_sD+>|6KFC`$pqAG*? PcV68%zu399zbN|?e&J&s literal 0 HcmV?d00001 diff --git a/test/test_data/avro/append_multiple.db/append_multiple/manifest/manifest-list-6f4689de-a277-459f-955a-e91a9b52b4ad-1 b/test/test_data/avro/append_multiple.db/append_multiple/manifest/manifest-list-6f4689de-a277-459f-955a-e91a9b52b4ad-1 new file mode 100644 index 0000000000000000000000000000000000000000..c1f8ca832f6e34cf4d17c3da126c24ab40ccd1df GIT binary patch literal 1122 zcmeZI%3@>@ODrqO*DFrWNX<=r#aFFVQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3)jdHT4<`#Ji$B9s)D6lLb6W2y@Fj6zbClaHpxFVr{Q(Z$8pB_3pEFv2{n zDqUQCTtkRZ8{ile;u!+;LVR$DV+g`j0)ZDFkEsz$@c4TA#fLfigd#gPsj?)s7{&L# zjuAwt@(cCxiFfu7^+R%Hv{GJaPL2{VyrIFLSX7i)2@IOdlGI#KOhM(9z+n_y8>^#~ zlA4xSnp2`=1=3j?TZVXi*pSxc@~pshjv;mAP(_bnt$(L)r?yQFE3trY?1y5#(n)MTNm;*!L?l*FPG zhHux8mM!U>@y6HnUzJuehXd0gMvblde;GWEGBG@A5t7|tD$8?t!y*M20cYg`Rnh5p zpG>>UH}9_34o,array> +bucket count: -1 + +Msgs: +snapshot-1 (1 data files) +Add: 1, 2.0, "test", [[["key",123]],[1,2,3]] +Add: 1, 2.0, "test", [[["key",123]],[1,2,3]] +Add: 1, 2.0, "test", [[["key",123]],[1,2,3]] +Add: 1, 2.0, "test", [[["key",123]],[1,2,3]] +NoCompact \ No newline at end of file diff --git a/test/test_data/avro/append_simple.db/append_simple/bucket-0/data-d7d1c416-6e34-4834-af87-341d09418f0c-0.avro b/test/test_data/avro/append_simple.db/append_simple/bucket-0/data-d7d1c416-6e34-4834-af87-341d09418f0c-0.avro new file mode 100644 index 0000000000000000000000000000000000000000..b811fb470ae77490152fdb5c7d128caab9565e9d GIT binary patch literal 636 zcmb7>%SyvQ6ow~530aHu0mhAs5HHLl; zT=+1e3&BaniUw-ko&Ui19p<0YU~2XHFjLYi!vu4SOqm=WgMK*fq%9KN4m>Fahw&ncjq4{4ED<>!g zwXvoe^9`hv?wv`qz5KvXatyaFOAUg9;$(rhM`sVo&S*Nizj?ZPbAsKwy?1hQY>`(= gDH&2nW2wz(UtmpNEJAG)dekPA3>MFi!~So@2S=pQmH+?% literal 0 HcmV?d00001 diff --git a/test/test_data/avro/append_simple.db/append_simple/manifest/manifest-f77f2a65-c6be-4e7c-accd-c8dc75578c9a-0 b/test/test_data/avro/append_simple.db/append_simple/manifest/manifest-f77f2a65-c6be-4e7c-accd-c8dc75578c9a-0 new file mode 100644 index 0000000000000000000000000000000000000000..2be8356742ec2b3cc1f202d5b6596ebf853077a4 GIT binary patch literal 2093 zcmdT_Pe>F|9A3>5Pk~fW*rBIr#I{P8(g+-PpX}(&>^gtclw}xqpX=oOS!TyTWkV{c zAc=$s>L?w84w4jfsY8i`NL@NaM?2M}yOzC~*_|18++7b{y6x`!e&753zW4pUO-x*? zYd9fYH}pt8J%v>1?df@7&gKy40R~FzMh1j|CaHn%`J9xte$j3|Wp$F!-=|m#mqf5cU9rBonKAnj~1B%37_BzN_0?MWmrE+LHD$VHlkQW?=!M>A~FQ+k%7Ev$Xa$!(>97D$9yP7 zFT=2j5iyz&7~gCG3{5iPAiU}}4ubm7#Aq!DW)L}^;5~T`B!Fn~x`q2_VRbi5Py+Ob zD>PUe#RwGT$l;NV18qi80QOVSUXo!*vset-M5^V);yJK&OBGUKkU^8ujADWwtFn+P z_|0fH;m+ewy43q`iQ@#(nbIx&|D~{Aj)Ox2WTM#30^J8$&tpLsVv8Ld%|+voCPdP= z_yI-=4nxYju{<{5D9;l15_)EEU#iRm?)v$kI?6)A3Ko$keVbbHi%Vtq6~^|-%4tdA zoxC(<73EDy&7D_eMUinD1$^a8xEQ1$55?dzmo6)sv)r0K3W(*8#bC@HSj345M_p-L z7H9FRb>d`BLSUm_>bT{W!qtj>yMecm@?Tt#WxRyjK-6#D;8zTLP&Q(xQL z2A1k~x0Y{pojTh1XXC-vC9t!$+x)3@=i~13qjz1P_xnP!ix}v5^Xzh*9zm-u8^Uz{ o%lo$FpHtIgFV~-(>0W6*66!;th2t;oALede_*K{4+fd%rUy9YRjsO4v literal 0 HcmV?d00001 diff --git a/test/test_data/avro/append_simple.db/append_simple/manifest/manifest-list-3602dcaf-7dc8-4879-9dce-9215f674e866-0 b/test/test_data/avro/append_simple.db/append_simple/manifest/manifest-list-3602dcaf-7dc8-4879-9dce-9215f674e866-0 new file mode 100644 index 0000000000000000000000000000000000000000..da13848b828935508c21ba314c45e3c408e899b4 GIT binary patch literal 1006 zcmbVL&rX9d9M5ypZFXhTe`%dEs*)!10MGn7WQMwE==*ryO z7o!%!tJ$tf&0U-3Nx70)+OJOM8=Z^$KeC(cPF|gt_{`hBP6ds9%Kf$T_xq>ko!8sD LtDA?nm&c7yO$SE5 literal 0 HcmV?d00001 diff --git a/test/test_data/avro/append_simple.db/append_simple/manifest/manifest-list-3602dcaf-7dc8-4879-9dce-9215f674e866-1 b/test/test_data/avro/append_simple.db/append_simple/manifest/manifest-list-3602dcaf-7dc8-4879-9dce-9215f674e866-1 new file mode 100644 index 0000000000000000000000000000000000000000..fd00865cc70dae4895d9542fd80ede6904715d82 GIT binary patch literal 1108 zcmeZI%3@>@ODrqO*DFrWNX<=r#aFFVQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3)jdHT4<`#Ji$B9s)D6lLb6W2y@Fj6zbClaHpxFVr{Q(Z$8pB_3pEFv2{n zDqUQCTtkRZ8{ile;u!+;LVR$DV+g`j0)ZDFkEsz$@c4TA#fLfigd#gPsj?)s7{&L# zjuAwt@(cCxiFfu7^+R%Hv{GJaPL2{VyrIFLSX7i)2@IOdlGI#KOhM(9z+n_y8>^#~ zlA4xSnp2`=1=3j?TZVXi*pSxc@~pshjv;mAP(_bnt$(L)r?yQFE3trY?1y5#(n)MTNm;*!L?l*FPG zhU0vCZ97&+ekzJl^-kS+l+EMqgxv^AsN*1tx|A L4~!W^&}{(#k^OQs literal 0 HcmV?d00001 diff --git a/test/test_data/avro/append_simple.db/append_simple/schema/schema-0 b/test/test_data/avro/append_simple.db/append_simple/schema/schema-0 new file mode 100644 index 00000000..be12e26c --- /dev/null +++ b/test/test_data/avro/append_simple.db/append_simple/schema/schema-0 @@ -0,0 +1,48 @@ +{ + "version" : 3, + "id" : 0, + "fields" : [ { + "id" : 0, + "name" : "f0", + "type" : "INT" + }, { + "id" : 1, + "name" : "f1", + "type" : "DOUBLE" + }, { + "id" : 2, + "name" : "f2", + "type" : "STRING" + }, { + "id" : 3, + "name" : "f3", + "type" : { + "type" : "ROW", + "fields" : [ { + "id" : 4, + "name" : "f0", + "type" : { + "type" : "MAP", + "key" : "STRING", + "value" : "INT" + } + }, { + "id" : 5, + "name" : "f1", + "type" : { + "type" : "ARRAY", + "element" : "INT" + } + } ] + } + } ], + "highestFieldId" : 5, + "partitionKeys" : [ ], + "primaryKeys" : [ ], + "options" : { + "bucket" : "-1", + "manifest.format" : "avro", + "file.format" : "avro" + }, + "timeMillis" : 1767779394352 +} \ No newline at end of file diff --git a/test/test_data/avro/append_simple.db/append_simple/snapshot/EARLIEST b/test/test_data/avro/append_simple.db/append_simple/snapshot/EARLIEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/avro/append_simple.db/append_simple/snapshot/EARLIEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/avro/append_simple.db/append_simple/snapshot/LATEST b/test/test_data/avro/append_simple.db/append_simple/snapshot/LATEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/avro/append_simple.db/append_simple/snapshot/LATEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/avro/append_simple.db/append_simple/snapshot/snapshot-1 b/test/test_data/avro/append_simple.db/append_simple/snapshot/snapshot-1 new file mode 100644 index 00000000..ec396696 --- /dev/null +++ b/test/test_data/avro/append_simple.db/append_simple/snapshot/snapshot-1 @@ -0,0 +1,19 @@ +{ + "version" : 3, + "id" : 1, + "schemaId" : 0, + "baseManifestList" : "manifest-list-3602dcaf-7dc8-4879-9dce-9215f674e866-0", + "baseManifestListSize" : 1006, + "deltaManifestList" : "manifest-list-3602dcaf-7dc8-4879-9dce-9215f674e866-1", + "deltaManifestListSize" : 1108, + "changelogManifestList" : null, + "commitUser" : "4c4a17b4-c139-4fd0-91f6-332509cc3eb1", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1767779395600, + "logOffsets" : { }, + "totalRecordCount" : 4, + "deltaRecordCount" : 4, + "changelogRecordCount" : 0, + "nextRowId" : 0 +} \ No newline at end of file diff --git a/test/test_data/avro/append_with_multiple_map.db/append_with_multiple_map/README b/test/test_data/avro/append_with_multiple_map.db/append_with_multiple_map/README new file mode 100644 index 00000000..6b21711e --- /dev/null +++ b/test/test_data/avro/append_with_multiple_map.db/append_with_multiple_map/README @@ -0,0 +1,7 @@ +f0:map f1:map f2:map f3:map f4:map f5:map> f6:map> f7:map> +bucket count: -1 + +Msgs: +snapshot-1 (1 data files) +Add: [[1,10],[2,20]],[[1.1,10.1],[2.2,20.2]],[["key1","val1"],["key2","val2"]],[["123456","abcdef"]],[["2023-01-01 12:00:00.123000","2023-01-01 12:00:00.123000"],["2023-01-02 13:30:00.456000","2023-01-02 13:30:00.456000"]], + [["arr_key",[1.5, 2.5, 3.5]]],[["outer_key",[[99.9,"nested_val"]]]],[[1000, [42, "row_str", "123.45"]]] diff --git a/test/test_data/avro/append_with_multiple_map.db/append_with_multiple_map/bucket-0/data-72442742-e49e-48a4-a736-a2475aac2d2c-0.avro b/test/test_data/avro/append_with_multiple_map.db/append_with_multiple_map/bucket-0/data-72442742-e49e-48a4-a736-a2475aac2d2c-0.avro new file mode 100644 index 0000000000000000000000000000000000000000..1c423b98063ff8088369da7e68494fd5509778a2 GIT binary patch literal 2195 zcmeZI%3@>@ODrqO*DFrWNX<>`lB-rKsVqoUvQjEaP0lY$QPNS$OU%XK7Z)TZg9P)7 z()AL7?2J^sg2c?+{5(CdVd<%PsYQt;sUYoXnW;G`#Y$Gu)i9IN41mHATcVZnN^^3Q zbl|oo78NB{0;MxcQgeZt;NsX_6Av~D(}wI+kQt~}C}rlAD8<$St%7=}EHS4v6^Ak% zrIggP#L}D+B`c5vYk~Tda`MwNlM{17Ai?k~dYU1I(Qwa@>Pe7MM0zqM zzceW)l^B1*RHA!<46hpD_9`ftbdmn^YP^7qA}(IwQJ<3!^fr20D#^@EEiOsSEzr%)OfJeV1|}?m>6By>sSwYmxPuOR zJi{G{7P9bk084~G593P-X4JA6?jT&{3CJemqErd)c0{JfQKZ09JF!uKH!RF?hXqzGm(+TynOo@ZSlWaM8x_g&Dc-oB!RqDyE^dNF+7XB4Ph4CC`X^pAYe;{Ly`? zv$E+btD=%Q+k`VdXX+oS*sf91VYzR%+|X-T?rqk61(WU-pAT1F(zGa9NuhaT;=#g) zhP(MA3Ns4MadHTIvRK~lC<-esd3eQP%H~-MxtF)yHOpjJ>tOU$+1S`{bGCu?r_Idl Y3<}8#<_TV#ayX3E&72|D!-(!F03nj>#Q*>R literal 0 HcmV?d00001 diff --git a/test/test_data/avro/append_with_multiple_map.db/append_with_multiple_map/manifest/manifest-3c01ce68-8b9e-4bc8-8a56-b481d26faab6-0 b/test/test_data/avro/append_with_multiple_map.db/append_with_multiple_map/manifest/manifest-3c01ce68-8b9e-4bc8-8a56-b481d26faab6-0 new file mode 100644 index 0000000000000000000000000000000000000000..830f9b7c1ee78f820830e4c7e9f5f4c144071bd2 GIT binary patch literal 2086 zcmdT_zi-n(6uy`i=vGjvkPyouA%-|0V+eFXZgN$vI!^JAwopax)xM@iwi7unQmMkk z&Vd225hiBX5d#~7vz^#Z+$0@Xh%&@^?|t{)ci;P5-@aFvn#cG3R;gp| z5*NQce*lC1Hh~M!CuYmHU=e!Q%{_P8*bM({`OOlhzjsNgjcvE(l_Ibv@raKD5|;1S z#IZVXVe_DmWUL`Y73(q$_Rg}s0CtOM!ip$YM@Lr$MH96FfbIQ&bOs<6^~ws;MuTW| zO^}Q@OC6oIBuXfWPTZvC$?E841l3@qL4VBo7!-%&=w^4)^*};2G02ji_ZljwOW7I@4PYjtr zAHF5A5{!7tr1bx%BD^^j-PVy@p=#D^i%3Z|mRBK_*wM4q$}*}6hL~CW9HNBQkd&S{ zY#V4pQ-o*)MWEQ#Z=#D&uffBq0qM`DN}O))VKjG&2y0(qFBjYBaDx9TRON)FdeJ;*aYj_MEz| zR8rST%0q_76=!yXZlRRY-bY9{8%6&(bb6mik*Q6u3fgjhww{5GLs$|ORf|~8=&Ik_ zWQx}nkzUe28jQM1qdOP9AI+9U%+n93Ts#Z=7AVU<=AV7O@coN(@b1>*GcTE6g-fq4 z{|3uXj)6zhQ{XHpY*{$KY?d7@0i~6Ov{5IOX$b`42a2yfhOQA$an}EAf-udosVds zer9~2(c*i^v=NPxEY;>>1C}yQ1yl6fh{a(Jn!VY#66vCjW7t-)x$ufo0c4{DoEaDq z3!12eDf223z)dncb!?IXE`tiXHMj7^ zsEhD+w%bw**N%Nztz?<@tJBp+SK|JU?8~G1s|&J_1qahy&@`kx+@ODrqO*DFrWNX<=r#aFFVQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3)jdHT4<`#Ji$B9s)D6lLb6W2y@Fj6zbClaHpxFVr{Q(Z$8pB_3pEFv2{n zDqUQCTtkRZ8{ile;u!+;LVR$DV+g`j0)ZDFkEsz$@c4TA#fLfigd#gPsj?)s7{&L# zjuAwt@(cCxiFfu7^+R%Hv{GJaPL2{VyrIFLSX7i)2@IOdlGI#KOhM(9z+n_y8>^#~ zlA4xSnp2`=1=3j?TZVXi*pSxc@~pshjv;mAP(_bnt$(L)r?yQFE3trY?1y5#(n)MTNm;*!L?l*FPG zhHD97V%iJ8>fPG@Ajs&l`(~z9j2c_@|1x-_GBG5z2uK}R&iw!X`}PB~w(ib-7T*}- z>3!?a{rw7;Js7vW|GlqPUfw?A!oyW&B7IY`_&Lv>-ufzHV`XQK>Xx=-v1,array> +primary key: f2,f7 +partition key: f1 +bucket key: f2 +bucket count: 1 + +Msgs: +snapshot-1 (1 data files) +Add: 0, false, 10, 1, 1, 1000, 1.5, 2.5, "Alice", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]] +Add: 0, false, 10, 1, 1, 1000, 1.5, 2.5, "Bob", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]] +Add: 0, true, 10, 1, 1, 1000, 1.5, 2.5, "Emily", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]] +Add: 0, true, 10, 1, 1, 1000, 1.5, 2.5, "Tony", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]] + +snapshot-2 (1 data files) +Delete: 0, true, 10, 1, 1, 1000, 1.5, 2.5, "Emily", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]] +Add: 0, true, 10, 1, 1, 1000, 1.5, 2.5, "Lucy", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]] diff --git a/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/f1=10/bucket-0/data-0dbda3f3-c4d4-4e2e-b771-c48fc89f30a7-0.avro b/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/f1=10/bucket-0/data-0dbda3f3-c4d4-4e2e-b771-c48fc89f30a7-0.avro new file mode 100644 index 0000000000000000000000000000000000000000..f17c067d7850b4f3ba52a9c6bfaa257b4cd413ac GIT binary patch literal 1320 zcmeZI%3@>@ODrqO*DFrWNX<>$BUr6eQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^d%H%)rx^j|pjKt( zl_=HfpvjvfSG6Q%pT+Ir)kBoo$AxDJ8!&DJK=LXYod1ngynrNtGq3#dwXh#Lxs!u%MvV zQOe0r&rD9t34tWy6ksA$s>K%^hL~{=Hxc3-3^P-cGjkI`X`}#{M>2~u^YfIfOm&os zfz|@^gb{x0Fw+d&I_xBu@WeqS(2JdgrTvR_QckTfQyIZ{FKyWp{nAN#JrTmq7;T)haGFe<}bd} zq*87BVXovxjz5eVTlN1kc&ue+XmS%*T%i8)|NPqht5!Kxv~St=YF>>}aqGn0H^u&6 z+SRKv`;JQrZ%oiJHa$(d;~LKw{g7F_f^~AuL6!Wze|we}=5+pHXZjM^lfk}?=XmeV r_7zO8SX`|eKFaUd4&pJhGfY?D6rLvJVY^Yvw@GDVOKV&L54wW@Yc#>* literal 0 HcmV?d00001 diff --git a/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/f1=10/bucket-0/data-0dbda3f3-c4d4-4e2e-b771-c48fc89f30a7-1.avro b/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/f1=10/bucket-0/data-0dbda3f3-c4d4-4e2e-b771-c48fc89f30a7-1.avro new file mode 100644 index 0000000000000000000000000000000000000000..82ef1bfe8fcb2ba80396b78878b85c772061d77a GIT binary patch literal 1294 zcmeZI%3@>@ODrqO*DFrWNX<>$BUr6eQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^d%H%)rx^j|pjKt( zl_=HfpvjvfSG6Q%pT+Ir)kBoo$AxDJ8!&DJK=LXYod1ngynrNtGq3#dwXh#Lxs!u%MvV zQOe0r&rD9t34tWy6ksA$s>K%^hL~{=Hxc3-3^P-cGjkI`X`}#{M>2~u^YfIfOm&os zfz|@^gb{x0Fw+d&I_xBu@WeqS(2JdgrTvR_QckTfQyIZ{FKyWp{nAN#JrTmq7(+PV86JQxSqUY zbLU^*;p*hWa)MD~tNvdGkGo6^LaxFJ8*OdD{VpGeauRA#umK@Ub zZF%GE!lt7)B{MSS#g+5xms{@1U0`1{$2ZZD+u(rHp7V}Vr%YCFIm3N;|HjCRcI=^6 S%nZ{NP8u*euo|$TTMGcIZLVqn literal 0 HcmV?d00001 diff --git a/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/manifest/manifest-4cd8b157-3a02-4a9b-8b65-4976410cb4a9-0 b/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/manifest/manifest-4cd8b157-3a02-4a9b-8b65-4976410cb4a9-0 new file mode 100644 index 0000000000000000000000000000000000000000..3b15c396c144080f4e2f6d364c7aaffce63c7256 GIT binary patch literal 2157 zcmeZI%3@>@ODrqO*DFrWNX<>$CtIylQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3(&d-}QHmJM(W3h@j-~-QA$ZoODxSP zQL+N*tc|TjPK;nj!2JU;5^gm1Tmnf6@ep6awGRED0L5IeYhb9WpEEEuhx$6X1|iZI zW-0_GG2&GRJA1hLI>vhnLR!|}ip(=`GfG;mWO86TfZAZ3n!Xpl3a z)PZXUg)lzNNO=R83w;p54l@BBX1IbMpXtE(bPe(Y7SjQaAs(21gBGCpOmOoI3Jw8> zGE%yNxs7lThX;8ABZ*2L1C}bWwIbm9F*!daHCd<%m|XHw5{pt8KK+{A-!sXnkSU=o z%IA}U98(Lk##a5m3?64$8H76JH5)=Rd5YbnSoO{|`AF<8X_Wi-sABTon+%_QdjZL{xa{(t?N=R=N_%wN|1N>ePotZvmm{>;d5NV-m}Dt2~S9naIIjC);^ zP9|^rcliOsE-mpD1=^0{zFA$xRfXb{Le?JLaL?}NBRAuwO_TB#vYzUQoAI|ZT9U!C zm4W#{RbxS%j0yvD1Vcj1$r|UJUQfp6hDL26eq{wymLish?9#6c9|{-Pu`}zR5J;KB IsE6(;08j?W;s5{u literal 0 HcmV?d00001 diff --git a/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/manifest/manifest-4cd8b157-3a02-4a9b-8b65-4976410cb4a9-1 b/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/manifest/manifest-4cd8b157-3a02-4a9b-8b65-4976410cb4a9-1 new file mode 100644 index 0000000000000000000000000000000000000000..a2b645c1a00b3d6f63bd99a6c038370103f7601e GIT binary patch literal 2157 zcmeZI%3@>@ODrqO*DFrWNX<>$CtIylQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3(&d-}QHmJM(W3h@j-~-QA$ZoODxSP zQL+N*tc|TjPK;nj!2JU;5^gm1Tmnf6@ep6awGRED0L5IeYhb9WpEEEuhx$6X1|iZI zW-0_GG2&GRJA1hLI>vhnLR!|}ip(=`GfG;mWO86TfZAZ3n!Xpl3a z)PZXUg)lzNNO=R83w;p54l@BBX1IbMpXtE(bPe(Y7SjQaAs(21gBGCpOmOoI3Jw8> zGE%yNxs7lThX;8ABZ*2L1C}bWwIbm9F*!daHCd<%m|XHw5{pt8PPrd{b9MCzd8^jm zshPiu{g_&qHMZ*iW$-x5${@t4pw$qX$y4km)qJCH=D`dF#hNq|&7-s5oMrRrdfau! z@{?uZWtl9MJ$Ik@G5&wO%6P%#hZ$emS=O`l>ip#X&UK!h;noAr`rmuj`RUh-Y4e^* zpWxn782$g0sM(}WR;49N6HI(pcWtgJRPEX&=ADo~zou}xrDNsE3`JJ2cD}p+4(mxU zShg}SAJ}J5=)fSuz_)n^BhM=b2^O}-hDL?YJcchP|6pc(5EX07P$RSA{J{g$0UyDG z2j0M&Eh3wYC~{lZ?@zz)*IK>1!=qF3z)~gihcqUO+6?gNn1Tji)K6Ie>L4L;Wu1+Q zFMpnmeBb-WlKcrERfkt=st%N(UZR?gPHdkI@!a-gy0OvY_ z*i;QzKr+t4WKc!f`fUjnr5hO2HL$8_P{UHwF7&n{HHe_I1NKU7JGvtxVB1lhLiLaG zFs>4})MMx-?x@HuAGbHP;YiXnJz-OWw~bQlwFWaq6mn-SM?Ld$W&AlavIA? zt@gpPBQN*rAfSCRi8yFTX71(XVcD$ogpnG$L zFM=J2_OsoWDqP#vb-9vt+OJMG8{LTeKeCsfEMJ|U2Gl=(%s5E`!h(ZQ?fvQHqxt%L K_I5vCjEql3iAD+l literal 0 HcmV?d00001 diff --git a/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/manifest/manifest-list-bc16121e-12e3-4ba0-a499-bf054e158852-1 b/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/manifest/manifest-list-bc16121e-12e3-4ba0-a499-bf054e158852-1 new file mode 100644 index 0000000000000000000000000000000000000000..f33419919174812f81c1ccb714fdea72055bf3ba GIT binary patch literal 1119 zcmeZI%3@>@ODrqO*DFrWNX<=r#aFFVQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3)jdHT4<`#Ji$B9s)D6lLb6W2y@Fj6zbClaHpxFVr{Q(Z$8pB_3pEFv2{n zDqUQCTtkRZ8{ile;u!+;LVR$DV+g`j0)ZDFkEsz$@c4TA#fLfigd#gPsj?)s7{&L# zjuAwt@(cCxiFfu7^+R%Hv{GJaPL2{VyrIFLSX7i)2@IOdlGI#KOhM(9z+n_y8>^#~ zlA4xSnp2`=1=3j?TZVXi*pSxc@~pshjv;mAP(_bnt$(L)r?yQFE3trY?1y5#(n)MTNm;*!L?l*FPG z2ASvm5r)hO)e6tGxO>-g_A%{Y)Yz*3m%(E#6T>AJA=wS4vOI?kdYEivJbH85&pfPt zzxUEDXG1GRmLPA&ZSQ~Yd;4Eb=7wI4Qp1JJcg~vP#hqEXhf8N=DA-O|xkb0Kv%x)5 U!d=>hgPFm?o`E@^Z2`K40Itz)uK)l5 literal 0 HcmV?d00001 diff --git a/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/manifest/manifest-list-bc16121e-12e3-4ba0-a499-bf054e158852-2 b/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/manifest/manifest-list-bc16121e-12e3-4ba0-a499-bf054e158852-2 new file mode 100644 index 0000000000000000000000000000000000000000..782b362592fd1e28dbd80b4cd7f9731ecc286ed2 GIT binary patch literal 1119 zcmeZI%3@>@ODrqO*DFrWNX<=r#aFFVQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3)jdHT4<`#Ji$B9s)D6lLb6W2y@Fj6zbClaHpxFVr{Q(Z$8pB_3pEFv2{n zDqUQCTtkRZ8{ile;u!+;LVR$DV+g`j0)ZDFkEsz$@c4TA#fLfigd#gPsj?)s7{&L# zjuAwt@(cCxiFfu7^+R%Hv{GJaPL2{VyrIFLSX7i)2@IOdlGI#KOhM(9z+n_y8>^#~ zlA4xSnp2`=1=3j?TZVXi*pSxc@~pshjv;mAP(_bnt$(L)r?yQFE3trY?1y5#(n)MTNm;*!L?l*FPG zhR}`u8}{#Ldm*sxBI~gQ7C)HwFlubo|I6U9mWkn#i;(OFQ(2zF20ctRG9JA-?PnfV zzu$Z5mb0OiB1@1r@ODrqO*DFrWNX<=r#aFFVQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3)jdHT4<`#Ji$B9s)D6lLb6W2y@Fj6zbClaHpxFVr{Q(Z$8pB_3pEFv2{n zDqUQCTtkRZ8{ile;u!+;LVR$DV+g`j0)ZDFkEsz$@c4TA#fLfigd#gPsj?)s7{&L# zjuAwt@(cCxiFfu7^+R%Hv{GJaPL2{VyrIFLSX7i)2@IOdlGI#KOhM(9z+n_y8>^#~ zlA4xSnp2`=1=3j?TZVXi*pSxc@~pshjv;mAP(_bnt$(L)r?yQFE3trY?1y5#(n)MTNm;*!L?l*FPG z2BD07UM`P!X7Af%63^#S@ttW8qsCVKzYHF0nHVm)2+3|RmE}2X(8FXS=~Hz*%qK%2mo~%anS$( literal 0 HcmV?d00001 diff --git a/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/schema/schema-0 b/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/schema/schema-0 new file mode 100644 index 00000000..608d0df8 --- /dev/null +++ b/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/schema/schema-0 @@ -0,0 +1,81 @@ +{ + "version" : 3, + "id" : 0, + "fields" : [ { + "id" : 0, + "name" : "f0", + "type" : "BOOLEAN" + }, { + "id" : 1, + "name" : "f1", + "type" : "TINYINT" + }, { + "id" : 2, + "name" : "f2", + "type" : "SMALLINT NOT NULL" + }, { + "id" : 3, + "name" : "f3", + "type" : "INT" + }, { + "id" : 4, + "name" : "f4", + "type" : "BIGINT" + }, { + "id" : 5, + "name" : "f5", + "type" : "FLOAT" + }, { + "id" : 6, + "name" : "f6", + "type" : "DOUBLE" + }, { + "id" : 7, + "name" : "f7", + "type" : "STRING NOT NULL" + }, { + "id" : 8, + "name" : "f8", + "type" : "BYTES" + }, { + "id" : 9, + "name" : "f9", + "type" : "DATE" + }, { + "id" : 10, + "name" : "f10", + "type" : "DECIMAL(5, 2)" + }, { + "id" : 11, + "name" : "f11", + "type" : { + "type" : "ROW", + "fields" : [ { + "id" : 12, + "name" : "f0", + "type" : { + "type" : "MAP", + "key" : "STRING", + "value" : "INT" + } + }, { + "id" : 13, + "name" : "f1", + "type" : { + "type" : "ARRAY", + "element" : "INT" + } + } ] + } + } ], + "highestFieldId" : 13, + "partitionKeys" : [ "f1" ], + "primaryKeys" : [ "f2", "f7" ], + "options" : { + "bucket" : "1", + "bucket-key" : "f2", + "manifest.format" : "avro", + "file.format" : "avro" + }, + "timeMillis" : 1768206714821 +} \ No newline at end of file diff --git a/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/snapshot/EARLIEST b/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/snapshot/EARLIEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/snapshot/EARLIEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/snapshot/LATEST b/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/snapshot/LATEST new file mode 100644 index 00000000..d8263ee9 --- /dev/null +++ b/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/snapshot/LATEST @@ -0,0 +1 @@ +2 \ No newline at end of file diff --git a/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/snapshot/snapshot-1 b/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/snapshot/snapshot-1 new file mode 100644 index 00000000..b3afcbc0 --- /dev/null +++ b/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/snapshot/snapshot-1 @@ -0,0 +1,19 @@ +{ + "version" : 3, + "id" : 1, + "schemaId" : 0, + "baseManifestList" : "manifest-list-bc16121e-12e3-4ba0-a499-bf054e158852-0", + "baseManifestListSize" : 1006, + "deltaManifestList" : "manifest-list-bc16121e-12e3-4ba0-a499-bf054e158852-1", + "deltaManifestListSize" : 1119, + "changelogManifestList" : null, + "commitUser" : "65d24992-0149-4bd8-92bf-715e6e4804ca", + "commitIdentifier" : 1, + "commitKind" : "APPEND", + "timeMillis" : 1768211240603, + "logOffsets" : { }, + "totalRecordCount" : 4, + "deltaRecordCount" : 4, + "changelogRecordCount" : 0, + "nextRowId" : 0 +} \ No newline at end of file diff --git a/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/snapshot/snapshot-2 b/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/snapshot/snapshot-2 new file mode 100644 index 00000000..f24c3dbb --- /dev/null +++ b/test/test_data/avro/pk_with_multiple_type.db/pk_with_multiple_type/snapshot/snapshot-2 @@ -0,0 +1,19 @@ +{ + "version" : 3, + "id" : 2, + "schemaId" : 0, + "baseManifestList" : "manifest-list-bc16121e-12e3-4ba0-a499-bf054e158852-2", + "baseManifestListSize" : 1119, + "deltaManifestList" : "manifest-list-bc16121e-12e3-4ba0-a499-bf054e158852-3", + "deltaManifestListSize" : 1119, + "changelogManifestList" : null, + "commitUser" : "65d24992-0149-4bd8-92bf-715e6e4804ca", + "commitIdentifier" : 2, + "commitKind" : "APPEND", + "timeMillis" : 1768211240667, + "logOffsets" : { }, + "totalRecordCount" : 6, + "deltaRecordCount" : 2, + "changelogRecordCount" : 0, + "nextRowId" : 0 +} \ No newline at end of file From 1697e7e190eaf9dc718b62abf85b692a29dcc59b Mon Sep 17 00:00:00 2001 From: "jinli.zjw" Date: Mon, 26 Jan 2026 11:30:19 +0800 Subject: [PATCH 03/12] fix --- .../avro/avro_file_batch_reader_test.cpp | 98 ++++++++ test/inte/scan_and_read_inte_test.cpp | 231 ++++++++++++++++++ 2 files changed, 329 insertions(+) diff --git a/src/paimon/format/avro/avro_file_batch_reader_test.cpp b/src/paimon/format/avro/avro_file_batch_reader_test.cpp index 845d62dd..5f03eade 100644 --- a/src/paimon/format/avro/avro_file_batch_reader_test.cpp +++ b/src/paimon/format/avro/avro_file_batch_reader_test.cpp @@ -246,6 +246,104 @@ TEST_P(AvroFileBatchReaderTest, TestReadTimestampTypes) { ASSERT_TRUE(expected_array->Equals(result_array)); } +TEST_P(AvroFileBatchReaderTest, TestReadMapTypes) { + std::string path = paimon::test::GetDataDir() + + "/avro/append_with_multiple_map.db/" + "append_with_multiple_map/bucket-0/" + "data-72442742-e49e-48a4-a736-a2475aac2d2c-0.avro"; + + ASSERT_OK_AND_ASSIGN(auto reader_builder, + file_format_->CreateReaderBuilder(/*batch_size=*/1024)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr in, fs_->Open(path)); + ASSERT_OK_AND_ASSIGN(auto batch_reader, reader_builder->Build(in)); + + arrow::FieldVector read_fields = { + arrow::field("f0", arrow::map(arrow::int32(), arrow::int32())), + arrow::field("f1", arrow::map(arrow::float64(), arrow::float64())), + arrow::field("f2", arrow::map(arrow::utf8(), arrow::utf8())), + arrow::field("f3", arrow::map(arrow::utf8(), arrow::binary())), + arrow::field("f4", arrow::map(arrow::timestamp(arrow::TimeUnit::MICRO), + arrow::timestamp(arrow::TimeUnit::MICRO))), + arrow::field("f5", arrow::map(arrow::utf8(), arrow::list(arrow::float64()))), + arrow::field("f6", arrow::map(arrow::utf8(), arrow::map(arrow::float64(), arrow::utf8()))), + arrow::field("f7", arrow::map(arrow::int64(), + arrow::struct_({field("f0", arrow::int32()), + field("f1", arrow::utf8()), + field("f2", arrow::decimal128(5, 2))})))}; + auto read_schema = arrow::schema(read_fields); + std::unique_ptr c_schema = std::make_unique(); + ASSERT_TRUE(arrow::ExportSchema(*read_schema, c_schema.get()).ok()); + EXPECT_OK(batch_reader->SetReadSchema(c_schema.get(), /*predicate=*/nullptr, + /*selection_bitmap=*/std::nullopt)); + + // check array + ASSERT_OK_AND_ASSIGN(auto result_array, + ::paimon::test::ReadResultCollector::CollectResult(batch_reader.get())); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow::struct_(read_fields), {R"([ + [ + [[1,10],[2,20]], + [[1.1,10.1],[2.2,20.2]], + [["key1","val1"],["key2","val2"]], + [["123456","abcdef"]], + [["2023-01-01 12:00:00.123000","2023-01-01 12:00:00.123000"],["2023-01-02 13:30:00.456000","2023-01-02 13:30:00.456000"]], + [["arr_key",[1.5, 2.5, 3.5]]], + [["outer_key",[[99.9,"nested_val"]]]], + [[1000, [42, "row_str", "123.45"]]] + ] + ])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()) << array_status.ToString(); + ASSERT_TRUE(result_array->Equals(expected_array)) << result_array->ToString() << std::endl; + ASSERT_TRUE(expected_array->Equals(result_array)); +} + +TEST_P(AvroFileBatchReaderTest, TestReadRowNumbers) { + std::string path = paimon::test::GetDataDir() + + "/avro/append_simple.db/" + "append_simple/bucket-0/" + "data-d7d1c416-6e34-4834-af87-341d09418f0c-0.avro"; + + ASSERT_OK_AND_ASSIGN(auto reader_builder, file_format_->CreateReaderBuilder(/*batch_size=*/1)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr in, fs_->Open(path)); + ASSERT_OK_AND_ASSIGN(auto reader, reader_builder->Build(in)); + + arrow::FieldVector read_fields = { + arrow::field("f0", arrow::int32()), arrow::field("f1", arrow::float64()), + arrow::field("f2", arrow::utf8()), + arrow::field("f3", + arrow::struct_({arrow::field("f0", arrow::map(arrow::utf8(), arrow::int32())), + arrow::field("f1", arrow::list(arrow::int32()))}))}; + + auto read_schema = arrow::schema(read_fields); + std::unique_ptr c_schema = std::make_unique(); + ASSERT_TRUE(arrow::ExportSchema(*read_schema, c_schema.get()).ok()); + EXPECT_OK(reader->SetReadSchema(c_schema.get(), /*predicate=*/nullptr, + /*selection_bitmap=*/std::nullopt)); + + ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchFirstRowNumber()); + ASSERT_OK_AND_ASSIGN(auto batch1, reader->NextBatch()); + ArrowArrayRelease(batch1.first.get()); + ArrowSchemaRelease(batch1.second.get()); + ASSERT_EQ(0, reader->GetPreviousBatchFirstRowNumber()); + ASSERT_OK_AND_ASSIGN(auto batch2, reader->NextBatch()); + ASSERT_EQ(1, reader->GetPreviousBatchFirstRowNumber()); + ArrowArrayRelease(batch2.first.get()); + ArrowSchemaRelease(batch2.second.get()); + ASSERT_OK_AND_ASSIGN(auto batch3, reader->NextBatch()); + ASSERT_EQ(2, reader->GetPreviousBatchFirstRowNumber()); + ArrowArrayRelease(batch3.first.get()); + ArrowSchemaRelease(batch3.second.get()); + ASSERT_OK_AND_ASSIGN(auto batch4, reader->NextBatch()); + ASSERT_EQ(3, reader->GetPreviousBatchFirstRowNumber()); + ArrowArrayRelease(batch4.first.get()); + ArrowSchemaRelease(batch4.second.get()); + ASSERT_OK_AND_ASSIGN(auto batch5, reader->NextBatch()); + ASSERT_EQ(4, reader->GetPreviousBatchFirstRowNumber()); + ASSERT_TRUE(BatchReader::IsEofBatch(batch5)); +} + INSTANTIATE_TEST_SUITE_P(TestParam, AvroFileBatchReaderTest, ::testing::Values(false, true)); } // namespace paimon::avro::test diff --git a/test/inte/scan_and_read_inte_test.cpp b/test/inte/scan_and_read_inte_test.cpp index 247ff70d..a1cce60c 100644 --- a/test/inte/scan_and_read_inte_test.cpp +++ b/test/inte/scan_and_read_inte_test.cpp @@ -2566,6 +2566,237 @@ TEST_P(ScanAndReadInteTest, TestCastTimestampType) { ASSERT_TRUE(expected->Equals(read_result)) << read_result->ToString(); } +TEST_P(ScanAndReadInteTest, TestAvroWithAppendSnapshot1) { + auto [file_format, enable_prefetch] = GetParam(); + if (file_format != "avro") { + return; + } + std::string table_path = GetDataDir() + "/avro/append_multiple.db/append_multiple"; + + // scan + ScanContextBuilder scan_context_builder(table_path); + scan_context_builder.AddOption(Options::SCAN_SNAPSHOT_ID, "1"); + ASSERT_OK_AND_ASSIGN(auto scan_context, scan_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_scan, TableScan::Create(std::move(scan_context))); + ASSERT_OK_AND_ASSIGN(auto result_plan, table_scan->CreatePlan()); + ASSERT_EQ(result_plan->SnapshotId().value(), 1); + + auto splits = result_plan->Splits(); + ASSERT_EQ(3, splits.size()); + + // read + ReadContextBuilder read_context_builder(table_path); + ASSERT_OK_AND_ASSIGN(std::unique_ptr read_context, read_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); + ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(splits)); + ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get())); + + // check result + auto timezone = DateTimeUtils::GetLocalTimezoneName(); + arrow::FieldVector fields = { + arrow::field("_VALUE_KIND", arrow::int8()), + arrow::field("f0", arrow::int8()), + arrow::field("f1", arrow::int16()), + arrow::field("f2", arrow::int32()), + arrow::field("f3", arrow::int64()), + arrow::field("f4", arrow::float32()), + arrow::field("f5", arrow::float64()), + arrow::field("f6", arrow::utf8()), + arrow::field("f7", arrow::binary()), + arrow::field("f8", arrow::date32()), + arrow::field("f9", arrow::decimal128(5, 2)), + arrow::field("f10", arrow::timestamp(arrow::TimeUnit::SECOND)), + arrow::field("f11", arrow::timestamp(arrow::TimeUnit::MILLI)), + arrow::field("f12", arrow::timestamp(arrow::TimeUnit::MICRO)), + arrow::field("f13", arrow::timestamp(arrow::TimeUnit::SECOND, timezone)), + arrow::field("f14", arrow::timestamp(arrow::TimeUnit::MILLI, timezone)), + arrow::field("f15", arrow::timestamp(arrow::TimeUnit::MICRO, timezone)), + arrow::field("f16", + arrow::struct_({arrow::field("f0", arrow::map(arrow::utf8(), arrow::int32())), + arrow::field("f1", arrow::list(arrow::int32()))})), + }; + auto expected = std::make_shared( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ +[0, 2, 10, 1, 100, 2.0, 2.0, "two", "bbb", 123, "123.45", "1970-01-02 00:00:00", "1970-01-02 00:00:00.000", "1970-01-02 00:00:00.000000", "1970-01-02 00:00:00", "1970-01-02 00:00:00.000", "1970-01-02 00:00:00.000000",[[["key",123]],[1,2,3]]], +[0, 1, 10, 0, 100, 1.0, 1.0, "one", "aaa", 123, "123.45", "1970-01-01 00:00:00", "1970-01-01 00:00:00.000", "1970-01-01 00:00:00.000000", "1970-01-01 00:00:00", "1970-01-01 00:00:00.000", "1970-01-01 00:00:00.000000",[[["key",123]],[1,2,3]]], +[0, 3, 11, 0, 100, null, 3.0, "three", "ccc", 123, "123.45", "1970-01-03 00:00:00", "1970-01-03 00:00:00.000", "1970-01-03 00:00:00.000000", "1970-01-03 00:00:00", "1970-01-03 00:00:00.000", "1970-01-03 00:00:00.000000",[[["key",123]],[1,2,3]]], +[0, 4, 11, 0, 100, 4.0, null, "four", "ddd", 123, "123.45", "1970-01-04 00:00:00", "1970-01-04 00:00:00.000", "1970-01-04 00:00:00.000000", "1970-01-04 00:00:00", "1970-01-04 00:00:00.000", "1970-01-04 00:00:00.000000",[[["key",123]],[1,2,3]]] +])") + .ValueOrDie()); + ASSERT_TRUE(expected); + ASSERT_TRUE(expected->Equals(read_result)) << read_result->ToString(); +} + +TEST_P(ScanAndReadInteTest, TestAvroWithAppendSnapshot2) { + auto [file_format, enable_prefetch] = GetParam(); + if (file_format != "avro") { + return; + } + std::string table_path = GetDataDir() + "/avro/append_multiple.db/append_multiple"; + + // scan + ScanContextBuilder scan_context_builder(table_path); + scan_context_builder.AddOption(Options::SCAN_SNAPSHOT_ID, "2"); + ASSERT_OK_AND_ASSIGN(auto scan_context, scan_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_scan, TableScan::Create(std::move(scan_context))); + ASSERT_OK_AND_ASSIGN(auto result_plan, table_scan->CreatePlan()); + ASSERT_EQ(result_plan->SnapshotId().value(), 2); + + auto splits = result_plan->Splits(); + ASSERT_EQ(3, splits.size()); + + // read + ReadContextBuilder read_context_builder(table_path); + ASSERT_OK_AND_ASSIGN(std::unique_ptr read_context, read_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); + ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(splits)); + ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get())); + + // check result + auto timezone = DateTimeUtils::GetLocalTimezoneName(); + arrow::FieldVector fields = { + arrow::field("_VALUE_KIND", arrow::int8()), + arrow::field("f0", arrow::int8()), + arrow::field("f1", arrow::int16()), + arrow::field("f2", arrow::int32()), + arrow::field("f3", arrow::int64()), + arrow::field("f4", arrow::float32()), + arrow::field("f5", arrow::float64()), + arrow::field("f6", arrow::utf8()), + arrow::field("f7", arrow::binary()), + arrow::field("f8", arrow::date32()), + arrow::field("f9", arrow::decimal128(5, 2)), + arrow::field("f10", arrow::timestamp(arrow::TimeUnit::SECOND)), + arrow::field("f11", arrow::timestamp(arrow::TimeUnit::MILLI)), + arrow::field("f12", arrow::timestamp(arrow::TimeUnit::MICRO)), + arrow::field("f13", arrow::timestamp(arrow::TimeUnit::SECOND, timezone)), + arrow::field("f14", arrow::timestamp(arrow::TimeUnit::MILLI, timezone)), + arrow::field("f15", arrow::timestamp(arrow::TimeUnit::MICRO, timezone)), + arrow::field("f16", + arrow::struct_({arrow::field("f0", arrow::map(arrow::utf8(), arrow::int32())), + arrow::field("f1", arrow::list(arrow::int32()))})), + }; + auto expected = std::make_shared( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ +[0, 6, 10, 1, 100, 6.0, 4.0, "six", "fff", 123, "123.45", "1970-01-02 00:00:00", "1970-01-06 00:00:00.000", "1970-01-06 00:00:00.000000", "1970-01-06 00:00:00", "1970-01-06 00:00:00.000", "1970-01-06 00:00:00.000000",[[["key",123]],[1,2,3]]], +[0, 5, 10, 0, 100, 5.0, 2.0, null, "eee", 123, "123.45", "1970-01-01 00:00:00", "1970-01-05 00:00:00.000", "1970-01-05 00:00:00.000000", "1970-01-05 00:00:00", "1970-01-05 00:00:00.000", "1970-01-05 00:00:00.000000",[[["key",123]],[1,2,3]]], +[0, 7, 11, 0, 100, 7.0, 6.0, "seven", "ggg", 123, "123.45", "1970-01-03 00:00:00", "1970-01-07 00:00:00.000", "1970-01-07 00:00:00.000000", "1970-01-07 00:00:00", "1970-01-07 00:00:00.000", "1970-01-07 00:00:00.000000",[[["key",123]],[1,2,3]]] +])") + .ValueOrDie()); + ASSERT_TRUE(expected); + ASSERT_TRUE(expected->Equals(read_result)) << read_result->ToString(); +} + +TEST_P(ScanAndReadInteTest, TestAvroWithPkSnapshot1) { + auto [file_format, enable_prefetch] = GetParam(); + if (file_format != "avro") { + return; + } + std::string table_path = GetDataDir() + "/avro/pk_with_multiple_type.db/pk_with_multiple_type"; + + // scan + ScanContextBuilder scan_context_builder(table_path); + scan_context_builder.AddOption(Options::SCAN_SNAPSHOT_ID, "1"); + ASSERT_OK_AND_ASSIGN(auto scan_context, scan_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_scan, TableScan::Create(std::move(scan_context))); + ASSERT_OK_AND_ASSIGN(auto result_plan, table_scan->CreatePlan()); + ASSERT_EQ(result_plan->SnapshotId().value(), 1); + + auto splits = result_plan->Splits(); + ASSERT_EQ(1, splits.size()); + + // read + ReadContextBuilder read_context_builder(table_path); + ASSERT_OK_AND_ASSIGN(std::unique_ptr read_context, read_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); + ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(splits)); + ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get())); + + // check result + arrow::FieldVector fields = { + arrow::field("_VALUE_KIND", arrow::int8()), + arrow::field("f0", arrow::boolean()), + arrow::field("f1", arrow::int8()), + arrow::field("f2", arrow::int16()), + arrow::field("f3", arrow::int32()), + arrow::field("f4", arrow::int64()), + arrow::field("f5", arrow::float32()), + arrow::field("f6", arrow::float64()), + arrow::field("f7", arrow::utf8()), + arrow::field("f8", arrow::binary()), + arrow::field("f9", arrow::date32()), + arrow::field("f10", arrow::decimal128(5, 2)), + arrow::field("f11", + arrow::struct_({arrow::field("f0", arrow::map(arrow::utf8(), arrow::int32())), + arrow::field("f1", arrow::list(arrow::int32()))})), + }; + auto expected = std::make_shared( + arrow::ipc::internal::json::ArrayFromJSON(struct_(fields), R"([ +[0, false, 10, 1, 1, 1000, 1.5, 2.5, "Alice", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]]], +[0, false, 10, 1, 1, 1000, 1.5, 2.5, "Bob", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]]], +[0, true, 10, 1, 1, 1000, 1.5, 2.5, "Emily", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]]], +[0, true, 10, 1, 1, 1000, 1.5, 2.5, "Tony", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]]] + ])") + .ValueOrDie()); + ASSERT_TRUE(expected); + ASSERT_TRUE(expected->Equals(read_result)) << read_result->ToString(); +} + +TEST_P(ScanAndReadInteTest, TestAvroWithPkSnapshot2) { + auto [file_format, enable_prefetch] = GetParam(); + if (file_format != "avro") { + return; + } + std::string table_path = GetDataDir() + "/avro/pk_with_multiple_type.db/pk_with_multiple_type"; + + // scan + ScanContextBuilder scan_context_builder(table_path); + scan_context_builder.AddOption(Options::SCAN_SNAPSHOT_ID, "2"); + ASSERT_OK_AND_ASSIGN(auto scan_context, scan_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_scan, TableScan::Create(std::move(scan_context))); + ASSERT_OK_AND_ASSIGN(auto result_plan, table_scan->CreatePlan()); + ASSERT_EQ(result_plan->SnapshotId().value(), 2); + + auto splits = result_plan->Splits(); + ASSERT_EQ(1, splits.size()); + + // read + ReadContextBuilder read_context_builder(table_path); + ASSERT_OK_AND_ASSIGN(std::unique_ptr read_context, read_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); + ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(splits)); + ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get())); + + // check result + arrow::FieldVector fields = { + arrow::field("_VALUE_KIND", arrow::int8()), + arrow::field("f0", arrow::boolean()), + arrow::field("f1", arrow::int8()), + arrow::field("f2", arrow::int16()), + arrow::field("f3", arrow::int32()), + arrow::field("f4", arrow::int64()), + arrow::field("f5", arrow::float32()), + arrow::field("f6", arrow::float64()), + arrow::field("f7", arrow::utf8()), + arrow::field("f8", arrow::binary()), + arrow::field("f9", arrow::date32()), + arrow::field("f10", arrow::decimal128(5, 2)), + arrow::field("f11", + arrow::struct_({arrow::field("f0", arrow::map(arrow::utf8(), arrow::int32())), + arrow::field("f1", arrow::list(arrow::int32()))})), + }; + auto expected = std::make_shared( + arrow::ipc::internal::json::ArrayFromJSON(struct_(fields), R"([ +[0, false, 10, 1, 1, 1000, 1.5, 2.5, "Alice", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]]], +[0, false, 10, 1, 1, 1000, 1.5, 2.5, "Bob", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]]], +[0, true, 10, 1, 1, 1000, 1.5, 2.5, "Lucy", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]]], +[0, true, 10, 1, 1, 1000, 1.5, 2.5, "Tony", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]]] + ])") + .ValueOrDie()); + ASSERT_TRUE(expected); + ASSERT_TRUE(expected->Equals(read_result)) << read_result->ToString(); +} + std::vector> GetTestValuesForScanAndReadInteTest() { std::vector> values = {{"parquet", false}, {"parquet", true}}; #ifdef PAIMON_ENABLE_ORC From f2c649a3ccfc3c09407944348031ad6f6092a4a4 Mon Sep 17 00:00:00 2001 From: "jinli.zjw" Date: Mon, 26 Jan 2026 11:59:31 +0800 Subject: [PATCH 04/12] fix --- src/paimon/format/avro/avro_schema_converter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/paimon/format/avro/avro_schema_converter.h b/src/paimon/format/avro/avro_schema_converter.h index da7c01da..30afd93e 100644 --- a/src/paimon/format/avro/avro_schema_converter.h +++ b/src/paimon/format/avro/avro_schema_converter.h @@ -32,7 +32,7 @@ class AvroSchemaConverter { AvroSchemaConverter() = delete; ~AvroSchemaConverter() = delete; - // TODO(menglingda.mld): avro添加field id + // TODO(menglingda.mld): add field id for avro static Result<::avro::ValidSchema> ArrowSchemaToAvroSchema( const std::shared_ptr& arrow_schema, const std::string& row_name = "org.apache.paimon.avro.generated.record"); From 976f4e99c1cfbcfd119c6dd0b1589562145888ee Mon Sep 17 00:00:00 2001 From: "jinli.zjw" Date: Mon, 26 Jan 2026 16:41:18 +0800 Subject: [PATCH 05/12] fix2 --- .../format/avro/avro_direct_decoder.cpp | 101 +++++++++--------- src/paimon/format/avro/avro_direct_decoder.h | 4 +- .../format/avro/avro_file_batch_reader.cpp | 4 +- 3 files changed, 55 insertions(+), 54 deletions(-) diff --git a/src/paimon/format/avro/avro_direct_decoder.cpp b/src/paimon/format/avro/avro_direct_decoder.cpp index 209f6822..cd5a7a44 100644 --- a/src/paimon/format/avro/avro_direct_decoder.cpp +++ b/src/paimon/format/avro/avro_direct_decoder.cpp @@ -49,46 +49,46 @@ bool HasMapLogicalType(const ::avro::NodePtr& node) { /// Forward declaration for mutual recursion. Status DecodeFieldToBuilder(const ::avro::NodePtr& avro_node, const std::optional>& projection, - ::avro::Decoder& decoder, arrow::ArrayBuilder* array_builder, - AvroDirectDecoder::DecodeContext& ctx); + ::avro::Decoder* decoder, arrow::ArrayBuilder* array_builder, + AvroDirectDecoder::DecodeContext* ctx); /// \brief Skip an Avro value based on its schema without decoding -Status SkipAvroValue(const ::avro::NodePtr& avro_node, ::avro::Decoder& decoder) { +Status SkipAvroValue(const ::avro::NodePtr& avro_node, ::avro::Decoder* decoder) { switch (avro_node->type()) { case ::avro::AVRO_NULL: - decoder.decodeNull(); + decoder->decodeNull(); return Status::OK(); case ::avro::AVRO_BOOL: - decoder.decodeBool(); + decoder->decodeBool(); return Status::OK(); case ::avro::AVRO_INT: - decoder.decodeInt(); + decoder->decodeInt(); return Status::OK(); case ::avro::AVRO_LONG: - decoder.decodeLong(); + decoder->decodeLong(); return Status::OK(); case ::avro::AVRO_FLOAT: - decoder.decodeFloat(); + decoder->decodeFloat(); return Status::OK(); case ::avro::AVRO_DOUBLE: - decoder.decodeDouble(); + decoder->decodeDouble(); return Status::OK(); case ::avro::AVRO_STRING: - decoder.skipString(); + decoder->skipString(); return Status::OK(); case ::avro::AVRO_BYTES: - decoder.skipBytes(); + decoder->skipBytes(); return Status::OK(); case ::avro::AVRO_FIXED: - decoder.skipFixed(avro_node->fixedSize()); + decoder->skipFixed(avro_node->fixedSize()); return Status::OK(); case ::avro::AVRO_RECORD: { @@ -100,18 +100,18 @@ Status SkipAvroValue(const ::avro::NodePtr& avro_node, ::avro::Decoder& decoder) } case ::avro::AVRO_ENUM: - decoder.decodeEnum(); + decoder->decodeEnum(); return Status::OK(); case ::avro::AVRO_ARRAY: { const auto& element_node = avro_node->leafAt(0); // skipArray() returns count like arrayStart(), must handle all blocks - int64_t block_count = decoder.skipArray(); + int64_t block_count = decoder->skipArray(); while (block_count > 0) { for (int64_t i = 0; i < block_count; ++i) { PAIMON_RETURN_NOT_OK(SkipAvroValue(element_node, decoder)); } - block_count = decoder.arrayNext(); + block_count = decoder->arrayNext(); } return Status::OK(); } @@ -119,19 +119,19 @@ Status SkipAvroValue(const ::avro::NodePtr& avro_node, ::avro::Decoder& decoder) case ::avro::AVRO_MAP: { const auto& value_node = avro_node->leafAt(1); // skipMap() returns count like mapStart(), must handle all blocks - int64_t block_count = decoder.skipMap(); + int64_t block_count = decoder->skipMap(); while (block_count > 0) { for (int64_t i = 0; i < block_count; ++i) { - decoder.skipString(); // Skip key (always string in Avro maps) + decoder->skipString(); // Skip key (always string in Avro maps) PAIMON_RETURN_NOT_OK(SkipAvroValue(value_node, decoder)); } - block_count = decoder.mapNext(); + block_count = decoder->mapNext(); } return Status::OK(); } case ::avro::AVRO_UNION: { - const size_t branch_index = decoder.decodeUnionIndex(); + const size_t branch_index = decoder->decodeUnionIndex(); // Validate branch index const size_t num_branches = avro_node->leaves(); if (branch_index >= num_branches) { @@ -150,8 +150,8 @@ Status SkipAvroValue(const ::avro::NodePtr& avro_node, ::avro::Decoder& decoder) /// Decode Avro record directly to Arrow struct builder. Status DecodeStructToBuilder(const ::avro::NodePtr& avro_node, const std::optional>& projection, - ::avro::Decoder& decoder, arrow::ArrayBuilder* array_builder, - AvroDirectDecoder::DecodeContext& ctx) { + ::avro::Decoder* decoder, arrow::ArrayBuilder* array_builder, + AvroDirectDecoder::DecodeContext* ctx) { if (avro_node->type() != ::avro::AVRO_RECORD) { return Status::Invalid( fmt::format("Expected Avro record, got type: {}", ToString(avro_node))); @@ -179,9 +179,9 @@ Status DecodeStructToBuilder(const ::avro::NodePtr& avro_node, } /// Decode Avro array directly to Arrow list builder. -Status DecodeListToBuilder(const ::avro::NodePtr& avro_node, ::avro::Decoder& decoder, +Status DecodeListToBuilder(const ::avro::NodePtr& avro_node, ::avro::Decoder* decoder, arrow::ArrayBuilder* array_builder, - AvroDirectDecoder::DecodeContext& ctx) { + AvroDirectDecoder::DecodeContext* ctx) { if (avro_node->type() != ::avro::AVRO_ARRAY) { return Status::Invalid( fmt::format("Expected Avro array, got type: {}", ToString(avro_node))); @@ -194,22 +194,22 @@ Status DecodeListToBuilder(const ::avro::NodePtr& avro_node, ::avro::Decoder& de const auto& element_node = avro_node->leafAt(0); // Read array block count - int64_t block_count = decoder.arrayStart(); + int64_t block_count = decoder->arrayStart(); while (block_count != 0) { for (int64_t i = 0; i < block_count; ++i) { PAIMON_RETURN_NOT_OK(DecodeFieldToBuilder(element_node, /*projection=*/std::nullopt, decoder, value_builder, ctx)); } - block_count = decoder.arrayNext(); + block_count = decoder->arrayNext(); } return Status::OK(); } /// Decode Avro map directly to Arrow map builder. -Status DecodeMapToBuilder(const ::avro::NodePtr& avro_node, ::avro::Decoder& decoder, +Status DecodeMapToBuilder(const ::avro::NodePtr& avro_node, ::avro::Decoder* decoder, arrow::ArrayBuilder* array_builder, - AvroDirectDecoder::DecodeContext& ctx) { + AvroDirectDecoder::DecodeContext* ctx) { auto* map_builder = arrow::internal::checked_cast(array_builder); if (avro_node->type() == ::avro::AVRO_MAP) { @@ -222,7 +222,7 @@ Status DecodeMapToBuilder(const ::avro::NodePtr& avro_node, ::avro::Decoder& dec auto* item_builder = map_builder->item_builder(); // Read map block count - int64_t block_count = decoder.mapStart(); + int64_t block_count = decoder->mapStart(); while (block_count != 0) { for (int64_t i = 0; i < block_count; ++i) { PAIMON_RETURN_NOT_OK(DecodeFieldToBuilder(key_node, /*projection=*/std::nullopt, @@ -230,7 +230,7 @@ Status DecodeMapToBuilder(const ::avro::NodePtr& avro_node, ::avro::Decoder& dec PAIMON_RETURN_NOT_OK(DecodeFieldToBuilder(value_node, /*projection=*/std::nullopt, decoder, item_builder, ctx)); } - block_count = decoder.mapNext(); + block_count = decoder->mapNext(); } return Status::OK(); } else if (avro_node->type() == ::avro::AVRO_ARRAY && HasMapLogicalType(avro_node)) { @@ -249,7 +249,7 @@ Status DecodeMapToBuilder(const ::avro::NodePtr& avro_node, ::avro::Decoder& dec const auto& value_node = record_node->leafAt(1); // Read array block count - int64_t block_count = decoder.arrayStart(); + int64_t block_count = decoder->arrayStart(); while (block_count != 0) { for (int64_t i = 0; i < block_count; ++i) { PAIMON_RETURN_NOT_OK(DecodeFieldToBuilder(key_node, /*projection=*/std::nullopt, @@ -257,7 +257,7 @@ Status DecodeMapToBuilder(const ::avro::NodePtr& avro_node, ::avro::Decoder& dec PAIMON_RETURN_NOT_OK(DecodeFieldToBuilder(value_node, /*projection=*/std::nullopt, decoder, item_builder, ctx)); } - block_count = decoder.arrayNext(); + block_count = decoder->arrayNext(); } return Status::OK(); } else { @@ -269,21 +269,21 @@ Status DecodeMapToBuilder(const ::avro::NodePtr& avro_node, ::avro::Decoder& dec /// Decode Avro data directly to Arrow array builder. Status DecodeAvroValueToBuilder(const ::avro::NodePtr& avro_node, const std::optional>& projection, - ::avro::Decoder& decoder, arrow::ArrayBuilder* array_builder, - AvroDirectDecoder::DecodeContext& ctx) { + ::avro::Decoder* decoder, arrow::ArrayBuilder* array_builder, + AvroDirectDecoder::DecodeContext* ctx) { auto type = avro_node->type(); auto logical_type = avro_node->logicalType(); switch (type) { case ::avro::AVRO_BOOL: { auto* builder = arrow::internal::checked_cast(array_builder); - bool value = decoder.decodeBool(); + bool value = decoder->decodeBool(); PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append(value)); return Status::OK(); } case ::avro::AVRO_INT: { - int32_t value = decoder.decodeInt(); + int32_t value = decoder->decodeInt(); auto arrow_type = array_builder->type(); switch (arrow_type->id()) { case arrow::Type::INT8: { @@ -323,7 +323,7 @@ Status DecodeAvroValueToBuilder(const ::avro::NodePtr& avro_node, } case ::avro::AVRO_LONG: { - int64_t value = decoder.decodeLong(); + int64_t value = decoder->decodeLong(); switch (logical_type.type()) { case ::avro::LogicalType::Type::NONE: { auto* builder = @@ -357,31 +357,32 @@ Status DecodeAvroValueToBuilder(const ::avro::NodePtr& avro_node, case ::avro::AVRO_FLOAT: { auto* builder = arrow::internal::checked_cast(array_builder); - float value = decoder.decodeFloat(); + float value = decoder->decodeFloat(); PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append(value)); return Status::OK(); } case ::avro::AVRO_DOUBLE: { auto* builder = arrow::internal::checked_cast(array_builder); - double value = decoder.decodeDouble(); + double value = decoder->decodeDouble(); PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append(value)); return Status::OK(); } case ::avro::AVRO_STRING: { auto* builder = arrow::internal::checked_cast(array_builder); - decoder.decodeString(ctx.string_scratch); - PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append(ctx.string_scratch)); + decoder->decodeString(ctx->string_scratch); + PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append(ctx->string_scratch)); return Status::OK(); } case ::avro::AVRO_BYTES: { - decoder.decodeBytes(ctx.bytes_scratch); + decoder->decodeBytes(ctx->bytes_scratch); switch (logical_type.type()) { case ::avro::LogicalType::Type::NONE: { auto* builder = arrow::internal::checked_cast(array_builder); - PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append( - ctx.bytes_scratch.data(), static_cast(ctx.bytes_scratch.size()))); + PAIMON_RETURN_NOT_OK_FROM_ARROW( + builder->Append(ctx->bytes_scratch.data(), + static_cast(ctx->bytes_scratch.size()))); return Status::OK(); } case ::avro::LogicalType::Type::DECIMAL: { @@ -389,8 +390,8 @@ Status DecodeAvroValueToBuilder(const ::avro::NodePtr& avro_node, arrow::internal::checked_cast(array_builder); PAIMON_ASSIGN_OR_RAISE_FROM_ARROW( arrow::Decimal128 decimal, - arrow::Decimal128::FromBigEndian(ctx.bytes_scratch.data(), - ctx.bytes_scratch.size())); + arrow::Decimal128::FromBigEndian(ctx->bytes_scratch.data(), + ctx->bytes_scratch.size())); PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append(decimal)); return Status::OK(); } @@ -422,10 +423,10 @@ Status DecodeAvroValueToBuilder(const ::avro::NodePtr& avro_node, Status DecodeFieldToBuilder(const ::avro::NodePtr& avro_node, const std::optional>& projection, - ::avro::Decoder& decoder, arrow::ArrayBuilder* array_builder, - AvroDirectDecoder::DecodeContext& ctx) { + ::avro::Decoder* decoder, arrow::ArrayBuilder* array_builder, + AvroDirectDecoder::DecodeContext* ctx) { if (avro_node->type() == ::avro::AVRO_UNION) { - const size_t branch_index = decoder.decodeUnionIndex(); + const size_t branch_index = decoder->decodeUnionIndex(); // Validate branch index const size_t num_branches = avro_node->leaves(); @@ -450,9 +451,9 @@ Status DecodeFieldToBuilder(const ::avro::NodePtr& avro_node, Status AvroDirectDecoder::DecodeAvroToBuilder(const ::avro::NodePtr& avro_node, const std::optional>& projection, - ::avro::Decoder& decoder, + ::avro::Decoder* decoder, arrow::ArrayBuilder* array_builder, - DecodeContext& ctx) { + DecodeContext* ctx) { return DecodeFieldToBuilder(avro_node, projection, decoder, array_builder, ctx); } diff --git a/src/paimon/format/avro/avro_direct_decoder.h b/src/paimon/format/avro/avro_direct_decoder.h index e63b6e84..a5bb7cde 100644 --- a/src/paimon/format/avro/avro_direct_decoder.h +++ b/src/paimon/format/avro/avro_direct_decoder.h @@ -56,8 +56,8 @@ class AvroDirectDecoder { /// @return Status indicating success, or an error status static Status DecodeAvroToBuilder(const ::avro::NodePtr& avro_node, const std::optional>& projection, - ::avro::Decoder& decoder, arrow::ArrayBuilder* array_builder, - DecodeContext& ctx); + ::avro::Decoder* decoder, arrow::ArrayBuilder* array_builder, + DecodeContext* ctx); }; } // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_file_batch_reader.cpp b/src/paimon/format/avro/avro_file_batch_reader.cpp index 0fa6fb35..51bc2842 100644 --- a/src/paimon/format/avro/avro_file_batch_reader.cpp +++ b/src/paimon/format/avro/avro_file_batch_reader.cpp @@ -102,8 +102,8 @@ Result AvroFileBatchReader::NextBatch() { } reader_->decr(); PAIMON_RETURN_NOT_OK(AvroDirectDecoder::DecodeAvroToBuilder( - reader_->dataSchema().root(), read_fields_projection_, reader_->decoder(), - array_builder_.get(), decode_context_)); + reader_->dataSchema().root(), read_fields_projection_, &reader_->decoder(), + array_builder_.get(), &decode_context_)); } previous_first_row_ = next_row_to_read_; next_row_to_read_ += array_builder_->length(); From 1376cb3787c90353907e568105eaa6b7457f7058 Mon Sep 17 00:00:00 2001 From: "jinli.zjw" Date: Wed, 28 Jan 2026 15:41:12 +0800 Subject: [PATCH 06/12] fix review --- src/paimon/CMakeLists.txt | 1 + src/paimon/common/utils/arrow/arrow_utils.h | 41 ++- .../common/utils/arrow/arrow_utils_test.cpp | 47 +++ src/paimon/core/io/single_file_writer.h | 2 +- .../core/operation/abstract_split_read.cpp | 16 +- .../core/operation/merge_file_split_read.cpp | 17 +- .../core/operation/merge_file_split_read.h | 4 - .../operation/merge_file_split_read_test.cpp | 22 -- .../format/avro/avro_direct_decoder.cpp | 51 ++- .../format/avro/avro_file_batch_reader.cpp | 46 +-- .../format/avro/avro_file_batch_reader.h | 7 +- .../format/avro/avro_output_stream_impl.cpp | 9 +- .../format/avro/avro_schema_converter.cpp | 24 +- .../format/avro/avro_schema_converter.h | 5 +- src/paimon/format/avro/avro_utils.h | 50 +++ test/inte/scan_and_read_inte_test.cpp | 291 ++++++------------ 16 files changed, 310 insertions(+), 323 deletions(-) create mode 100644 src/paimon/common/utils/arrow/arrow_utils_test.cpp create mode 100644 src/paimon/format/avro/avro_utils.h diff --git a/src/paimon/CMakeLists.txt b/src/paimon/CMakeLists.txt index 1b470ff7..76a3b63a 100644 --- a/src/paimon/CMakeLists.txt +++ b/src/paimon/CMakeLists.txt @@ -382,6 +382,7 @@ if(PAIMON_BUILD_TESTS) common/types/data_type_json_parser_test.cpp common/types/row_kind_test.cpp common/types/data_type_test.cpp + common/utils/arrow/arrow_utils_test.cpp common/utils/arrow/mem_utils_test.cpp common/utils/arrow/status_utils_test.cpp common/utils/concurrent_hash_map_test.cpp diff --git a/src/paimon/common/utils/arrow/arrow_utils.h b/src/paimon/common/utils/arrow/arrow_utils.h index 995b3fff..5c23ec83 100644 --- a/src/paimon/common/utils/arrow/arrow_utils.h +++ b/src/paimon/common/utils/arrow/arrow_utils.h @@ -16,14 +16,41 @@ #pragma once -#include "arrow/c/helpers.h" +#include + +#include "arrow/api.h" +#include "fmt/format.h" +#include "paimon/result.h" namespace paimon { -inline void ArrowArrayInit(struct ArrowArray* array) { - ArrowArrayMarkReleased(array); -} -inline void ArrowSchemaInit(struct ArrowSchema* schema) { - ArrowSchemaMarkReleased(schema); -} +class ArrowUtils { + public: + ArrowUtils() = delete; + ~ArrowUtils() = delete; + + static Result> DataTypeToSchema( + const std::shared_ptr<::arrow::DataType>& data_type) { + if (data_type->id() != arrow::Type::STRUCT) { + return Status::Invalid(fmt::format("Expected struct data type, actual data type: {}", + data_type->ToString())); + } + const auto& struct_type = std::static_pointer_cast(data_type); + return std::make_shared(struct_type->fields()); + } + + static std::vector CreateProjection( + const std::shared_ptr<::arrow::Schema>& file_schema, + const arrow::FieldVector& read_fields) { + std::vector target_to_src_mapping; + target_to_src_mapping.reserve(read_fields.size()); + for (const auto& field : read_fields) { + auto src_field_idx = file_schema->GetFieldIndex(field->name()); + assert(src_field_idx >= 0); + target_to_src_mapping.push_back(src_field_idx); + } + return target_to_src_mapping; + } +}; + } // namespace paimon diff --git a/src/paimon/common/utils/arrow/arrow_utils_test.cpp b/src/paimon/common/utils/arrow/arrow_utils_test.cpp new file mode 100644 index 00000000..afc86d10 --- /dev/null +++ b/src/paimon/common/utils/arrow/arrow_utils_test.cpp @@ -0,0 +1,47 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/utils/arrow/arrow_utils.h" + +#include "arrow/api.h" +#include "gtest/gtest.h" +#include "paimon/common/types/data_field.h" + +namespace paimon::test { + +TEST(ArrowUtilsTest, TestCreateProjection) { + std::vector read_fields = {DataField(1, arrow::field("k1", arrow::int32())), + DataField(3, arrow::field("p1", arrow::int32())), + DataField(5, arrow::field("s1", arrow::utf8())), + DataField(6, arrow::field("v0", arrow::float64())), + DataField(7, arrow::field("v1", arrow::boolean()))}; + auto read_schema = DataField::ConvertDataFieldsToArrowSchema(read_fields); + + std::vector file_fields = {DataField(0, arrow::field("k0", arrow::int32())), + DataField(1, arrow::field("k1", arrow::int32())), + DataField(3, arrow::field("p1", arrow::int32())), + DataField(5, arrow::field("s1", arrow::utf8())), + DataField(6, arrow::field("v0", arrow::float64())), + DataField(7, arrow::field("v1", arrow::boolean())), + DataField(4, arrow::field("s0", arrow::utf8()))}; + auto file_schema = DataField::ConvertDataFieldsToArrowSchema(file_fields); + + auto projection = ArrowUtils::CreateProjection(file_schema, read_schema->fields()); + std::vector expected_projection = {1, 2, 3, 4, 5}; + ASSERT_EQ(projection, expected_projection); +} + +} // namespace paimon::test diff --git a/src/paimon/core/io/single_file_writer.h b/src/paimon/core/io/single_file_writer.h index ca4f19b3..862772eb 100644 --- a/src/paimon/core/io/single_file_writer.h +++ b/src/paimon/core/io/single_file_writer.h @@ -175,7 +175,7 @@ Status SingleFileWriter::Write(T record) { } } else { ArrowArray array; - ArrowArrayInit(&array); + ArrowArrayMarkReleased(&array); // reset array ScopeGuard inner_guard([&array]() { ArrowArrayRelease(&array); }); PAIMON_RETURN_NOT_OK(converter_(std::move(record), &array)); record_count = array.length; diff --git a/src/paimon/core/operation/abstract_split_read.cpp b/src/paimon/core/operation/abstract_split_read.cpp index c5a3ec3c..8fa3f0c8 100644 --- a/src/paimon/core/operation/abstract_split_read.cpp +++ b/src/paimon/core/operation/abstract_split_read.cpp @@ -150,13 +150,15 @@ Result> AbstractSplitRead::CreateFileBatchReade // TODO(zhanyu.fyh): orc format support prefetch if (context_->EnablePrefetch() && file_format_identifier != "blob" && file_format_identifier != "avro") { - PAIMON_ASSIGN_OR_RAISE(std::unique_ptr prefetch_reader, - PrefetchFileBatchReaderImpl::Create( - data_file_path, reader_builder, options_.GetFileSystem(), - context_->GetPrefetchMaxParallelNum(), - options_.GetReadBatchSize(), context_->GetPrefetchBatchCount(), - options_.EnableAdaptivePrefetchStrategy(), executor_, - /*initialize_read_ranges=*/false)); + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr prefetch_reader, + PrefetchFileBatchReaderImpl::Create( + data_file_path, reader_builder, options_.GetFileSystem(), + context_->GetPrefetchMaxParallelNum(), options_.GetReadBatchSize(), + context_->GetPrefetchBatchCount(), options_.EnableAdaptivePrefetchStrategy(), + executor_, + /*initialize_read_ranges=*/false, context_->EnablePrefetchCache(), + context_->GetCacheConfig(), pool_)); return std::make_unique(std::move(prefetch_reader)); } else { PAIMON_ASSIGN_OR_RAISE(std::shared_ptr input_stream, diff --git a/src/paimon/core/operation/merge_file_split_read.cpp b/src/paimon/core/operation/merge_file_split_read.cpp index 9de26f34..c6688e07 100644 --- a/src/paimon/core/operation/merge_file_split_read.cpp +++ b/src/paimon/core/operation/merge_file_split_read.cpp @@ -32,6 +32,7 @@ #include "paimon/common/reader/concat_batch_reader.h" #include "paimon/common/table/special_fields.h" #include "paimon/common/types/data_field.h" +#include "paimon/common/utils/arrow/arrow_utils.h" #include "paimon/common/utils/arrow/status_utils.h" #include "paimon/core/core_options.h" #include "paimon/core/deletionvectors/apply_deletion_vector_batch_reader.h" @@ -105,7 +106,8 @@ Result> MergeFileSplitRead::Create( int32_t key_arity = trimmed_primary_key.size(); // projection is the mapping from value_schema in KeyValue object to raw_read_schema - std::vector projection = CreateProjection(context->GetReadSchema(), value_schema); + std::vector projection = + ArrowUtils::CreateProjection(value_schema, context->GetReadSchema()->fields()); return std::unique_ptr(new MergeFileSplitRead( path_factory, context, @@ -368,19 +370,6 @@ Result> MergeFileSplitRead::GenerateKeyPredicates( return PredicateUtils::ExcludePredicateWithFields(predicate, non_primary_keys); } -std::vector MergeFileSplitRead::CreateProjection( - const std::shared_ptr& raw_read_schema, - const std::shared_ptr& value_schema) { - std::vector target_to_src_mapping; - target_to_src_mapping.reserve(raw_read_schema->num_fields()); - for (const auto& field : raw_read_schema->fields()) { - auto src_field_idx = value_schema->GetFieldIndex(field->name()); - assert(src_field_idx >= 0); - target_to_src_mapping.push_back(src_field_idx); - } - return target_to_src_mapping; -} - Result> MergeFileSplitRead::CreateReaderForSection( const std::vector& section, const std::string& bucket_path, const BinaryRow& partition, diff --git a/src/paimon/core/operation/merge_file_split_read.h b/src/paimon/core/operation/merge_file_split_read.h index fc20c6ac..483e4742 100644 --- a/src/paimon/core/operation/merge_file_split_read.h +++ b/src/paimon/core/operation/merge_file_split_read.h @@ -154,10 +154,6 @@ class MergeFileSplitRead : public AbstractSplitRead { static Result> GenerateKeyPredicates( const std::shared_ptr& predicate, const TableSchema& table_schema); - static std::vector CreateProjection( - const std::shared_ptr& raw_read_schema, - const std::shared_ptr& value_schema); - private: int32_t key_arity_; // schema of value member in KeyValue object diff --git a/src/paimon/core/operation/merge_file_split_read_test.cpp b/src/paimon/core/operation/merge_file_split_read_test.cpp index 5aafc2f5..09ed81a7 100644 --- a/src/paimon/core/operation/merge_file_split_read_test.cpp +++ b/src/paimon/core/operation/merge_file_split_read_test.cpp @@ -553,28 +553,6 @@ TEST_F(MergeFileSplitReadTest, TestGenerateKeyPredicates2) { ASSERT_FALSE(result); } -TEST_F(MergeFileSplitReadTest, TestCreateProjection) { - std::vector raw_read_fields = {DataField(1, arrow::field("k1", arrow::int32())), - DataField(3, arrow::field("p1", arrow::int32())), - DataField(5, arrow::field("s1", arrow::utf8())), - DataField(6, arrow::field("v0", arrow::float64())), - DataField(7, arrow::field("v1", arrow::boolean()))}; - auto raw_read_schema = DataField::ConvertDataFieldsToArrowSchema(raw_read_fields); - - std::vector value_fields = {DataField(0, arrow::field("k0", arrow::int32())), - DataField(1, arrow::field("k1", arrow::int32())), - DataField(3, arrow::field("p1", arrow::int32())), - DataField(5, arrow::field("s1", arrow::utf8())), - DataField(6, arrow::field("v0", arrow::float64())), - DataField(7, arrow::field("v1", arrow::boolean())), - DataField(4, arrow::field("s0", arrow::utf8()))}; - auto value_schema = DataField::ConvertDataFieldsToArrowSchema(value_fields); - - auto projection = MergeFileSplitRead::CreateProjection(raw_read_schema, value_schema); - std::vector expected_projection = {1, 2, 3, 4, 5}; - ASSERT_EQ(projection, expected_projection); -} - TEST_P(MergeFileSplitReadTest, TestSimple) { std::string path = paimon::test::GetDataDir() + "/parquet/pk_table_with_mor.db/pk_table_with_mor"; diff --git a/src/paimon/format/avro/avro_direct_decoder.cpp b/src/paimon/format/avro/avro_direct_decoder.cpp index cd5a7a44..dc47fe41 100644 --- a/src/paimon/format/avro/avro_direct_decoder.cpp +++ b/src/paimon/format/avro/avro_direct_decoder.cpp @@ -29,23 +29,12 @@ #include "avro/Types.hh" #include "paimon/common/utils/arrow/status_utils.h" #include "paimon/common/utils/date_time_utils.h" +#include "paimon/format/avro/avro_utils.h" namespace paimon::avro { namespace { -std::string ToString(const ::avro::NodePtr& node) { - std::stringstream ss; - ss << *node; - return ss.str(); -} - -bool HasMapLogicalType(const ::avro::NodePtr& node) { - return node->logicalType().type() == ::avro::LogicalType::CUSTOM && - node->logicalType().customLogicalType() != nullptr && - node->logicalType().customLogicalType()->name() == "map"; -} - /// Forward declaration for mutual recursion. Status DecodeFieldToBuilder(const ::avro::NodePtr& avro_node, const std::optional>& projection, @@ -142,8 +131,8 @@ Status SkipAvroValue(const ::avro::NodePtr& avro_node, ::avro::Decoder* decoder) } default: - return Status::Invalid( - fmt::format("Unsupported Avro type for skipping: {}", ToString(avro_node))); + return Status::Invalid(fmt::format("Unsupported Avro type for skipping: {}", + AvroUtils::ToString(avro_node))); } } @@ -154,7 +143,7 @@ Status DecodeStructToBuilder(const ::avro::NodePtr& avro_node, AvroDirectDecoder::DecodeContext* ctx) { if (avro_node->type() != ::avro::AVRO_RECORD) { return Status::Invalid( - fmt::format("Expected Avro record, got type: {}", ToString(avro_node))); + fmt::format("Expected Avro record, got type: {}", AvroUtils::ToString(avro_node))); } auto* struct_builder = arrow::internal::checked_cast(array_builder); @@ -184,7 +173,7 @@ Status DecodeListToBuilder(const ::avro::NodePtr& avro_node, ::avro::Decoder* de AvroDirectDecoder::DecodeContext* ctx) { if (avro_node->type() != ::avro::AVRO_ARRAY) { return Status::Invalid( - fmt::format("Expected Avro array, got type: {}", ToString(avro_node))); + fmt::format("Expected Avro array, got type: {}", AvroUtils::ToString(avro_node))); } auto* list_builder = arrow::internal::checked_cast(array_builder); @@ -233,7 +222,7 @@ Status DecodeMapToBuilder(const ::avro::NodePtr& avro_node, ::avro::Decoder* dec block_count = decoder->mapNext(); } return Status::OK(); - } else if (avro_node->type() == ::avro::AVRO_ARRAY && HasMapLogicalType(avro_node)) { + } else if (avro_node->type() == ::avro::AVRO_ARRAY && AvroUtils::HasMapLogicalType(avro_node)) { // Handle array-based map: list> PAIMON_RETURN_NOT_OK_FROM_ARROW(map_builder->Append()); auto* key_builder = map_builder->key_builder(); @@ -243,7 +232,7 @@ Status DecodeMapToBuilder(const ::avro::NodePtr& avro_node, ::avro::Decoder* dec if (record_node->type() != ::avro::AVRO_RECORD || record_node->leaves() != 2) { return Status::Invalid( fmt::format("Array-based map must contain records with exactly 2 fields, got: {}", - ToString(record_node))); + AvroUtils::ToString(record_node))); } const auto& key_node = record_node->leafAt(0); const auto& value_node = record_node->leafAt(1); @@ -261,8 +250,9 @@ Status DecodeMapToBuilder(const ::avro::NodePtr& avro_node, ::avro::Decoder* dec } return Status::OK(); } else { - return Status::Invalid(fmt::format( - "Expected Avro map or array with map logical type, got: {}", ToString(avro_node))); + return Status::Invalid( + fmt::format("Expected Avro map or array with map logical type, got: {}", + AvroUtils::ToString(avro_node))); } } @@ -307,8 +297,8 @@ Status DecodeAvroValueToBuilder(const ::avro::NodePtr& avro_node, case arrow::Type::DATE32: { if (logical_type.type() != ::avro::LogicalType::Type::DATE) { return Status::TypeError( - fmt::format("Unexpected avro type [{}] with arrow type [{}].", - toString(type), arrow_type->ToString())); + fmt::format("Unexpected avro type [{}] with arrow type [{}].", type, + arrow_type->ToString())); } auto* builder = arrow::internal::checked_cast(array_builder); @@ -317,8 +307,8 @@ Status DecodeAvroValueToBuilder(const ::avro::NodePtr& avro_node, } default: return Status::TypeError( - fmt::format("Unexpected avro type [{}] with arrow type [{}].", - toString(type), arrow_type->ToString())); + fmt::format("Unexpected avro type [{}] with arrow type [{}].", type, + arrow_type->ToString())); } } @@ -350,8 +340,8 @@ Status DecodeAvroValueToBuilder(const ::avro::NodePtr& avro_node, } default: return Status::TypeError( - fmt::format("Unexpected avro type [{}] with arrow type [{}].", - toString(type), array_builder->type()->ToString())); + fmt::format("Unexpected avro type [{}] with arrow type [{}].", type, + array_builder->type()->ToString())); } } @@ -397,8 +387,8 @@ Status DecodeAvroValueToBuilder(const ::avro::NodePtr& avro_node, } default: return Status::TypeError( - fmt::format("Unexpected avro type [{}] with arrow type [{}].", - toString(type), array_builder->type()->ToString())); + fmt::format("Unexpected avro type [{}] with arrow type [{}].", type, + array_builder->type()->ToString())); } } @@ -406,7 +396,7 @@ Status DecodeAvroValueToBuilder(const ::avro::NodePtr& avro_node, return DecodeStructToBuilder(avro_node, projection, decoder, array_builder, ctx); } case ::avro::AVRO_ARRAY: { - if (HasMapLogicalType(avro_node)) { + if (AvroUtils::HasMapLogicalType(avro_node)) { return DecodeMapToBuilder(avro_node, decoder, array_builder, ctx); } else { return DecodeListToBuilder(avro_node, decoder, array_builder, ctx); @@ -416,8 +406,7 @@ Status DecodeAvroValueToBuilder(const ::avro::NodePtr& avro_node, return DecodeMapToBuilder(avro_node, decoder, array_builder, ctx); } default: - return Status::Invalid( - fmt::format("Unsupported avro type: {}", ::avro::toString(type))); + return Status::Invalid(fmt::format("Unsupported avro type: {}", type)); } } diff --git a/src/paimon/format/avro/avro_file_batch_reader.cpp b/src/paimon/format/avro/avro_file_batch_reader.cpp index 51bc2842..3d068d99 100644 --- a/src/paimon/format/avro/avro_file_batch_reader.cpp +++ b/src/paimon/format/avro/avro_file_batch_reader.cpp @@ -22,6 +22,8 @@ #include "arrow/c/bridge.h" #include "fmt/format.h" #include "paimon/common/metrics/metrics_impl.h" +#include "paimon/common/utils/arrow/arrow_utils.h" +#include "paimon/common/utils/arrow/mem_utils.h" #include "paimon/common/utils/arrow/status_utils.h" #include "paimon/format/avro/avro_input_stream_impl.h" #include "paimon/format/avro/avro_schema_converter.h" @@ -33,9 +35,11 @@ AvroFileBatchReader::AvroFileBatchReader(const std::shared_ptr& inp const std::shared_ptr<::arrow::DataType>& file_data_type, std::unique_ptr<::avro::DataFileReaderBase>&& reader, std::unique_ptr&& array_builder, + std::unique_ptr&& arrow_pool, int32_t batch_size, const std::shared_ptr& pool) : pool_(pool), + arrow_pool_(std::move(arrow_pool)), input_stream_(input_stream), file_data_type_(file_data_type), reader_(std::move(reader)), @@ -66,11 +70,12 @@ Result> AvroFileBatchReader::Create( const auto& avro_file_schema = reader->dataSchema(); PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<::arrow::DataType> file_data_type, AvroSchemaConverter::AvroSchemaToArrowDataType(avro_file_schema)); + auto arrow_pool = GetArrowPool(pool); PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::unique_ptr array_builder, - arrow::MakeBuilder(file_data_type)); + arrow::MakeBuilder(file_data_type, arrow_pool.get())); return std::unique_ptr( new AvroFileBatchReader(input_stream, file_data_type, std::move(reader), - std::move(array_builder), batch_size, pool)); + std::move(array_builder), std::move(arrow_pool), batch_size, pool)); } Result> AvroFileBatchReader::CreateDataFileReader( @@ -139,32 +144,31 @@ Status AvroFileBatchReader::SetReadSchema(::ArrowSchema* read_schema, next_row_to_read_ = std::numeric_limits::max(); PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr arrow_read_schema, arrow::ImportSchema(read_schema)); - std::shared_ptr<::arrow::DataType> read_data_type = arrow::struct_(arrow_read_schema->fields()); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr file_schema, + ArrowUtils::DataTypeToSchema(file_data_type_)); PAIMON_ASSIGN_OR_RAISE(read_fields_projection_, - CalculateReadFieldsProjection(file_data_type_, read_data_type)); + CalculateReadFieldsProjection(file_schema, arrow_read_schema->fields())); array_builder_->Reset(); - PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(array_builder_, arrow::MakeBuilder(read_data_type)); + std::shared_ptr<::arrow::DataType> read_data_type = arrow::struct_(arrow_read_schema->fields()); + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(array_builder_, + arrow::MakeBuilder(read_data_type, arrow_pool_.get())); return Status::OK(); } Result> AvroFileBatchReader::CalculateReadFieldsProjection( - const std::shared_ptr<::arrow::DataType>& file_data_type, - const std::shared_ptr<::arrow::DataType>& read_data_type) { - if (file_data_type->id() != arrow::Type::STRUCT || - read_data_type->id() != arrow::Type::STRUCT) { - return Status::Invalid( - fmt::format("Expected struct data type, file data type: {}, read data type: {}", - file_data_type->ToString(), read_data_type->ToString())); - } - const auto& file_struct_type = std::static_pointer_cast(file_data_type); - const auto& read_struct_type = std::static_pointer_cast(read_data_type); - std::set projection; - for (const auto& field : read_struct_type->fields()) { - auto field_index = file_struct_type->GetFieldIndex(field->name()); - assert(field_index != -1); - projection.insert(field_index); + const std::shared_ptr<::arrow::Schema>& file_schema, const arrow::FieldVector& read_fields) { + std::set projection_set; + auto projection = ArrowUtils::CreateProjection(file_schema, read_fields); + int32_t prev_index = -1; + for (auto& index : projection) { + if (index <= prev_index) { + return Status::Invalid( + "SetReadSchema failed: read schema fields order is different from file schema"); + } + prev_index = index; + projection_set.insert(index); } - return projection; + return projection_set; } Result> AvroFileBatchReader::GetFileSchema() const { diff --git a/src/paimon/format/avro/avro_file_batch_reader.h b/src/paimon/format/avro/avro_file_batch_reader.h index 8e23050b..495c4a2a 100644 --- a/src/paimon/format/avro/avro_file_batch_reader.h +++ b/src/paimon/format/avro/avro_file_batch_reader.h @@ -73,18 +73,19 @@ class AvroFileBatchReader : public FileBatchReader { const std::shared_ptr& input_stream, const std::shared_ptr& pool); static Result> CalculateReadFieldsProjection( - const std::shared_ptr<::arrow::DataType>& file_data_type, - const std::shared_ptr<::arrow::DataType>& read_data_type); + const std::shared_ptr<::arrow::Schema>& file_schema, const arrow::FieldVector& read_fields); AvroFileBatchReader(const std::shared_ptr& input_stream, const std::shared_ptr<::arrow::DataType>& file_data_type, std::unique_ptr<::avro::DataFileReaderBase>&& reader, - std::unique_ptr&& array_builder, int32_t batch_size, + std::unique_ptr&& array_builder, + std::unique_ptr&& arrow_pool, int32_t batch_size, const std::shared_ptr& pool); static constexpr size_t BUFFER_SIZE = 1024 * 1024; // 1M std::shared_ptr pool_; + std::unique_ptr arrow_pool_; std::shared_ptr input_stream_; std::shared_ptr<::arrow::DataType> file_data_type_; std::unique_ptr<::avro::DataFileReaderBase> reader_; diff --git a/src/paimon/format/avro/avro_output_stream_impl.cpp b/src/paimon/format/avro/avro_output_stream_impl.cpp index 4a0d8819..98fcc9f8 100644 --- a/src/paimon/format/avro/avro_output_stream_impl.cpp +++ b/src/paimon/format/avro/avro_output_stream_impl.cpp @@ -78,11 +78,12 @@ void AvroOutputStreamImpl::FlushBuffer() { } void AvroOutputStreamImpl::flush() { - // ::avro::OutputStream's flush do nothing, because in the avro-cpp impl, calling flush() too - // frequently generates many small I/O operations, affecting write performance. - // - // And In avro-java impl, there is an option to control flush frequency. + // In avro-java impl, there is an option to control flush frequency. // See: https://github.com/apache/avro/commit/35750393891c40f0ceb925a852162ec764bcae6c + // + // However, in the avro-cpp impl, there is no such option. Calling flush() too frequently + // generates many small I/O operations, affecting write performance, so we make + // ::avro::OutputStream's flush() do nothing } } // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_schema_converter.cpp b/src/paimon/format/avro/avro_schema_converter.cpp index b1889482..99a1e05a 100644 --- a/src/paimon/format/avro/avro_schema_converter.cpp +++ b/src/paimon/format/avro/avro_schema_converter.cpp @@ -31,6 +31,7 @@ #include "fmt/format.h" #include "paimon/common/utils/date_time_utils.h" #include "paimon/format/avro/avro_file_format_factory.h" +#include "paimon/format/avro/avro_utils.h" #include "paimon/macros.h" #include "paimon/status.h" namespace paimon::avro { @@ -155,7 +156,7 @@ Result> AvroSchemaConverter::GetArrowType( return arrow::timestamp(arrow::TimeUnit::NANO, timezone); } case ::avro::LogicalType::Type::CUSTOM: { - if (!HasMapLogicalType(avro_node)) { + if (!AvroUtils::HasMapLogicalType(avro_node)) { return Status::TypeError("invalid avro logical map type"); } if (type != ::avro::AVRO_ARRAY) { @@ -186,9 +187,7 @@ Result> AvroSchemaConverter::GetArrowType( return std::make_shared(std::move(key_field), std::move(value_field)); } default: - std::stringstream logical_type_str; - logical_type.printJson(logical_type_str); - return Status::NotImplemented("not support logical type: ", logical_type_str.str()); + return Status::Invalid("not support logical type: ", AvroUtils::ToString(logical_type)); } size_t subtype_count = avro_node->leaves(); @@ -376,20 +375,17 @@ Result<::avro::Schema> AvroSchemaConverter::ArrowTypeToAvroSchema( } Result<::avro::ValidSchema> AvroSchemaConverter::ArrowSchemaToAvroSchema( - const std::shared_ptr& arrow_schema, const std::string& row_name) { - ::avro::RecordSchema record_schema(row_name); + const std::shared_ptr& arrow_schema) { + // top level row name of avro record, the same as java paimon + static const std::string kTopLevelRowName = "org.apache.paimon.avro.generated.record"; + ::avro::RecordSchema record_schema(kTopLevelRowName); for (const auto& field : arrow_schema->fields()) { - PAIMON_ASSIGN_OR_RAISE(::avro::Schema field_schema, - ArrowTypeToAvroSchema(field, row_name + "_" + field->name())); + PAIMON_ASSIGN_OR_RAISE( + ::avro::Schema field_schema, + ArrowTypeToAvroSchema(field, kTopLevelRowName + "_" + field->name())); AddRecordField(&record_schema, field->name(), field_schema); } return ::avro::ValidSchema(record_schema); } -bool AvroSchemaConverter::HasMapLogicalType(const ::avro::NodePtr& node) { - return node->logicalType().type() == ::avro::LogicalType::CUSTOM && - node->logicalType().customLogicalType() != nullptr && - node->logicalType().customLogicalType()->name() == "map"; -} - } // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_schema_converter.h b/src/paimon/format/avro/avro_schema_converter.h index 30afd93e..92bd6185 100644 --- a/src/paimon/format/avro/avro_schema_converter.h +++ b/src/paimon/format/avro/avro_schema_converter.h @@ -34,8 +34,7 @@ class AvroSchemaConverter { // TODO(menglingda.mld): add field id for avro static Result<::avro::ValidSchema> ArrowSchemaToAvroSchema( - const std::shared_ptr& arrow_schema, - const std::string& row_name = "org.apache.paimon.avro.generated.record"); + const std::shared_ptr& arrow_schema); static Result> AvroSchemaToArrowDataType( const ::avro::ValidSchema& avro_schema); @@ -56,8 +55,6 @@ class AvroSchemaConverter { static Result> GetArrowField(const std::string& name, const ::avro::NodePtr& avro_node); - - static bool HasMapLogicalType(const ::avro::NodePtr& node); }; } // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_utils.h b/src/paimon/format/avro/avro_utils.h new file mode 100644 index 00000000..6a79292b --- /dev/null +++ b/src/paimon/format/avro/avro_utils.h @@ -0,0 +1,50 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "avro/Node.hh" + +namespace paimon::avro { + +class AvroUtils { + public: + AvroUtils() = delete; + ~AvroUtils() = delete; + + static std::string ToString(const ::avro::NodePtr& node) { + std::stringstream ss; + ss << *node; + return ss.str(); + } + + static std::string ToString(const ::avro::LogicalType& type) { + std::stringstream ss; + type.printJson(ss); + return ss.str(); + } + + static bool HasMapLogicalType(const ::avro::NodePtr& node) { + return node->logicalType().type() == ::avro::LogicalType::CUSTOM && + node->logicalType().customLogicalType() != nullptr && + node->logicalType().customLogicalType()->name() == "map"; + } +}; + +} // namespace paimon::avro diff --git a/test/inte/scan_and_read_inte_test.cpp b/test/inte/scan_and_read_inte_test.cpp index cea16518..25819a4c 100644 --- a/test/inte/scan_and_read_inte_test.cpp +++ b/test/inte/scan_and_read_inte_test.cpp @@ -2566,235 +2566,144 @@ TEST_P(ScanAndReadInteTest, TestCastTimestampType) { ASSERT_TRUE(expected->Equals(read_result)) << read_result->ToString(); } -TEST_P(ScanAndReadInteTest, TestAvroWithAppendSnapshot1) { +TEST_P(ScanAndReadInteTest, TestAvroWithAppendTable) { auto [file_format, enable_prefetch] = GetParam(); if (file_format != "avro") { return; } - std::string table_path = GetDataDir() + "/avro/append_multiple.db/append_multiple"; - // scan - ScanContextBuilder scan_context_builder(table_path); - scan_context_builder.AddOption(Options::SCAN_SNAPSHOT_ID, "1"); - ASSERT_OK_AND_ASSIGN(auto scan_context, scan_context_builder.Finish()); - ASSERT_OK_AND_ASSIGN(auto table_scan, TableScan::Create(std::move(scan_context))); - ASSERT_OK_AND_ASSIGN(auto result_plan, table_scan->CreatePlan()); - ASSERT_EQ(result_plan->SnapshotId().value(), 1); - - auto splits = result_plan->Splits(); - ASSERT_EQ(3, splits.size()); + auto read_data = [](int64_t snapshot_id, const std::string& result_json) { + std::string table_path = GetDataDir() + "/avro/append_multiple.db/append_multiple"; + // scan + ScanContextBuilder scan_context_builder(table_path); + scan_context_builder.AddOption(Options::SCAN_SNAPSHOT_ID, std::to_string(snapshot_id)); + ASSERT_OK_AND_ASSIGN(auto scan_context, scan_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_scan, TableScan::Create(std::move(scan_context))); + ASSERT_OK_AND_ASSIGN(auto result_plan, table_scan->CreatePlan()); + ASSERT_EQ(result_plan->SnapshotId().value(), snapshot_id); + auto splits = result_plan->Splits(); + ASSERT_EQ(3, splits.size()); - // read - ReadContextBuilder read_context_builder(table_path); - ASSERT_OK_AND_ASSIGN(std::unique_ptr read_context, read_context_builder.Finish()); - ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); - ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(splits)); - ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get())); + // read + ReadContextBuilder read_context_builder(table_path); + ASSERT_OK_AND_ASSIGN(std::unique_ptr read_context, + read_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); + ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(splits)); + ASSERT_OK_AND_ASSIGN(auto read_result, + ReadResultCollector::CollectResult(batch_reader.get())); - // check result - auto timezone = DateTimeUtils::GetLocalTimezoneName(); - arrow::FieldVector fields = { - arrow::field("_VALUE_KIND", arrow::int8()), - arrow::field("f0", arrow::int8()), - arrow::field("f1", arrow::int16()), - arrow::field("f2", arrow::int32()), - arrow::field("f3", arrow::int64()), - arrow::field("f4", arrow::float32()), - arrow::field("f5", arrow::float64()), - arrow::field("f6", arrow::utf8()), - arrow::field("f7", arrow::binary()), - arrow::field("f8", arrow::date32()), - arrow::field("f9", arrow::decimal128(5, 2)), - arrow::field("f10", arrow::timestamp(arrow::TimeUnit::SECOND)), - arrow::field("f11", arrow::timestamp(arrow::TimeUnit::MILLI)), - arrow::field("f12", arrow::timestamp(arrow::TimeUnit::MICRO)), - arrow::field("f13", arrow::timestamp(arrow::TimeUnit::SECOND, timezone)), - arrow::field("f14", arrow::timestamp(arrow::TimeUnit::MILLI, timezone)), - arrow::field("f15", arrow::timestamp(arrow::TimeUnit::MICRO, timezone)), - arrow::field("f16", - arrow::struct_({arrow::field("f0", arrow::map(arrow::utf8(), arrow::int32())), + // check result + auto timezone = DateTimeUtils::GetLocalTimezoneName(); + arrow::FieldVector fields = { + arrow::field("_VALUE_KIND", arrow::int8()), + arrow::field("f0", arrow::int8()), + arrow::field("f1", arrow::int16()), + arrow::field("f2", arrow::int32()), + arrow::field("f3", arrow::int64()), + arrow::field("f4", arrow::float32()), + arrow::field("f5", arrow::float64()), + arrow::field("f6", arrow::utf8()), + arrow::field("f7", arrow::binary()), + arrow::field("f8", arrow::date32()), + arrow::field("f9", arrow::decimal128(5, 2)), + arrow::field("f10", arrow::timestamp(arrow::TimeUnit::SECOND)), + arrow::field("f11", arrow::timestamp(arrow::TimeUnit::MILLI)), + arrow::field("f12", arrow::timestamp(arrow::TimeUnit::MICRO)), + arrow::field("f13", arrow::timestamp(arrow::TimeUnit::SECOND, timezone)), + arrow::field("f14", arrow::timestamp(arrow::TimeUnit::MILLI, timezone)), + arrow::field("f15", arrow::timestamp(arrow::TimeUnit::MICRO, timezone)), + arrow::field("f16", arrow::struct_( + {arrow::field("f0", arrow::map(arrow::utf8(), arrow::int32())), arrow::field("f1", arrow::list(arrow::int32()))})), + }; + auto expected = std::make_shared( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), result_json) + .ValueOrDie()); + ASSERT_TRUE(expected); + ASSERT_TRUE(expected->Equals(read_result)) << read_result->ToString(); }; - auto expected = std::make_shared( - arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ + + read_data(1, R"([ [0, 2, 10, 1, 100, 2.0, 2.0, "two", "bbb", 123, "123.45", "1970-01-02 00:00:00", "1970-01-02 00:00:00.000", "1970-01-02 00:00:00.000000", "1970-01-02 00:00:00", "1970-01-02 00:00:00.000", "1970-01-02 00:00:00.000000",[[["key",123]],[1,2,3]]], [0, 1, 10, 0, 100, 1.0, 1.0, "one", "aaa", 123, "123.45", "1970-01-01 00:00:00", "1970-01-01 00:00:00.000", "1970-01-01 00:00:00.000000", "1970-01-01 00:00:00", "1970-01-01 00:00:00.000", "1970-01-01 00:00:00.000000",[[["key",123]],[1,2,3]]], [0, 3, 11, 0, 100, null, 3.0, "three", "ccc", 123, "123.45", "1970-01-03 00:00:00", "1970-01-03 00:00:00.000", "1970-01-03 00:00:00.000000", "1970-01-03 00:00:00", "1970-01-03 00:00:00.000", "1970-01-03 00:00:00.000000",[[["key",123]],[1,2,3]]], [0, 4, 11, 0, 100, 4.0, null, "four", "ddd", 123, "123.45", "1970-01-04 00:00:00", "1970-01-04 00:00:00.000", "1970-01-04 00:00:00.000000", "1970-01-04 00:00:00", "1970-01-04 00:00:00.000", "1970-01-04 00:00:00.000000",[[["key",123]],[1,2,3]]] -])") - .ValueOrDie()); - ASSERT_TRUE(expected); - ASSERT_TRUE(expected->Equals(read_result)) << read_result->ToString(); -} - -TEST_P(ScanAndReadInteTest, TestAvroWithAppendSnapshot2) { - auto [file_format, enable_prefetch] = GetParam(); - if (file_format != "avro") { - return; - } - std::string table_path = GetDataDir() + "/avro/append_multiple.db/append_multiple"; - - // scan - ScanContextBuilder scan_context_builder(table_path); - scan_context_builder.AddOption(Options::SCAN_SNAPSHOT_ID, "2"); - ASSERT_OK_AND_ASSIGN(auto scan_context, scan_context_builder.Finish()); - ASSERT_OK_AND_ASSIGN(auto table_scan, TableScan::Create(std::move(scan_context))); - ASSERT_OK_AND_ASSIGN(auto result_plan, table_scan->CreatePlan()); - ASSERT_EQ(result_plan->SnapshotId().value(), 2); - - auto splits = result_plan->Splits(); - ASSERT_EQ(3, splits.size()); - - // read - ReadContextBuilder read_context_builder(table_path); - ASSERT_OK_AND_ASSIGN(std::unique_ptr read_context, read_context_builder.Finish()); - ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); - ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(splits)); - ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get())); +])"); - // check result - auto timezone = DateTimeUtils::GetLocalTimezoneName(); - arrow::FieldVector fields = { - arrow::field("_VALUE_KIND", arrow::int8()), - arrow::field("f0", arrow::int8()), - arrow::field("f1", arrow::int16()), - arrow::field("f2", arrow::int32()), - arrow::field("f3", arrow::int64()), - arrow::field("f4", arrow::float32()), - arrow::field("f5", arrow::float64()), - arrow::field("f6", arrow::utf8()), - arrow::field("f7", arrow::binary()), - arrow::field("f8", arrow::date32()), - arrow::field("f9", arrow::decimal128(5, 2)), - arrow::field("f10", arrow::timestamp(arrow::TimeUnit::SECOND)), - arrow::field("f11", arrow::timestamp(arrow::TimeUnit::MILLI)), - arrow::field("f12", arrow::timestamp(arrow::TimeUnit::MICRO)), - arrow::field("f13", arrow::timestamp(arrow::TimeUnit::SECOND, timezone)), - arrow::field("f14", arrow::timestamp(arrow::TimeUnit::MILLI, timezone)), - arrow::field("f15", arrow::timestamp(arrow::TimeUnit::MICRO, timezone)), - arrow::field("f16", - arrow::struct_({arrow::field("f0", arrow::map(arrow::utf8(), arrow::int32())), - arrow::field("f1", arrow::list(arrow::int32()))})), - }; - auto expected = std::make_shared( - arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ + read_data(2, R"([ [0, 6, 10, 1, 100, 6.0, 4.0, "six", "fff", 123, "123.45", "1970-01-02 00:00:00", "1970-01-06 00:00:00.000", "1970-01-06 00:00:00.000000", "1970-01-06 00:00:00", "1970-01-06 00:00:00.000", "1970-01-06 00:00:00.000000",[[["key",123]],[1,2,3]]], [0, 5, 10, 0, 100, 5.0, 2.0, null, "eee", 123, "123.45", "1970-01-01 00:00:00", "1970-01-05 00:00:00.000", "1970-01-05 00:00:00.000000", "1970-01-05 00:00:00", "1970-01-05 00:00:00.000", "1970-01-05 00:00:00.000000",[[["key",123]],[1,2,3]]], [0, 7, 11, 0, 100, 7.0, 6.0, "seven", "ggg", 123, "123.45", "1970-01-03 00:00:00", "1970-01-07 00:00:00.000", "1970-01-07 00:00:00.000000", "1970-01-07 00:00:00", "1970-01-07 00:00:00.000", "1970-01-07 00:00:00.000000",[[["key",123]],[1,2,3]]] -])") - .ValueOrDie()); - ASSERT_TRUE(expected); - ASSERT_TRUE(expected->Equals(read_result)) << read_result->ToString(); +])"); } -TEST_P(ScanAndReadInteTest, TestAvroWithPkSnapshot1) { +TEST_P(ScanAndReadInteTest, TestAvroWithPkTable) { auto [file_format, enable_prefetch] = GetParam(); if (file_format != "avro") { return; } - std::string table_path = GetDataDir() + "/avro/pk_with_multiple_type.db/pk_with_multiple_type"; - // scan - ScanContextBuilder scan_context_builder(table_path); - scan_context_builder.AddOption(Options::SCAN_SNAPSHOT_ID, "1"); - ASSERT_OK_AND_ASSIGN(auto scan_context, scan_context_builder.Finish()); - ASSERT_OK_AND_ASSIGN(auto table_scan, TableScan::Create(std::move(scan_context))); - ASSERT_OK_AND_ASSIGN(auto result_plan, table_scan->CreatePlan()); - ASSERT_EQ(result_plan->SnapshotId().value(), 1); + auto read_data = [](int64_t snapshot_id, const std::string& result_json) { + std::string table_path = + GetDataDir() + "/avro/pk_with_multiple_type.db/pk_with_multiple_type"; + // scan + ScanContextBuilder scan_context_builder(table_path); + scan_context_builder.AddOption(Options::SCAN_SNAPSHOT_ID, std::to_string(snapshot_id)); + ASSERT_OK_AND_ASSIGN(auto scan_context, scan_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_scan, TableScan::Create(std::move(scan_context))); + ASSERT_OK_AND_ASSIGN(auto result_plan, table_scan->CreatePlan()); + ASSERT_EQ(result_plan->SnapshotId().value(), snapshot_id); - auto splits = result_plan->Splits(); - ASSERT_EQ(1, splits.size()); + auto splits = result_plan->Splits(); + ASSERT_EQ(1, splits.size()); - // read - ReadContextBuilder read_context_builder(table_path); - ASSERT_OK_AND_ASSIGN(std::unique_ptr read_context, read_context_builder.Finish()); - ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); - ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(splits)); - ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get())); + // read + ReadContextBuilder read_context_builder(table_path); + ASSERT_OK_AND_ASSIGN(std::unique_ptr read_context, + read_context_builder.Finish()); + ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); + ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(splits)); + ASSERT_OK_AND_ASSIGN(auto read_result, + ReadResultCollector::CollectResult(batch_reader.get())); - // check result - arrow::FieldVector fields = { - arrow::field("_VALUE_KIND", arrow::int8()), - arrow::field("f0", arrow::boolean()), - arrow::field("f1", arrow::int8()), - arrow::field("f2", arrow::int16()), - arrow::field("f3", arrow::int32()), - arrow::field("f4", arrow::int64()), - arrow::field("f5", arrow::float32()), - arrow::field("f6", arrow::float64()), - arrow::field("f7", arrow::utf8()), - arrow::field("f8", arrow::binary()), - arrow::field("f9", arrow::date32()), - arrow::field("f10", arrow::decimal128(5, 2)), - arrow::field("f11", - arrow::struct_({arrow::field("f0", arrow::map(arrow::utf8(), arrow::int32())), + // check result + arrow::FieldVector fields = { + arrow::field("_VALUE_KIND", arrow::int8()), + arrow::field("f0", arrow::boolean()), + arrow::field("f1", arrow::int8()), + arrow::field("f2", arrow::int16()), + arrow::field("f3", arrow::int32()), + arrow::field("f4", arrow::int64()), + arrow::field("f5", arrow::float32()), + arrow::field("f6", arrow::float64()), + arrow::field("f7", arrow::utf8()), + arrow::field("f8", arrow::binary()), + arrow::field("f9", arrow::date32()), + arrow::field("f10", arrow::decimal128(5, 2)), + arrow::field("f11", arrow::struct_( + {arrow::field("f0", arrow::map(arrow::utf8(), arrow::int32())), arrow::field("f1", arrow::list(arrow::int32()))})), + }; + auto expected = std::make_shared( + arrow::ipc::internal::json::ArrayFromJSON(struct_(fields), result_json).ValueOrDie()); + ASSERT_TRUE(expected); + ASSERT_TRUE(expected->Equals(read_result)) << read_result->ToString(); }; - auto expected = std::make_shared( - arrow::ipc::internal::json::ArrayFromJSON(struct_(fields), R"([ + + read_data(1, R"([ [0, false, 10, 1, 1, 1000, 1.5, 2.5, "Alice", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]]], [0, false, 10, 1, 1, 1000, 1.5, 2.5, "Bob", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]]], [0, true, 10, 1, 1, 1000, 1.5, 2.5, "Emily", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]]], [0, true, 10, 1, 1, 1000, 1.5, 2.5, "Tony", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]]] - ])") - .ValueOrDie()); - ASSERT_TRUE(expected); - ASSERT_TRUE(expected->Equals(read_result)) << read_result->ToString(); -} - -TEST_P(ScanAndReadInteTest, TestAvroWithPkSnapshot2) { - auto [file_format, enable_prefetch] = GetParam(); - if (file_format != "avro") { - return; - } - std::string table_path = GetDataDir() + "/avro/pk_with_multiple_type.db/pk_with_multiple_type"; - - // scan - ScanContextBuilder scan_context_builder(table_path); - scan_context_builder.AddOption(Options::SCAN_SNAPSHOT_ID, "2"); - ASSERT_OK_AND_ASSIGN(auto scan_context, scan_context_builder.Finish()); - ASSERT_OK_AND_ASSIGN(auto table_scan, TableScan::Create(std::move(scan_context))); - ASSERT_OK_AND_ASSIGN(auto result_plan, table_scan->CreatePlan()); - ASSERT_EQ(result_plan->SnapshotId().value(), 2); +])"); - auto splits = result_plan->Splits(); - ASSERT_EQ(1, splits.size()); - - // read - ReadContextBuilder read_context_builder(table_path); - ASSERT_OK_AND_ASSIGN(std::unique_ptr read_context, read_context_builder.Finish()); - ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); - ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(splits)); - ASSERT_OK_AND_ASSIGN(auto read_result, ReadResultCollector::CollectResult(batch_reader.get())); - - // check result - arrow::FieldVector fields = { - arrow::field("_VALUE_KIND", arrow::int8()), - arrow::field("f0", arrow::boolean()), - arrow::field("f1", arrow::int8()), - arrow::field("f2", arrow::int16()), - arrow::field("f3", arrow::int32()), - arrow::field("f4", arrow::int64()), - arrow::field("f5", arrow::float32()), - arrow::field("f6", arrow::float64()), - arrow::field("f7", arrow::utf8()), - arrow::field("f8", arrow::binary()), - arrow::field("f9", arrow::date32()), - arrow::field("f10", arrow::decimal128(5, 2)), - arrow::field("f11", - arrow::struct_({arrow::field("f0", arrow::map(arrow::utf8(), arrow::int32())), - arrow::field("f1", arrow::list(arrow::int32()))})), - }; - auto expected = std::make_shared( - arrow::ipc::internal::json::ArrayFromJSON(struct_(fields), R"([ + read_data(2, R"([ [0, false, 10, 1, 1, 1000, 1.5, 2.5, "Alice", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]]], [0, false, 10, 1, 1, 1000, 1.5, 2.5, "Bob", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]]], [0, true, 10, 1, 1, 1000, 1.5, 2.5, "Lucy", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]]], [0, true, 10, 1, 1, 1000, 1.5, 2.5, "Tony", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]]] - ])") - .ValueOrDie()); - ASSERT_TRUE(expected); - ASSERT_TRUE(expected->Equals(read_result)) << read_result->ToString(); +])"); } std::vector> GetTestValuesForScanAndReadInteTest() { From 90a43f6210b10e4b1769e8c8ac5b32d07b729cc8 Mon Sep 17 00:00:00 2001 From: "jinli.zjw" Date: Wed, 28 Jan 2026 16:56:23 +0800 Subject: [PATCH 07/12] fix review 2 --- .../format/avro/avro_file_format_factory.cpp | 3 +-- .../format/avro/avro_file_format_factory.h | 3 --- src/paimon/format/avro/avro_format_writer.cpp | 16 ++++++++-------- src/paimon/format/avro/avro_format_writer.h | 8 ++++---- .../format/avro/avro_output_stream_impl.cpp | 1 - src/paimon/format/avro/avro_output_stream_impl.h | 2 +- src/paimon/format/avro/avro_schema_converter.cpp | 6 +++--- src/paimon/format/orc/orc_adapter.cpp | 2 +- 8 files changed, 18 insertions(+), 23 deletions(-) diff --git a/src/paimon/format/avro/avro_file_format_factory.cpp b/src/paimon/format/avro/avro_file_format_factory.cpp index d1c8c0cd..e11be40a 100644 --- a/src/paimon/format/avro/avro_file_format_factory.cpp +++ b/src/paimon/format/avro/avro_file_format_factory.cpp @@ -27,11 +27,10 @@ const char AvroFileFormatFactory::IDENTIFIER[] = "avro"; Result> AvroFileFormatFactory::Create( const std::map& options) const { - RegisterLogicalTypes(); return std::make_unique(options); } -void AvroFileFormatFactory::RegisterLogicalTypes() { +static __attribute__((constructor)) void AvroFileFormatFactoryRegisterLogicalTypes() { ::avro::CustomLogicalTypeRegistry::instance().registerType( "map", [](const std::string&) { return std::make_shared(); }); } diff --git a/src/paimon/format/avro/avro_file_format_factory.h b/src/paimon/format/avro/avro_file_format_factory.h index 3a3ecfd1..c01360fa 100644 --- a/src/paimon/format/avro/avro_file_format_factory.h +++ b/src/paimon/format/avro/avro_file_format_factory.h @@ -41,9 +41,6 @@ class AvroFileFormatFactory : public FileFormatFactory { Result> Create( const std::map& options) const override; - - private: - static void RegisterLogicalTypes(); }; } // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_format_writer.cpp b/src/paimon/format/avro/avro_format_writer.cpp index 7b041e31..8b913914 100644 --- a/src/paimon/format/avro/avro_format_writer.cpp +++ b/src/paimon/format/avro/avro_format_writer.cpp @@ -39,9 +39,6 @@ namespace arrow { class Array; } // namespace arrow -namespace avro { -class OutputStream; -} // namespace avro struct ArrowArray; namespace paimon::avro { @@ -49,24 +46,26 @@ namespace paimon::avro { AvroFormatWriter::AvroFormatWriter( const std::shared_ptr<::avro::DataFileWriter<::avro::GenericDatum>>& file_writer, const ::avro::ValidSchema& avro_schema, const std::shared_ptr& data_type, - std::unique_ptr adaptor) + std::unique_ptr adaptor, AvroOutputStreamImpl* avro_output_stream) : writer_(file_writer), avro_schema_(avro_schema), data_type_(data_type), - adaptor_(std::move(adaptor)) {} + adaptor_(std::move(adaptor)), + avro_output_stream_(avro_output_stream) {} Result> AvroFormatWriter::Create( - std::unique_ptr<::avro::OutputStream> out, const std::shared_ptr& schema, + std::unique_ptr out, const std::shared_ptr& schema, const ::avro::Codec codec) { try { PAIMON_ASSIGN_OR_RAISE(::avro::ValidSchema avro_schema, AvroSchemaConverter::ArrowSchemaToAvroSchema(schema)); + AvroOutputStreamImpl* avro_output_stream = out.get(); auto writer = std::make_shared<::avro::DataFileWriter<::avro::GenericDatum>>( std::move(out), avro_schema, DEFAULT_SYNC_INTERVAL, codec); auto data_type = arrow::struct_(schema->fields()); auto adaptor = std::make_unique(data_type); - return std::unique_ptr( - new AvroFormatWriter(writer, avro_schema, data_type, std::move(adaptor))); + return std::unique_ptr(new AvroFormatWriter( + writer, avro_schema, data_type, std::move(adaptor), avro_output_stream)); } catch (const ::avro::Exception& e) { return Status::Invalid(fmt::format("avro format writer create failed. {}", e.what())); } catch (const std::exception& e) { @@ -92,6 +91,7 @@ Status AvroFormatWriter::Flush() { Status AvroFormatWriter::Finish() { try { + avro_output_stream_->FlushBuffer(); // we need flush buffer before close writer writer_->close(); } catch (const ::avro::Exception& e) { return Status::Invalid(fmt::format("avro writer close failed. {}", e.what())); diff --git a/src/paimon/format/avro/avro_format_writer.h b/src/paimon/format/avro/avro_format_writer.h index 2701c230..17f39a8b 100644 --- a/src/paimon/format/avro/avro_format_writer.h +++ b/src/paimon/format/avro/avro_format_writer.h @@ -22,9 +22,9 @@ #include "arrow/api.h" #include "avro/DataFile.hh" -#include "avro/Stream.hh" #include "avro/ValidSchema.hh" #include "paimon/format/avro/avro_adaptor.h" +#include "paimon/format/avro/avro_output_stream_impl.h" #include "paimon/format/format_writer.h" #include "paimon/metrics.h" #include "paimon/result.h" @@ -36,7 +36,6 @@ class Schema; } // namespace arrow namespace avro { class GenericDatum; -class OutputStream; } // namespace avro namespace paimon { class Metrics; @@ -49,7 +48,7 @@ namespace paimon::avro { class AvroFormatWriter : public FormatWriter { public: static Result> Create( - std::unique_ptr<::avro::OutputStream> out, const std::shared_ptr& schema, + std::unique_ptr out, const std::shared_ptr& schema, const ::avro::Codec codec); Status AddBatch(ArrowArray* batch) override; @@ -70,13 +69,14 @@ class AvroFormatWriter : public FormatWriter { AvroFormatWriter( const std::shared_ptr<::avro::DataFileWriter<::avro::GenericDatum>>& file_writer, const ::avro::ValidSchema& avro_schema, const std::shared_ptr& data_type, - std::unique_ptr adaptor); + std::unique_ptr adaptor, AvroOutputStreamImpl* avro_output_stream); std::shared_ptr<::avro::DataFileWriter<::avro::GenericDatum>> writer_; ::avro::ValidSchema avro_schema_; std::shared_ptr data_type_; std::shared_ptr metrics_; std::unique_ptr adaptor_; + AvroOutputStreamImpl* avro_output_stream_; }; } // namespace paimon::avro diff --git a/src/paimon/format/avro/avro_output_stream_impl.cpp b/src/paimon/format/avro/avro_output_stream_impl.cpp index 98fcc9f8..91e6c741 100644 --- a/src/paimon/format/avro/avro_output_stream_impl.cpp +++ b/src/paimon/format/avro/avro_output_stream_impl.cpp @@ -39,7 +39,6 @@ AvroOutputStreamImpl::AvroOutputStreamImpl(const std::shared_ptrFree(buffer_, buffer_size_); } diff --git a/src/paimon/format/avro/avro_output_stream_impl.h b/src/paimon/format/avro/avro_output_stream_impl.h index 349c0c86..d0dfe919 100644 --- a/src/paimon/format/avro/avro_output_stream_impl.h +++ b/src/paimon/format/avro/avro_output_stream_impl.h @@ -44,9 +44,9 @@ class AvroOutputStreamImpl : public ::avro::OutputStream { return byte_count_; } - private: void FlushBuffer(); + private: std::shared_ptr pool_; const size_t buffer_size_; uint8_t* const buffer_; diff --git a/src/paimon/format/avro/avro_schema_converter.cpp b/src/paimon/format/avro/avro_schema_converter.cpp index 99a1e05a..e5daa4e2 100644 --- a/src/paimon/format/avro/avro_schema_converter.cpp +++ b/src/paimon/format/avro/avro_schema_converter.cpp @@ -178,8 +178,7 @@ Result> AvroSchemaConverter::GetArrowType( if (fields.size() != 2) { return Status::TypeError("invalid avro logical map struct fields size"); } - auto key_field = fields[0]; - key_field = key_field->WithNullable(false); + auto key_field = fields[0]->WithNullable(false); auto value_field = fields[1]; if (key_field->name() != "key" || value_field->name() != "value") { return Status::TypeError("invalid avro logical map struct field names"); @@ -187,7 +186,8 @@ Result> AvroSchemaConverter::GetArrowType( return std::make_shared(std::move(key_field), std::move(value_field)); } default: - return Status::Invalid("not support logical type: ", AvroUtils::ToString(logical_type)); + return Status::Invalid("invalid avro logical type: ", + AvroUtils::ToString(logical_type)); } size_t subtype_count = avro_node->leaves(); diff --git a/src/paimon/format/orc/orc_adapter.cpp b/src/paimon/format/orc/orc_adapter.cpp index e986c797..cff1702c 100644 --- a/src/paimon/format/orc/orc_adapter.cpp +++ b/src/paimon/format/orc/orc_adapter.cpp @@ -856,7 +856,7 @@ Result> MakeArrowBuilder( arrow::MemoryPool* pool) { if (column_vector_batch->numElements == 0) { PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr builder, - arrow::MakeBuilder(type)); + arrow::MakeBuilder(type, pool)); return builder; } arrow::Type::type kind = type->id(); From 83934d6a8d5c966047b9a61cd52ced62ae0169a1 Mon Sep 17 00:00:00 2001 From: Zhang Jiawei <30893610+zjw1111@users.noreply.github.com> Date: Wed, 28 Jan 2026 17:49:37 +0800 Subject: [PATCH 08/12] fix3 --- src/paimon/format/orc/orc_adapter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/paimon/format/orc/orc_adapter.cpp b/src/paimon/format/orc/orc_adapter.cpp index cff1702c..18672bab 100644 --- a/src/paimon/format/orc/orc_adapter.cpp +++ b/src/paimon/format/orc/orc_adapter.cpp @@ -481,7 +481,7 @@ class UnPooledStringDictionaryBuilder : public EmptyBuilder { dict_builder->IncreaseLength(dict_offset.size() - 1); std::shared_ptr dictionary; ARROW_RETURN_NOT_OK(dict_builder->Finish(&dictionary)); - dictionary_.reset(new OrcStringDictionary(dictionary->data(), orc_dictionary)); + dictionary_ = std::make_shared(dictionary->data(), orc_dictionary); return arrow::Status::OK(); } From 152003ae8422ed9047c16121d9365ec980aa04f7 Mon Sep 17 00:00:00 2001 From: Zhang Jiawei <30893610+zjw1111@users.noreply.github.com> Date: Wed, 28 Jan 2026 17:54:47 +0800 Subject: [PATCH 09/12] fix4 --- src/paimon/format/avro/avro_format_writer.cpp | 8 ++++---- test/inte/scan_and_read_inte_test.cpp | 4 ++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/paimon/format/avro/avro_format_writer.cpp b/src/paimon/format/avro/avro_format_writer.cpp index 8b913914..b19f3881 100644 --- a/src/paimon/format/avro/avro_format_writer.cpp +++ b/src/paimon/format/avro/avro_format_writer.cpp @@ -69,7 +69,7 @@ Result> AvroFormatWriter::Create( } catch (const ::avro::Exception& e) { return Status::Invalid(fmt::format("avro format writer create failed. {}", e.what())); } catch (const std::exception& e) { - return Status::Invalid("avro format writer create failed: {}", e.what()); + return Status::Invalid(fmt::format("avro format writer create failed: {}", e.what())); } catch (...) { return Status::Invalid("avro format writer create failed: unknown exception"); } @@ -81,7 +81,7 @@ Status AvroFormatWriter::Flush() { } catch (const ::avro::Exception& e) { return Status::Invalid(fmt::format("avro writer flush failed. {}", e.what())); } catch (const std::exception& e) { - return Status::Invalid("avro writer flush failed: {}", e.what()); + return Status::Invalid(fmt::format("avro writer flush failed: {}", e.what())); } catch (...) { return Status::Invalid("avro writer flush failed: unknown exception"); } @@ -96,7 +96,7 @@ Status AvroFormatWriter::Finish() { } catch (const ::avro::Exception& e) { return Status::Invalid(fmt::format("avro writer close failed. {}", e.what())); } catch (const std::exception& e) { - return Status::Invalid("avro writer close failed: {}", e.what()); + return Status::Invalid(fmt::format("avro writer close failed: {}", e.what())); } catch (...) { return Status::Invalid("avro writer close failed: unknown exception"); } @@ -120,7 +120,7 @@ Status AvroFormatWriter::AddBatch(ArrowArray* batch) { } catch (const ::avro::Exception& e) { return Status::Invalid(fmt::format("avro writer add batch failed. {}", e.what())); } catch (const std::exception& e) { - return Status::Invalid("avro writer add batch failed: {}", e.what()); + return Status::Invalid(fmt::format("avro writer add batch failed: {}", e.what())); } catch (...) { return Status::Invalid("avro writer add batch failed: unknown exception"); } diff --git a/test/inte/scan_and_read_inte_test.cpp b/test/inte/scan_and_read_inte_test.cpp index 25819a4c..4c843591 100644 --- a/test/inte/scan_and_read_inte_test.cpp +++ b/test/inte/scan_and_read_inte_test.cpp @@ -2711,6 +2711,10 @@ std::vector> GetTestValuesForScanAndReadInteTest() #ifdef PAIMON_ENABLE_ORC values.emplace_back("orc", false); values.emplace_back("orc", true); +#endif +#ifdef PAIMON_ENABLE_AVRO + values.emplace_back("avro", false); + values.emplace_back("avro", true); #endif return values; } From 8690d65a58b265463f86b3b58aa141b51c31a143 Mon Sep 17 00:00:00 2001 From: Zhang Jiawei <30893610+zjw1111@users.noreply.github.com> Date: Thu, 29 Jan 2026 19:54:55 +0800 Subject: [PATCH 10/12] fix --- test/inte/scan_and_read_inte_test.cpp | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/test/inte/scan_and_read_inte_test.cpp b/test/inte/scan_and_read_inte_test.cpp index 4c843591..51ce8c8f 100644 --- a/test/inte/scan_and_read_inte_test.cpp +++ b/test/inte/scan_and_read_inte_test.cpp @@ -2566,12 +2566,8 @@ TEST_P(ScanAndReadInteTest, TestCastTimestampType) { ASSERT_TRUE(expected->Equals(read_result)) << read_result->ToString(); } -TEST_P(ScanAndReadInteTest, TestAvroWithAppendTable) { - auto [file_format, enable_prefetch] = GetParam(); - if (file_format != "avro") { - return; - } - +#ifdef PAIMON_ENABLE_AVRO +TEST_F(ScanAndReadInteTest, TestAvroWithAppendTable) { auto read_data = [](int64_t snapshot_id, const std::string& result_json) { std::string table_path = GetDataDir() + "/avro/append_multiple.db/append_multiple"; // scan @@ -2586,6 +2582,8 @@ TEST_P(ScanAndReadInteTest, TestAvroWithAppendTable) { // read ReadContextBuilder read_context_builder(table_path); + read_context_builder->AddOption("test.enable-adaptive-prefetch-strategy", "false"); + read_context_builder->EnablePrefetch(true).SetPrefetchBatchCount(3); ASSERT_OK_AND_ASSIGN(std::unique_ptr read_context, read_context_builder.Finish()); ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); @@ -2638,12 +2636,7 @@ TEST_P(ScanAndReadInteTest, TestAvroWithAppendTable) { ])"); } -TEST_P(ScanAndReadInteTest, TestAvroWithPkTable) { - auto [file_format, enable_prefetch] = GetParam(); - if (file_format != "avro") { - return; - } - +TEST_F(ScanAndReadInteTest, TestAvroWithPkTable) { auto read_data = [](int64_t snapshot_id, const std::string& result_json) { std::string table_path = GetDataDir() + "/avro/pk_with_multiple_type.db/pk_with_multiple_type"; @@ -2705,16 +2698,13 @@ TEST_P(ScanAndReadInteTest, TestAvroWithPkTable) { [0, true, 10, 1, 1, 1000, 1.5, 2.5, "Tony", "abcdef", 100, "123.45", [[["key",123]],[1,2,3]]] ])"); } +#endif std::vector> GetTestValuesForScanAndReadInteTest() { std::vector> values = {{"parquet", false}, {"parquet", true}}; #ifdef PAIMON_ENABLE_ORC values.emplace_back("orc", false); values.emplace_back("orc", true); -#endif -#ifdef PAIMON_ENABLE_AVRO - values.emplace_back("avro", false); - values.emplace_back("avro", true); #endif return values; } From a88e1b99d7cf717001ed989d5efb822a8f7c3ab6 Mon Sep 17 00:00:00 2001 From: Zhang Jiawei <30893610+zjw1111@users.noreply.github.com> Date: Thu, 29 Jan 2026 19:57:24 +0800 Subject: [PATCH 11/12] fix --- test/inte/scan_and_read_inte_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/inte/scan_and_read_inte_test.cpp b/test/inte/scan_and_read_inte_test.cpp index 51ce8c8f..e0ad436a 100644 --- a/test/inte/scan_and_read_inte_test.cpp +++ b/test/inte/scan_and_read_inte_test.cpp @@ -2582,8 +2582,8 @@ TEST_F(ScanAndReadInteTest, TestAvroWithAppendTable) { // read ReadContextBuilder read_context_builder(table_path); - read_context_builder->AddOption("test.enable-adaptive-prefetch-strategy", "false"); - read_context_builder->EnablePrefetch(true).SetPrefetchBatchCount(3); + read_context_builder.AddOption("test.enable-adaptive-prefetch-strategy", "false"); + read_context_builder.EnablePrefetch(true).SetPrefetchBatchCount(3); ASSERT_OK_AND_ASSIGN(std::unique_ptr read_context, read_context_builder.Finish()); ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); From 07ce3a340eba8f337773f4cb284ef2f9c043a3e6 Mon Sep 17 00:00:00 2001 From: Zhang Jiawei <30893610+zjw1111@users.noreply.github.com> Date: Fri, 30 Jan 2026 10:06:37 +0800 Subject: [PATCH 12/12] fix --- test/inte/scan_and_read_inte_test.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/inte/scan_and_read_inte_test.cpp b/test/inte/scan_and_read_inte_test.cpp index e0ad436a..8c38c28b 100644 --- a/test/inte/scan_and_read_inte_test.cpp +++ b/test/inte/scan_and_read_inte_test.cpp @@ -2567,7 +2567,8 @@ TEST_P(ScanAndReadInteTest, TestCastTimestampType) { } #ifdef PAIMON_ENABLE_AVRO -TEST_F(ScanAndReadInteTest, TestAvroWithAppendTable) { +// TODO(zjw): remove DISABLED_ when avro write is ready +TEST_F(ScanAndReadInteTest, DISABLED_TestAvroWithAppendTable) { auto read_data = [](int64_t snapshot_id, const std::string& result_json) { std::string table_path = GetDataDir() + "/avro/append_multiple.db/append_multiple"; // scan @@ -2636,7 +2637,7 @@ TEST_F(ScanAndReadInteTest, TestAvroWithAppendTable) { ])"); } -TEST_F(ScanAndReadInteTest, TestAvroWithPkTable) { +TEST_F(ScanAndReadInteTest, DISABLED_TestAvroWithPkTable) { auto read_data = [](int64_t snapshot_id, const std::string& result_json) { std::string table_path = GetDataDir() + "/avro/pk_with_multiple_type.db/pk_with_multiple_type";