From 28ae789f0161f162af0fa29b32661054ee46a372 Mon Sep 17 00:00:00 2001 From: daidai <2017501503@qq.com> Date: Tue, 24 Sep 2024 16:07:33 +0800 Subject: [PATCH 1/4] [fix](serde)fix the bug in DataTypeNullableSerDe.deserialize_column_from_fixed_json2 --- .../data_types/serde/data_type_datetimev2_serde.cpp | 6 ++++++ be/src/vec/data_types/serde/data_type_datev2_serde.cpp | 6 ++++++ .../vec/data_types/serde/data_type_decimal_serde.cpp | 6 ++++++ .../vec/data_types/serde/data_type_nullable_serde.cpp | 5 ++++- be/src/vec/data_types/serde/data_type_number_serde.cpp | 6 ++++++ be/src/vec/data_types/serde/data_type_serde.h | 10 +++++++++- be/src/vec/data_types/serde/data_type_string_serde.h | 6 ++++++ 7 files changed, 43 insertions(+), 2 deletions(-) diff --git a/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp b/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp index e57af914d43e04..e8238af4eee0ea 100644 --- a/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp @@ -252,6 +252,9 @@ Status DataTypeDateTimeV2SerDe::write_column_to_orc(const std::string& timezone, Status DataTypeDateTimeV2SerDe::deserialize_column_from_fixed_json( IColumn& column, Slice& slice, int rows, int* num_deserialized, const FormatOptions& options) const { + if (rows < 1) [[unlikely]] { + return Status::OK(); + } Status st = deserialize_one_cell_from_json(column, slice, options); if (!st.ok()) { return st; @@ -264,6 +267,9 @@ Status DataTypeDateTimeV2SerDe::deserialize_column_from_fixed_json( void DataTypeDateTimeV2SerDe::insert_column_last_value_multiple_times(IColumn& column, int times) const { + if (times < 1) [[unlikely]] { + return; + } auto& col = static_cast&>(column); auto sz = col.size(); UInt64 val = col.get_element(sz - 1); diff --git a/be/src/vec/data_types/serde/data_type_datev2_serde.cpp b/be/src/vec/data_types/serde/data_type_datev2_serde.cpp index f2d595b87c452f..95109ee408caee 100644 --- a/be/src/vec/data_types/serde/data_type_datev2_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_datev2_serde.cpp @@ -178,6 +178,9 @@ Status DataTypeDateV2SerDe::write_column_to_orc(const std::string& timezone, con Status DataTypeDateV2SerDe::deserialize_column_from_fixed_json(IColumn& column, Slice& slice, int rows, int* num_deserialized, const FormatOptions& options) const { + if (rows < 1) [[unlikely]] { + return Status::OK(); + } Status st = deserialize_one_cell_from_json(column, slice, options); if (!st.ok()) { return st; @@ -189,6 +192,9 @@ Status DataTypeDateV2SerDe::deserialize_column_from_fixed_json(IColumn& column, void DataTypeDateV2SerDe::insert_column_last_value_multiple_times(IColumn& column, int times) const { + if (times < 1) [[unlikely]] { + return; + } auto& col = static_cast&>(column); auto sz = col.size(); UInt32 val = col.get_element(sz - 1); diff --git a/be/src/vec/data_types/serde/data_type_decimal_serde.cpp b/be/src/vec/data_types/serde/data_type_decimal_serde.cpp index e979211d6d720b..acb09ee773ec62 100644 --- a/be/src/vec/data_types/serde/data_type_decimal_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_decimal_serde.cpp @@ -280,6 +280,9 @@ template Status DataTypeDecimalSerDe::deserialize_column_from_fixed_json( IColumn& column, Slice& slice, int rows, int* num_deserialized, const FormatOptions& options) const { + if (rows < 1) [[unlikely]] { + return Status::OK(); + } Status st = deserialize_one_cell_from_json(column, slice, options); if (!st.ok()) { return st; @@ -293,6 +296,9 @@ Status DataTypeDecimalSerDe::deserialize_column_from_fixed_json( template void DataTypeDecimalSerDe::insert_column_last_value_multiple_times(IColumn& column, int times) const { + if (times < 1) [[unlikely]] { + return; + } auto& col = static_cast&>(column); auto sz = col.size(); diff --git a/be/src/vec/data_types/serde/data_type_nullable_serde.cpp b/be/src/vec/data_types/serde/data_type_nullable_serde.cpp index 1af85bd040d1e2..3b46e0e784f8f3 100644 --- a/be/src/vec/data_types/serde/data_type_nullable_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_nullable_serde.cpp @@ -131,12 +131,15 @@ Status DataTypeNullableSerDe::deserialize_column_from_hive_text_vector( Status DataTypeNullableSerDe::deserialize_column_from_fixed_json( IColumn& column, Slice& slice, int rows, int* num_deserialized, const FormatOptions& options) const { + if (rows < 1) [[unlikely]] { + return Status::OK(); + } auto& col = static_cast(column); Status st = deserialize_one_cell_from_json(column, slice, options); if (!st.ok()) { return st; } - if (rows - 1 != 0) { + if (rows > 1) { auto& null_map = col.get_null_map_data(); auto& nested_column = col.get_nested_column(); diff --git a/be/src/vec/data_types/serde/data_type_number_serde.cpp b/be/src/vec/data_types/serde/data_type_number_serde.cpp index 299779ea267961..efa41e346bfa6e 100644 --- a/be/src/vec/data_types/serde/data_type_number_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_number_serde.cpp @@ -228,6 +228,9 @@ template Status DataTypeNumberSerDe::deserialize_column_from_fixed_json( IColumn& column, Slice& slice, int rows, int* num_deserialized, const FormatOptions& options) const { + if (rows < 1) [[unlikely]] { + return Status::OK(); + } Status st = deserialize_one_cell_from_json(column, slice, options); if (!st.ok()) { return st; @@ -241,6 +244,9 @@ Status DataTypeNumberSerDe::deserialize_column_from_fixed_json( template void DataTypeNumberSerDe::insert_column_last_value_multiple_times(IColumn& column, int times) const { + if (times < 1) [[unlikely]] { + return; + } auto& col = static_cast&>(column); auto sz = col.size(); T val = col.get_element(sz - 1); diff --git a/be/src/vec/data_types/serde/data_type_serde.h b/be/src/vec/data_types/serde/data_type_serde.h index 262f9cae6a8a62..1cdc2f7c3ff7e5 100644 --- a/be/src/vec/data_types/serde/data_type_serde.h +++ b/be/src/vec/data_types/serde/data_type_serde.h @@ -243,17 +243,25 @@ class DataTypeSerDe { virtual Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice, int rows, int* num_deserialized, const FormatOptions& options) const { + if (rows < 1) [[unlikely]] { + return Status::OK(); + } Status st = deserialize_one_cell_from_json(column, slice, options); if (!st.ok()) { *num_deserialized = 0; return st; } - insert_column_last_value_multiple_times(column, rows - 1); + if (rows > 1) [[likely]] { + insert_column_last_value_multiple_times(column, rows - 1); + } *num_deserialized = rows; return Status::OK(); } // Insert the last value to the end of this column multiple times. virtual void insert_column_last_value_multiple_times(IColumn& column, int times) const { + if (times < 1) [[unlikely]] { + return; + } //If you try to simplify this operation by using `column.insert_many_from(column, column.size() - 1, rows - 1);` // you are likely to get incorrect data results. MutableColumnPtr dum_col = column.clone_empty(); diff --git a/be/src/vec/data_types/serde/data_type_string_serde.h b/be/src/vec/data_types/serde/data_type_string_serde.h index fe09ff615f4742..583772c582530f 100644 --- a/be/src/vec/data_types/serde/data_type_string_serde.h +++ b/be/src/vec/data_types/serde/data_type_string_serde.h @@ -218,6 +218,9 @@ class DataTypeStringSerDeBase : public DataTypeSerDe { Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice, int rows, int* num_deserialized, const FormatOptions& options) const override { + if (rows < 1) [[unlikely]] { + return Status::OK(); + } Status st = deserialize_one_cell_from_json(column, slice, options); if (!st.ok()) { return st; @@ -229,6 +232,9 @@ class DataTypeStringSerDeBase : public DataTypeSerDe { } void insert_column_last_value_multiple_times(IColumn& column, int times) const override { + if (times < 1) [[unlikely]] { + return; + } auto& col = static_cast(column); auto sz = col.size(); From 7e5d2ad0a997c68ede806d3fb6afb4e747df74b2 Mon Sep 17 00:00:00 2001 From: daidai <2017501503@qq.com> Date: Tue, 24 Sep 2024 16:35:29 +0800 Subject: [PATCH 2/4] add log --- be/src/vec/exec/format/orc/vorc_reader.cpp | 1 + be/src/vec/exec/format/parquet/vparquet_group_reader.cpp | 2 ++ be/src/vec/exec/scan/vfile_scanner.cpp | 1 + 3 files changed, 4 insertions(+) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 16a3c1254c62eb..94c120e3786db8 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -944,6 +944,7 @@ Status OrcReader::_fill_partition_columns( auto _text_serde = slot_desc->get_data_type_ptr()->get_serde(); Slice slice(value.data(), value.size()); int num_deserialized = 0; + LOG(INFO) << "OPT[fill_partition_columns] rows = " << rows << " value = " << value; if (_text_serde->deserialize_column_from_fixed_json(*col_ptr, slice, rows, &num_deserialized, _text_formatOptions) != Status::OK()) { diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index 08ecb601f39941..e642f549c1bd9f 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -654,6 +655,7 @@ Status RowGroupReader::_fill_partition_columns( auto _text_serde = slot_desc->get_data_type_ptr()->get_serde(); Slice slice(value.data(), value.size()); int num_deserialized = 0; + LOG(INFO) << "OPT[fill_partition_columns] rows = " << rows << " value = " << value; if (_text_serde->deserialize_column_from_fixed_json(*col_ptr, slice, rows, &num_deserialized, _text_formatOptions) != Status::OK()) { diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index ffc88d07cab2f0..e99c611757d5a5 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -451,6 +451,7 @@ Status VFileScanner::_fill_columns_from_path(size_t rows) { auto _text_serde = slot_desc->get_data_type_ptr()->get_serde(); Slice slice(value.data(), value.size()); int num_deserialized = 0; + LOG(INFO) << "OPT[fill_partition_columns] rows = " << rows << " value = " << value; if (_text_serde->deserialize_column_from_fixed_json(*col_ptr, slice, rows, &num_deserialized, _text_formatOptions) != Status::OK()) { From bf28c07e80e872cae1188f522bbd691acde12a85 Mon Sep 17 00:00:00 2001 From: daidai <2017501503@qq.com> Date: Tue, 8 Oct 2024 18:38:21 +0800 Subject: [PATCH 3/4] Revert "add log" This reverts commit bb7d7e90c953efa7872f960f0df083a82d604466. --- be/src/vec/exec/format/orc/vorc_reader.cpp | 1 - be/src/vec/exec/format/parquet/vparquet_group_reader.cpp | 2 -- be/src/vec/exec/scan/vfile_scanner.cpp | 1 - 3 files changed, 4 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 94c120e3786db8..16a3c1254c62eb 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -944,7 +944,6 @@ Status OrcReader::_fill_partition_columns( auto _text_serde = slot_desc->get_data_type_ptr()->get_serde(); Slice slice(value.data(), value.size()); int num_deserialized = 0; - LOG(INFO) << "OPT[fill_partition_columns] rows = " << rows << " value = " << value; if (_text_serde->deserialize_column_from_fixed_json(*col_ptr, slice, rows, &num_deserialized, _text_formatOptions) != Status::OK()) { diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index e642f549c1bd9f..08ecb601f39941 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -655,7 +654,6 @@ Status RowGroupReader::_fill_partition_columns( auto _text_serde = slot_desc->get_data_type_ptr()->get_serde(); Slice slice(value.data(), value.size()); int num_deserialized = 0; - LOG(INFO) << "OPT[fill_partition_columns] rows = " << rows << " value = " << value; if (_text_serde->deserialize_column_from_fixed_json(*col_ptr, slice, rows, &num_deserialized, _text_formatOptions) != Status::OK()) { diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index e99c611757d5a5..ffc88d07cab2f0 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -451,7 +451,6 @@ Status VFileScanner::_fill_columns_from_path(size_t rows) { auto _text_serde = slot_desc->get_data_type_ptr()->get_serde(); Slice slice(value.data(), value.size()); int num_deserialized = 0; - LOG(INFO) << "OPT[fill_partition_columns] rows = " << rows << " value = " << value; if (_text_serde->deserialize_column_from_fixed_json(*col_ptr, slice, rows, &num_deserialized, _text_formatOptions) != Status::OK()) { From 04d7fe40684d45f8ff3d6caa4b79a042f405dcf6 Mon Sep 17 00:00:00 2001 From: daidai <2017501503@qq.com> Date: Tue, 8 Oct 2024 18:45:50 +0800 Subject: [PATCH 4/4] add comments --- be/src/vec/data_types/serde/data_type_serde.h | 1 + be/src/vec/exec/format/parquet/vparquet_group_reader.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/be/src/vec/data_types/serde/data_type_serde.h b/be/src/vec/data_types/serde/data_type_serde.h index 1cdc2f7c3ff7e5..46236faa926c6f 100644 --- a/be/src/vec/data_types/serde/data_type_serde.h +++ b/be/src/vec/data_types/serde/data_type_serde.h @@ -243,6 +243,7 @@ class DataTypeSerDe { virtual Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice, int rows, int* num_deserialized, const FormatOptions& options) const { + //In this function implementation, we need to consider the case where rows is 0, 1, and other larger integers. if (rows < 1) [[unlikely]] { return Status::OK(); } diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index 08ecb601f39941..37e82774c39ee4 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -654,6 +654,7 @@ Status RowGroupReader::_fill_partition_columns( auto _text_serde = slot_desc->get_data_type_ptr()->get_serde(); Slice slice(value.data(), value.size()); int num_deserialized = 0; + // Be careful when reading empty rows from parquet row groups. if (_text_serde->deserialize_column_from_fixed_json(*col_ptr, slice, rows, &num_deserialized, _text_formatOptions) != Status::OK()) {