From 5dfa577126f4fa06df4e764e8936b8de66cc6d0c Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Sun, 8 Oct 2023 21:42:47 +0800 Subject: [PATCH 01/21] reserve decode_value. --- be/src/vec/common/cow.h | 2 +- be/src/vec/exec/format/convert.h | 528 ++++++++++++++++++ be/src/vec/exec/format/parquet/decoder.h | 2 +- .../parquet/fix_length_dict_decoder.hpp | 21 +- .../vec/exec/format/parquet/parquet_common.h | 8 +- .../parquet/vparquet_column_chunk_reader.cpp | 3 +- .../format/parquet/vparquet_column_reader.cpp | 12 +- .../format/parquet/vparquet_column_reader.h | 2 +- .../format/parquet/vparquet_group_reader.cpp | 15 +- .../exec/format/parquet/vparquet_reader.cpp | 90 ++- be/src/vec/exec/scan/scanner_context.cpp | 13 +- be/src/vec/exec/scan/scanner_scheduler.cpp | 6 +- be/src/vec/exec/scan/vfile_scanner.cpp | 14 +- be/src/vec/exprs/vectorized_fn_call.cpp | 3 +- 14 files changed, 669 insertions(+), 50 deletions(-) create mode 100644 be/src/vec/exec/format/convert.h diff --git a/be/src/vec/common/cow.h b/be/src/vec/common/cow.h index d3ab0ebd681af2..5cd701e6badd45 100644 --- a/be/src/vec/common/cow.h +++ b/be/src/vec/common/cow.h @@ -410,4 +410,4 @@ class COWHelper : public Base { MutablePtr shallow_mutate() const { return MutablePtr(static_cast(Base::shallow_mutate().get())); } -}; +}; \ No newline at end of file diff --git a/be/src/vec/exec/format/convert.h b/be/src/vec/exec/format/convert.h new file mode 100644 index 00000000000000..f294dc3ee6095e --- /dev/null +++ b/be/src/vec/exec/format/convert.h @@ -0,0 +1,528 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "common/status.h" +#include "io/file_factory.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_factory.hpp" +#include "vec/exec/format/parquet/parquet_common.h" +#include "gen_cpp/descriptors.pb.h" +#include "olap/olap_common.h" +#include "vec/columns/column_string.h" +#include "vec/columns/column_vector.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type_nullable.h" +#include "vec/data_types/data_type_number.h" +#include "vec/data_types/data_type_string.h" + +namespace doris::vectorized { + +namespace convert { + +class DocTime { + +public: + std::unique_ptr _decode_params; + const FieldSchema * _field_schema; + void init_time(const FieldSchema *field_schema, cctz::time_zone* ctz) { + if (_decode_params == nullptr) { + _decode_params.reset(new DecodeParams()); + } + if (ctz != nullptr) { + _decode_params->ctz = ctz; + } + + _field_schema = field_schema; + const auto& schema = field_schema->parquet_schema; + if (schema.__isset.logicalType && schema.logicalType.__isset.TIMESTAMP) { + const auto& timestamp_info = schema.logicalType.TIMESTAMP; + if (!timestamp_info.isAdjustedToUTC) { + // should set timezone to utc+0 + _decode_params->ctz = const_cast(&_decode_params->utc0); + } + const auto& time_unit = timestamp_info.unit; + if (time_unit.__isset.MILLIS) { + _decode_params->second_mask = 1000; + _decode_params->scale_to_nano_factor = 1000000; + } else if (time_unit.__isset.MICROS) { + _decode_params->second_mask = 1000000; + _decode_params->scale_to_nano_factor = 1000; + } else if (time_unit.__isset.NANOS) { + _decode_params->second_mask = 1000000000; + _decode_params->scale_to_nano_factor = 1; + } + } else if (schema.__isset.converted_type) { + const auto& converted_type = schema.converted_type; + if (converted_type == tparquet::ConvertedType::TIMESTAMP_MILLIS) { + _decode_params->second_mask = 1000; + _decode_params->scale_to_nano_factor = 1000000; + } else if (converted_type == tparquet::ConvertedType::TIMESTAMP_MICROS) { + _decode_params->second_mask = 1000000; + _decode_params->scale_to_nano_factor = 1000; + } + } + + if (_decode_params->ctz) { + VecDateTimeValue t; + t.from_unixtime(0, *_decode_params->ctz); + _decode_params->offset_days = t.day() == 31 ? 0 : 1; + } + } + template + void init_decimal_converter(DataTypePtr& data_type) { + if (_decode_params == nullptr || _field_schema == nullptr || + _decode_params->decimal_scale.scale_type != DecimalScaleParams::NOT_INIT) { + return; + } + auto scale = _field_schema->parquet_schema.scale; + auto* decimal_type = reinterpret_cast*>( + const_cast(remove_nullable(data_type).get())); + auto dest_scale = decimal_type->get_scale(); + if (dest_scale > scale) { + _decode_params->decimal_scale.scale_type = DecimalScaleParams::SCALE_UP; + _decode_params->decimal_scale.scale_factor = + DecimalScaleParams::get_scale_factor(dest_scale - scale); + } else if (dest_scale < scale) { + _decode_params->decimal_scale.scale_type = DecimalScaleParams::SCALE_DOWN; + _decode_params->decimal_scale.scale_factor = + DecimalScaleParams::get_scale_factor(scale - dest_scale); + } else { + _decode_params->decimal_scale.scale_type = DecimalScaleParams::NO_SCALE; + _decode_params->decimal_scale.scale_factor = 1; + } + } + +}; + +static Status +convert_data_type_from_parquet(tparquet::Type::type parquet_type , + vectorized::DataTypePtr& ans_data_type , + DataTypePtr& src_type,bool * need_convert) { + std::cout << getTypeName(src_type->get_type_id()) <<"\n"; + if (is_complex_type(src_type)){ + *need_convert = false; + return Status::OK(); + } + switch (parquet_type) { + case tparquet::Type::type::BOOLEAN: + ans_data_type = std::make_shared(); + break; + case tparquet::Type::type::INT32: + ans_data_type = std::make_shared(); + break; + case tparquet::Type::type::INT64: + ans_data_type = std::make_shared(); + break; + case tparquet::Type::type::FLOAT: + ans_data_type = std::make_shared(); + break; + case tparquet::Type::type::DOUBLE: + ans_data_type = std::make_shared(); + break; + case tparquet::Type::type::BYTE_ARRAY: + case tparquet::Type::type::FIXED_LEN_BYTE_ARRAY: + ans_data_type = std::make_shared(); + break; + case tparquet::Type::type::INT96: + ans_data_type = std::make_shared(); + break; + default: + std::cout <<"--->"<get_type_id() == src_type->get_type_id()){ + *need_convert = false; + return Status::OK(); + } + if (src_type->is_nullable()){ + auto& nested_src_type= reinterpret_cast(src_type.get())->get_nested_type(); + std::cout << getTypeName(nested_src_type->get_type_id()) <<"\n"; + auto sub = ans_data_type; + ans_data_type = std::make_shared(ans_data_type); + + if (nested_src_type->get_type_id() == sub ->get_type_id()){ + *need_convert = false; + return Status::OK(); + } + } + + *need_convert = true; + return Status::OK(); +} + + + + +struct ColumnConvert { + Status virtual convert(const IColumn* src_col , IColumn* dst_col ){ + return Status::OK(); + } + virtual ~ColumnConvert() = default; +}; + +template +struct NumberColumnConvert : public ColumnConvert { + virtual Status convert(const IColumn* src_col , IColumn* dst_col ) override; + +}; +void +convert_null(const IColumn **src_col, IColumn **dst_col){ + size_t rows = (*src_col)->size(); + if ((*src_col)->is_nullable()) { + auto src_nullable_column = reinterpret_cast(*src_col); + auto dst_nullable_column = reinterpret_cast(*dst_col); + auto& dst_null_col = dst_nullable_column->get_null_map_column(); + + for(auto j =0;jget_null_map_column()[j]); + } + + *src_col = &src_nullable_column->get_nested_column(); + *dst_col = &dst_nullable_column->get_nested_column(); + } +} + + + +template +Status NumberColumnConvert::convert(const IColumn *src_col, IColumn *dst_col) { + size_t rows = src_col->size(); + if constexpr (is_nullable){ + convert_null(&src_col,&dst_col); + } + + + for(int i =0;i*>(src_col)->get_data()[i]; + + dst_type value = static_cast( + reinterpret_cast*>(src_col)->get_data()[i]); + + reinterpret_cast*>(dst_col)->insert(value); + + } + + + return Status::OK(); +} +template +struct NumberColumnToStringConvert : public ColumnConvert { + virtual Status convert(const IColumn* src_col , IColumn* dst_col ) override; +}; + +template +Status NumberColumnToStringConvert::convert(const IColumn *src_col, IColumn *dst_col) { + size_t rows = src_col->size(); + if constexpr (is_nullable){ + convert_null(&src_col,&dst_col); + } + + for(int i =0;i*>(src_col)->get_data()[i]); + reinterpret_cast(dst_col)->insert_data(value.data(),value.size()); + + } + return Status::OK(); +} + +template +struct int128totimestamp: public ColumnConvert { + int128totimestamp(DocTime *pTime) { + doc = pTime; + } + + inline uint64_t to_timestamp_micros(uint32_t hi , uint64_t lo) const { + return (hi - ParquetInt96::JULIAN_EPOCH_OFFSET_DAYS) * ParquetInt96::MICROS_IN_DAY + lo / ParquetInt96::NANOS_PER_MICROSECOND; + } + Status convert(const IColumn* src_col , IColumn* dst_col ) { + size_t rows = src_col->size(); + if constexpr (is_nullable){ + convert_null(&src_col,&dst_col); + } + + for(int i =0;i*>(src_col)->get_data()[i]; + uint32_t hi = x>>64; uint64_t lo = (x<<64)>>64 ; + dst_col = static_cast*>(dst_col); + reinterpret_cast*>(dst_col)->insert(0); + auto& num = static_cast*>(dst_col)->get_data()[i]; + auto &value = reinterpret_cast&>(num); + int64_t micros = to_timestamp_micros(hi,lo); + value.from_unixtime(micros / 1000000, *doc->_decode_params->ctz); + value.set_microsecond(micros % 1000000); + std::cout << "value = " << value <<"\n"; + } + return Status::OK(); + } + DocTime *doc; +}; + +template +struct int64totimestamp: public ColumnConvert { + +public: + int64totimestamp(DocTime *pTime) { + doc = pTime; + } + + Status convert(const IColumn *src_col, IColumn *dst_col) { + size_t rows = src_col->size(); + if constexpr (is_nullable) { + convert_null(&src_col, &dst_col); + } + dst_col->resize(rows); + for (int i = 0; i < rows; i++) { + int64 x = reinterpret_cast *>(src_col)->get_data()[i]; + dst_col = static_cast *>(dst_col); +// reinterpret_cast*>(dst_col)->insert(0); + auto &num = static_cast *>(dst_col)->get_data()[i]; + auto &value = reinterpret_cast &>(num); + value.from_unixtime(x / doc->_decode_params->second_mask, *doc->_decode_params->ctz); + value.set_microsecond((x % doc->_decode_params->second_mask) * + doc->_decode_params->scale_to_nano_factor / 1000); + std::cout << "value = " << value << "\n"; + } + return Status::OK(); + } + + DocTime *doc; +}; + + +template +class int32todate : public ColumnConvert { +public: + DocTime *doc; + int32todate(DocTime *pTime) { + doc = pTime; + } + Status convert(const IColumn* src_col , IColumn* dst_col ) { + size_t rows = src_col->size(); + if constexpr (is_nullable){ + convert_null(&src_col,&dst_col); + } + dst_col->resize(rows); + for(int i = 0;i< rows;i++) { + +// auto& value = reinterpret_cast*>(src_col)->get_data()[i]; +// reinterpret_cast>(); + auto& value = reinterpret_cast &>(reinterpret_cast(dst_col)->get_data()[i]); +// value = reinterpret_cast*>(src_col)->get_data()[i]; + int64_t date_value = reinterpret_cast*>(src_col)->get_data()[i] + doc ->_decode_params->offset_days; + date_day_offset_dict& date_dict = date_day_offset_dict::get(); + value = date_dict[date_value]; + } + + return Status::OK(); + } +}; + +template< typename DecimalType , bool is_nullable> +class stringtodecimal : public ColumnConvert { +public: + DocTime *doc; + stringtodecimal(DocTime *pTime) { + doc = pTime; + } + Status convert(const IColumn* src_col , IColumn* dst_col ) { + size_t rows = src_col->size(); + if constexpr (is_nullable){ + convert_null(&src_col,&dst_col); + } + DecimalScaleParams& scale_params = doc->_decode_params->decimal_scale; + auto buf = static_cast(src_col)->get_chars().data(); + auto& offset = static_cast(src_col)->get_offsets(); + dst_col->resize(rows); + auto& data = static_cast*>(dst_col)->get_data(); + for(int i = 0;i< rows;i++) { + int len = offset[i] - offset[i-1]; + Int128 value = buf[offset[i-1]] & 0x80 ? -1 : 0; + memcpy(reinterpret_cast(&value) + sizeof(Int128) - len, buf+offset[i-1], + len); + value = BigEndian::ToHost128(value); + if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { + value *= scale_params.scale_factor; + } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { + value /= scale_params.scale_factor; + } + auto& v = reinterpret_cast(data[i]); + v = (DecimalType)value; + } + + return Status::OK(); + } +}; +template< typename NumberType, typename DecimalPhysicalType, bool is_nullable> +class numbertodecimal: public ColumnConvert { + + DocTime *doc; +public: + Status convert(const IColumn* src_col , IColumn* dst_col ) { + size_t rows = src_col->size(); + if constexpr (is_nullable){ + convert_null(&src_col,&dst_col); + } + auto* src_data = static_cast*>(src_col)->get_data().data(); + dst_col->resize(rows); + DecimalScaleParams& scale_params = doc->_decode_params->decimal_scale; + auto* data = static_cast< ColumnDecimal>*>(dst_col)->get_data().data(); + + for(int i = 0;i < rows;i++) { + Int128 value = src_data[i]; + if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { + value *= scale_params.scale_factor; + } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { + value /= scale_params.scale_factor; + } + data[i] = (DecimalPhysicalType)value; + } + return Status::OK(); + } + +public: + numbertodecimal(DocTime *pTime) { + doc = pTime; + } +}; +/* + * Int128 value = *reinterpret_cast(buf_start); + if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { + value *= scale_params.scale_factor; + } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { + value /= scale_params.scale_factor; + } + auto& v = reinterpret_cast(column_data[data_index++]); + v = (DecimalPrimitiveType)value; + * + * + */ + +template +static +Status get_converter_impl(std::shared_ptr src_data_type , std::shared_ptr dst_data_type, + std::unique_ptr * converter, + DocTime& doc[[maybe_unused]] ){ + auto src_type = src_data_type -> get_type_id(); + auto dst_type = dst_data_type -> get_type_id(); + + switch (dst_type) { +#define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \ + case NUMERIC_TYPE: \ + switch(src_type){ \ + case TypeIndex::UInt8: \ + *converter = std::make_unique>(); \ + break; \ + case TypeIndex::Int32: \ + *converter = std::make_unique>(); \ + break; \ + case TypeIndex::Int64: \ + *converter = std::make_unique>(); \ + break; \ + case TypeIndex::Float32: \ + *converter = std::make_unique>(); \ + break; \ + case TypeIndex::Float64: \ + *converter = std::make_unique>(); \ + break; \ + case TypeIndex::Int128: \ + *converter = std::make_unique>(); \ + break; \ + default: \ + break; \ + } \ + break; + FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) +#undef DISPATCH + + case TypeIndex::String: + switch (src_type) { +#define DISPATCH1(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \ + case NUMERIC_TYPE: \ + *converter = std::make_unique>(); \ + break; + FOR_LOGICAL_NUMERIC_TYPES(DISPATCH1) +#undef DISPATCH1 + default: + break; + } + break; + case TypeIndex::DateV2: + if (src_type == TypeIndex::Int32) { + *converter = std::make_unique>(&doc); + } + break; + case TypeIndex::DateTimeV2: + if (src_type == TypeIndex::Int128) { + *converter = std::make_unique>(&doc); + }else if (src_type == TypeIndex::Int64) { + *converter = std::make_unique>(&doc); + } + break; + case TypeIndex::Decimal64: + if (src_type == TypeIndex::Int128) { + *converter = std::make_unique>(&doc); + } else if (src_type == TypeIndex::String) { + doc.init_decimal_converter(dst_data_type); + *converter = std::make_unique>(&doc); + } else if (src_type == TypeIndex::Int32) { + *converter = std::make_unique>(&doc); + } else if (src_type == TypeIndex::Int64) { + *converter = std::make_unique>(&doc); + } + break; + default: + break; + } + + if (converter->get() == nullptr){ + return Status::NotSupported("Can't cast type {} to type {}", + getTypeName(src_type ), getTypeName(dst_type)); + } + + return Status::OK(); +} + +static +Status get_converter(std::shared_ptr src_type , std::shared_ptr dst_type , + std::unique_ptr * converter ,DocTime&doc){ + + if (src_type->is_nullable()){ + + auto src = reinterpret_cast (src_type.get())->get_nested_type(); + auto dst = reinterpret_cast (dst_type.get())->get_nested_type(); + + return get_converter_impl(src,dst,converter,doc); + }else { + return get_converter_impl(src_type, dst_type,converter,doc); + } + return Status::OK(); +} +}; + + +} \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/decoder.h b/be/src/vec/exec/format/parquet/decoder.h index acd9965bad8b7a..02440a2484d1bf 100644 --- a/be/src/vec/exec/format/parquet/decoder.h +++ b/be/src/vec/exec/format/parquet/decoder.h @@ -220,4 +220,4 @@ class BaseDictDecoder : public Decoder { std::vector _indexes; }; -} // namespace doris::vectorized +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp index 35880cfcdd3080..dad33664252d0f 100644 --- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp +++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp @@ -82,6 +82,9 @@ class FixLengthDictDecoder final : public BaseDictDecoder { } FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) #undef DISPATCH + case TypeIndex::Int128: + return _decode_numeric< Int128 ,T, has_filter>(doris_column, select_vector); + break; case TypeIndex::Date: if constexpr (std::is_same_v) { return _decode_date(doris_column, @@ -158,10 +161,11 @@ class FixLengthDictDecoder final : public BaseDictDecoder { default: break; } - return Status::InvalidArgument( "Can't decode parquet physical type {} to doris logical type {}", tparquet::to_string(_physical_type), getTypeName(logical_type)); + + return Status::OK(); } Status set_dict(std::unique_ptr& dict, int32_t length, size_t num_values) override { @@ -190,9 +194,16 @@ class FixLengthDictDecoder final : public BaseDictDecoder { switch (read_type) { case ColumnSelectVector::CONTENT: { for (size_t i = 0; i < run_length; ++i) { - column_data[data_index++] = - static_cast(_dict_items[_indexes[dict_index++]]); - } + if constexpr (std::is_same_v ){ + ParquetInt96 value = static_cast(_dict_items[_indexes[dict_index++]]); + column_data[data_index++] = value.to_int128(); + + } else { + column_data[data_index++] = + static_cast(_dict_items[_indexes[dict_index++]]); + + } + } break; } case ColumnSelectVector::NULL_DATA: { @@ -739,4 +750,4 @@ class FixLengthDictDecoder final : public BaseDictDecoder { } }; -} // namespace doris::vectorized +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/parquet_common.h b/be/src/vec/exec/format/parquet/parquet_common.h index 0a4278ae67804a..1b7b66be07af1c 100644 --- a/be/src/vec/exec/format/parquet/parquet_common.h +++ b/be/src/vec/exec/format/parquet/parquet_common.h @@ -54,6 +54,12 @@ struct ParquetInt96 { inline uint64_t to_timestamp_micros() const { return (hi - JULIAN_EPOCH_OFFSET_DAYS) * MICROS_IN_DAY + lo / NANOS_PER_MICROSECOND; } + inline __int128 to_int128() const { + __int128 ans = 0 ; + std::cout <<"before ""hi = "<(encoding)].get(); } else { std::unique_ptr page_decoder; +// std::cout <<"type = "<<_metadata.type <<" "<< encoding <<"\n"; RETURN_IF_ERROR(Decoder::get_decoder(_metadata.type, encoding, page_decoder)); // Set type length page_decoder->set_type_length(_get_type_length()); @@ -323,4 +324,4 @@ int32_t ColumnChunkReader::_get_type_length() { return -1; } } -} // namespace doris::vectorized +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp index 4143a5e0797951..f66e648690342b 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp @@ -121,6 +121,8 @@ Status ParquetColumnReader::create(io::FileReaderSPtr file, FieldSchema* field, io::IOContext* io_ctx, std::unique_ptr& reader, size_t max_buf_size) { + + if (field->type.type == TYPE_ARRAY) { std::unique_ptr element_reader; RETURN_IF_ERROR(create(file, &field->children[0], row_group, row_ranges, ctz, io_ctx, @@ -252,8 +254,10 @@ Status ScalarColumnReader::_read_values(size_t num_values, ColumnPtr& doris_colu NullMap* map_data_column = nullptr; if (doris_column->is_nullable()) { SCOPED_RAW_TIMER(&_decode_null_map_time); - auto* nullable_column = reinterpret_cast( - (*std::move(doris_column)).mutate().get()); +// auto* nullable_column = reinterpret_cast( +// (*std::move(doris_column)).mutate().get()); + auto* nullable_column = const_cast(reinterpret_cast(doris_column.get())); + data_column = nullable_column->get_nested_column_ptr(); map_data_column = &(nullable_column->get_null_map_data()); if (_chunk_reader->max_def_level() > 0) { @@ -442,7 +446,7 @@ Status ScalarColumnReader::read_dict_values_to_column(MutableColumnPtr& doris_co bool* has_dict) { bool loaded; RETURN_IF_ERROR(_try_load_dict_page(&loaded, has_dict)); - if (loaded && has_dict) { + if (loaded && *has_dict) {//todo(cyw) has_dist ???? return _chunk_reader->read_dict_values_to_column(doris_column); } return Status::OK(); @@ -732,4 +736,4 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr return Status::OK(); } -}; // namespace doris::vectorized +}; // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.h b/be/src/vec/exec/format/parquet/vparquet_column_reader.h index f4973b4b4e2efd..f8061d04857887 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.h @@ -288,4 +288,4 @@ class StructColumnReader : public ParquetColumnReader { std::vector> _child_readers; }; -}; // namespace doris::vectorized +}; // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index 14d6e00dbd9fa6..1ded5e4a779459 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -129,7 +129,7 @@ Status RowGroupReader::init( std::unique_ptr reader; RETURN_IF_ERROR(ParquetColumnReader::create(_file_reader, field, _row_group_meta, _read_ranges, _ctz, _io_ctx, reader, - max_buf_size)); + max_buf_size));//create column reader ..... if (reader == nullptr) { VLOG_DEBUG << "Init row group(" << _row_group_id << ") reader failed"; return Status::Corruption("Init row group reader failed"); @@ -183,7 +183,8 @@ bool RowGroupReader::_can_filter_by_dict(int slot_id, break; } } - if (!slot->type().is_string_type()) { + if (slot != nullptr){ +// if (!slot->type().is_string_type()) {//TODO(CYW) : check use file metadata column_metadata.type return false; } @@ -320,7 +321,7 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_ } RETURN_IF_ERROR(_build_pos_delete_filter(*read_rows)); - +/* std::vector columns_to_filter; int column_to_keep = block->columns(); columns_to_filter.resize(column_to_keep); @@ -336,6 +337,9 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_ bool can_filter_all = false; RETURN_IF_ERROR_OR_CATCH_EXCEPTION(VExprContext::execute_conjuncts( _filter_conjuncts, &filters, block, &result_filter, &can_filter_all)); + // => select col where col = '1' => col1 ,converted col , '1' col + + //filter all data if (can_filter_all) { for (auto& col : columns_to_filter) { std::move(*block->get_by_position(col).column).assume_mutable()->clear(); @@ -344,6 +348,7 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_ _convert_dict_cols_to_string_cols(block); return Status::OK(); } + _pre_conjunct_ctxs if (!_not_single_slot_filter_conjuncts.empty()) { _convert_dict_cols_to_string_cols(block); std::vector merged_filters; @@ -362,7 +367,7 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_ RETURN_IF_CATCH_EXCEPTION( RETURN_IF_ERROR(_filter_block(block, column_to_keep, columns_to_filter))); } - +*/ *read_rows = block->rows(); return Status::OK(); } @@ -1008,4 +1013,4 @@ ParquetColumnReader::Statistics RowGroupReader::statistics() { return st; } -} // namespace doris::vectorized +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index 25c1f46aed6e1b..3a52e4d2146bd0 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -43,10 +43,34 @@ #include "vec/exec/format/parquet/vparquet_page_index.h" #include "vec/exprs/vbloom_predicate.h" #include "vec/exprs/vexpr.h" -#include "vec/exprs/vexpr_context.h" #include "vec/exprs/vin_predicate.h" #include "vec/exprs/vruntimefilter_wrapper.h" #include "vec/exprs/vslot_ref.h" +#include "exec/schema_scanner.h" +#include "gtest/gtest_pred_impl.h" +#include "io/fs/buffered_reader.h" +#include "io/fs/file_reader.h" +#include "io/fs/file_reader_writer_fwd.h" +#include "runtime/descriptors.h" +#include "util/timezone_utils.h" +#include "vec/aggregate_functions/aggregate_function.h" +#include "vec/columns/column.h" +#include "vec/columns/column_nullable.h" +#include "vec/common/string_ref.h" +#include "vec/core/block.h" +#include "vec/core/column_with_type_and_name.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_factory.hpp" +#include "vec/exec/format/parquet/parquet_common.h" +#include "gen_cpp/descriptors.pb.h" +#include "olap/olap_common.h" +#include "vec/columns/column_string.h" +#include "vec/columns/column_vector.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type_nullable.h" +#include "vec/data_types/data_type_number.h" +#include "vec/data_types/data_type_string.h" +#include "vec/exec/format/convert.h" namespace cctz { class time_zone; @@ -514,33 +538,61 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof) } } DCHECK(_current_group_reader != nullptr); - if (_push_down_agg_type == TPushAggOp::type::COUNT) { - auto rows = std::min(_current_group_reader->get_remaining_rows(), (int64_t)_batch_size); - _current_group_reader->set_remaining_rows(_current_group_reader->get_remaining_rows() - - rows); - - for (auto& col : block->mutate_columns()) { - col->resize(rows); - } - - *read_rows = rows; - if (_current_group_reader->get_remaining_rows() == 0) { - _current_group_reader.reset(nullptr); - } - - return Status::OK(); - } { + BlockUPtr src_block ; + std::map need_convert; + { +// std::cout <<"->"; +// for(auto i =0; i < block->columns();i++ ){ +// std::cout << block->get_columns()[i]->get_name()<<" "; +// } +// std::cout <<"\n"; + + vector v; + for (auto &col_name: block->get_names()) { + vectorized::DataTypePtr data_type; + tparquet::Type::type parquet_type = _file_metadata->schema().get_column(col_name)->physical_type; + bool conv = false; + convert::convert_data_type_from_parquet(parquet_type, + data_type,block->get_by_name(col_name).type,&conv); + std::cout << col_name <<"->"<get_by_name(col_name).column)).mutate(),data_type,col_name ); + v.emplace_back( + block->get_by_name(col_name).column->assume_mutable(), + data_type, col_name); + } + } + src_block = vectorized::Block::create_unique(v); + } SCOPED_RAW_TIMER(&_statistics.column_read_time); Status batch_st = - _current_group_reader->next_batch(block, _batch_size, read_rows, &_row_group_eof); + _current_group_reader->next_batch(src_block.get(), _batch_size, read_rows, &_row_group_eof); if (!batch_st.ok()) { return Status::InternalError("Read parquet file {} failed, reason = {}", _scan_range.path, batch_st.to_string()); } + + //convert + for(auto i =0; i < block->columns();i++ ){ + std::cout <<"colname = " << block->get_names()[i] <<" "<get_names()[i]] <<"\n"; + if (need_convert[block->get_names()[i]]){ + std::unique_ptr converter(nullptr); + convert::DocTime doc; +// auto x = + doc.init_time(_file_metadata->schema().get_column(i),_ctz); + RETURN_IF_ERROR(convert::get_converter(src_block->get_data_type(i),block->get_data_type(i),&converter,doc)); +// block->get_columns()[i]=src_block->get_columns()[i]; + converter->convert(src_block->get_columns()[i].get(), const_cast(block->get_columns()[i].get())); + } + } } + if (_row_group_eof) { auto column_st = _current_group_reader->statistics(); _column_statistics.merge(column_st); @@ -909,4 +961,4 @@ int64_t ParquetReader::_get_column_start_offset(const tparquet::ColumnMetaData& } return column.data_page_offset; } -} // namespace doris::vectorized +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/scan/scanner_context.cpp b/be/src/vec/exec/scan/scanner_context.cpp index e1c29d569aa27f..63028537e11555 100644 --- a/be/src/vec/exec/scan/scanner_context.cpp +++ b/be/src/vec/exec/scan/scanner_context.cpp @@ -168,6 +168,17 @@ vectorized::BlockUPtr ScannerContext::get_free_block() { block = vectorized::Block::create_unique(_output_tuple_desc->slots(), _batch_size, true /*ignore invalid slots*/); + + +// vector v; +// vectorized::DataTypePtr a = std::make_shared(std::make_shared()); +// v.push_back( ColumnWithTypeAndName(a,"id")); +// vectorized::DataTypePtr b = std::make_shared(std::make_shared()); +// v.push_back( ColumnWithTypeAndName(b,"age")); +// block = vectorized::Block::create_unique(v); +// block->set_num_rows(_batch_size); + + COUNTER_UPDATE(_newly_create_free_blocks_num, 1); _serving_blocks_num++; @@ -521,4 +532,4 @@ template void ScannerContext::clear_and_join(pipeline::ScanLocalStateBase* paren RuntimeState* state); template void ScannerContext::clear_and_join(VScanNode* parent, RuntimeState* state); -} // namespace doris::vectorized +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/scan/scanner_scheduler.cpp b/be/src/vec/exec/scan/scanner_scheduler.cpp index 8ebb6405bd8521..144454dda00778 100644 --- a/be/src/vec/exec/scan/scanner_scheduler.cpp +++ b/be/src/vec/exec/scan/scanner_scheduler.cpp @@ -389,9 +389,9 @@ void ScannerScheduler::_scanner_scan(ScannerScheduler* scheduler, ScannerContext break; } - BlockUPtr block = ctx->get_free_block(); + BlockUPtr block = ctx->get_free_block();//create block <- _output_tuple_desc / 想要的结果 - status = scanner->get_block(state, block.get(), &eos); + status = scanner->get_block(state, block.get(), &eos);//init reader ,read data VLOG_ROW << "VScanNode input rows: " << block->rows() << ", eos: " << eos; // The VFileScanner for external table may try to open not exist files, // Because FE file cache for external table may out of date. @@ -464,4 +464,4 @@ void ScannerScheduler::_task_group_scanner_scan(ScannerScheduler* scheduler, } } -} // namespace doris::vectorized +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index 1da53c114e90c8..055e57148ccc35 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -289,7 +289,7 @@ Status VFileScanner::open(RuntimeState* state) { Status VFileScanner::_get_block_impl(RuntimeState* state, Block* block, bool* eof) { do { if (_cur_reader == nullptr || _cur_reader_eof) { - RETURN_IF_ERROR(_get_next_reader()); + RETURN_IF_ERROR(_get_next_reader());//init parquet reader } if (_scanner_eof) { @@ -577,7 +577,7 @@ Status VFileScanner::_convert_to_output_block(Block* block) { const ColumnNullable* nullable_column = reinterpret_cast(column_ptr.get()); for (int i = 0; i < rows; ++i) { - if (filter_map[i] && nullable_column->is_null_at(i)) { + if (filter_map[i] && nullable_column->is_null_at(i)) {//in load , error case if (_strict_mode && (_src_slot_descs_order_by_dest[dest_index]) && !_src_block_ptr->get_by_position(_dest_slot_to_src_slot_index[dest_index]) .column->is_null_at(i)) { @@ -706,7 +706,7 @@ Status VFileScanner::_get_next_reader() { _state->update_num_finished_scan_range(1); return Status::OK(); } - if (_next_range != 0) { + if (_next_range != 0) { _state->update_num_finished_scan_range(1); } @@ -764,14 +764,14 @@ Status VFileScanner::_get_next_reader() { _state->query_options().enable_parquet_lazy_mat); { SCOPED_TIMER(_open_reader_timer); - RETURN_IF_ERROR(parquet_reader->open()); + RETURN_IF_ERROR(parquet_reader->open());//read file_schema } if (push_down_predicates && _push_down_conjuncts.empty() && !_conjuncts.empty()) { _push_down_conjuncts.resize(_conjuncts.size()); for (size_t i = 0; i != _conjuncts.size(); ++i) { RETURN_IF_ERROR(_conjuncts[i]->clone(_state, _push_down_conjuncts[i])); } - _discard_conjuncts(); + //_discard_conjuncts(); } if (range.__isset.table_format_params && range.table_format_params.table_format_type == "iceberg") { @@ -792,7 +792,7 @@ Status VFileScanner::_get_next_reader() { _file_col_names, place_holder, _colname_to_value_range, _push_down_conjuncts, _real_tuple_desc, _default_val_row_desc.get(), _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, - &_slot_id_to_filter_conjuncts); + &_slot_id_to_filter_conjuncts);//init parquet reader <- select column / filter / value_range min max _cur_reader = std::move(parquet_reader); } need_to_get_parsed_schema = true; @@ -1105,4 +1105,4 @@ Status VFileScanner::close(RuntimeState* state) { return Status::OK(); } -} // namespace doris::vectorized +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exprs/vectorized_fn_call.cpp b/be/src/vec/exprs/vectorized_fn_call.cpp index 58083a486f60e5..3db8e47e8c53f9 100644 --- a/be/src/vec/exprs/vectorized_fn_call.cpp +++ b/be/src/vec/exprs/vectorized_fn_call.cpp @@ -155,6 +155,7 @@ Status VectorizedFnCall::execute(VExprContext* context, vectorized::Block* block // if not find fast execute result column, means do not need check fast execute again _can_fast_execute = fast_execute(context->fn_context(_fn_context_index), *block, arguments, num_columns_without_result, block->rows()); + // insert be converted column to block 向block插入转换后的列 if (_can_fast_execute) { *result_column_id = num_columns_without_result; return Status::OK(); @@ -225,4 +226,4 @@ std::string VectorizedFnCall::debug_string(const std::vector& out << "]"; return out.str(); } -} // namespace doris::vectorized +} // namespace doris::vectorized \ No newline at end of file From 632ce64ccb508050ec952be4f985f95344d62ddc Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Sun, 8 Oct 2023 21:44:29 +0800 Subject: [PATCH 02/21] fix format. --- be/src/vec/exec/format/convert.h | 402 +++++++++--------- .../parquet/fix_length_dict_decoder.hpp | 10 +- .../vec/exec/format/parquet/parquet_common.h | 6 +- .../parquet/vparquet_column_chunk_reader.cpp | 2 +- .../format/parquet/vparquet_column_reader.cpp | 11 +- .../format/parquet/vparquet_group_reader.cpp | 8 +- .../exec/format/parquet/vparquet_reader.cpp | 108 ++--- be/src/vec/exec/scan/scanner_context.cpp | 16 +- be/src/vec/exec/scan/scanner_scheduler.cpp | 4 +- be/src/vec/exec/scan/vfile_scanner.cpp | 10 +- 10 files changed, 281 insertions(+), 296 deletions(-) diff --git a/be/src/vec/exec/format/convert.h b/be/src/vec/exec/format/convert.h index f294dc3ee6095e..b4d1ab46b928da 100644 --- a/be/src/vec/exec/format/convert.h +++ b/be/src/vec/exec/format/convert.h @@ -27,29 +27,28 @@ #include #include "common/status.h" -#include "io/file_factory.h" -#include "vec/data_types/data_type.h" -#include "vec/data_types/data_type_factory.hpp" -#include "vec/exec/format/parquet/parquet_common.h" #include "gen_cpp/descriptors.pb.h" +#include "io/file_factory.h" #include "olap/olap_common.h" #include "vec/columns/column_string.h" #include "vec/columns/column_vector.h" #include "vec/core/types.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_factory.hpp" #include "vec/data_types/data_type_nullable.h" #include "vec/data_types/data_type_number.h" #include "vec/data_types/data_type_string.h" +#include "vec/exec/format/parquet/parquet_common.h" namespace doris::vectorized { namespace convert { class DocTime { - public: std::unique_ptr _decode_params; - const FieldSchema * _field_schema; - void init_time(const FieldSchema *field_schema, cctz::time_zone* ctz) { + const FieldSchema* _field_schema; + void init_time(const FieldSchema* field_schema, cctz::time_zone* ctz) { if (_decode_params == nullptr) { _decode_params.reset(new DecodeParams()); } @@ -57,7 +56,7 @@ class DocTime { _decode_params->ctz = ctz; } - _field_schema = field_schema; + _field_schema = field_schema; const auto& schema = field_schema->parquet_schema; if (schema.__isset.logicalType && schema.logicalType.__isset.TIMESTAMP) { const auto& timestamp_info = schema.logicalType.TIMESTAMP; @@ -93,7 +92,7 @@ class DocTime { _decode_params->offset_days = t.day() == 31 ? 0 : 1; } } - template + template void init_decimal_converter(DataTypePtr& data_type) { if (_decode_params == nullptr || _field_schema == nullptr || _decode_params->decimal_scale.scale_type != DecimalScaleParams::NOT_INIT) { @@ -116,56 +115,55 @@ class DocTime { _decode_params->decimal_scale.scale_factor = 1; } } - }; -static Status -convert_data_type_from_parquet(tparquet::Type::type parquet_type , - vectorized::DataTypePtr& ans_data_type , - DataTypePtr& src_type,bool * need_convert) { - std::cout << getTypeName(src_type->get_type_id()) <<"\n"; - if (is_complex_type(src_type)){ +static Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, + vectorized::DataTypePtr& ans_data_type, + DataTypePtr& src_type, bool* need_convert) { + std::cout << getTypeName(src_type->get_type_id()) << "\n"; + if (is_complex_type(src_type)) { *need_convert = false; return Status::OK(); } switch (parquet_type) { - case tparquet::Type::type::BOOLEAN: - ans_data_type = std::make_shared(); - break; - case tparquet::Type::type::INT32: - ans_data_type = std::make_shared(); - break; - case tparquet::Type::type::INT64: - ans_data_type = std::make_shared(); - break; - case tparquet::Type::type::FLOAT: - ans_data_type = std::make_shared(); - break; - case tparquet::Type::type::DOUBLE: - ans_data_type = std::make_shared(); - break; - case tparquet::Type::type::BYTE_ARRAY: - case tparquet::Type::type::FIXED_LEN_BYTE_ARRAY: - ans_data_type = std::make_shared(); - break; - case tparquet::Type::type::INT96: - ans_data_type = std::make_shared(); - break; - default: - std::cout <<"--->"<(); + break; + case tparquet::Type::type::INT32: + ans_data_type = std::make_shared(); + break; + case tparquet::Type::type::INT64: + ans_data_type = std::make_shared(); + break; + case tparquet::Type::type::FLOAT: + ans_data_type = std::make_shared(); + break; + case tparquet::Type::type::DOUBLE: + ans_data_type = std::make_shared(); + break; + case tparquet::Type::type::BYTE_ARRAY: + case tparquet::Type::type::FIXED_LEN_BYTE_ARRAY: + ans_data_type = std::make_shared(); + break; + case tparquet::Type::type::INT96: + ans_data_type = std::make_shared(); + break; + default: + std::cout << "--->" << parquet_type << "\n"; + break; } - if (ans_data_type->get_type_id() == src_type->get_type_id()){ + if (ans_data_type->get_type_id() == src_type->get_type_id()) { *need_convert = false; return Status::OK(); } - if (src_type->is_nullable()){ - auto& nested_src_type= reinterpret_cast(src_type.get())->get_nested_type(); - std::cout << getTypeName(nested_src_type->get_type_id()) <<"\n"; + if (src_type->is_nullable()) { + auto& nested_src_type = + reinterpret_cast(src_type.get())->get_nested_type(); + std::cout << getTypeName(nested_src_type->get_type_id()) << "\n"; auto sub = ans_data_type; ans_data_type = std::make_shared(ans_data_type); - if (nested_src_type->get_type_id() == sub ->get_type_id()){ + if (nested_src_type->get_type_id() == sub->get_type_id()) { *need_convert = false; return Status::OK(); } @@ -175,30 +173,23 @@ convert_data_type_from_parquet(tparquet::Type::type parquet_type , return Status::OK(); } - - - struct ColumnConvert { - Status virtual convert(const IColumn* src_col , IColumn* dst_col ){ - return Status::OK(); - } + Status virtual convert(const IColumn* src_col, IColumn* dst_col) { return Status::OK(); } virtual ~ColumnConvert() = default; }; -template +template struct NumberColumnConvert : public ColumnConvert { - virtual Status convert(const IColumn* src_col , IColumn* dst_col ) override; - + virtual Status convert(const IColumn* src_col, IColumn* dst_col) override; }; -void -convert_null(const IColumn **src_col, IColumn **dst_col){ +void convert_null(const IColumn** src_col, IColumn** dst_col) { size_t rows = (*src_col)->size(); if ((*src_col)->is_nullable()) { - auto src_nullable_column = reinterpret_cast(*src_col); - auto dst_nullable_column = reinterpret_cast(*dst_col); + auto src_nullable_column = reinterpret_cast(*src_col); + auto dst_nullable_column = reinterpret_cast(*dst_col); auto& dst_null_col = dst_nullable_column->get_null_map_column(); - for(auto j =0;jget_null_map_column()[j]); } @@ -207,101 +198,95 @@ convert_null(const IColumn **src_col, IColumn **dst_col){ } } - - -template -Status NumberColumnConvert::convert(const IColumn *src_col, IColumn *dst_col) { +template +Status NumberColumnConvert::convert(const IColumn* src_col, + IColumn* dst_col) { size_t rows = src_col->size(); - if constexpr (is_nullable){ - convert_null(&src_col,&dst_col); + if constexpr (is_nullable) { + convert_null(&src_col, &dst_col); } - - for(int i =0;i*>(src_col)->get_data()[i]; + for (int i = 0; i < rows; i++) { + // src_type src_value = reinterpret_cast*>(src_col)->get_data()[i]; dst_type value = static_cast( reinterpret_cast*>(src_col)->get_data()[i]); reinterpret_cast*>(dst_col)->insert(value); - } - return Status::OK(); } -template +template struct NumberColumnToStringConvert : public ColumnConvert { - virtual Status convert(const IColumn* src_col , IColumn* dst_col ) override; + virtual Status convert(const IColumn* src_col, IColumn* dst_col) override; }; -template -Status NumberColumnToStringConvert::convert(const IColumn *src_col, IColumn *dst_col) { +template +Status NumberColumnToStringConvert::convert(const IColumn* src_col, + IColumn* dst_col) { size_t rows = src_col->size(); - if constexpr (is_nullable){ - convert_null(&src_col,&dst_col); + if constexpr (is_nullable) { + convert_null(&src_col, &dst_col); } - for(int i =0;i*>(src_col)->get_data()[i]); - reinterpret_cast(dst_col)->insert_data(value.data(),value.size()); - + for (int i = 0; i < rows; i++) { + std::string value = std::to_string( + reinterpret_cast*>(src_col)->get_data()[i]); + reinterpret_cast(dst_col)->insert_data(value.data(), value.size()); } return Status::OK(); } -template -struct int128totimestamp: public ColumnConvert { - int128totimestamp(DocTime *pTime) { - doc = pTime; - } +template +struct int128totimestamp : public ColumnConvert { + int128totimestamp(DocTime* pTime) { doc = pTime; } - inline uint64_t to_timestamp_micros(uint32_t hi , uint64_t lo) const { - return (hi - ParquetInt96::JULIAN_EPOCH_OFFSET_DAYS) * ParquetInt96::MICROS_IN_DAY + lo / ParquetInt96::NANOS_PER_MICROSECOND; + inline uint64_t to_timestamp_micros(uint32_t hi, uint64_t lo) const { + return (hi - ParquetInt96::JULIAN_EPOCH_OFFSET_DAYS) * ParquetInt96::MICROS_IN_DAY + + lo / ParquetInt96::NANOS_PER_MICROSECOND; } - Status convert(const IColumn* src_col , IColumn* dst_col ) { + Status convert(const IColumn* src_col, IColumn* dst_col) { size_t rows = src_col->size(); - if constexpr (is_nullable){ - convert_null(&src_col,&dst_col); + if constexpr (is_nullable) { + convert_null(&src_col, &dst_col); } - for(int i =0;i*>(src_col)->get_data()[i]; - uint32_t hi = x>>64; uint64_t lo = (x<<64)>>64 ; + for (int i = 0; i < rows; i++) { + __int128 x = reinterpret_cast*>(src_col)->get_data()[i]; + uint32_t hi = x >> 64; + uint64_t lo = (x << 64) >> 64; dst_col = static_cast*>(dst_col); reinterpret_cast*>(dst_col)->insert(0); auto& num = static_cast*>(dst_col)->get_data()[i]; - auto &value = reinterpret_cast&>(num); - int64_t micros = to_timestamp_micros(hi,lo); + auto& value = reinterpret_cast&>(num); + int64_t micros = to_timestamp_micros(hi, lo); value.from_unixtime(micros / 1000000, *doc->_decode_params->ctz); value.set_microsecond(micros % 1000000); - std::cout << "value = " << value <<"\n"; + std::cout << "value = " << value << "\n"; } return Status::OK(); } - DocTime *doc; + DocTime* doc; }; -template -struct int64totimestamp: public ColumnConvert { - +template +struct int64totimestamp : public ColumnConvert { public: - int64totimestamp(DocTime *pTime) { - doc = pTime; - } + int64totimestamp(DocTime* pTime) { doc = pTime; } - Status convert(const IColumn *src_col, IColumn *dst_col) { + Status convert(const IColumn* src_col, IColumn* dst_col) { size_t rows = src_col->size(); if constexpr (is_nullable) { convert_null(&src_col, &dst_col); } dst_col->resize(rows); for (int i = 0; i < rows; i++) { - int64 x = reinterpret_cast *>(src_col)->get_data()[i]; - dst_col = static_cast *>(dst_col); -// reinterpret_cast*>(dst_col)->insert(0); - auto &num = static_cast *>(dst_col)->get_data()[i]; - auto &value = reinterpret_cast &>(num); + int64 x = reinterpret_cast*>(src_col)->get_data()[i]; + dst_col = static_cast*>(dst_col); + // reinterpret_cast*>(dst_col)->insert(0); + auto& num = static_cast*>(dst_col)->get_data()[i]; + auto& value = reinterpret_cast&>(num); value.from_unixtime(x / doc->_decode_params->second_mask, *doc->_decode_params->ctz); value.set_microsecond((x % doc->_decode_params->second_mask) * doc->_decode_params->scale_to_nano_factor / 1000); @@ -310,30 +295,29 @@ struct int64totimestamp: public ColumnConvert { return Status::OK(); } - DocTime *doc; + DocTime* doc; }; - -template +template class int32todate : public ColumnConvert { public: - DocTime *doc; - int32todate(DocTime *pTime) { - doc = pTime; - } - Status convert(const IColumn* src_col , IColumn* dst_col ) { + DocTime* doc; + int32todate(DocTime* pTime) { doc = pTime; } + Status convert(const IColumn* src_col, IColumn* dst_col) { size_t rows = src_col->size(); - if constexpr (is_nullable){ - convert_null(&src_col,&dst_col); + if constexpr (is_nullable) { + convert_null(&src_col, &dst_col); } dst_col->resize(rows); - for(int i = 0;i< rows;i++) { - -// auto& value = reinterpret_cast*>(src_col)->get_data()[i]; -// reinterpret_cast>(); - auto& value = reinterpret_cast &>(reinterpret_cast(dst_col)->get_data()[i]); -// value = reinterpret_cast*>(src_col)->get_data()[i]; - int64_t date_value = reinterpret_cast*>(src_col)->get_data()[i] + doc ->_decode_params->offset_days; + for (int i = 0; i < rows; i++) { + // auto& value = reinterpret_cast*>(src_col)->get_data()[i]; + // reinterpret_cast>(); + auto& value = reinterpret_cast&>( + reinterpret_cast(dst_col)->get_data()[i]); + // value = reinterpret_cast*>(src_col)->get_data()[i]; + int64_t date_value = + reinterpret_cast*>(src_col)->get_data()[i] + + doc->_decode_params->offset_days; date_day_offset_dict& date_dict = date_day_offset_dict::get(); value = date_dict[date_value]; } @@ -342,27 +326,25 @@ class int32todate : public ColumnConvert { } }; -template< typename DecimalType , bool is_nullable> +template class stringtodecimal : public ColumnConvert { public: - DocTime *doc; - stringtodecimal(DocTime *pTime) { - doc = pTime; - } - Status convert(const IColumn* src_col , IColumn* dst_col ) { + DocTime* doc; + stringtodecimal(DocTime* pTime) { doc = pTime; } + Status convert(const IColumn* src_col, IColumn* dst_col) { size_t rows = src_col->size(); - if constexpr (is_nullable){ - convert_null(&src_col,&dst_col); + if constexpr (is_nullable) { + convert_null(&src_col, &dst_col); } DecimalScaleParams& scale_params = doc->_decode_params->decimal_scale; auto buf = static_cast(src_col)->get_chars().data(); auto& offset = static_cast(src_col)->get_offsets(); dst_col->resize(rows); auto& data = static_cast*>(dst_col)->get_data(); - for(int i = 0;i< rows;i++) { - int len = offset[i] - offset[i-1]; - Int128 value = buf[offset[i-1]] & 0x80 ? -1 : 0; - memcpy(reinterpret_cast(&value) + sizeof(Int128) - len, buf+offset[i-1], + for (int i = 0; i < rows; i++) { + int len = offset[i] - offset[i - 1]; + Int128 value = buf[offset[i - 1]] & 0x80 ? -1 : 0; + memcpy(reinterpret_cast(&value) + sizeof(Int128) - len, buf + offset[i - 1], len); value = BigEndian::ToHost128(value); if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { @@ -377,22 +359,22 @@ class stringtodecimal : public ColumnConvert { return Status::OK(); } }; -template< typename NumberType, typename DecimalPhysicalType, bool is_nullable> -class numbertodecimal: public ColumnConvert { +template +class numbertodecimal : public ColumnConvert { + DocTime* doc; - DocTime *doc; public: - Status convert(const IColumn* src_col , IColumn* dst_col ) { + Status convert(const IColumn* src_col, IColumn* dst_col) { size_t rows = src_col->size(); - if constexpr (is_nullable){ - convert_null(&src_col,&dst_col); + if constexpr (is_nullable) { + convert_null(&src_col, &dst_col); } auto* src_data = static_cast*>(src_col)->get_data().data(); dst_col->resize(rows); DecimalScaleParams& scale_params = doc->_decode_params->decimal_scale; - auto* data = static_cast< ColumnDecimal>*>(dst_col)->get_data().data(); + auto* data = static_cast>*>(dst_col)->get_data().data(); - for(int i = 0;i < rows;i++) { + for (int i = 0; i < rows; i++) { Int128 value = src_data[i]; if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { value *= scale_params.scale_factor; @@ -405,9 +387,7 @@ class numbertodecimal: public ColumnConvert { } public: - numbertodecimal(DocTime *pTime) { - doc = pTime; - } + numbertodecimal(DocTime* pTime) { doc = pTime; } }; /* * Int128 value = *reinterpret_cast(buf_start); @@ -422,49 +402,56 @@ class numbertodecimal: public ColumnConvert { * */ -template -static -Status get_converter_impl(std::shared_ptr src_data_type , std::shared_ptr dst_data_type, - std::unique_ptr * converter, - DocTime& doc[[maybe_unused]] ){ - auto src_type = src_data_type -> get_type_id(); - auto dst_type = dst_data_type -> get_type_id(); +template +static Status get_converter_impl(std::shared_ptr src_data_type, + std::shared_ptr dst_data_type, + std::unique_ptr* converter, + DocTime& doc [[maybe_unused]]) { + auto src_type = src_data_type->get_type_id(); + auto dst_type = dst_data_type->get_type_id(); switch (dst_type) { -#define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \ - case NUMERIC_TYPE: \ - switch(src_type){ \ - case TypeIndex::UInt8: \ - *converter = std::make_unique>(); \ - break; \ - case TypeIndex::Int32: \ - *converter = std::make_unique>(); \ - break; \ - case TypeIndex::Int64: \ - *converter = std::make_unique>(); \ - break; \ - case TypeIndex::Float32: \ - *converter = std::make_unique>(); \ - break; \ - case TypeIndex::Float64: \ - *converter = std::make_unique>(); \ - break; \ - case TypeIndex::Int128: \ - *converter = std::make_unique>(); \ - break; \ - default: \ - break; \ - } \ - break; - FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) +#define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \ + case NUMERIC_TYPE: \ + switch (src_type) { \ + case TypeIndex::UInt8: \ + *converter = \ + std::make_unique>(); \ + break; \ + case TypeIndex::Int32: \ + *converter = \ + std::make_unique>(); \ + break; \ + case TypeIndex::Int64: \ + *converter = \ + std::make_unique>(); \ + break; \ + case TypeIndex::Float32: \ + *converter = std::make_unique< \ + NumberColumnConvert>(); \ + break; \ + case TypeIndex::Float64: \ + *converter = std::make_unique< \ + NumberColumnConvert>(); \ + break; \ + case TypeIndex::Int128: \ + *converter = std::make_unique< \ + NumberColumnConvert>(); \ + break; \ + default: \ + break; \ + } \ + break; + FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) #undef DISPATCH case TypeIndex::String: switch (src_type) { -#define DISPATCH1(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \ - case NUMERIC_TYPE: \ - *converter = std::make_unique>(); \ - break; +#define DISPATCH1(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \ + case NUMERIC_TYPE: \ + *converter = \ + std::make_unique>(); \ + break; FOR_LOGICAL_NUMERIC_TYPES(DISPATCH1) #undef DISPATCH1 default: @@ -479,50 +466,47 @@ Status get_converter_impl(std::shared_ptr src_data_type , std:: case TypeIndex::DateTimeV2: if (src_type == TypeIndex::Int128) { *converter = std::make_unique>(&doc); - }else if (src_type == TypeIndex::Int64) { + } else if (src_type == TypeIndex::Int64) { *converter = std::make_unique>(&doc); } break; case TypeIndex::Decimal64: if (src_type == TypeIndex::Int128) { - *converter = std::make_unique>(&doc); + *converter = std::make_unique>(&doc); } else if (src_type == TypeIndex::String) { doc.init_decimal_converter(dst_data_type); - *converter = std::make_unique>(&doc); + *converter = std::make_unique>(&doc); } else if (src_type == TypeIndex::Int32) { - *converter = std::make_unique>(&doc); + *converter = std::make_unique>(&doc); } else if (src_type == TypeIndex::Int64) { - *converter = std::make_unique>(&doc); + *converter = std::make_unique>(&doc); } break; default: break; } - if (converter->get() == nullptr){ - return Status::NotSupported("Can't cast type {} to type {}", - getTypeName(src_type ), getTypeName(dst_type)); + if (converter->get() == nullptr) { + return Status::NotSupported("Can't cast type {} to type {}", getTypeName(src_type), + getTypeName(dst_type)); } return Status::OK(); } -static -Status get_converter(std::shared_ptr src_type , std::shared_ptr dst_type , - std::unique_ptr * converter ,DocTime&doc){ +static Status get_converter(std::shared_ptr src_type, + std::shared_ptr dst_type, + std::unique_ptr* converter, DocTime& doc) { + if (src_type->is_nullable()) { + auto src = reinterpret_cast(src_type.get())->get_nested_type(); + auto dst = reinterpret_cast(dst_type.get())->get_nested_type(); - if (src_type->is_nullable()){ - - auto src = reinterpret_cast (src_type.get())->get_nested_type(); - auto dst = reinterpret_cast (dst_type.get())->get_nested_type(); - - return get_converter_impl(src,dst,converter,doc); - }else { - return get_converter_impl(src_type, dst_type,converter,doc); + return get_converter_impl(src, dst, converter, doc); + } else { + return get_converter_impl(src_type, dst_type, converter, doc); } return Status::OK(); } -}; - +}; // namespace convert -} \ No newline at end of file +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp index dad33664252d0f..0c573b6db62dac 100644 --- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp +++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp @@ -83,7 +83,7 @@ class FixLengthDictDecoder final : public BaseDictDecoder { FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) #undef DISPATCH case TypeIndex::Int128: - return _decode_numeric< Int128 ,T, has_filter>(doris_column, select_vector); + return _decode_numeric(doris_column, select_vector); break; case TypeIndex::Date: if constexpr (std::is_same_v) { @@ -194,16 +194,16 @@ class FixLengthDictDecoder final : public BaseDictDecoder { switch (read_type) { case ColumnSelectVector::CONTENT: { for (size_t i = 0; i < run_length; ++i) { - if constexpr (std::is_same_v ){ - ParquetInt96 value = static_cast(_dict_items[_indexes[dict_index++]]); + if constexpr (std::is_same_v) { + ParquetInt96 value = + static_cast(_dict_items[_indexes[dict_index++]]); column_data[data_index++] = value.to_int128(); } else { column_data[data_index++] = static_cast(_dict_items[_indexes[dict_index++]]); - - } } + } break; } case ColumnSelectVector::NULL_DATA: { diff --git a/be/src/vec/exec/format/parquet/parquet_common.h b/be/src/vec/exec/format/parquet/parquet_common.h index 1b7b66be07af1c..424e398887244e 100644 --- a/be/src/vec/exec/format/parquet/parquet_common.h +++ b/be/src/vec/exec/format/parquet/parquet_common.h @@ -55,8 +55,10 @@ struct ParquetInt96 { return (hi - JULIAN_EPOCH_OFFSET_DAYS) * MICROS_IN_DAY + lo / NANOS_PER_MICROSECOND; } inline __int128 to_int128() const { - __int128 ans = 0 ; - std::cout <<"before ""hi = "<(encoding)].get(); } else { std::unique_ptr page_decoder; -// std::cout <<"type = "<<_metadata.type <<" "<< encoding <<"\n"; + // std::cout <<"type = "<<_metadata.type <<" "<< encoding <<"\n"; RETURN_IF_ERROR(Decoder::get_decoder(_metadata.type, encoding, page_decoder)); // Set type length page_decoder->set_type_length(_get_type_length()); diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp index f66e648690342b..794dfe7a39a0c6 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp @@ -121,8 +121,6 @@ Status ParquetColumnReader::create(io::FileReaderSPtr file, FieldSchema* field, io::IOContext* io_ctx, std::unique_ptr& reader, size_t max_buf_size) { - - if (field->type.type == TYPE_ARRAY) { std::unique_ptr element_reader; RETURN_IF_ERROR(create(file, &field->children[0], row_group, row_ranges, ctz, io_ctx, @@ -254,9 +252,10 @@ Status ScalarColumnReader::_read_values(size_t num_values, ColumnPtr& doris_colu NullMap* map_data_column = nullptr; if (doris_column->is_nullable()) { SCOPED_RAW_TIMER(&_decode_null_map_time); -// auto* nullable_column = reinterpret_cast( -// (*std::move(doris_column)).mutate().get()); - auto* nullable_column = const_cast(reinterpret_cast(doris_column.get())); + // auto* nullable_column = reinterpret_cast( + // (*std::move(doris_column)).mutate().get()); + auto* nullable_column = const_cast( + reinterpret_cast(doris_column.get())); data_column = nullable_column->get_nested_column_ptr(); map_data_column = &(nullable_column->get_null_map_data()); @@ -446,7 +445,7 @@ Status ScalarColumnReader::read_dict_values_to_column(MutableColumnPtr& doris_co bool* has_dict) { bool loaded; RETURN_IF_ERROR(_try_load_dict_page(&loaded, has_dict)); - if (loaded && *has_dict) {//todo(cyw) has_dist ???? + if (loaded && *has_dict) { //todo(cyw) has_dist ???? return _chunk_reader->read_dict_values_to_column(doris_column); } return Status::OK(); diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index 1ded5e4a779459..6f6d5c4dc70c21 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -129,7 +129,7 @@ Status RowGroupReader::init( std::unique_ptr reader; RETURN_IF_ERROR(ParquetColumnReader::create(_file_reader, field, _row_group_meta, _read_ranges, _ctz, _io_ctx, reader, - max_buf_size));//create column reader ..... + max_buf_size)); //create column reader ..... if (reader == nullptr) { VLOG_DEBUG << "Init row group(" << _row_group_id << ") reader failed"; return Status::Corruption("Init row group reader failed"); @@ -183,8 +183,8 @@ bool RowGroupReader::_can_filter_by_dict(int slot_id, break; } } - if (slot != nullptr){ -// if (!slot->type().is_string_type()) {//TODO(CYW) : check use file metadata column_metadata.type + if (slot != nullptr) { + // if (!slot->type().is_string_type()) {//TODO(CYW) : check use file metadata column_metadata.type return false; } @@ -321,7 +321,7 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_ } RETURN_IF_ERROR(_build_pos_delete_filter(*read_rows)); -/* + /* std::vector columns_to_filter; int column_to_keep = block->columns(); columns_to_filter.resize(column_to_keep); diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index 3a52e4d2146bd0..6e7b4f0e5b314e 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -29,48 +29,48 @@ #include #include "common/status.h" -#include "io/file_factory.h" -#include "parquet_pred_cmp.h" -#include "parquet_thrift_util.h" -#include "runtime/define_primitive_type.h" -#include "runtime/types.h" -#include "util/slice.h" -#include "vec/common/typeid_cast.h" -#include "vec/exec/format/format_common.h" -#include "vec/exec/format/parquet/schema_desc.h" -#include "vec/exec/format/parquet/vparquet_file_metadata.h" -#include "vec/exec/format/parquet/vparquet_group_reader.h" -#include "vec/exec/format/parquet/vparquet_page_index.h" -#include "vec/exprs/vbloom_predicate.h" -#include "vec/exprs/vexpr.h" -#include "vec/exprs/vin_predicate.h" -#include "vec/exprs/vruntimefilter_wrapper.h" -#include "vec/exprs/vslot_ref.h" #include "exec/schema_scanner.h" +#include "gen_cpp/descriptors.pb.h" #include "gtest/gtest_pred_impl.h" +#include "io/file_factory.h" #include "io/fs/buffered_reader.h" #include "io/fs/file_reader.h" #include "io/fs/file_reader_writer_fwd.h" +#include "olap/olap_common.h" +#include "parquet_pred_cmp.h" +#include "parquet_thrift_util.h" +#include "runtime/define_primitive_type.h" #include "runtime/descriptors.h" +#include "runtime/types.h" +#include "util/slice.h" #include "util/timezone_utils.h" #include "vec/aggregate_functions/aggregate_function.h" #include "vec/columns/column.h" #include "vec/columns/column_nullable.h" +#include "vec/columns/column_string.h" +#include "vec/columns/column_vector.h" #include "vec/common/string_ref.h" +#include "vec/common/typeid_cast.h" #include "vec/core/block.h" #include "vec/core/column_with_type_and_name.h" +#include "vec/core/types.h" #include "vec/data_types/data_type.h" #include "vec/data_types/data_type_factory.hpp" -#include "vec/exec/format/parquet/parquet_common.h" -#include "gen_cpp/descriptors.pb.h" -#include "olap/olap_common.h" -#include "vec/columns/column_string.h" -#include "vec/columns/column_vector.h" -#include "vec/core/types.h" #include "vec/data_types/data_type_nullable.h" #include "vec/data_types/data_type_number.h" #include "vec/data_types/data_type_string.h" #include "vec/exec/format/convert.h" +#include "vec/exec/format/format_common.h" +#include "vec/exec/format/parquet/parquet_common.h" +#include "vec/exec/format/parquet/schema_desc.h" +#include "vec/exec/format/parquet/vparquet_file_metadata.h" +#include "vec/exec/format/parquet/vparquet_group_reader.h" +#include "vec/exec/format/parquet/vparquet_page_index.h" +#include "vec/exprs/vbloom_predicate.h" +#include "vec/exprs/vexpr.h" +#include "vec/exprs/vin_predicate.h" +#include "vec/exprs/vruntimefilter_wrapper.h" +#include "vec/exprs/vslot_ref.h" namespace cctz { class time_zone; @@ -539,56 +539,58 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof) } DCHECK(_current_group_reader != nullptr); - { - BlockUPtr src_block ; - std::map need_convert; + BlockUPtr src_block; + std::map need_convert; { -// std::cout <<"->"; -// for(auto i =0; i < block->columns();i++ ){ -// std::cout << block->get_columns()[i]->get_name()<<" "; -// } -// std::cout <<"\n"; + // std::cout <<"->"; + // for(auto i =0; i < block->columns();i++ ){ + // std::cout << block->get_columns()[i]->get_name()<<" "; + // } + // std::cout <<"\n"; vector v; - for (auto &col_name: block->get_names()) { + for (auto& col_name : block->get_names()) { vectorized::DataTypePtr data_type; - tparquet::Type::type parquet_type = _file_metadata->schema().get_column(col_name)->physical_type; + tparquet::Type::type parquet_type = + _file_metadata->schema().get_column(col_name)->physical_type; bool conv = false; - convert::convert_data_type_from_parquet(parquet_type, - data_type,block->get_by_name(col_name).type,&conv); - std::cout << col_name <<"->"<get_by_name(col_name).type, &conv); + std::cout << col_name << "->" << conv << "\n"; need_convert[col_name] = conv; - if (conv){ + if (conv) { v.emplace_back(data_type, col_name); - }else { -// v.emplace_back( (*std::move(block->get_by_name(col_name).column)).mutate(),data_type,col_name ); - v.emplace_back( - block->get_by_name(col_name).column->assume_mutable(), - data_type, col_name); + } else { + // v.emplace_back( (*std::move(block->get_by_name(col_name).column)).mutate(),data_type,col_name ); + v.emplace_back(block->get_by_name(col_name).column->assume_mutable(), data_type, + col_name); } } src_block = vectorized::Block::create_unique(v); } SCOPED_RAW_TIMER(&_statistics.column_read_time); - Status batch_st = - _current_group_reader->next_batch(src_block.get(), _batch_size, read_rows, &_row_group_eof); + Status batch_st = _current_group_reader->next_batch(src_block.get(), _batch_size, read_rows, + &_row_group_eof); if (!batch_st.ok()) { return Status::InternalError("Read parquet file {} failed, reason = {}", _scan_range.path, batch_st.to_string()); } //convert - for(auto i =0; i < block->columns();i++ ){ - std::cout <<"colname = " << block->get_names()[i] <<" "<get_names()[i]] <<"\n"; - if (need_convert[block->get_names()[i]]){ - std::unique_ptr converter(nullptr); + for (auto i = 0; i < block->columns(); i++) { + std::cout << "colname = " << block->get_names()[i] << " " + << need_convert[block->get_names()[i]] << "\n"; + if (need_convert[block->get_names()[i]]) { + std::unique_ptr converter(nullptr); convert::DocTime doc; -// auto x = - doc.init_time(_file_metadata->schema().get_column(i),_ctz); - RETURN_IF_ERROR(convert::get_converter(src_block->get_data_type(i),block->get_data_type(i),&converter,doc)); -// block->get_columns()[i]=src_block->get_columns()[i]; - converter->convert(src_block->get_columns()[i].get(), const_cast(block->get_columns()[i].get())); + // auto x = + doc.init_time(_file_metadata->schema().get_column(i), _ctz); + RETURN_IF_ERROR(convert::get_converter(src_block->get_data_type(i), + block->get_data_type(i), &converter, doc)); + // block->get_columns()[i]=src_block->get_columns()[i]; + converter->convert(src_block->get_columns()[i].get(), + const_cast(block->get_columns()[i].get())); } } } diff --git a/be/src/vec/exec/scan/scanner_context.cpp b/be/src/vec/exec/scan/scanner_context.cpp index 63028537e11555..a8ab6d4ec6d6c9 100644 --- a/be/src/vec/exec/scan/scanner_context.cpp +++ b/be/src/vec/exec/scan/scanner_context.cpp @@ -169,15 +169,13 @@ vectorized::BlockUPtr ScannerContext::get_free_block() { block = vectorized::Block::create_unique(_output_tuple_desc->slots(), _batch_size, true /*ignore invalid slots*/); - -// vector v; -// vectorized::DataTypePtr a = std::make_shared(std::make_shared()); -// v.push_back( ColumnWithTypeAndName(a,"id")); -// vectorized::DataTypePtr b = std::make_shared(std::make_shared()); -// v.push_back( ColumnWithTypeAndName(b,"age")); -// block = vectorized::Block::create_unique(v); -// block->set_num_rows(_batch_size); - + // vector v; + // vectorized::DataTypePtr a = std::make_shared(std::make_shared()); + // v.push_back( ColumnWithTypeAndName(a,"id")); + // vectorized::DataTypePtr b = std::make_shared(std::make_shared()); + // v.push_back( ColumnWithTypeAndName(b,"age")); + // block = vectorized::Block::create_unique(v); + // block->set_num_rows(_batch_size); COUNTER_UPDATE(_newly_create_free_blocks_num, 1); diff --git a/be/src/vec/exec/scan/scanner_scheduler.cpp b/be/src/vec/exec/scan/scanner_scheduler.cpp index 144454dda00778..513f1ed995f0db 100644 --- a/be/src/vec/exec/scan/scanner_scheduler.cpp +++ b/be/src/vec/exec/scan/scanner_scheduler.cpp @@ -389,9 +389,9 @@ void ScannerScheduler::_scanner_scan(ScannerScheduler* scheduler, ScannerContext break; } - BlockUPtr block = ctx->get_free_block();//create block <- _output_tuple_desc / 想要的结果 + BlockUPtr block = ctx->get_free_block(); //create block <- _output_tuple_desc / 想要的结果 - status = scanner->get_block(state, block.get(), &eos);//init reader ,read data + status = scanner->get_block(state, block.get(), &eos); //init reader ,read data VLOG_ROW << "VScanNode input rows: " << block->rows() << ", eos: " << eos; // The VFileScanner for external table may try to open not exist files, // Because FE file cache for external table may out of date. diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index 055e57148ccc35..77e9aeea77820a 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -289,7 +289,7 @@ Status VFileScanner::open(RuntimeState* state) { Status VFileScanner::_get_block_impl(RuntimeState* state, Block* block, bool* eof) { do { if (_cur_reader == nullptr || _cur_reader_eof) { - RETURN_IF_ERROR(_get_next_reader());//init parquet reader + RETURN_IF_ERROR(_get_next_reader()); //init parquet reader } if (_scanner_eof) { @@ -577,7 +577,7 @@ Status VFileScanner::_convert_to_output_block(Block* block) { const ColumnNullable* nullable_column = reinterpret_cast(column_ptr.get()); for (int i = 0; i < rows; ++i) { - if (filter_map[i] && nullable_column->is_null_at(i)) {//in load , error case + if (filter_map[i] && nullable_column->is_null_at(i)) { //in load , error case if (_strict_mode && (_src_slot_descs_order_by_dest[dest_index]) && !_src_block_ptr->get_by_position(_dest_slot_to_src_slot_index[dest_index]) .column->is_null_at(i)) { @@ -706,7 +706,7 @@ Status VFileScanner::_get_next_reader() { _state->update_num_finished_scan_range(1); return Status::OK(); } - if (_next_range != 0) { + if (_next_range != 0) { _state->update_num_finished_scan_range(1); } @@ -764,7 +764,7 @@ Status VFileScanner::_get_next_reader() { _state->query_options().enable_parquet_lazy_mat); { SCOPED_TIMER(_open_reader_timer); - RETURN_IF_ERROR(parquet_reader->open());//read file_schema + RETURN_IF_ERROR(parquet_reader->open()); //read file_schema } if (push_down_predicates && _push_down_conjuncts.empty() && !_conjuncts.empty()) { _push_down_conjuncts.resize(_conjuncts.size()); @@ -792,7 +792,7 @@ Status VFileScanner::_get_next_reader() { _file_col_names, place_holder, _colname_to_value_range, _push_down_conjuncts, _real_tuple_desc, _default_val_row_desc.get(), _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, - &_slot_id_to_filter_conjuncts);//init parquet reader <- select column / filter / value_range min max + &_slot_id_to_filter_conjuncts); //init parquet reader <- select column / filter / value_range min max _cur_reader = std::move(parquet_reader); } need_to_get_parsed_schema = true; From 22df535f30a96f4e3ab73f786b206d3d80ae9f4b Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Tue, 10 Oct 2023 20:54:21 +0800 Subject: [PATCH 03/21] fix format. --- be/src/vec/exec/format/convert.h | 26 ++++++++----------- .../exec/format/parquet/vparquet_reader.cpp | 23 +++++----------- 2 files changed, 17 insertions(+), 32 deletions(-) diff --git a/be/src/vec/exec/format/convert.h b/be/src/vec/exec/format/convert.h index b4d1ab46b928da..ee0c9a2581e863 100644 --- a/be/src/vec/exec/format/convert.h +++ b/be/src/vec/exec/format/convert.h @@ -15,11 +15,9 @@ // specific language governing permissions and limitations // under the License. -#include #include #include #include -#include #include #include @@ -149,7 +147,7 @@ static Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, ans_data_type = std::make_shared(); break; default: - std::cout << "--->" << parquet_type << "\n"; + // std::cout << "--->" << parquet_type << "\n"; break; } if (ans_data_type->get_type_id() == src_type->get_type_id()) { @@ -174,13 +172,13 @@ static Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, } struct ColumnConvert { - Status virtual convert(const IColumn* src_col, IColumn* dst_col) { return Status::OK(); } + virtual Status convert(const IColumn* src_col, IColumn* dst_col) { return Status::OK(); } virtual ~ColumnConvert() = default; }; template struct NumberColumnConvert : public ColumnConvert { - virtual Status convert(const IColumn* src_col, IColumn* dst_col) override; + Status convert(const IColumn* src_col, IColumn* dst_col) override; }; void convert_null(const IColumn** src_col, IColumn** dst_col) { size_t rows = (*src_col)->size(); @@ -219,7 +217,7 @@ Status NumberColumnConvert::convert(const IColu } template struct NumberColumnToStringConvert : public ColumnConvert { - virtual Status convert(const IColumn* src_col, IColumn* dst_col) override; + Status convert(const IColumn* src_col, IColumn* dst_col) override; }; template @@ -242,11 +240,11 @@ template struct int128totimestamp : public ColumnConvert { int128totimestamp(DocTime* pTime) { doc = pTime; } - inline uint64_t to_timestamp_micros(uint32_t hi, uint64_t lo) const { + [[nodiscard]] inline uint64_t to_timestamp_micros(uint32_t hi, uint64_t lo) const { return (hi - ParquetInt96::JULIAN_EPOCH_OFFSET_DAYS) * ParquetInt96::MICROS_IN_DAY + lo / ParquetInt96::NANOS_PER_MICROSECOND; } - Status convert(const IColumn* src_col, IColumn* dst_col) { + Status convert(const IColumn* src_col, IColumn* dst_col) override { size_t rows = src_col->size(); if constexpr (is_nullable) { convert_null(&src_col, &dst_col); @@ -275,7 +273,7 @@ struct int64totimestamp : public ColumnConvert { public: int64totimestamp(DocTime* pTime) { doc = pTime; } - Status convert(const IColumn* src_col, IColumn* dst_col) { + Status convert(const IColumn* src_col, IColumn* dst_col) override { size_t rows = src_col->size(); if constexpr (is_nullable) { convert_null(&src_col, &dst_col); @@ -303,7 +301,7 @@ class int32todate : public ColumnConvert { public: DocTime* doc; int32todate(DocTime* pTime) { doc = pTime; } - Status convert(const IColumn* src_col, IColumn* dst_col) { + Status convert(const IColumn* src_col, IColumn* dst_col) override { size_t rows = src_col->size(); if constexpr (is_nullable) { convert_null(&src_col, &dst_col); @@ -331,7 +329,7 @@ class stringtodecimal : public ColumnConvert { public: DocTime* doc; stringtodecimal(DocTime* pTime) { doc = pTime; } - Status convert(const IColumn* src_col, IColumn* dst_col) { + Status convert(const IColumn* src_col, IColumn* dst_col) override { size_t rows = src_col->size(); if constexpr (is_nullable) { convert_null(&src_col, &dst_col); @@ -364,7 +362,7 @@ class numbertodecimal : public ColumnConvert { DocTime* doc; public: - Status convert(const IColumn* src_col, IColumn* dst_col) { + Status convert(const IColumn* src_col, IColumn* dst_col) override { size_t rows = src_col->size(); if constexpr (is_nullable) { convert_null(&src_col, &dst_col); @@ -386,7 +384,6 @@ class numbertodecimal : public ColumnConvert { return Status::OK(); } -public: numbertodecimal(DocTime* pTime) { doc = pTime; } }; /* @@ -486,7 +483,7 @@ static Status get_converter_impl(std::shared_ptr src_data_type, break; } - if (converter->get() == nullptr) { + if (*converter == nullptr) { return Status::NotSupported("Can't cast type {} to type {}", getTypeName(src_type), getTypeName(dst_type)); } @@ -505,7 +502,6 @@ static Status get_converter(std::shared_ptr src_type, } else { return get_converter_impl(src_type, dst_type, converter, doc); } - return Status::OK(); } }; // namespace convert diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index 6e7b4f0e5b314e..cf7955a91b80ca 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -19,11 +19,9 @@ #include #include -#include #include #include -#include #include #include #include @@ -44,23 +42,12 @@ #include "runtime/types.h" #include "util/slice.h" #include "util/timezone_utils.h" -#include "vec/aggregate_functions/aggregate_function.h" #include "vec/columns/column.h" -#include "vec/columns/column_nullable.h" -#include "vec/columns/column_string.h" -#include "vec/columns/column_vector.h" -#include "vec/common/string_ref.h" #include "vec/common/typeid_cast.h" #include "vec/core/block.h" #include "vec/core/column_with_type_and_name.h" #include "vec/core/types.h" -#include "vec/data_types/data_type.h" -#include "vec/data_types/data_type_factory.hpp" -#include "vec/data_types/data_type_nullable.h" -#include "vec/data_types/data_type_number.h" -#include "vec/data_types/data_type_string.h" #include "vec/exec/format/convert.h" -#include "vec/exec/format/format_common.h" #include "vec/exec/format/parquet/parquet_common.h" #include "vec/exec/format/parquet/schema_desc.h" #include "vec/exec/format/parquet/vparquet_file_metadata.h" @@ -555,8 +542,8 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof) tparquet::Type::type parquet_type = _file_metadata->schema().get_column(col_name)->physical_type; bool conv = false; - convert::convert_data_type_from_parquet(parquet_type, data_type, - block->get_by_name(col_name).type, &conv); + RETURN_IF_ERROR(convert::convert_data_type_from_parquet( + parquet_type, data_type, block->get_by_name(col_name).type, &conv)); std::cout << col_name << "->" << conv << "\n"; need_convert[col_name] = conv; if (conv) { @@ -589,8 +576,10 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof) RETURN_IF_ERROR(convert::get_converter(src_block->get_data_type(i), block->get_data_type(i), &converter, doc)); // block->get_columns()[i]=src_block->get_columns()[i]; - converter->convert(src_block->get_columns()[i].get(), - const_cast(block->get_columns()[i].get())); + RETURN_IF_ERROR( + + converter->convert(src_block->get_columns()[i].get(), + const_cast(block->get_columns()[i].get()))); } } } From eb8263a8ea2ebb32fd2ee99c00b73ae1237e4355 Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Wed, 11 Oct 2023 14:25:10 +0800 Subject: [PATCH 04/21] modify decode --- be/src/vec/exec/format/convert.h | 13 +- .../parquet/byte_array_plain_decoder.cpp | 41 +- .../parquet/fix_length_dict_decoder.hpp | 212 +++--- .../parquet/fix_length_plain_decoder.cpp | 609 ------------------ .../parquet/vparquet_column_chunk_reader.cpp | 2 +- .../format/parquet/vparquet_column_reader.cpp | 23 +- .../exec/format/parquet/vparquet_reader.cpp | 99 ++- 7 files changed, 228 insertions(+), 771 deletions(-) delete mode 100644 be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp diff --git a/be/src/vec/exec/format/convert.h b/be/src/vec/exec/format/convert.h index ee0c9a2581e863..1a30c3719a685a 100644 --- a/be/src/vec/exec/format/convert.h +++ b/be/src/vec/exec/format/convert.h @@ -37,10 +37,21 @@ #include "vec/data_types/data_type_number.h" #include "vec/data_types/data_type_string.h" #include "vec/exec/format/parquet/parquet_common.h" +#include "common/compiler_util.h" // IWYU pragma: keep +#include "common/status.h" +#include "gutil/endian.h" +#include "util/coding.h" +#include "util/slice.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type.h" +#include "vec/exec/format/format_common.h" +#include "vec/exec/format/parquet/decoder.h" +#include "vec/exec/format/parquet/parquet_common.h" + namespace doris::vectorized { -namespace convert { +namespace ParquetConvert { class DocTime { public: diff --git a/be/src/vec/exec/format/parquet/byte_array_plain_decoder.cpp b/be/src/vec/exec/format/parquet/byte_array_plain_decoder.cpp index e91f9f1db94ce2..911f090b1ce642 100644 --- a/be/src/vec/exec/format/parquet/byte_array_plain_decoder.cpp +++ b/be/src/vec/exec/format/parquet/byte_array_plain_decoder.cpp @@ -56,11 +56,11 @@ template Status ByteArrayPlainDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector, bool is_dict_filter) { - TypeIndex logical_type = remove_nullable(data_type)->get_type_id(); - switch (logical_type) { - case TypeIndex::String: - [[fallthrough]]; - case TypeIndex::FixedString: { +// TypeIndex logical_type = remove_nullable(data_type)->get_type_id(); +// switch (logical_type) { +// case TypeIndex::String: +// [[fallthrough]]; +// case TypeIndex::FixedString: { ColumnSelectVector::DataReadType read_type; while (size_t run_length = select_vector.get_next_run(&read_type)) { switch (read_type) { @@ -109,21 +109,20 @@ Status ByteArrayPlainDecoder::_decode_values(MutableColumnPtr& doris_column, Dat } } return Status::OK(); - } - case TypeIndex::Decimal32: - return _decode_binary_decimal(doris_column, data_type, select_vector); - case TypeIndex::Decimal64: - return _decode_binary_decimal(doris_column, data_type, select_vector); - case TypeIndex::Decimal128: - return _decode_binary_decimal(doris_column, data_type, select_vector); - case TypeIndex::Decimal128I: - return _decode_binary_decimal(doris_column, data_type, select_vector); - // TODO: decimal256 - default: - break; - } - return Status::InvalidArgument( - "Can't decode parquet physical type BYTE_ARRAY to doris logical type {}", - getTypeName(logical_type)); +// } +// case TypeIndex::Decimal32: +// return _decode_binary_decimal(doris_column, data_type, select_vector); +// case TypeIndex::Decimal64: +// return _decode_binary_decimal(doris_column, data_type, select_vector); +// case TypeIndex::Decimal128: +// return _decode_binary_decimal(doris_column, data_type, select_vector); +// case TypeIndex::Decimal128I: +// return _decode_binary_decimal(doris_column, data_type, select_vector); +// default: +// break; +// } +// return Status::InvalidArgument( +// "Can't decode parquet physical type BYTE_ARRAY to doris logical type {}", +// getTypeName(logical_type)); } } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp index 0c573b6db62dac..2bf19250d51ed5 100644 --- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp +++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp @@ -72,100 +72,138 @@ class FixLengthDictDecoder final : public BaseDictDecoder { if (doris_column->is_column_dictionary() || is_dict_filter) { return _decode_dict_values(doris_column, select_vector, is_dict_filter); } - - TypeIndex logical_type = remove_nullable(data_type)->get_type_id(); - switch (logical_type) { -#define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \ - case NUMERIC_TYPE: \ - if constexpr (!std::is_same_v) { \ - return _decode_numeric(doris_column, select_vector); \ - } - FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) -#undef DISPATCH - case TypeIndex::Int128: - return _decode_numeric(doris_column, select_vector); - break; - case TypeIndex::Date: - if constexpr (std::is_same_v) { - return _decode_date(doris_column, - select_vector); - } - break; - case TypeIndex::DateV2: - if constexpr (std::is_same_v) { - return _decode_date, UInt32, has_filter>( - doris_column, select_vector); - } - break; - case TypeIndex::DateTime: - if constexpr (std::is_same_v) { - return _decode_datetime96(doris_column, - select_vector); - } else if constexpr (std::is_same_v) { - return _decode_datetime64(doris_column, - select_vector); - } - break; - case TypeIndex::DateTimeV2: - // Spark can set the timestamp precision by the following configuration: - // spark.sql.parquet.outputTimestampType = INT96(NANOS), TIMESTAMP_MICROS, TIMESTAMP_MILLIS - if constexpr (std::is_same_v) { - return _decode_datetime96, UInt64, has_filter>( - doris_column, select_vector); - } else if constexpr (std::is_same_v) { - return _decode_datetime64, UInt64, has_filter>( - doris_column, select_vector); - } - break; - case TypeIndex::Decimal32: - if constexpr (std::is_same_v) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } else if constexpr (std::is_same_v) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } - break; - case TypeIndex::Decimal64: - if constexpr (std::is_same_v) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } else if constexpr (std::is_same_v) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } + /* + * decoder.reset(new FixLengthDictDecoder(type)); break; - case TypeIndex::Decimal128: - if constexpr (std::is_same_v) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } else if constexpr (std::is_same_v) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } + case tparquet::Type::INT64: + decoder.reset(new FixLengthDictDecoder(type)); break; - case TypeIndex::Decimal128I: - if constexpr (std::is_same_v) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } else if constexpr (std::is_same_v) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } + case tparquet::Type::INT96: + decoder.reset(new FixLengthDictDecoder(type)); break; - // TODO: decimal256 - case TypeIndex::String: - [[fallthrough]]; - case TypeIndex::FixedString: + case tparquet::Type::FLOAT: + decoder.reset(new FixLengthDictDecoder(type)); break; - default: + case tparquet::Type::DOUBLE: + decoder.reset(new FixLengthDictDecoder(type)); break; + case tparquet::Type::FIXED_LEN_BYTE_ARRAY: + decoder.reset(new FixLengthDictDecoder(type)); + * + */ + + if constexpr (std::is_same_v){ + return _decode_numeric(doris_column, select_vector); + }else if constexpr (std::is_same_v){ + return _decode_numeric(doris_column, select_vector); + }else if constexpr (std::is_same_v){ + return _decode_numeric(doris_column, select_vector); + }else if constexpr (std::is_same_v){ + return _decode_numeric(doris_column, select_vector); + }else if constexpr (std::is_same_v){ + return _decode_numeric(doris_column, select_vector); } - return Status::InvalidArgument( - "Can't decode parquet physical type {} to doris logical type {}", - tparquet::to_string(_physical_type), getTypeName(logical_type)); +// else if constexpr (std::is_same_v) { +// return _decode_numeric(doris_column, select_vector); +// +// } return Status::OK(); + +// TypeIndex logical_type = remove_nullable(data_type)->get_type_id(); +// switch (logical_type) { +// +// +//#define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \ +// case NUMERIC_TYPE: \ +// if constexpr (!std::is_same_v) { \ +// return _decode_numeric(doris_column, select_vector); \ +// } +// FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) +//#undef DISPATCH +// case TypeIndex::Int128: +// return _decode_numeric(doris_column, select_vector); +// break; +// case TypeIndex::Date: +// if constexpr (std::is_same_v) { +// return _decode_date(doris_column, +// select_vector); +// } +// break; +// case TypeIndex::DateV2: +// if constexpr (std::is_same_v) { +// return _decode_date, UInt32, has_filter>( +// doris_column, select_vector); +// } +// break; +// case TypeIndex::DateTime: +// if constexpr (std::is_same_v) { +// return _decode_datetime96(doris_column, +// select_vector); +// } else if constexpr (std::is_same_v) { +// return _decode_datetime64(doris_column, +// select_vector); +// } +// break; +// case TypeIndex::DateTimeV2: +// // Spark can set the timestamp precision by the following configuration: +// // spark.sql.parquet.outputTimestampType = INT96(NANOS), TIMESTAMP_MICROS, TIMESTAMP_MILLIS +// if constexpr (std::is_same_v) { +// return _decode_datetime96, UInt64, has_filter>( +// doris_column, select_vector); +// } else if constexpr (std::is_same_v) { +// return _decode_datetime64, UInt64, has_filter>( +// doris_column, select_vector); +// } +// break; +// case TypeIndex::Decimal32: +// if constexpr (std::is_same_v) { +// return _decode_primitive_decimal(doris_column, data_type, +// select_vector); +// } else if constexpr (std::is_same_v) { +// return _decode_primitive_decimal(doris_column, data_type, +// select_vector); +// } +// break; +// case TypeIndex::Decimal64: +// if constexpr (std::is_same_v) { +// return _decode_primitive_decimal(doris_column, data_type, +// select_vector); +// } else if constexpr (std::is_same_v) { +// return _decode_primitive_decimal(doris_column, data_type, +// select_vector); +// } +// break; +// case TypeIndex::Decimal128: +// if constexpr (std::is_same_v) { +// return _decode_primitive_decimal(doris_column, data_type, +// select_vector); +// } else if constexpr (std::is_same_v) { +// return _decode_primitive_decimal(doris_column, data_type, +// select_vector); +// } +// break; +// case TypeIndex::Decimal128I: +// if constexpr (std::is_same_v) { +// return _decode_primitive_decimal(doris_column, data_type, +// select_vector); +// } else if constexpr (std::is_same_v) { +// return _decode_primitive_decimal(doris_column, data_type, +// select_vector); +// } +// break; +// case TypeIndex::String: +// [[fallthrough]]; +// case TypeIndex::FixedString: +// break; +// default: +// break; +// } +// return Status::InvalidArgument( +// "Can't decode parquet physical type {} to doris logical type {}", +// tparquet::to_string(_physical_type), getTypeName(logical_type)); +// +// return Status::OK(); } Status set_dict(std::unique_ptr& dict, int32_t length, size_t num_values) override { @@ -750,4 +788,4 @@ class FixLengthDictDecoder final : public BaseDictDecoder { } }; -} // namespace doris::vectorized \ No newline at end of file +} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp deleted file mode 100644 index 8e6f6ebb67ff04..00000000000000 --- a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp +++ /dev/null @@ -1,609 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "vec/exec/format/parquet/fix_length_plain_decoder.h" - -#include -#include -#include - -#include -#include - -// IWYU pragma: no_include -#include "common/compiler_util.h" // IWYU pragma: keep -#include "util/bit_util.h" -#include "util/slice.h" -#include "vec/columns/column.h" -#include "vec/common/string_ref.h" -#include "vec/core/types.h" -#include "vec/data_types/data_type_nullable.h" -#include "vec/exec/format/format_common.h" -#include "vec/exec/format/parquet/parquet_common.h" -#include "vec/runtime/vdatetime_value.h" - -namespace doris { -namespace vectorized { -template -class ColumnDecimal; -template -class ColumnVector; -} // namespace vectorized -} // namespace doris - -namespace doris::vectorized { - -Status FixLengthPlainDecoder::skip_values(size_t num_values) { - _offset += _type_length * num_values; - if (UNLIKELY(_offset > _data->size)) { - return Status::IOError("Out-of-bounds access in parquet data decoder"); - } - return Status::OK(); -} - -Status FixLengthPlainDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector, - bool is_dict_filter) { - if (select_vector.has_filter()) { - return _decode_values(doris_column, data_type, select_vector, is_dict_filter); - } else { - return _decode_values(doris_column, data_type, select_vector, is_dict_filter); - } -} - -template -Status FixLengthPlainDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector, - bool is_dict_filter) { - size_t non_null_size = select_vector.num_values() - select_vector.num_nulls(); - if (UNLIKELY(_offset + _type_length * non_null_size > _data->size)) { - return Status::IOError("Out-of-bounds access in parquet data decoder"); - } - TypeIndex logical_type = remove_nullable(data_type)->get_type_id(); - switch (logical_type) { -#define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \ - case NUMERIC_TYPE: \ - if (_physical_type == tparquet::Type::INT32) { \ - return _decode_numeric(doris_column, \ - select_vector); \ - } else if (_physical_type == tparquet::Type::INT64) { \ - return _decode_numeric(doris_column, \ - select_vector); \ - } else if (_physical_type == tparquet::Type::FLOAT) { \ - return _decode_numeric(doris_column, \ - select_vector); \ - } else if (_physical_type == tparquet::Type::DOUBLE) { \ - return _decode_numeric(doris_column, \ - select_vector); \ - } else { \ - break; \ - } - FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) -#undef DISPATCH - case TypeIndex::Date: - if (_physical_type == tparquet::Type::INT32) { - return _decode_date(doris_column, select_vector); - } - break; - case TypeIndex::DateV2: - if (_physical_type == tparquet::Type::INT32) { - return _decode_date, UInt32, has_filter>(doris_column, - select_vector); - } - break; - case TypeIndex::DateTime: - if (_physical_type == tparquet::Type::INT96) { - return _decode_datetime96(doris_column, - select_vector); - } else if (_physical_type == tparquet::Type::INT64) { - return _decode_datetime64(doris_column, - select_vector); - } - break; - case TypeIndex::DateTimeV2: - // Spark can set the timestamp precision by the following configuration: - // spark.sql.parquet.outputTimestampType = INT96(NANOS), TIMESTAMP_MICROS, TIMESTAMP_MILLIS - if (_physical_type == tparquet::Type::INT96) { - return _decode_datetime96, UInt64, has_filter>( - doris_column, select_vector); - } else if (_physical_type == tparquet::Type::INT64) { - return _decode_datetime64, UInt64, has_filter>( - doris_column, select_vector); - } - break; - case TypeIndex::Decimal32: - if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { - return _decode_binary_decimal(doris_column, data_type, - select_vector); - } else if (_physical_type == tparquet::Type::INT32) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } else if (_physical_type == tparquet::Type::INT64) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } - break; - case TypeIndex::Decimal64: - if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { - return _decode_binary_decimal(doris_column, data_type, - select_vector); - } else if (_physical_type == tparquet::Type::INT32) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } else if (_physical_type == tparquet::Type::INT64) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } - break; - case TypeIndex::Decimal128: - if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { - return _decode_binary_decimal(doris_column, data_type, - select_vector); - } else if (_physical_type == tparquet::Type::INT32) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } else if (_physical_type == tparquet::Type::INT64) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } - break; - case TypeIndex::Decimal128I: - if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { - return _decode_binary_decimal(doris_column, data_type, - select_vector); - } else if (_physical_type == tparquet::Type::INT32) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } else if (_physical_type == tparquet::Type::INT64) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } - break; - // TODO: decimal256 - case TypeIndex::String: - [[fallthrough]]; - case TypeIndex::FixedString: - if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { - return _decode_string(doris_column, select_vector); - } - break; - default: - break; - } - - return Status::InvalidArgument("Can't decode parquet physical type {} to doris logical type {}", - tparquet::to_string(_physical_type), getTypeName(logical_type)); -} - -template -Status FixLengthPlainDecoder::_decode_string(MutableColumnPtr& doris_column, - ColumnSelectVector& select_vector) { - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - std::vector string_values; - string_values.reserve(run_length); - for (size_t i = 0; i < run_length; ++i) { - char* buf_start = _data->data + _offset; - string_values.emplace_back(buf_start, _type_length); - _offset += _type_length; - } - doris_column->insert_many_strings(&string_values[0], run_length); - break; - } - case ColumnSelectVector::NULL_DATA: { - doris_column->insert_many_defaults(run_length); - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - _offset += _type_length * run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); -} -template -Status FixLengthPlainDecoder::_decode_numeric(MutableColumnPtr& doris_column, - ColumnSelectVector& select_vector) { - auto& column_data = static_cast&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - char* buf_start = _data->data + _offset; - column_data[data_index++] = *(PhysicalType*)buf_start; - _offset += _type_length; - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - _offset += _type_length * run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); -} - -template -Status FixLengthPlainDecoder::_decode_date(MutableColumnPtr& doris_column, - ColumnSelectVector& select_vector) { - auto& column_data = static_cast&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - ColumnSelectVector::DataReadType read_type; - date_day_offset_dict& date_dict = date_day_offset_dict::get(); - - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - char* buf_start = _data->data + _offset; - int64_t date_value = static_cast(*reinterpret_cast(buf_start)) + - _decode_params->offset_days; - if constexpr (std::is_same_v) { - auto& v = reinterpret_cast(column_data[data_index++]); - v.create_from_date_v2(date_dict[date_value], TIME_DATE); - // we should cast to date if using date v1. - v.cast_to_date(); - } else { - reinterpret_cast(column_data[data_index++]) = date_dict[date_value]; - } - _offset += _type_length; - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - _offset += _type_length * run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); -} - -template -Status FixLengthPlainDecoder::_decode_datetime64(MutableColumnPtr& doris_column, - ColumnSelectVector& select_vector) { - auto& column_data = static_cast&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - char* buf_start = _data->data + _offset; - int64_t& date_value = *reinterpret_cast(buf_start); - auto& v = reinterpret_cast(column_data[data_index++]); - v.from_unixtime(date_value / _decode_params->second_mask, *_decode_params->ctz); - if constexpr (std::is_same_v>) { - // nanoseconds will be ignored. - v.set_microsecond((date_value % _decode_params->second_mask) * - _decode_params->scale_to_nano_factor / 1000); - // TODO: the precision of datetime v1 - } - _offset += _type_length; - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - _offset += _type_length * run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); -} - -template -Status FixLengthPlainDecoder::_decode_datetime96(MutableColumnPtr& doris_column, - ColumnSelectVector& select_vector) { - auto& column_data = static_cast&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - char* buf_start = _data->data + _offset; - ParquetInt96& datetime96 = *reinterpret_cast(buf_start); - auto& v = reinterpret_cast(column_data[data_index++]); - int64_t micros = datetime96.to_timestamp_micros(); - v.from_unixtime(micros / 1000000, *_decode_params->ctz); - if constexpr (std::is_same_v>) { - // spark.sql.parquet.outputTimestampType = INT96(NANOS) will lost precision. - // only keep microseconds. - v.set_microsecond(micros % 1000000); - } - _offset += _type_length; - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - _offset += _type_length * run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); -} - -template -Status FixLengthPlainDecoder::_decode_binary_decimal(MutableColumnPtr& doris_column, - DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - init_decimal_converter(data_type); - DecimalScaleParams& scale_params = _decode_params->decimal_scale; -#define M(FixedTypeLength, ValueCopyType, ScaleType) \ - case FixedTypeLength: \ - return _decode_binary_decimal_internal(doris_column, data_type, \ - select_vector); - -#define APPLY_FOR_DECIMALS(ScaleType) \ - M(1, int64_t, ScaleType) \ - M(2, int64_t, ScaleType) \ - M(3, int64_t, ScaleType) \ - M(4, int64_t, ScaleType) \ - M(5, int64_t, ScaleType) \ - M(6, int64_t, ScaleType) \ - M(7, int64_t, ScaleType) \ - M(8, int64_t, ScaleType) \ - M(9, int128_t, ScaleType) \ - M(10, int128_t, ScaleType) \ - M(11, int128_t, ScaleType) \ - M(12, int128_t, ScaleType) \ - M(13, int128_t, ScaleType) \ - M(14, int128_t, ScaleType) \ - M(15, int128_t, ScaleType) \ - M(16, int128_t, ScaleType) - - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_UP) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_DOWN) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } else { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::NO_SCALE) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } - return Status::OK(); -#undef APPLY_FOR_DECIMALS -#undef M -} - -template -Status FixLengthPlainDecoder::_decode_binary_decimal_internal(MutableColumnPtr& doris_column, - DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - auto& column_data = - static_cast>&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - DecimalScaleParams& scale_params = _decode_params->decimal_scale; - - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - char* buf_start = _data->data + _offset; - // When Decimal in parquet is stored in byte arrays, binary and fixed, - // the unscaled number must be encoded as two's complement using big-endian byte order. - DecimalPrimitiveType result_value = 0; - ValueCopyType value = 0; - memcpy(reinterpret_cast(&value), buf_start, fixed_type_length); - value = BitUtil::big_endian_to_host(value); - value = value >> ((sizeof(value) - fixed_type_length) * 8); - result_value = value; - if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) { - result_value *= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { - result_value /= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) { - // do nothing - } else { - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - auto& v = reinterpret_cast(column_data[data_index++]); - v = (DecimalPrimitiveType)result_value; - _offset += fixed_type_length; - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - _offset += _type_length * run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); -} - -template -Status FixLengthPlainDecoder::_decode_primitive_decimal(MutableColumnPtr& doris_column, - DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - init_decimal_converter(data_type); - DecimalScaleParams& scale_params = _decode_params->decimal_scale; -#define M(FixedTypeLength, T, ScaleType) \ - case FixedTypeLength: \ - return _decode_primitive_decimal_internal( \ - doris_column, data_type, select_vector); - -#define APPLY_FOR_DECIMALS(ScaleType) \ - M(1, int64_t, ScaleType) \ - M(2, int64_t, ScaleType) \ - M(3, int64_t, ScaleType) \ - M(4, int64_t, ScaleType) \ - M(5, int64_t, ScaleType) \ - M(6, int64_t, ScaleType) \ - M(7, int64_t, ScaleType) \ - M(8, int64_t, ScaleType) \ - M(9, int128_t, ScaleType) \ - M(10, int128_t, ScaleType) \ - M(11, int128_t, ScaleType) \ - M(12, int128_t, ScaleType) \ - M(13, int128_t, ScaleType) \ - M(14, int128_t, ScaleType) \ - M(15, int128_t, ScaleType) \ - M(16, int128_t, ScaleType) - - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_UP) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_DOWN) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } else { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::NO_SCALE) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } - return Status::OK(); -#undef APPLY_FOR_DECIMALS -#undef M -} - -template -Status FixLengthPlainDecoder::_decode_primitive_decimal_internal( - MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector) { - auto& column_data = - static_cast>&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - DecimalScaleParams& scale_params = _decode_params->decimal_scale; - - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - char* buf_start = _data->data + _offset; - ValueCopyType value = *reinterpret_cast(buf_start); - if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) { - value *= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { - value /= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) { - // do nothing - } else { - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - auto& v = reinterpret_cast(column_data[data_index++]); - v = (DecimalPrimitiveType)value; - _offset += _type_length; - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - _offset += _type_length * run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); -} -} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp index 8333a783a1c4fa..240e537ee05e59 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp @@ -190,7 +190,7 @@ Status ColumnChunkReader::load_page_data() { _page_decoder = _decoders[static_cast(encoding)].get(); } else { std::unique_ptr page_decoder; - // std::cout <<"type = "<<_metadata.type <<" "<< encoding <<"\n"; + std::cout <<"type = "<<_metadata.type <<" "<< encoding <<"\n"; RETURN_IF_ERROR(Decoder::get_decoder(_metadata.type, encoding, page_decoder)); // Set type length page_decoder->set_type_length(_get_type_length()); diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp index 794dfe7a39a0c6..7ad67cdc80ec78 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp @@ -39,6 +39,7 @@ #include "vec/data_types/data_type_struct.h" #include "vec/exec/format/parquet/level_decoder.h" #include "vparquet_column_chunk_reader.h" +#include "vec/exec/format/convert.h" namespace cctz { class time_zone; @@ -479,6 +480,16 @@ Status ScalarColumnReader::_try_load_dict_page(bool* loaded, bool* has_dict) { Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr& type, ColumnSelectVector& select_vector, size_t batch_size, size_t* read_rows, bool* eof, bool is_dict_filter) { + bool need_convert = false; + auto & physical_type = _chunk_meta.meta_data.type; + DataTypePtr src_type; + ParquetConvert::convert_data_type_from_parquet(physical_type, src_type,type,&need_convert); + + ColumnPtr src_column = doris_column; + if (need_convert ){ + src_column = src_type->create_column(); + } + if (_chunk_reader->remaining_num_values() == 0) { if (!_chunk_reader->has_next_page()) { *eof = true; @@ -489,7 +500,7 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr } if (_nested_column) { RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent()); - return _read_nested_column(doris_column, type, select_vector, batch_size, read_rows, eof, + return _read_nested_column(src_column, type, select_vector, batch_size, read_rows, eof, is_dict_filter); } @@ -544,7 +555,7 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr if (skip_whole_batch) { RETURN_IF_ERROR(_skip_values(read_values)); } else { - RETURN_IF_ERROR(_read_values(read_values, doris_column, type, select_vector, + RETURN_IF_ERROR(_read_values(read_values, src_column, type, select_vector, is_dict_filter)); } has_read += read_values; @@ -559,6 +570,14 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr if (_chunk_reader->remaining_num_values() == 0 && !_chunk_reader->has_next_page()) { *eof = true; } + if ( need_convert ){ + std::unique_ptr converter; + ParquetConvert::DocTime doc; + doc.init_time( _field_schema , _ctz); + ParquetConvert::get_converter(src_type,type,&converter, doc); + converter->convert(src_column,const_cast(doris_column.get())); + } + return Status::OK(); } diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index cf7955a91b80ca..e4e2e538fb9e91 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -47,7 +47,6 @@ #include "vec/core/block.h" #include "vec/core/column_with_type_and_name.h" #include "vec/core/types.h" -#include "vec/exec/format/convert.h" #include "vec/exec/format/parquet/parquet_common.h" #include "vec/exec/format/parquet/schema_desc.h" #include "vec/exec/format/parquet/vparquet_file_metadata.h" @@ -526,38 +525,38 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof) } DCHECK(_current_group_reader != nullptr); - { - BlockUPtr src_block; - std::map need_convert; - { - // std::cout <<"->"; - // for(auto i =0; i < block->columns();i++ ){ - // std::cout << block->get_columns()[i]->get_name()<<" "; - // } - // std::cout <<"\n"; - - vector v; - for (auto& col_name : block->get_names()) { - vectorized::DataTypePtr data_type; - tparquet::Type::type parquet_type = - _file_metadata->schema().get_column(col_name)->physical_type; - bool conv = false; - RETURN_IF_ERROR(convert::convert_data_type_from_parquet( - parquet_type, data_type, block->get_by_name(col_name).type, &conv)); - std::cout << col_name << "->" << conv << "\n"; - need_convert[col_name] = conv; - if (conv) { - v.emplace_back(data_type, col_name); - } else { - // v.emplace_back( (*std::move(block->get_by_name(col_name).column)).mutate(),data_type,col_name ); - v.emplace_back(block->get_by_name(col_name).column->assume_mutable(), data_type, - col_name); - } - } - src_block = vectorized::Block::create_unique(v); - } +// { +// BlockUPtr src_block; +// std::map need_convert; +// { +// // std::cout <<"->"; +// // for(auto i =0; i < block->columns();i++ ){ +// // std::cout << block->get_columns()[i]->get_name()<<" "; +// // } +// // std::cout <<"\n"; +// +// vector v; +// for (auto& col_name : block->get_names()) { +// vectorized::DataTypePtr data_type; +// tparquet::Type::type parquet_type = +// _file_metadata->schema().get_column(col_name)->physical_type; +// bool conv = false; +// RETURN_IF_ERROR(convert::convert_data_type_from_parquet( +// parquet_type, data_type, block->get_by_name(col_name).type, &conv)); +// std::cout << col_name << "->" << conv << "\n"; +// need_convert[col_name] = conv; +// if (conv) { +// v.emplace_back(data_type, col_name); +// } else { +// // v.emplace_back( (*std::move(block->get_by_name(col_name).column)).mutate(),data_type,col_name ); +// v.emplace_back(block->get_by_name(col_name).column->assume_mutable(), data_type, +// col_name); +// } +// } +// src_block = vectorized::Block::create_unique(v); +// } SCOPED_RAW_TIMER(&_statistics.column_read_time); - Status batch_st = _current_group_reader->next_batch(src_block.get(), _batch_size, read_rows, + Status batch_st = _current_group_reader->next_batch(block, _batch_size, read_rows, &_row_group_eof); if (!batch_st.ok()) { return Status::InternalError("Read parquet file {} failed, reason = {}", @@ -565,24 +564,24 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof) } //convert - for (auto i = 0; i < block->columns(); i++) { - std::cout << "colname = " << block->get_names()[i] << " " - << need_convert[block->get_names()[i]] << "\n"; - if (need_convert[block->get_names()[i]]) { - std::unique_ptr converter(nullptr); - convert::DocTime doc; - // auto x = - doc.init_time(_file_metadata->schema().get_column(i), _ctz); - RETURN_IF_ERROR(convert::get_converter(src_block->get_data_type(i), - block->get_data_type(i), &converter, doc)); - // block->get_columns()[i]=src_block->get_columns()[i]; - RETURN_IF_ERROR( - - converter->convert(src_block->get_columns()[i].get(), - const_cast(block->get_columns()[i].get()))); - } - } - } +// for (auto i = 0; i < block->columns(); i++) { +// std::cout << "colname = " << block->get_names()[i] << " " +// << need_convert[block->get_names()[i]] << "\n"; +// if (need_convert[block->get_names()[i]]) { +// std::unique_ptr converter(nullptr); +// convert::DocTime doc; +// // auto x = +// doc.init_time(_file_metadata->schema().get_column(i), _ctz); +// RETURN_IF_ERROR(convert::get_converter(src_block->get_data_type(i), +// block->get_data_type(i), &converter, doc)); +// // block->get_columns()[i]=src_block->get_columns()[i]; +// RETURN_IF_ERROR( +// +// converter->convert(src_block->get_columns()[i].get(), +// const_cast(block->get_columns()[i].get()))); +// } +// } +// } if (_row_group_eof) { auto column_st = _current_group_reader->statistics(); From f83da192e00a57050b58bc8ba2cfee93f589b0fb Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Wed, 11 Oct 2023 23:50:55 +0800 Subject: [PATCH 05/21] remote decode case judge. --- .../parquet/byte_array_dict_decoder.cpp | 73 +-- .../format/parquet/byte_array_dict_decoder.h | 87 --- .../parquet/byte_array_plain_decoder.cpp | 106 ++- .../format/parquet/byte_array_plain_decoder.h | 92 --- be/src/vec/exec/format/parquet/decoder.cpp | 82 +-- be/src/vec/exec/format/parquet/decoder.h | 56 +- .../format/parquet/delta_bit_pack_decoder.cpp | 283 -------- .../format/parquet/delta_bit_pack_decoder.h | 314 +++++++-- .../parquet/fix_length_dict_decoder.hpp | 609 +----------------- .../format/parquet/fix_length_plain_decoder.h | 157 ++++- .../parquet_column_convert.h} | 372 ++++++----- .../parquet/vparquet_column_chunk_reader.cpp | 8 +- .../parquet/vparquet_column_chunk_reader.h | 2 +- .../format/parquet/vparquet_column_reader.cpp | 53 +- .../format/parquet/vparquet_group_reader.cpp | 5 +- .../exec/format/parquet/vparquet_reader.cpp | 64 +- be/src/vec/exec/scan/scanner_context.cpp | 8 - be/src/vec/exec/scan/vfile_scanner.cpp | 2 +- 18 files changed, 786 insertions(+), 1587 deletions(-) delete mode 100644 be/src/vec/exec/format/parquet/delta_bit_pack_decoder.cpp rename be/src/vec/exec/format/{convert.h => parquet/parquet_column_convert.h} (61%) diff --git a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp index 6f5f36a33a972d..b6a614831a3f32 100644 --- a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp +++ b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp @@ -125,56 +125,35 @@ Status ByteArrayDictDecoder::_decode_values(MutableColumnPtr& doris_column, Data return _decode_dict_values(doris_column, select_vector, is_dict_filter); } - TypeIndex logical_type = remove_nullable(data_type)->get_type_id(); - switch (logical_type) { - case TypeIndex::String: - [[fallthrough]]; - case TypeIndex::FixedString: { - size_t dict_index = 0; + size_t dict_index = 0; - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - std::vector string_values; - string_values.reserve(run_length); - for (size_t i = 0; i < run_length; ++i) { - string_values.emplace_back(_dict_items[_indexes[dict_index++]]); - } - doris_column->insert_many_strings_overflow(&string_values[0], run_length, - _max_value_length); - break; - } - case ColumnSelectVector::NULL_DATA: { - doris_column->insert_many_defaults(run_length); - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - dict_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } + ColumnSelectVector::DataReadType read_type; + while (size_t run_length = select_vector.get_next_run(&read_type)) { + switch (read_type) { + case ColumnSelectVector::CONTENT: { + std::vector string_values; + string_values.reserve(run_length); + for (size_t i = 0; i < run_length; ++i) { + string_values.emplace_back(_dict_items[_indexes[dict_index++]]); } + doris_column->insert_many_strings_overflow(&string_values[0], run_length, + _max_value_length); + break; + } + case ColumnSelectVector::NULL_DATA: { + doris_column->insert_many_defaults(run_length); + break; + } + case ColumnSelectVector::FILTERED_CONTENT: { + dict_index += run_length; + break; + } + case ColumnSelectVector::FILTERED_NULL: { + // do nothing + break; + } } - return Status::OK(); - } - case TypeIndex::Decimal32: - return _decode_binary_decimal(doris_column, data_type, select_vector); - case TypeIndex::Decimal64: - return _decode_binary_decimal(doris_column, data_type, select_vector); - case TypeIndex::Decimal128: - return _decode_binary_decimal(doris_column, data_type, select_vector); - case TypeIndex::Decimal128I: - return _decode_binary_decimal(doris_column, data_type, select_vector); - // TODO: decimal256 - default: - break; } - return Status::InvalidArgument( - "Can't decode parquet physical type BYTE_ARRAY to doris logical type {}", - getTypeName(logical_type)); + return Status::OK(); } } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.h b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.h index 2f90ada4282a14..0267cf17f755d9 100644 --- a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.h +++ b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.h @@ -66,97 +66,10 @@ class ByteArrayDictDecoder final : public BaseDictDecoder { MutableColumnPtr convert_dict_column_to_string_column(const ColumnInt32* dict_column) override; protected: - template - Status _decode_binary_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector); - // For dictionary encoding std::vector _dict_items; std::vector _dict_data; size_t _max_value_length; std::unordered_map _dict_value_to_code; - -private: - template - Status _decode_binary_decimal_internal(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector); }; - -template -Status ByteArrayDictDecoder::_decode_binary_decimal(MutableColumnPtr& doris_column, - DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - init_decimal_converter(data_type); - DecimalScaleParams& scale_params = _decode_params->decimal_scale; - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { - return _decode_binary_decimal_internal( - doris_column, data_type, select_vector); - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { - return _decode_binary_decimal_internal( - doris_column, data_type, select_vector); - } else { - return _decode_binary_decimal_internal( - doris_column, data_type, select_vector); - } -} - -template -Status ByteArrayDictDecoder::_decode_binary_decimal_internal(MutableColumnPtr& doris_column, - DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - auto& column_data = - static_cast>&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - size_t dict_index = 0; - DecimalScaleParams& scale_params = _decode_params->decimal_scale; - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - StringRef& slice = _dict_items[_indexes[dict_index++]]; - char* buf_start = const_cast(slice.data); - uint32_t length = (uint32_t)slice.size; - // When Decimal in parquet is stored in byte arrays, binary and fixed, - // the unscaled number must be encoded as two's complement using big-endian byte order. - DecimalPrimitiveType value = 0; - memcpy(reinterpret_cast(&value), buf_start, length); - value = BitUtil::big_endian_to_host(value); - value = value >> ((sizeof(value) - length) * 8); - if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) { - value *= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { - value /= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) { - // do nothing - } else { - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - auto& v = reinterpret_cast(column_data[data_index++]); - v = (DecimalPrimitiveType)value; - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - dict_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); -} } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/byte_array_plain_decoder.cpp b/be/src/vec/exec/format/parquet/byte_array_plain_decoder.cpp index 911f090b1ce642..4dde378dc8eb2c 100644 --- a/be/src/vec/exec/format/parquet/byte_array_plain_decoder.cpp +++ b/be/src/vec/exec/format/parquet/byte_array_plain_decoder.cpp @@ -56,73 +56,53 @@ template Status ByteArrayPlainDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector, bool is_dict_filter) { -// TypeIndex logical_type = remove_nullable(data_type)->get_type_id(); -// switch (logical_type) { -// case TypeIndex::String: -// [[fallthrough]]; -// case TypeIndex::FixedString: { - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - std::vector string_values; - string_values.reserve(run_length); - for (size_t i = 0; i < run_length; ++i) { - if (UNLIKELY(_offset + 4 > _data->size)) { - return Status::IOError("Can't read byte array length from plain decoder"); - } - uint32_t length = decode_fixed32_le( - reinterpret_cast(_data->data) + _offset); - _offset += 4; - if (UNLIKELY(_offset + length) > _data->size) { - return Status::IOError("Can't read enough bytes in plain decoder"); - } - string_values.emplace_back(_data->data + _offset, length); - _offset += length; + ColumnSelectVector::DataReadType read_type; + while (size_t run_length = select_vector.get_next_run(&read_type)) { + switch (read_type) { + case ColumnSelectVector::CONTENT: { + std::vector string_values; + string_values.reserve(run_length); + for (size_t i = 0; i < run_length; ++i) { + if (UNLIKELY(_offset + 4 > _data->size)) { + return Status::IOError("Can't read byte array length from plain decoder"); } - doris_column->insert_many_strings(&string_values[0], run_length); - break; - } - case ColumnSelectVector::NULL_DATA: { - doris_column->insert_many_defaults(run_length); - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - for (int i = 0; i < run_length; ++i) { - if (UNLIKELY(_offset + 4 > _data->size)) { - return Status::IOError("Can't read byte array length from plain decoder"); - } - uint32_t length = decode_fixed32_le( - reinterpret_cast(_data->data) + _offset); - _offset += 4; - if (UNLIKELY(_offset + length) > _data->size) { - return Status::IOError("Can't read enough bytes in plain decoder"); - } - _offset += length; + uint32_t length = + decode_fixed32_le(reinterpret_cast(_data->data) + _offset); + _offset += 4; + if (UNLIKELY(_offset + length) > _data->size) { + return Status::IOError("Can't read enough bytes in plain decoder"); } - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; + string_values.emplace_back(_data->data + _offset, length); + _offset += length; } + doris_column->insert_many_strings(&string_values[0], run_length); + break; + } + case ColumnSelectVector::NULL_DATA: { + doris_column->insert_many_defaults(run_length); + break; + } + case ColumnSelectVector::FILTERED_CONTENT: { + for (int i = 0; i < run_length; ++i) { + if (UNLIKELY(_offset + 4 > _data->size)) { + return Status::IOError("Can't read byte array length from plain decoder"); + } + uint32_t length = + decode_fixed32_le(reinterpret_cast(_data->data) + _offset); + _offset += 4; + if (UNLIKELY(_offset + length) > _data->size) { + return Status::IOError("Can't read enough bytes in plain decoder"); + } + _offset += length; } + break; + } + case ColumnSelectVector::FILTERED_NULL: { + // do nothing + break; + } } - return Status::OK(); -// } -// case TypeIndex::Decimal32: -// return _decode_binary_decimal(doris_column, data_type, select_vector); -// case TypeIndex::Decimal64: -// return _decode_binary_decimal(doris_column, data_type, select_vector); -// case TypeIndex::Decimal128: -// return _decode_binary_decimal(doris_column, data_type, select_vector); -// case TypeIndex::Decimal128I: -// return _decode_binary_decimal(doris_column, data_type, select_vector); -// default: -// break; -// } -// return Status::InvalidArgument( -// "Can't decode parquet physical type BYTE_ARRAY to doris logical type {}", -// getTypeName(logical_type)); + } + return Status::OK(); } } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/byte_array_plain_decoder.h b/be/src/vec/exec/format/parquet/byte_array_plain_decoder.h index 5d5d23db603f6a..5fb8a9622c095c 100644 --- a/be/src/vec/exec/format/parquet/byte_array_plain_decoder.h +++ b/be/src/vec/exec/format/parquet/byte_array_plain_decoder.h @@ -56,97 +56,5 @@ class ByteArrayPlainDecoder final : public Decoder { ColumnSelectVector& select_vector, bool is_dict_filter); Status skip_values(size_t num_values) override; - -protected: - template - Status _decode_binary_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector); - -private: - template - Status _decode_binary_decimal_internal(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector); }; - -template -Status ByteArrayPlainDecoder::_decode_binary_decimal(MutableColumnPtr& doris_column, - DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - init_decimal_converter(data_type); - DecimalScaleParams& scale_params = _decode_params->decimal_scale; - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { - return _decode_binary_decimal_internal( - doris_column, data_type, select_vector); - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { - return _decode_binary_decimal_internal( - doris_column, data_type, select_vector); - } else { - return _decode_binary_decimal_internal( - doris_column, data_type, select_vector); - } -} - -template -Status ByteArrayPlainDecoder::_decode_binary_decimal_internal(MutableColumnPtr& doris_column, - DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - auto& column_data = - static_cast>&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - DecimalScaleParams& scale_params = _decode_params->decimal_scale; - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - if (UNLIKELY(_offset + 4 > _data->size)) { - return Status::IOError("Can't read byte array length from plain decoder"); - } - uint32_t length = - decode_fixed32_le(reinterpret_cast(_data->data) + _offset); - _offset += 4; - char* buf_start = _data->data + _offset; - _offset += length; - // When Decimal in parquet is stored in byte arrays, binary and fixed, - // the unscaled number must be encoded as two's complement using big-endian byte order. - DecimalPrimitiveType value = 0; - memcpy(reinterpret_cast(&value), buf_start, length); - value = BitUtil::big_endian_to_host(value); - value = value >> ((sizeof(value) - length) * 8); - if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) { - value *= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { - value /= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) { - // do nothing - } else { - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - auto& v = reinterpret_cast(column_data[data_index++]); - v = (DecimalPrimitiveType)value; - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - _offset += _type_length * run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); -} } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/decoder.cpp b/be/src/vec/exec/format/parquet/decoder.cpp index 0a158176091511..952d226af24e4e 100644 --- a/be/src/vec/exec/format/parquet/decoder.cpp +++ b/be/src/vec/exec/format/parquet/decoder.cpp @@ -31,8 +31,6 @@ namespace doris::vectorized { -const cctz::time_zone DecodeParams::utc0 = cctz::utc_time_zone(); - Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type encoding, std::unique_ptr& decoder) { switch (encoding) { @@ -45,17 +43,22 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type decoder.reset(new ByteArrayPlainDecoder()); break; case tparquet::Type::INT32: - [[fallthrough]]; + decoder.reset(new FixLengthPlainDecoder()); + break; case tparquet::Type::INT64: - [[fallthrough]]; + decoder.reset(new FixLengthPlainDecoder()); + break; case tparquet::Type::INT96: - [[fallthrough]]; + decoder.reset(new FixLengthPlainDecoder()); + break; case tparquet::Type::FLOAT: - [[fallthrough]]; + decoder.reset(new FixLengthPlainDecoder()); + break; case tparquet::Type::DOUBLE: - [[fallthrough]]; + decoder.reset(new FixLengthPlainDecoder()); + break; case tparquet::Type::FIXED_LEN_BYTE_ARRAY: - decoder.reset(new FixLengthPlainDecoder(type)); + decoder.reset(new FixLengthPlainDecoder()); break; default: return Status::InternalError("Unsupported type {}(encoding={}) in parquet decoder", @@ -70,22 +73,22 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type decoder.reset(new ByteArrayDictDecoder()); break; case tparquet::Type::INT32: - decoder.reset(new FixLengthDictDecoder(type)); + decoder.reset(new FixLengthDictDecoder()); break; case tparquet::Type::INT64: - decoder.reset(new FixLengthDictDecoder(type)); + decoder.reset(new FixLengthDictDecoder()); break; case tparquet::Type::INT96: - decoder.reset(new FixLengthDictDecoder(type)); + decoder.reset(new FixLengthDictDecoder()); break; case tparquet::Type::FLOAT: - decoder.reset(new FixLengthDictDecoder(type)); + decoder.reset(new FixLengthDictDecoder()); break; case tparquet::Type::DOUBLE: - decoder.reset(new FixLengthDictDecoder(type)); + decoder.reset(new FixLengthDictDecoder()); break; case tparquet::Type::FIXED_LEN_BYTE_ARRAY: - decoder.reset(new FixLengthDictDecoder(type)); + decoder.reset(new FixLengthDictDecoder()); break; default: return Status::InternalError("Unsupported type {}(encoding={}) in parquet decoder", @@ -106,10 +109,10 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type // Supports only INT32 and INT64. switch (type) { case tparquet::Type::INT32: - decoder.reset(new DeltaBitPackDecoder(type)); + decoder.reset(new DeltaBitPackDecoder()); break; case tparquet::Type::INT64: - decoder.reset(new DeltaBitPackDecoder(type)); + decoder.reset(new DeltaBitPackDecoder()); break; default: return Status::InternalError("DELTA_BINARY_PACKED only supports INT32 and INT64"); @@ -118,7 +121,7 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type case tparquet::Encoding::DELTA_BYTE_ARRAY: switch (type) { case tparquet::Type::BYTE_ARRAY: - decoder.reset(new DeltaByteArrayDecoder(type)); + decoder.reset(new DeltaByteArrayDecoder()); break; default: return Status::InternalError("DELTA_BYTE_ARRAY only supports BYTE_ARRAY."); @@ -127,7 +130,7 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type case tparquet::Encoding::DELTA_LENGTH_BYTE_ARRAY: switch (type) { case tparquet::Type::FIXED_LEN_BYTE_ARRAY: - decoder.reset(new DeltaLengthByteArrayDecoder(type)); + decoder.reset(new DeltaLengthByteArrayDecoder()); break; default: return Status::InternalError( @@ -141,47 +144,4 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type return Status::OK(); } -void Decoder::init(FieldSchema* field_schema, cctz::time_zone* ctz) { - _field_schema = field_schema; - if (_decode_params == nullptr) { - _decode_params.reset(new DecodeParams()); - } - if (ctz != nullptr) { - _decode_params->ctz = ctz; - } - const auto& schema = field_schema->parquet_schema; - if (schema.__isset.logicalType && schema.logicalType.__isset.TIMESTAMP) { - const auto& timestamp_info = schema.logicalType.TIMESTAMP; - if (!timestamp_info.isAdjustedToUTC) { - // should set timezone to utc+0 - _decode_params->ctz = const_cast(&_decode_params->utc0); - } - const auto& time_unit = timestamp_info.unit; - if (time_unit.__isset.MILLIS) { - _decode_params->second_mask = 1000; - _decode_params->scale_to_nano_factor = 1000000; - } else if (time_unit.__isset.MICROS) { - _decode_params->second_mask = 1000000; - _decode_params->scale_to_nano_factor = 1000; - } else if (time_unit.__isset.NANOS) { - _decode_params->second_mask = 1000000000; - _decode_params->scale_to_nano_factor = 1; - } - } else if (schema.__isset.converted_type) { - const auto& converted_type = schema.converted_type; - if (converted_type == tparquet::ConvertedType::TIMESTAMP_MILLIS) { - _decode_params->second_mask = 1000; - _decode_params->scale_to_nano_factor = 1000000; - } else if (converted_type == tparquet::ConvertedType::TIMESTAMP_MICROS) { - _decode_params->second_mask = 1000000; - _decode_params->scale_to_nano_factor = 1000; - } - } - - if (_decode_params->ctz) { - VecDateTimeValue t; - t.from_unixtime(0, *_decode_params->ctz); - _decode_params->offset_days = t.day() == 31 ? -1 : 0; // If 1969-12-31, then returns -1. - } -} } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/decoder.h b/be/src/vec/exec/format/parquet/decoder.h index 02440a2484d1bf..4e56dea3d106f5 100644 --- a/be/src/vec/exec/format/parquet/decoder.h +++ b/be/src/vec/exec/format/parquet/decoder.h @@ -54,29 +54,6 @@ class ColumnString; namespace doris::vectorized { -#define FOR_LOGICAL_NUMERIC_TYPES(M) \ - M(TypeIndex::Int8, Int8, Int32) \ - M(TypeIndex::UInt8, UInt8, Int32) \ - M(TypeIndex::Int16, Int16, Int32) \ - M(TypeIndex::UInt16, UInt16, Int32) \ - M(TypeIndex::Int32, Int32, Int32) \ - M(TypeIndex::UInt32, UInt32, Int32) \ - M(TypeIndex::Int64, Int64, Int64) \ - M(TypeIndex::UInt64, UInt64, Int64) \ - M(TypeIndex::Float32, Float32, Float32) \ - M(TypeIndex::Float64, Float64, Float64) - -struct DecodeParams { - // schema.logicalType.TIMESTAMP.isAdjustedToUTC == false - static const cctz::time_zone utc0; - // schema.logicalType.TIMESTAMP.isAdjustedToUTC == true, we should set the time zone - cctz::time_zone* ctz = nullptr; - int32_t offset_days = 0; - int64_t second_mask = 1; - int64_t scale_to_nano_factor = 1; - DecimalScaleParams decimal_scale; -}; - class Decoder { public: Decoder() = default; @@ -94,11 +71,6 @@ class Decoder { _offset = 0; } - void init(FieldSchema* field_schema, cctz::time_zone* ctz); - - template - void init_decimal_converter(DataTypePtr& data_type); - // Write the decoded values batch to doris's column virtual Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector, bool is_dict_filter) = 0; @@ -126,34 +98,8 @@ class Decoder { int32_t _type_length; Slice* _data = nullptr; uint32_t _offset = 0; - FieldSchema* _field_schema = nullptr; - std::unique_ptr _decode_params = nullptr; }; -template -void Decoder::init_decimal_converter(DataTypePtr& data_type) { - if (_decode_params == nullptr || _field_schema == nullptr || - _decode_params->decimal_scale.scale_type != DecimalScaleParams::NOT_INIT) { - return; - } - auto scale = _field_schema->parquet_schema.scale; - auto* decimal_type = reinterpret_cast>*>( - const_cast(remove_nullable(data_type).get())); - auto dest_scale = decimal_type->get_scale(); - if (dest_scale > scale) { - _decode_params->decimal_scale.scale_type = DecimalScaleParams::SCALE_UP; - _decode_params->decimal_scale.scale_factor = - DecimalScaleParams::get_scale_factor(dest_scale - scale); - } else if (dest_scale < scale) { - _decode_params->decimal_scale.scale_type = DecimalScaleParams::SCALE_DOWN; - _decode_params->decimal_scale.scale_factor = - DecimalScaleParams::get_scale_factor(scale - dest_scale); - } else { - _decode_params->decimal_scale.scale_type = DecimalScaleParams::NO_SCALE; - _decode_params->decimal_scale.scale_factor = 1; - } -} - class BaseDictDecoder : public Decoder { public: BaseDictDecoder() = default; @@ -220,4 +166,4 @@ class BaseDictDecoder : public Decoder { std::vector _indexes; }; -} // namespace doris::vectorized \ No newline at end of file +} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.cpp b/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.cpp deleted file mode 100644 index f734f3012c11d3..00000000000000 --- a/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.cpp +++ /dev/null @@ -1,283 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "delta_bit_pack_decoder.h" - -#include - -#include -#include - -#include "vec/columns/column.h" -#include "vec/common/arithmetic_overflow.h" -#include "vec/common/string_ref.h" -#include "vec/core/types.h" -#include "vec/data_types/data_type_nullable.h" - -namespace doris::vectorized { - -template -Status DeltaBitPackDecoder::_init_header() { - if (!_bit_reader->GetVlqInt(&_values_per_block) || - !_bit_reader->GetVlqInt(&_mini_blocks_per_block) || - !_bit_reader->GetVlqInt(&_total_value_count) || - !_bit_reader->GetZigZagVlqInt(&_last_value)) { - return Status::IOError("Init header eof"); - } - if (_values_per_block == 0) { - return Status::InvalidArgument("Cannot have zero value per block"); - } - if (_values_per_block % 128 != 0) { - return Status::InvalidArgument( - "the number of values in a block must be multiple of 128, but it's " + - std::to_string(_values_per_block)); - } - if (_mini_blocks_per_block == 0) { - return Status::InvalidArgument("Cannot have zero miniblock per block"); - } - _values_per_mini_block = _values_per_block / _mini_blocks_per_block; - if (_values_per_mini_block == 0) { - return Status::InvalidArgument("Cannot have zero value per miniblock"); - } - if (_values_per_mini_block % 32 != 0) { - return Status::InvalidArgument( - "The number of values in a miniblock must be multiple of 32, but it's " + - std::to_string(_values_per_mini_block)); - } - _total_values_remaining = _total_value_count; - _delta_bit_widths.resize(_mini_blocks_per_block); - // init as empty property - _block_initialized = false; - _values_remaining_current_mini_block = 0; - return Status::OK(); -} - -template -Status DeltaBitPackDecoder::_init_block() { - DCHECK_GT(_total_values_remaining, 0) << "InitBlock called at EOF"; - if (!_bit_reader->GetZigZagVlqInt(&_min_delta)) { - return Status::IOError("Init block eof"); - } - - // read the bitwidth of each miniblock - uint8_t* bit_width_data = _delta_bit_widths.data(); - for (uint32_t i = 0; i < _mini_blocks_per_block; ++i) { - if (!_bit_reader->GetAligned(1, bit_width_data + i)) { - return Status::IOError("Decode bit-width EOF"); - } - // Note that non-conformant bitwidth entries are allowed by the Parquet spec - // for extraneous miniblocks in the last block (GH-14923), so we check - // the bitwidths when actually using them (see InitMiniBlock()). - } - _mini_block_idx = 0; - _block_initialized = true; - RETURN_IF_ERROR(_init_mini_block(bit_width_data[0])); - return Status::OK(); -} - -template -Status DeltaBitPackDecoder::_init_mini_block(int bit_width) { - if (PREDICT_FALSE(bit_width > kMaxDeltaBitWidth)) { - return Status::InvalidArgument("delta bit width larger than integer bit width"); - } - _delta_bit_width = bit_width; - _values_remaining_current_mini_block = _values_per_mini_block; - return Status::OK(); -} - -template -Status DeltaBitPackDecoder::_get_internal(T* buffer, int num_values, int* out_num_values) { - num_values = static_cast(std::min(num_values, _total_values_remaining)); - if (num_values == 0) { - *out_num_values = 0; - return Status::OK(); - } - int i = 0; - while (i < num_values) { - if (PREDICT_FALSE(_values_remaining_current_mini_block == 0)) { - if (PREDICT_FALSE(!_block_initialized)) { - buffer[i++] = _last_value; - DCHECK_EQ(i, 1); // we're at the beginning of the page - if (i == num_values) { - // When block is uninitialized and i reaches num_values we have two - // different possibilities: - // 1. _total_value_count == 1, which means that the page may have only - // one value (encoded in the header), and we should not initialize - // any block. - // 2. _total_value_count != 1, which means we should initialize the - // incoming block for subsequent reads. - if (_total_value_count != 1) { - RETURN_IF_ERROR(_init_block()); - } - break; - } - RETURN_IF_ERROR(_init_block()); - } else { - ++_mini_block_idx; - if (_mini_block_idx < _mini_blocks_per_block) { - RETURN_IF_ERROR(_init_mini_block(_delta_bit_widths.data()[_mini_block_idx])); - } else { - RETURN_IF_ERROR(_init_block()); - } - } - } - - int values_decode = std::min(_values_remaining_current_mini_block, - static_cast(num_values - i)); - for (int j = 0; j < values_decode; ++j) { - if (!_bit_reader->GetValue(_delta_bit_width, buffer + i + j)) { - return Status::IOError("Get batch EOF"); - } - } - for (int j = 0; j < values_decode; ++j) { - // Addition between min_delta, packed int and last_value should be treated as - // unsigned addition. Overflow is as expected. - buffer[i + j] = static_cast(_min_delta) + static_cast(buffer[i + j]) + - static_cast(_last_value); - _last_value = buffer[i + j]; - } - _values_remaining_current_mini_block -= values_decode; - i += values_decode; - } - _total_values_remaining -= num_values; - - if (PREDICT_FALSE(_total_values_remaining == 0)) { - if (!_bit_reader->Advance(_delta_bit_width * _values_remaining_current_mini_block)) { - return Status::IOError("Skip padding EOF"); - } - _values_remaining_current_mini_block = 0; - } - *out_num_values = num_values; - return Status::OK(); -} - -void DeltaLengthByteArrayDecoder::_decode_lengths() { - _len_decoder.set_bit_reader(_bit_reader); - // get the number of encoded lengths - int num_length = _len_decoder.valid_values_count(); - _buffered_length.resize(num_length); - - // decode all the lengths. all the lengths are buffered in buffered_length_. - int ret; - Status st = _len_decoder.decode(_buffered_length.data(), num_length, &ret); - if (!st.ok()) { - LOG(FATAL) << "Fail to decode delta length, status: " << st; - } - DCHECK_EQ(ret, num_length); - _length_idx = 0; - _num_valid_values = num_length; -} - -Status DeltaLengthByteArrayDecoder::_get_internal(Slice* buffer, int max_values, - int* out_num_values) { - // Decode up to `max_values` strings into an internal buffer - // and reference them into `buffer`. - max_values = std::min(max_values, _num_valid_values); - if (max_values == 0) { - *out_num_values = 0; - return Status::OK(); - } - - int32_t data_size = 0; - const int32_t* length_ptr = _buffered_length.data() + _length_idx; - for (int i = 0; i < max_values; ++i) { - int32_t len = length_ptr[i]; - if (PREDICT_FALSE(len < 0)) { - return Status::InvalidArgument("Negative string delta length"); - } - buffer[i].size = len; - if (common::add_overflow(data_size, len, data_size)) { - return Status::InvalidArgument("Excess expansion in DELTA_(LENGTH_)BYTE_ARRAY"); - } - } - _length_idx += max_values; - - _buffered_data.resize(data_size); - char* data_ptr = _buffered_data.data(); - for (int j = 0; j < data_size; j++) { - if (!_bit_reader->GetValue(8, data_ptr + j)) { - return Status::IOError("Get length bytes EOF"); - } - } - - for (int i = 0; i < max_values; ++i) { - buffer[i].data = data_ptr; - data_ptr += buffer[i].size; - } - // this->num_values_ -= max_values; - _num_valid_values -= max_values; - *out_num_values = max_values; - return Status::OK(); -} - -Status DeltaByteArrayDecoder::_get_internal(Slice* buffer, int max_values, int* out_num_values) { - // Decode up to `max_values` strings into an internal buffer - // and reference them into `buffer`. - max_values = std::min(max_values, _num_valid_values); - if (max_values == 0) { - *out_num_values = max_values; - return Status::OK(); - } - - int suffix_read; - RETURN_IF_ERROR(_suffix_decoder.decode(buffer, max_values, &suffix_read)); - if (PREDICT_FALSE(suffix_read != max_values)) { - return Status::IOError("Read {}, expecting {} from suffix decoder", - std::to_string(suffix_read), std::to_string(max_values)); - } - - int64_t data_size = 0; - const int32_t* prefix_len_ptr = _buffered_prefix_length.data() + _prefix_len_offset; - for (int i = 0; i < max_values; ++i) { - if (PREDICT_FALSE(prefix_len_ptr[i] < 0)) { - return Status::InvalidArgument("negative prefix length in DELTA_BYTE_ARRAY"); - } - if (PREDICT_FALSE(common::add_overflow(data_size, static_cast(prefix_len_ptr[i]), - data_size) || - common::add_overflow(data_size, static_cast(buffer[i].size), - data_size))) { - return Status::InvalidArgument("excess expansion in DELTA_BYTE_ARRAY"); - } - } - _buffered_data.resize(data_size); - - std::string_view prefix {_last_value}; - - char* data_ptr = _buffered_data.data(); - for (int i = 0; i < max_values; ++i) { - if (PREDICT_FALSE(static_cast(prefix_len_ptr[i]) > prefix.length())) { - return Status::InvalidArgument("prefix length too large in DELTA_BYTE_ARRAY"); - } - memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]); - // buffer[i] currently points to the string suffix - memcpy(data_ptr + prefix_len_ptr[i], buffer[i].data, buffer[i].size); - buffer[i].data = data_ptr; - buffer[i].size += prefix_len_ptr[i]; - data_ptr += buffer[i].size; - prefix = std::string_view {buffer[i].data, buffer[i].size}; - } - _prefix_len_offset += max_values; - _num_valid_values -= max_values; - _last_value = std::string {prefix}; - - if (_num_valid_values == 0) { - _last_value_in_previous_page = _last_value; - } - *out_num_values = max_values; - return Status::OK(); -} -} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h b/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h index 464229cda9cab4..6f893d5db80fa4 100644 --- a/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h +++ b/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h @@ -51,14 +51,11 @@ class DeltaDecoder : public Decoder { return _type_converted_decoder->skip_values(num_values); } - template + template Status decode_byte_array(const std::vector& decoded_vals, MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector) { - TypeIndex logical_type = remove_nullable(data_type)->get_type_id(); - switch (logical_type) { - case TypeIndex::String: - [[fallthrough]]; - case TypeIndex::FixedString: { + if constexpr (PhysicalType == tparquet::Type::BYTE_ARRAY && + PhysicalType == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { ColumnSelectVector::DataReadType read_type; while (size_t run_length = select_vector.get_next_run(&read_type)) { switch (read_type) { @@ -88,21 +85,14 @@ class DeltaDecoder : public Decoder { } } _current_value_idx = 0; - return Status::OK(); - } - default: - break; } - return Status::InvalidArgument( - "Can't decode parquet physical type BYTE_ARRAY to doris logical type {}", - getTypeName(logical_type)); + return Status::OK(); } protected: void init_values_converter() { _type_converted_decoder->set_data(_data); _type_converted_decoder->set_type_length(_type_length); - _type_converted_decoder->init(_field_schema, _decode_params->ctz); } // Convert decoded value to doris type value. std::unique_ptr _type_converted_decoder; @@ -117,13 +107,12 @@ class DeltaDecoder : public Decoder { * Block * [min delta] [list of bitwidths of the mini blocks] [miniblocks] */ -template +template class DeltaBitPackDecoder final : public DeltaDecoder { public: using UT = std::make_unsigned_t; - DeltaBitPackDecoder(const tparquet::Type::type& physical_type) - : DeltaDecoder(new FixLengthPlainDecoder(physical_type)) {} + DeltaBitPackDecoder() : DeltaDecoder(new FixLengthPlainDecoder()) {} ~DeltaBitPackDecoder() override = default; Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector, bool is_dict_filter) override { @@ -200,16 +189,13 @@ class DeltaBitPackDecoder final : public DeltaDecoder { // _values_remaining_current_mini_block may greater than _total_values_remaining. uint32_t _values_remaining_current_mini_block; }; -template class DeltaBitPackDecoder; -template class DeltaBitPackDecoder; - +//template class DeltaBitPackDecoder; +//template class DeltaBitPackDecoder; +template class DeltaLengthByteArrayDecoder final : public DeltaDecoder { public: - explicit DeltaLengthByteArrayDecoder(const tparquet::Type::type& physical_type) - : DeltaDecoder(nullptr), - _len_decoder(physical_type), - _buffered_length(0), - _buffered_data(0) {} + explicit DeltaLengthByteArrayDecoder() + : DeltaDecoder(nullptr), _len_decoder(), _buffered_length(0), _buffered_data(0) {} Status skip_values(size_t num_values) override { _current_value_idx += num_values; @@ -240,7 +226,8 @@ class DeltaLengthByteArrayDecoder final : public DeltaDecoder { return Status::IOError("Expected to decode {} values, but decoded {} values.", num_values - null_count, num_valid_values); } - return decode_byte_array(_values, doris_column, data_type, select_vector); + return decode_byte_array(_values, doris_column, data_type, + select_vector); } Status decode(Slice* buffer, int num_values, int* out_num_values) { @@ -270,7 +257,7 @@ class DeltaLengthByteArrayDecoder final : public DeltaDecoder { std::vector _values; std::shared_ptr _bit_reader; - DeltaBitPackDecoder _len_decoder; + DeltaBitPackDecoder _len_decoder; int _num_valid_values; uint32_t _length_idx; @@ -278,14 +265,11 @@ class DeltaLengthByteArrayDecoder final : public DeltaDecoder { std::vector _buffered_data; }; +template class DeltaByteArrayDecoder : public DeltaDecoder { public: - explicit DeltaByteArrayDecoder(const tparquet::Type::type& physical_type) - : DeltaDecoder(nullptr), - _prefix_len_decoder(physical_type), - _suffix_decoder(physical_type), - _buffered_prefix_length(0), - _buffered_data(0) {} + explicit DeltaByteArrayDecoder() + : DeltaDecoder(nullptr), _buffered_prefix_length(0), _buffered_data(0) {} Status skip_values(size_t num_values) override { _current_value_idx += num_values; @@ -312,7 +296,8 @@ class DeltaByteArrayDecoder : public DeltaDecoder { int num_valid_values; RETURN_IF_ERROR(_get_internal(_values.data(), num_values - null_count, &num_valid_values)); DCHECK_EQ(num_values - null_count, num_valid_values); - return decode_byte_array(_values, doris_column, data_type, select_vector); + return decode_byte_array(_values, doris_column, data_type, + select_vector); } void set_data(Slice* slice) override { @@ -350,8 +335,8 @@ class DeltaByteArrayDecoder : public DeltaDecoder { std::vector _values; std::shared_ptr _bit_reader; - DeltaBitPackDecoder _prefix_len_decoder; - DeltaLengthByteArrayDecoder _suffix_decoder; + DeltaBitPackDecoder _prefix_len_decoder; + DeltaLengthByteArrayDecoder _suffix_decoder; std::string _last_value; // string buffer for last value in previous page std::string _last_value_in_previous_page; @@ -361,3 +346,260 @@ class DeltaByteArrayDecoder : public DeltaDecoder { std::vector _buffered_data; }; } // namespace doris::vectorized + +namespace doris::vectorized { + +template +Status DeltaBitPackDecoder::_init_header() { + if (!_bit_reader->GetVlqInt(&_values_per_block) || + !_bit_reader->GetVlqInt(&_mini_blocks_per_block) || + !_bit_reader->GetVlqInt(&_total_value_count) || + !_bit_reader->GetZigZagVlqInt(&_last_value)) { + return Status::IOError("Init header eof"); + } + if (_values_per_block == 0) { + return Status::InvalidArgument("Cannot have zero value per block"); + } + if (_values_per_block % 128 != 0) { + return Status::InvalidArgument( + "the number of values in a block must be multiple of 128, but it's " + + std::to_string(_values_per_block)); + } + if (_mini_blocks_per_block == 0) { + return Status::InvalidArgument("Cannot have zero miniblock per block"); + } + _values_per_mini_block = _values_per_block / _mini_blocks_per_block; + if (_values_per_mini_block == 0) { + return Status::InvalidArgument("Cannot have zero value per miniblock"); + } + if (_values_per_mini_block % 32 != 0) { + return Status::InvalidArgument( + "The number of values in a miniblock must be multiple of 32, but it's " + + std::to_string(_values_per_mini_block)); + } + _total_values_remaining = _total_value_count; + _delta_bit_widths.resize(_mini_blocks_per_block); + // init as empty property + _block_initialized = false; + _values_remaining_current_mini_block = 0; + return Status::OK(); +} + +template +Status DeltaBitPackDecoder::_init_block() { + DCHECK_GT(_total_values_remaining, 0) << "InitBlock called at EOF"; + if (!_bit_reader->GetZigZagVlqInt(&_min_delta)) { + return Status::IOError("Init block eof"); + } + + // read the bitwidth of each miniblock + uint8_t* bit_width_data = _delta_bit_widths.data(); + for (uint32_t i = 0; i < _mini_blocks_per_block; ++i) { + if (!_bit_reader->GetAligned(1, bit_width_data + i)) { + return Status::IOError("Decode bit-width EOF"); + } + // Note that non-conformant bitwidth entries are allowed by the Parquet spec + // for extraneous miniblocks in the last block (GH-14923), so we check + // the bitwidths when actually using them (see InitMiniBlock()). + } + _mini_block_idx = 0; + _block_initialized = true; + RETURN_IF_ERROR(_init_mini_block(bit_width_data[0])); + return Status::OK(); +} + +template +Status DeltaBitPackDecoder::_init_mini_block(int bit_width) { + if (PREDICT_FALSE(bit_width > kMaxDeltaBitWidth)) { + return Status::InvalidArgument("delta bit width larger than integer bit width"); + } + _delta_bit_width = bit_width; + _values_remaining_current_mini_block = _values_per_mini_block; + return Status::OK(); +} + +template +Status DeltaBitPackDecoder::_get_internal(T* buffer, int num_values, + int* out_num_values) { + num_values = static_cast(std::min(num_values, _total_values_remaining)); + if (num_values == 0) { + *out_num_values = 0; + return Status::OK(); + } + int i = 0; + while (i < num_values) { + if (PREDICT_FALSE(_values_remaining_current_mini_block == 0)) { + if (PREDICT_FALSE(!_block_initialized)) { + buffer[i++] = _last_value; + DCHECK_EQ(i, 1); // we're at the beginning of the page + if (i == num_values) { + // When block is uninitialized and i reaches num_values we have two + // different possibilities: + // 1. _total_value_count == 1, which means that the page may have only + // one value (encoded in the header), and we should not initialize + // any block. + // 2. _total_value_count != 1, which means we should initialize the + // incoming block for subsequent reads. + if (_total_value_count != 1) { + RETURN_IF_ERROR(_init_block()); + } + break; + } + RETURN_IF_ERROR(_init_block()); + } else { + ++_mini_block_idx; + if (_mini_block_idx < _mini_blocks_per_block) { + RETURN_IF_ERROR(_init_mini_block(_delta_bit_widths.data()[_mini_block_idx])); + } else { + RETURN_IF_ERROR(_init_block()); + } + } + } + + int values_decode = std::min(_values_remaining_current_mini_block, + static_cast(num_values - i)); + for (int j = 0; j < values_decode; ++j) { + if (!_bit_reader->GetValue(_delta_bit_width, buffer + i + j)) { + return Status::IOError("Get batch EOF"); + } + } + for (int j = 0; j < values_decode; ++j) { + // Addition between min_delta, packed int and last_value should be treated as + // unsigned addition. Overflow is as expected. + buffer[i + j] = static_cast(_min_delta) + static_cast(buffer[i + j]) + + static_cast(_last_value); + _last_value = buffer[i + j]; + } + _values_remaining_current_mini_block -= values_decode; + i += values_decode; + } + _total_values_remaining -= num_values; + + if (PREDICT_FALSE(_total_values_remaining == 0)) { + if (!_bit_reader->Advance(_delta_bit_width * _values_remaining_current_mini_block)) { + return Status::IOError("Skip padding EOF"); + } + _values_remaining_current_mini_block = 0; + } + *out_num_values = num_values; + return Status::OK(); +} +template +void DeltaLengthByteArrayDecoder::_decode_lengths() { + _len_decoder.set_bit_reader(_bit_reader); + // get the number of encoded lengths + int num_length = _len_decoder.valid_values_count(); + _buffered_length.resize(num_length); + + // decode all the lengths. all the lengths are buffered in buffered_length_. + int ret; + Status st = _len_decoder.decode(_buffered_length.data(), num_length, &ret); + if (!st.ok()) { + LOG(FATAL) << "Fail to decode delta length, status: " << st; + } + DCHECK_EQ(ret, num_length); + _length_idx = 0; + _num_valid_values = num_length; +} +template +Status DeltaLengthByteArrayDecoder::_get_internal(Slice* buffer, int max_values, + int* out_num_values) { + // Decode up to `max_values` strings into an internal buffer + // and reference them into `buffer`. + max_values = std::min(max_values, _num_valid_values); + if (max_values == 0) { + *out_num_values = 0; + return Status::OK(); + } + + int32_t data_size = 0; + const int32_t* length_ptr = _buffered_length.data() + _length_idx; + for (int i = 0; i < max_values; ++i) { + int32_t len = length_ptr[i]; + if (PREDICT_FALSE(len < 0)) { + return Status::InvalidArgument("Negative string delta length"); + } + buffer[i].size = len; + if (common::add_overflow(data_size, len, data_size)) { + return Status::InvalidArgument("Excess expansion in DELTA_(LENGTH_)BYTE_ARRAY"); + } + } + _length_idx += max_values; + + _buffered_data.resize(data_size); + char* data_ptr = _buffered_data.data(); + for (int j = 0; j < data_size; j++) { + if (!_bit_reader->GetValue(8, data_ptr + j)) { + return Status::IOError("Get length bytes EOF"); + } + } + + for (int i = 0; i < max_values; ++i) { + buffer[i].data = data_ptr; + data_ptr += buffer[i].size; + } + // this->num_values_ -= max_values; + _num_valid_values -= max_values; + *out_num_values = max_values; + return Status::OK(); +} + +template +Status DeltaByteArrayDecoder::_get_internal(Slice* buffer, int max_values, + int* out_num_values) { + // Decode up to `max_values` strings into an internal buffer + // and reference them into `buffer`. + max_values = std::min(max_values, _num_valid_values); + if (max_values == 0) { + *out_num_values = max_values; + return Status::OK(); + } + + int suffix_read; + RETURN_IF_ERROR(_suffix_decoder.decode(buffer, max_values, &suffix_read)); + if (PREDICT_FALSE(suffix_read != max_values)) { + return Status::IOError("Read {}, expecting {} from suffix decoder", + std::to_string(suffix_read), std::to_string(max_values)); + } + + int64_t data_size = 0; + const int32_t* prefix_len_ptr = _buffered_prefix_length.data() + _prefix_len_offset; + for (int i = 0; i < max_values; ++i) { + if (PREDICT_FALSE(prefix_len_ptr[i] < 0)) { + return Status::InvalidArgument("negative prefix length in DELTA_BYTE_ARRAY"); + } + if (PREDICT_FALSE(common::add_overflow(data_size, static_cast(prefix_len_ptr[i]), + data_size) || + common::add_overflow(data_size, static_cast(buffer[i].size), + data_size))) { + return Status::InvalidArgument("excess expansion in DELTA_BYTE_ARRAY"); + } + } + _buffered_data.resize(data_size); + + std::string_view prefix {_last_value}; + + char* data_ptr = _buffered_data.data(); + for (int i = 0; i < max_values; ++i) { + if (PREDICT_FALSE(static_cast(prefix_len_ptr[i]) > prefix.length())) { + return Status::InvalidArgument("prefix length too large in DELTA_BYTE_ARRAY"); + } + memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]); + // buffer[i] currently points to the string suffix + memcpy(data_ptr + prefix_len_ptr[i], buffer[i].data, buffer[i].size); + buffer[i].data = data_ptr; + buffer[i].size += prefix_len_ptr[i]; + data_ptr += buffer[i].size; + prefix = std::string_view {buffer[i].data, buffer[i].size}; + } + _prefix_len_offset += max_values; + _num_valid_values -= max_values; + _last_value = std::string {prefix}; + + if (_num_valid_values == 0) { + _last_value_in_previous_page = _last_value; + } + *out_num_values = max_values; + return Status::OK(); +} +} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp index 2bf19250d51ed5..df66e633a0270d 100644 --- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp +++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp @@ -25,11 +25,10 @@ namespace doris::vectorized { -template +template class FixLengthDictDecoder final : public BaseDictDecoder { public: - FixLengthDictDecoder(tparquet::Type::type physical_type) - : BaseDictDecoder(), _physical_type(physical_type) {}; + FixLengthDictDecoder() : BaseDictDecoder() {}; ~FixLengthDictDecoder() override = default; Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, @@ -72,138 +71,10 @@ class FixLengthDictDecoder final : public BaseDictDecoder { if (doris_column->is_column_dictionary() || is_dict_filter) { return _decode_dict_values(doris_column, select_vector, is_dict_filter); } - /* - * decoder.reset(new FixLengthDictDecoder(type)); - break; - case tparquet::Type::INT64: - decoder.reset(new FixLengthDictDecoder(type)); - break; - case tparquet::Type::INT96: - decoder.reset(new FixLengthDictDecoder(type)); - break; - case tparquet::Type::FLOAT: - decoder.reset(new FixLengthDictDecoder(type)); - break; - case tparquet::Type::DOUBLE: - decoder.reset(new FixLengthDictDecoder(type)); - break; - case tparquet::Type::FIXED_LEN_BYTE_ARRAY: - decoder.reset(new FixLengthDictDecoder(type)); - * - */ - if constexpr (std::is_same_v){ - return _decode_numeric(doris_column, select_vector); - }else if constexpr (std::is_same_v){ - return _decode_numeric(doris_column, select_vector); - }else if constexpr (std::is_same_v){ - return _decode_numeric(doris_column, select_vector); - }else if constexpr (std::is_same_v){ - return _decode_numeric(doris_column, select_vector); - }else if constexpr (std::is_same_v){ - return _decode_numeric(doris_column, select_vector); - } -// else if constexpr (std::is_same_v) { -// return _decode_numeric(doris_column, select_vector); -// -// } + _decode_numeric(doris_column, select_vector); return Status::OK(); - -// TypeIndex logical_type = remove_nullable(data_type)->get_type_id(); -// switch (logical_type) { -// -// -//#define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \ -// case NUMERIC_TYPE: \ -// if constexpr (!std::is_same_v) { \ -// return _decode_numeric(doris_column, select_vector); \ -// } -// FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) -//#undef DISPATCH -// case TypeIndex::Int128: -// return _decode_numeric(doris_column, select_vector); -// break; -// case TypeIndex::Date: -// if constexpr (std::is_same_v) { -// return _decode_date(doris_column, -// select_vector); -// } -// break; -// case TypeIndex::DateV2: -// if constexpr (std::is_same_v) { -// return _decode_date, UInt32, has_filter>( -// doris_column, select_vector); -// } -// break; -// case TypeIndex::DateTime: -// if constexpr (std::is_same_v) { -// return _decode_datetime96(doris_column, -// select_vector); -// } else if constexpr (std::is_same_v) { -// return _decode_datetime64(doris_column, -// select_vector); -// } -// break; -// case TypeIndex::DateTimeV2: -// // Spark can set the timestamp precision by the following configuration: -// // spark.sql.parquet.outputTimestampType = INT96(NANOS), TIMESTAMP_MICROS, TIMESTAMP_MILLIS -// if constexpr (std::is_same_v) { -// return _decode_datetime96, UInt64, has_filter>( -// doris_column, select_vector); -// } else if constexpr (std::is_same_v) { -// return _decode_datetime64, UInt64, has_filter>( -// doris_column, select_vector); -// } -// break; -// case TypeIndex::Decimal32: -// if constexpr (std::is_same_v) { -// return _decode_primitive_decimal(doris_column, data_type, -// select_vector); -// } else if constexpr (std::is_same_v) { -// return _decode_primitive_decimal(doris_column, data_type, -// select_vector); -// } -// break; -// case TypeIndex::Decimal64: -// if constexpr (std::is_same_v) { -// return _decode_primitive_decimal(doris_column, data_type, -// select_vector); -// } else if constexpr (std::is_same_v) { -// return _decode_primitive_decimal(doris_column, data_type, -// select_vector); -// } -// break; -// case TypeIndex::Decimal128: -// if constexpr (std::is_same_v) { -// return _decode_primitive_decimal(doris_column, data_type, -// select_vector); -// } else if constexpr (std::is_same_v) { -// return _decode_primitive_decimal(doris_column, data_type, -// select_vector); -// } -// break; -// case TypeIndex::Decimal128I: -// if constexpr (std::is_same_v) { -// return _decode_primitive_decimal(doris_column, data_type, -// select_vector); -// } else if constexpr (std::is_same_v) { -// return _decode_primitive_decimal(doris_column, data_type, -// select_vector); -// } -// break; -// case TypeIndex::String: -// [[fallthrough]]; -// case TypeIndex::FixedString: -// break; -// default: -// break; -// } -// return Status::InvalidArgument( -// "Can't decode parquet physical type {} to doris logical type {}", -// tparquet::to_string(_physical_type), getTypeName(logical_type)); -// -// return Status::OK(); } Status set_dict(std::unique_ptr& dict, int32_t length, size_t num_values) override { @@ -214,265 +85,26 @@ class FixLengthDictDecoder final : public BaseDictDecoder { char* dict_item_address = reinterpret_cast(_dict.get()); _dict_items.resize(num_values); for (size_t i = 0; i < num_values; ++i) { - _dict_items[i] = *(T*)dict_item_address; + _dict_items[i] = *(DataType*)dict_item_address; dict_item_address += _type_length; } return Status::OK(); } protected: - template + template Status _decode_numeric(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) { - auto& column_data = static_cast&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - size_t dict_index = 0; - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - if constexpr (std::is_same_v) { - ParquetInt96 value = - static_cast(_dict_items[_indexes[dict_index++]]); - column_data[data_index++] = value.to_int128(); - - } else { - column_data[data_index++] = - static_cast(_dict_items[_indexes[dict_index++]]); - } - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - dict_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); - } - - template - Status _decode_date(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) { - auto& column_data = static_cast&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - size_t dict_index = 0; - date_day_offset_dict& date_dict = date_day_offset_dict::get(); - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - int64_t date_value = - _dict_items[_indexes[dict_index++]] + _decode_params->offset_days; - if constexpr (std::is_same_v) { - auto& v = reinterpret_cast(column_data[data_index++]); - v.create_from_date_v2(date_dict[date_value], TIME_DATE); - // we should cast to date if using date v1. - v.cast_to_date(); - } else { - reinterpret_cast(column_data[data_index++]) = - date_dict[date_value]; - } - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - dict_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); - } - - template - Status _decode_datetime64(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) { - auto& column_data = static_cast&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - size_t dict_index = 0; - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - int64_t date_value = _dict_items[_indexes[dict_index++]]; - auto& v = reinterpret_cast(column_data[data_index++]); - v.from_unixtime(date_value / _decode_params->second_mask, *_decode_params->ctz); - if constexpr (std::is_same_v>) { - // nanoseconds will be ignored. - v.set_microsecond((date_value % _decode_params->second_mask) * - _decode_params->scale_to_nano_factor / 1000); - // TODO: the precision of datetime v1 - } - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - dict_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); - } - - template - Status _decode_datetime96(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) { - auto& column_data = static_cast&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - size_t dict_index = 0; - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - ParquetInt96& datetime96 = _dict_items[_indexes[dict_index++]]; - auto& v = reinterpret_cast(column_data[data_index++]); - int64_t micros = datetime96.to_timestamp_micros(); - v.from_unixtime(micros / 1000000, *_decode_params->ctz); - if constexpr (std::is_same_v>) { - // spark.sql.parquet.outputTimestampType = INT96(NANOS) will lost precision. - // only keep microseconds. - v.set_microsecond(micros % 1000000); - } - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - dict_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); - } - - template - Status _decode_primitive_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - init_decimal_converter(data_type); - DecimalScaleParams& scale_params = _decode_params->decimal_scale; -#define M(FixedTypeLength, ValueCopyType, ScaleType) \ - case FixedTypeLength: \ - return _decode_primitive_decimal_internal(doris_column, data_type, \ - select_vector); - -#define APPLY_FOR_DECIMALS(ScaleType) \ - M(1, int64_t, ScaleType) \ - M(2, int64_t, ScaleType) \ - M(3, int64_t, ScaleType) \ - M(4, int64_t, ScaleType) \ - M(5, int64_t, ScaleType) \ - M(6, int64_t, ScaleType) \ - M(7, int64_t, ScaleType) \ - M(8, int64_t, ScaleType) \ - M(9, int128_t, ScaleType) \ - M(10, int128_t, ScaleType) \ - M(11, int128_t, ScaleType) \ - M(12, int128_t, ScaleType) \ - M(13, int128_t, ScaleType) \ - M(14, int128_t, ScaleType) \ - M(15, int128_t, ScaleType) \ - M(16, int128_t, ScaleType) - - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_UP) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_DOWN) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } else { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::NO_SCALE) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } - return Status::OK(); -#undef APPLY_FOR_DECIMALS -#undef M - } - - template - Status _decode_primitive_decimal_internal(MutableColumnPtr& doris_column, - DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - auto& column_data = - static_cast>&>(*doris_column) - .get_data(); + auto& column_data = static_cast(*doris_column).get_data(); size_t data_index = column_data.size(); column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); size_t dict_index = 0; - DecimalScaleParams& scale_params = _decode_params->decimal_scale; - ColumnSelectVector::DataReadType read_type; while (size_t run_length = select_vector.get_next_run(&read_type)) { switch (read_type) { case ColumnSelectVector::CONTENT: { for (size_t i = 0; i < run_length; ++i) { - ValueCopyType value = static_cast(_dict_items[_indexes[dict_index++]]); - if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) { - value *= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { - value /= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) { - // do nothing - } else { - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - auto& v = reinterpret_cast(column_data[data_index++]); - v = (DecimalPrimitiveType)value; + column_data[data_index++] = + static_cast(_dict_items[_indexes[dict_index++]]); } break; } @@ -492,18 +124,17 @@ class FixLengthDictDecoder final : public BaseDictDecoder { } return Status::OK(); } - - tparquet::Type::type _physical_type; + using ColumnType = ParquetConvert::PhysicalTypeTraits::ColumnType; + using DataType = ParquetConvert::PhysicalTypeTraits::DataType; // For dictionary encoding - std::vector _dict_items; + std::vector _dict_items; }; template <> -class FixLengthDictDecoder final : public BaseDictDecoder { +class FixLengthDictDecoder final : public BaseDictDecoder { public: - FixLengthDictDecoder(tparquet::Type::type physical_type) - : BaseDictDecoder(), _physical_type(physical_type) {}; + FixLengthDictDecoder() : BaseDictDecoder() {}; ~FixLengthDictDecoder() override = default; Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, @@ -536,160 +167,10 @@ class FixLengthDictDecoder final : public BaseDictDecoder { return _decode_dict_values(doris_column, select_vector, is_dict_filter); } - TypeIndex logical_type = remove_nullable(data_type)->get_type_id(); - switch (logical_type) { - case TypeIndex::Decimal32: - if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { - return _decode_binary_decimal(doris_column, data_type, - select_vector); - } - break; - case TypeIndex::Decimal64: - if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { - return _decode_binary_decimal(doris_column, data_type, - select_vector); - } - break; - case TypeIndex::Decimal128: - if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { - return _decode_binary_decimal(doris_column, data_type, - select_vector); - } - break; - case TypeIndex::Decimal128I: - if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { - return _decode_binary_decimal(doris_column, data_type, - select_vector); - } - break; - // TODO: decimal256 - case TypeIndex::String: - [[fallthrough]]; - case TypeIndex::FixedString: - if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { - return _decode_string(doris_column, select_vector); - } - break; - default: - break; - } - - return Status::InvalidArgument( - "Can't decode parquet physical type {} to doris logical type {}", - tparquet::to_string(_physical_type), getTypeName(logical_type)); - } - - Status skip_values(size_t num_values) override { - _indexes.resize(num_values); - _index_batch_decoder->GetBatch(&_indexes[0], num_values); - return Status::OK(); - } - - Status set_dict(std::unique_ptr& dict, int32_t length, size_t num_values) override { - if (num_values * _type_length != length) { - return Status::Corruption("Wrong dictionary data for fixed length type"); - } - _dict = std::move(dict); - char* dict_item_address = reinterpret_cast(_dict.get()); - _dict_items.resize(num_values); - _dict_value_to_code.reserve(num_values); - for (size_t i = 0; i < num_values; ++i) { - _dict_items[i] = dict_item_address; - _dict_value_to_code[StringRef(_dict_items[i], _type_length)] = i; - dict_item_address += _type_length; - } - return Status::OK(); - } - - Status read_dict_values_to_column(MutableColumnPtr& doris_column) override { - size_t dict_items_size = _dict_items.size(); - std::vector dict_values(dict_items_size); - for (size_t i = 0; i < dict_items_size; ++i) { - dict_values.emplace_back(_dict_items[i], _type_length); - } - doris_column->insert_many_strings(&dict_values[0], dict_items_size); - return Status::OK(); - } - - Status get_dict_codes(const ColumnString* string_column, - std::vector* dict_codes) override { - size_t size = string_column->size(); - dict_codes->reserve(size); - for (int i = 0; i < size; ++i) { - StringRef dict_value = string_column->get_data_at(i); - dict_codes->emplace_back(_dict_value_to_code[dict_value]); - } - return Status::OK(); - } - - MutableColumnPtr convert_dict_column_to_string_column(const ColumnInt32* dict_column) override { - auto res = ColumnString::create(); - std::vector dict_values(dict_column->size()); - const auto& data = dict_column->get_data(); - for (size_t i = 0; i < dict_column->size(); ++i) { - dict_values.emplace_back(_dict_items[data[i]], _type_length); - } - res->insert_many_strings(&dict_values[0], dict_values.size()); - return res; + return _decode_string(doris_column, select_vector); } protected: - template - Status _decode_binary_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - init_decimal_converter(data_type); - DecimalScaleParams& scale_params = _decode_params->decimal_scale; -#define M(FixedTypeLength, ValueCopyType, ScaleType) \ - case FixedTypeLength: \ - return _decode_binary_decimal_internal(doris_column, data_type, \ - select_vector); - -#define APPLY_FOR_DECIMALS(ScaleType) \ - M(1, int64_t, ScaleType) \ - M(2, int64_t, ScaleType) \ - M(3, int64_t, ScaleType) \ - M(4, int64_t, ScaleType) \ - M(5, int64_t, ScaleType) \ - M(6, int64_t, ScaleType) \ - M(7, int64_t, ScaleType) \ - M(8, int64_t, ScaleType) \ - M(9, int128_t, ScaleType) \ - M(10, int128_t, ScaleType) \ - M(11, int128_t, ScaleType) \ - M(12, int128_t, ScaleType) \ - M(13, int128_t, ScaleType) \ - M(14, int128_t, ScaleType) \ - M(15, int128_t, ScaleType) \ - M(16, int128_t, ScaleType) - - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_UP) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_DOWN) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } else { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::NO_SCALE) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } - return Status::OK(); -#undef APPLY_FOR_DECIMALS -#undef M - } - template Status _decode_string(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) { size_t dict_index = 0; @@ -722,70 +203,8 @@ class FixLengthDictDecoder final : public BaseDictDecoder { return Status::OK(); } - tparquet::Type::type _physical_type; - // For dictionary encoding std::vector _dict_items; - std::unordered_map _dict_value_to_code; - -private: - template - Status _decode_binary_decimal_internal(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - auto& column_data = - static_cast>&>(*doris_column) - .get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - size_t dict_index = 0; - DecimalScaleParams& scale_params = _decode_params->decimal_scale; - - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - char* buf_start = _dict_items[_indexes[dict_index++]]; - // When Decimal in parquet is stored in byte arrays, binary and fixed, - // the unscaled number must be encoded as two's complement using big-endian byte order. - DecimalPrimitiveType result_value = 0; - ValueCopyType value = 0; - memcpy(reinterpret_cast(&value), buf_start, fixed_type_length); - value = BitUtil::big_endian_to_host(value); - value = value >> ((sizeof(value) - fixed_type_length) * 8); - result_value = value; - if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) { - result_value *= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { - result_value /= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) { - // do nothing - } else { - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - auto& v = reinterpret_cast(column_data[data_index++]); - v = (DecimalPrimitiveType)result_value; - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - dict_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); - } }; } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h index 96d674e25888b1..0cc198f8ad5e12 100644 --- a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h +++ b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h @@ -23,6 +23,7 @@ #include "common/status.h" #include "vec/data_types/data_type.h" #include "vec/exec/format/parquet/decoder.h" +#include "vec/exec/format/parquet/parquet_column_convert.h" namespace doris { namespace vectorized { @@ -32,56 +33,144 @@ class ColumnSelectVector; namespace doris::vectorized { +template class FixLengthPlainDecoder final : public Decoder { public: - FixLengthPlainDecoder(tparquet::Type::type physical_type) : _physical_type(physical_type) {}; + FixLengthPlainDecoder() {}; ~FixLengthPlainDecoder() override = default; Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector, bool is_dict_filter) override; - template + template Status _decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector, bool is_dict_filter); Status skip_values(size_t num_values) override; protected: - template + template Status _decode_numeric(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector); - template - Status _decode_date(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector); - - template - Status _decode_datetime64(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector); - - template - Status _decode_datetime96(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector); - - template - Status _decode_binary_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector); - - template - Status _decode_primitive_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector); - template Status _decode_string(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector); - - tparquet::Type::type _physical_type; - -private: - template - Status _decode_binary_decimal_internal(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector); - template - Status _decode_primitive_decimal_internal(MutableColumnPtr& doris_column, - DataTypePtr& data_type, - ColumnSelectVector& select_vector); }; + +template +Status FixLengthPlainDecoder::skip_values(size_t num_values) { + _offset += _type_length * num_values; + if (UNLIKELY(_offset > _data->size)) { + return Status::IOError("Out-of-bounds access in parquet data decoder"); + } + return Status::OK(); +} + +template +Status FixLengthPlainDecoder::decode_values(MutableColumnPtr& doris_column, + DataTypePtr& data_type, + ColumnSelectVector& select_vector, + bool is_dict_filter) { + if (select_vector.has_filter()) { + return _decode_values(doris_column, data_type, select_vector, is_dict_filter); + } else { + return _decode_values(doris_column, data_type, select_vector, is_dict_filter); + } +} + +template +template +Status FixLengthPlainDecoder::_decode_values(MutableColumnPtr& doris_column, + DataTypePtr& data_type, + ColumnSelectVector& select_vector, + bool is_dict_filter) { + size_t non_null_size = select_vector.num_values() - select_vector.num_nulls(); + if (UNLIKELY(_offset + _type_length * non_null_size > _data->size)) { + return Status::IOError("Out-of-bounds access in parquet data decoder"); + } + + if constexpr (PhysicalType == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { + return _decode_string(doris_column, select_vector); + } else { + return _decode_numeric(doris_column, select_vector); + } +} + +template +template +Status FixLengthPlainDecoder::_decode_string(MutableColumnPtr& doris_column, + ColumnSelectVector& select_vector) { + ColumnSelectVector::DataReadType read_type; + while (size_t run_length = select_vector.get_next_run(&read_type)) { + switch (read_type) { + case ColumnSelectVector::CONTENT: { + std::vector string_values; + string_values.reserve(run_length); + for (size_t i = 0; i < run_length; ++i) { + char* buf_start = _data->data + _offset; + string_values.emplace_back(buf_start, _type_length); + _offset += _type_length; + } + doris_column->insert_many_strings(&string_values[0], run_length); + break; + } + case ColumnSelectVector::NULL_DATA: { + doris_column->insert_many_defaults(run_length); + break; + } + case ColumnSelectVector::FILTERED_CONTENT: { + _offset += _type_length * run_length; + break; + } + case ColumnSelectVector::FILTERED_NULL: { + // do nothing + break; + } + } + } + return Status::OK(); +} + +template +template +Status FixLengthPlainDecoder::_decode_numeric(MutableColumnPtr& doris_column, + ColumnSelectVector& select_vector) { + if constexpr (PhysicalType == tparquet::Type::FIXED_LEN_BYTE_ARRAY || + PhysicalType == tparquet::Type::BYTE_ARRAY) { + return Status::OK(); + } else { + using ColumnType = ParquetConvert::PhysicalTypeTraits::ColumnType; + using DataType = ParquetConvert::PhysicalTypeTraits::DataType; + + auto& column_data = static_cast(*doris_column).get_data(); + size_t data_index = column_data.size(); + column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); + ColumnSelectVector::DataReadType read_type; + while (size_t run_length = select_vector.get_next_run(&read_type)) { + switch (read_type) { + case ColumnSelectVector::CONTENT: { + for (size_t i = 0; i < run_length; ++i) { + char* buf_start = _data->data + _offset; + column_data[data_index++] = *(DataType*)buf_start; + _offset += _type_length; + } + break; + } + case ColumnSelectVector::NULL_DATA: { + data_index += run_length; + break; + } + case ColumnSelectVector::FILTERED_CONTENT: { + _offset += _type_length * run_length; + break; + } + case ColumnSelectVector::FILTERED_NULL: { + // do nothing + break; + } + } + } + return Status::OK(); + } +} + } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/convert.h b/be/src/vec/exec/format/parquet/parquet_column_convert.h similarity index 61% rename from be/src/vec/exec/format/convert.h rename to be/src/vec/exec/format/parquet/parquet_column_convert.h index 1a30c3719a685a..567941205faebf 100644 --- a/be/src/vec/exec/format/convert.h +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h @@ -24,10 +24,14 @@ #include #include +#include "common/compiler_util.h" // IWYU pragma: keep #include "common/status.h" #include "gen_cpp/descriptors.pb.h" +#include "gutil/endian.h" #include "io/file_factory.h" #include "olap/olap_common.h" +#include "util/coding.h" +#include "util/slice.h" #include "vec/columns/column_string.h" #include "vec/columns/column_vector.h" #include "vec/core/types.h" @@ -36,97 +40,148 @@ #include "vec/data_types/data_type_nullable.h" #include "vec/data_types/data_type_number.h" #include "vec/data_types/data_type_string.h" -#include "vec/exec/format/parquet/parquet_common.h" -#include "common/compiler_util.h" // IWYU pragma: keep -#include "common/status.h" -#include "gutil/endian.h" -#include "util/coding.h" -#include "util/slice.h" -#include "vec/core/types.h" -#include "vec/data_types/data_type.h" #include "vec/exec/format/format_common.h" #include "vec/exec/format/parquet/decoder.h" #include "vec/exec/format/parquet/parquet_common.h" - namespace doris::vectorized { namespace ParquetConvert { -class DocTime { -public: - std::unique_ptr _decode_params; - const FieldSchema* _field_schema; - void init_time(const FieldSchema* field_schema, cctz::time_zone* ctz) { - if (_decode_params == nullptr) { - _decode_params.reset(new DecodeParams()); - } - if (ctz != nullptr) { - _decode_params->ctz = ctz; - } +template +struct PhysicalTypeTraits {}; + +template <> +struct PhysicalTypeTraits { + using DataType = int32_t; + using ColumnType = ColumnVector; +}; - _field_schema = field_schema; +template <> +struct PhysicalTypeTraits { + using DataType = int64_t; + using ColumnType = ColumnVector; +}; + +template <> +struct PhysicalTypeTraits { + using DataType = float; + using ColumnType = ColumnVector; +}; + +template <> +struct PhysicalTypeTraits { + using DataType = double; + using ColumnType = ColumnVector; +}; + +template <> +struct PhysicalTypeTraits { + using DataType = String; + using ColumnType = ColumnString; +}; + +template <> +struct PhysicalTypeTraits { + using DataType = String; + using ColumnType = ColumnString; +}; + +template <> +struct PhysicalTypeTraits { + using DataType = Int128; + using ColumnType = ColumnVector; +}; + +#define FOR_LOGICAL_NUMERIC_TYPES(M) \ + M(TypeIndex::Int8, Int8, Int32) \ + M(TypeIndex::UInt8, UInt8, Int32) \ + M(TypeIndex::Int16, Int16, Int32) \ + M(TypeIndex::UInt16, UInt16, Int32) \ + M(TypeIndex::Int32, Int32, Int32) \ + M(TypeIndex::UInt32, UInt32, Int32) \ + M(TypeIndex::Int64, Int64, Int64) \ + M(TypeIndex::UInt64, UInt64, Int64) \ + M(TypeIndex::Float32, Float32, Float32) \ + M(TypeIndex::Float64, Float64, Float64) + +struct ConvertParams { + // schema.logicalType.TIMESTAMP.isAdjustedToUTC == false + static const cctz::time_zone utc0; + // schema.logicalType.TIMESTAMP.isAdjustedToUTC == true, we should set the time zone + cctz::time_zone* ctz = nullptr; + size_t offset_days = 0; + int64_t second_mask = 1; + int64_t scale_to_nano_factor = 1; + DecimalScaleParams decimal_scale; + FieldSchema* field_schema = nullptr; + + void init(FieldSchema* field_schema_, cctz::time_zone* ctz_) { + field_schema = field_schema_; + if (ctz_ != nullptr) { + ctz = ctz_; + } const auto& schema = field_schema->parquet_schema; if (schema.__isset.logicalType && schema.logicalType.__isset.TIMESTAMP) { const auto& timestamp_info = schema.logicalType.TIMESTAMP; if (!timestamp_info.isAdjustedToUTC) { // should set timezone to utc+0 - _decode_params->ctz = const_cast(&_decode_params->utc0); + ctz = const_cast(&utc0); } const auto& time_unit = timestamp_info.unit; if (time_unit.__isset.MILLIS) { - _decode_params->second_mask = 1000; - _decode_params->scale_to_nano_factor = 1000000; + second_mask = 1000; + scale_to_nano_factor = 1000000; } else if (time_unit.__isset.MICROS) { - _decode_params->second_mask = 1000000; - _decode_params->scale_to_nano_factor = 1000; + second_mask = 1000000; + scale_to_nano_factor = 1000; } else if (time_unit.__isset.NANOS) { - _decode_params->second_mask = 1000000000; - _decode_params->scale_to_nano_factor = 1; + second_mask = 1000000000; + scale_to_nano_factor = 1; } } else if (schema.__isset.converted_type) { const auto& converted_type = schema.converted_type; if (converted_type == tparquet::ConvertedType::TIMESTAMP_MILLIS) { - _decode_params->second_mask = 1000; - _decode_params->scale_to_nano_factor = 1000000; + second_mask = 1000; + scale_to_nano_factor = 1000000; } else if (converted_type == tparquet::ConvertedType::TIMESTAMP_MICROS) { - _decode_params->second_mask = 1000000; - _decode_params->scale_to_nano_factor = 1000; + second_mask = 1000000; + scale_to_nano_factor = 1000; } } - if (_decode_params->ctz) { + if (ctz) { VecDateTimeValue t; - t.from_unixtime(0, *_decode_params->ctz); - _decode_params->offset_days = t.day() == 31 ? 0 : 1; + t.from_unixtime(0, *ctz); + offset_days = t.day() == 31 ? 0 : 1; } } - template + + template void init_decimal_converter(DataTypePtr& data_type) { - if (_decode_params == nullptr || _field_schema == nullptr || - _decode_params->decimal_scale.scale_type != DecimalScaleParams::NOT_INIT) { + if (field_schema == nullptr || decimal_scale.scale_type != DecimalScaleParams::NOT_INIT) { return; } - auto scale = _field_schema->parquet_schema.scale; - auto* decimal_type = reinterpret_cast*>( + auto scale = field_schema->parquet_schema.scale; + auto* decimal_type = static_cast*>( const_cast(remove_nullable(data_type).get())); auto dest_scale = decimal_type->get_scale(); if (dest_scale > scale) { - _decode_params->decimal_scale.scale_type = DecimalScaleParams::SCALE_UP; - _decode_params->decimal_scale.scale_factor = - DecimalScaleParams::get_scale_factor(dest_scale - scale); + decimal_scale.scale_type = DecimalScaleParams::SCALE_UP; + decimal_scale.scale_factor = + DecimalScaleParams::get_scale_factor(dest_scale - scale); } else if (dest_scale < scale) { - _decode_params->decimal_scale.scale_type = DecimalScaleParams::SCALE_DOWN; - _decode_params->decimal_scale.scale_factor = - DecimalScaleParams::get_scale_factor(scale - dest_scale); + decimal_scale.scale_type = DecimalScaleParams::SCALE_DOWN; + decimal_scale.scale_factor = + DecimalScaleParams::get_scale_factor(scale - dest_scale); } else { - _decode_params->decimal_scale.scale_type = DecimalScaleParams::NO_SCALE; - _decode_params->decimal_scale.scale_factor = 1; + decimal_scale.scale_type = DecimalScaleParams::NO_SCALE; + decimal_scale.scale_factor = 1; } } }; - -static Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, +inline const cctz::time_zone ConvertParams::utc0 = cctz::utc_time_zone(); +inline Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, vectorized::DataTypePtr& ans_data_type, DataTypePtr& src_type, bool* need_convert) { std::cout << getTypeName(src_type->get_type_id()) << "\n"; @@ -158,8 +213,7 @@ static Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, ans_data_type = std::make_shared(); break; default: - // std::cout << "--->" << parquet_type << "\n"; - break; + return Status::IOError("Can't read parquet type : {}", parquet_type); } if (ans_data_type->get_type_id() == src_type->get_type_id()) { *need_convert = false; @@ -168,7 +222,6 @@ static Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, if (src_type->is_nullable()) { auto& nested_src_type = reinterpret_cast(src_type.get())->get_nested_type(); - std::cout << getTypeName(nested_src_type->get_type_id()) << "\n"; auto sub = ans_data_type; ans_data_type = std::make_shared(ans_data_type); @@ -184,14 +237,19 @@ static Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, struct ColumnConvert { virtual Status convert(const IColumn* src_col, IColumn* dst_col) { return Status::OK(); } + virtual ~ColumnConvert() = default; + +public: + ConvertParams* _convert_params; }; template struct NumberColumnConvert : public ColumnConvert { Status convert(const IColumn* src_col, IColumn* dst_col) override; }; -void convert_null(const IColumn** src_col, IColumn** dst_col) { + +inline void convert_null(const IColumn** src_col, IColumn** dst_col) { size_t rows = (*src_col)->size(); if ((*src_col)->is_nullable()) { auto src_nullable_column = reinterpret_cast(*src_col); @@ -214,14 +272,13 @@ Status NumberColumnConvert::convert(const IColu if constexpr (is_nullable) { convert_null(&src_col, &dst_col); } + auto& src_data = static_cast*>(src_col)->get_data(); + dst_col->resize(rows); + auto& data = static_cast*>(dst_col)->get_data(); for (int i = 0; i < rows; i++) { - // src_type src_value = reinterpret_cast*>(src_col)->get_data()[i]; - - dst_type value = static_cast( - reinterpret_cast*>(src_col)->get_data()[i]); - - reinterpret_cast*>(dst_col)->insert(value); + dst_type value = static_cast(src_data[i]); + data[i] = value; } return Status::OK(); @@ -238,19 +295,17 @@ Status NumberColumnToStringConvert::convert(const IColumn if constexpr (is_nullable) { convert_null(&src_col, &dst_col); } - + auto& src_data = static_cast*>(src_col)->get_data(); + auto str_col = static_cast(dst_col); for (int i = 0; i < rows; i++) { - std::string value = std::to_string( - reinterpret_cast*>(src_col)->get_data()[i]); - reinterpret_cast(dst_col)->insert_data(value.data(), value.size()); + std::string value = std::to_string(src_data[i]); + str_col->insert_data(value.data(), value.size()); } return Status::OK(); } template struct int128totimestamp : public ColumnConvert { - int128totimestamp(DocTime* pTime) { doc = pTime; } - [[nodiscard]] inline uint64_t to_timestamp_micros(uint32_t hi, uint64_t lo) const { return (hi - ParquetInt96::JULIAN_EPOCH_OFFSET_DAYS) * ParquetInt96::MICROS_IN_DAY + lo / ParquetInt96::NANOS_PER_MICROSECOND; @@ -260,74 +315,65 @@ struct int128totimestamp : public ColumnConvert { if constexpr (is_nullable) { convert_null(&src_col, &dst_col); } + auto& src_data = static_cast*>(src_col)->get_data(); + dst_col->resize(rows); + auto& data = static_cast*>(dst_col)->get_data(); for (int i = 0; i < rows; i++) { - __int128 x = reinterpret_cast*>(src_col)->get_data()[i]; + __int128 x = src_data[i]; uint32_t hi = x >> 64; uint64_t lo = (x << 64) >> 64; - dst_col = static_cast*>(dst_col); - reinterpret_cast*>(dst_col)->insert(0); - auto& num = static_cast*>(dst_col)->get_data()[i]; + auto& num = data[i]; auto& value = reinterpret_cast&>(num); int64_t micros = to_timestamp_micros(hi, lo); - value.from_unixtime(micros / 1000000, *doc->_decode_params->ctz); + value.from_unixtime(micros / 1000000, *_convert_params->ctz); value.set_microsecond(micros % 1000000); - std::cout << "value = " << value << "\n"; } return Status::OK(); } - DocTime* doc; }; template struct int64totimestamp : public ColumnConvert { public: - int64totimestamp(DocTime* pTime) { doc = pTime; } - Status convert(const IColumn* src_col, IColumn* dst_col) override { size_t rows = src_col->size(); if constexpr (is_nullable) { convert_null(&src_col, &dst_col); } dst_col->resize(rows); + auto& src_data = static_cast*>(src_col)->get_data(); + auto& data = static_cast*>(dst_col)->get_data(); for (int i = 0; i < rows; i++) { - int64 x = reinterpret_cast*>(src_col)->get_data()[i]; + int64 x = src_data[i]; dst_col = static_cast*>(dst_col); - // reinterpret_cast*>(dst_col)->insert(0); - auto& num = static_cast*>(dst_col)->get_data()[i]; + auto& num = data[i]; auto& value = reinterpret_cast&>(num); - value.from_unixtime(x / doc->_decode_params->second_mask, *doc->_decode_params->ctz); - value.set_microsecond((x % doc->_decode_params->second_mask) * - doc->_decode_params->scale_to_nano_factor / 1000); + value.from_unixtime(x / _convert_params->second_mask, *_convert_params->ctz); + value.set_microsecond((x % _convert_params->second_mask) * + _convert_params->scale_to_nano_factor / 1000); std::cout << "value = " << value << "\n"; } return Status::OK(); } - - DocTime* doc; }; template class int32todate : public ColumnConvert { public: - DocTime* doc; - int32todate(DocTime* pTime) { doc = pTime; } Status convert(const IColumn* src_col, IColumn* dst_col) override { size_t rows = src_col->size(); if constexpr (is_nullable) { convert_null(&src_col, &dst_col); } dst_col->resize(rows); + auto& src_data = reinterpret_cast*>(src_col)->get_data(); + auto& data = static_cast(dst_col)->get_data(); + date_day_offset_dict& date_dict = date_day_offset_dict::get(); + for (int i = 0; i < rows; i++) { - // auto& value = reinterpret_cast*>(src_col)->get_data()[i]; - // reinterpret_cast>(); - auto& value = reinterpret_cast&>( - reinterpret_cast(dst_col)->get_data()[i]); - // value = reinterpret_cast*>(src_col)->get_data()[i]; - int64_t date_value = - reinterpret_cast*>(src_col)->get_data()[i] + - doc->_decode_params->offset_days; - date_day_offset_dict& date_dict = date_day_offset_dict::get(); + auto& value = reinterpret_cast&>(data[i]); + int64_t date_value = src_data[i] + _convert_params->offset_days; value = date_dict[date_value]; } @@ -335,31 +381,36 @@ class int32todate : public ColumnConvert { } }; -template +template class stringtodecimal : public ColumnConvert { public: - DocTime* doc; - stringtodecimal(DocTime* pTime) { doc = pTime; } Status convert(const IColumn* src_col, IColumn* dst_col) override { size_t rows = src_col->size(); if constexpr (is_nullable) { convert_null(&src_col, &dst_col); } - DecimalScaleParams& scale_params = doc->_decode_params->decimal_scale; + DecimalScaleParams& scale_params = _convert_params->decimal_scale; auto buf = static_cast(src_col)->get_chars().data(); auto& offset = static_cast(src_col)->get_offsets(); dst_col->resize(rows); auto& data = static_cast*>(dst_col)->get_data(); for (int i = 0; i < rows; i++) { int len = offset[i] - offset[i - 1]; - Int128 value = buf[offset[i - 1]] & 0x80 ? -1 : 0; - memcpy(reinterpret_cast(&value) + sizeof(Int128) - len, buf + offset[i - 1], - len); - value = BigEndian::ToHost128(value); - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { + // When Decimal in parquet is stored in byte arrays, binary and fixed, + // the unscaled number must be encoded as two's complement using big-endian byte order. + typename DecimalType::NativeType value = 0; + memcpy(reinterpret_cast(&value), buf + offset[i - 1], len); + value = BitUtil::big_endian_to_host(value); + value = value >> ((sizeof(value) - len) * 8); + if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) { value *= scale_params.scale_factor; - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { + } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { value /= scale_params.scale_factor; + } else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) { + // do nothing + } else { + LOG(FATAL) << "__builtin_unreachable"; + __builtin_unreachable(); } auto& v = reinterpret_cast(data[i]); v = (DecimalType)value; @@ -368,10 +419,9 @@ class stringtodecimal : public ColumnConvert { return Status::OK(); } }; -template +template class numbertodecimal : public ColumnConvert { - DocTime* doc; - public: Status convert(const IColumn* src_col, IColumn* dst_col) override { size_t rows = src_col->size(); @@ -380,41 +430,29 @@ class numbertodecimal : public ColumnConvert { } auto* src_data = static_cast*>(src_col)->get_data().data(); dst_col->resize(rows); - DecimalScaleParams& scale_params = doc->_decode_params->decimal_scale; - auto* data = static_cast>*>(dst_col)->get_data().data(); + DecimalScaleParams& scale_params = _convert_params->decimal_scale; + auto* data = static_cast>*>(dst_col) + ->get_data() + .data(); for (int i = 0; i < rows; i++) { Int128 value = src_data[i]; - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { + if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) { value *= scale_params.scale_factor; - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { + } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { value /= scale_params.scale_factor; } data[i] = (DecimalPhysicalType)value; } return Status::OK(); } - - numbertodecimal(DocTime* pTime) { doc = pTime; } }; -/* - * Int128 value = *reinterpret_cast(buf_start); - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { - value *= scale_params.scale_factor; - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { - value /= scale_params.scale_factor; - } - auto& v = reinterpret_cast(column_data[data_index++]); - v = (DecimalPrimitiveType)value; - * - * - */ template -static Status get_converter_impl(std::shared_ptr src_data_type, +inline Status get_converter_impl(std::shared_ptr src_data_type, std::shared_ptr dst_data_type, std::unique_ptr* converter, - DocTime& doc [[maybe_unused]]) { + ConvertParams* convert_params) { auto src_type = src_data_type->get_type_id(); auto dst_type = dst_data_type->get_type_id(); @@ -468,26 +506,71 @@ static Status get_converter_impl(std::shared_ptr src_data_type, break; case TypeIndex::DateV2: if (src_type == TypeIndex::Int32) { - *converter = std::make_unique>(&doc); + *converter = std::make_unique>(); } break; case TypeIndex::DateTimeV2: if (src_type == TypeIndex::Int128) { - *converter = std::make_unique>(&doc); + *converter = std::make_unique>(); } else if (src_type == TypeIndex::Int64) { - *converter = std::make_unique>(&doc); + *converter = std::make_unique>(); } break; case TypeIndex::Decimal64: + convert_params->init_decimal_converter(dst_data_type); + DecimalScaleParams& scale_params = convert_params->decimal_scale; + if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { + } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { + } else { + } + if (src_type == TypeIndex::Int128) { - *converter = std::make_unique>(&doc); + if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { + *converter = std::make_unique>(); + } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { + *converter = std::make_unique>(); + } else { + *converter = std::make_unique>(); + } } else if (src_type == TypeIndex::String) { - doc.init_decimal_converter(dst_data_type); - *converter = std::make_unique>(&doc); + if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { + *converter = std::make_unique< + stringtodecimal>(); + } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { + *converter = std::make_unique< + stringtodecimal>(); + + } else { + *converter = std::make_unique< + stringtodecimal>(); + } } else if (src_type == TypeIndex::Int32) { - *converter = std::make_unique>(&doc); + if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { + *converter = std::make_unique< + numbertodecimal>(); + } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { + *converter = std::make_unique>(); + } else { + *converter = std::make_unique< + numbertodecimal>(); + } } else if (src_type == TypeIndex::Int64) { - *converter = std::make_unique>(&doc); + if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { + *converter = std::make_unique< + numbertodecimal>(); + + } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { + *converter = std::make_unique< + numbertodecimal>(); + + } else { + *converter = std::make_unique< + numbertodecimal>(); + } } break; default: @@ -498,22 +581,23 @@ static Status get_converter_impl(std::shared_ptr src_data_type, return Status::NotSupported("Can't cast type {} to type {}", getTypeName(src_type), getTypeName(dst_type)); } - + (*converter)->_convert_params = convert_params; return Status::OK(); } -static Status get_converter(std::shared_ptr src_type, +inline Status get_converter(std::shared_ptr src_type, std::shared_ptr dst_type, - std::unique_ptr* converter, DocTime& doc) { + std::unique_ptr* converter, + ConvertParams* convert_param) { if (src_type->is_nullable()) { - auto src = reinterpret_cast(src_type.get())->get_nested_type(); - auto dst = reinterpret_cast(dst_type.get())->get_nested_type(); + auto src = static_cast(src_type.get())->get_nested_type(); + auto dst = static_cast(dst_type.get())->get_nested_type(); - return get_converter_impl(src, dst, converter, doc); + return get_converter_impl(src, dst, converter, convert_param); } else { - return get_converter_impl(src_type, dst_type, converter, doc); + return get_converter_impl(src_type, dst_type, converter, convert_param); } } -}; // namespace convert +}; // namespace ParquetConvert -} // namespace doris::vectorized \ No newline at end of file +}; // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp index 240e537ee05e59..d9fe16158e1f46 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp @@ -54,7 +54,7 @@ ColumnChunkReader::ColumnChunkReader(io::BufferedStreamReader* reader, _max_def_level(field_schema->definition_level), _stream_reader(reader), _metadata(column_chunk->meta_data), - _ctz(ctz), + // _ctz(ctz), _io_ctx(io_ctx) {} Status ColumnChunkReader::init() { @@ -190,12 +190,12 @@ Status ColumnChunkReader::load_page_data() { _page_decoder = _decoders[static_cast(encoding)].get(); } else { std::unique_ptr page_decoder; - std::cout <<"type = "<<_metadata.type <<" "<< encoding <<"\n"; + std::cout << "type = " << _metadata.type << " " << encoding << "\n"; RETURN_IF_ERROR(Decoder::get_decoder(_metadata.type, encoding, page_decoder)); // Set type length page_decoder->set_type_length(_get_type_length()); // Initialize the time convert context - page_decoder->init(_field_schema, _ctz); + // page_decoder->init(_field_schema, _ctz); _decoders[static_cast(encoding)] = std::move(page_decoder); _page_decoder = _decoders[static_cast(encoding)].get(); } @@ -243,7 +243,7 @@ Status ColumnChunkReader::_decode_dict_page() { // Set type length page_decoder->set_type_length(_get_type_length()); // Initialize the time convert context - page_decoder->init(_field_schema, _ctz); + // page_decoder->init(_field_schema, _ctz); // Set the dictionary data RETURN_IF_ERROR(page_decoder->set_dict(dict_data, uncompressed_size, header.dictionary_page_header.num_values)); diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h index 24415c9830699a..daf8512b3b2cb1 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h @@ -193,7 +193,7 @@ class ColumnChunkReader { io::BufferedStreamReader* _stream_reader; tparquet::ColumnMetaData _metadata; - cctz::time_zone* _ctz; + // cctz::time_zone* _ctz; io::IOContext* _io_ctx; std::unique_ptr _page_reader = nullptr; diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp index 7ad67cdc80ec78..ec4f2a1a612ecc 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp @@ -25,6 +25,7 @@ #include #include +#include "parquet_column_convert.h" #include "runtime/define_primitive_type.h" #include "schema_desc.h" #include "util/runtime_profile.h" @@ -39,7 +40,6 @@ #include "vec/data_types/data_type_struct.h" #include "vec/exec/format/parquet/level_decoder.h" #include "vparquet_column_chunk_reader.h" -#include "vec/exec/format/convert.h" namespace cctz { class time_zone; @@ -316,6 +316,16 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType ColumnSelectVector& select_vector, size_t batch_size, size_t* read_rows, bool* eof, bool is_dict_filter, bool align_rows = false) { + bool need_convert = false; + auto& physical_type = _chunk_meta.meta_data.type; + DataTypePtr src_type; + ParquetConvert::convert_data_type_from_parquet(physical_type, src_type, type, &need_convert); + + ColumnPtr src_column = doris_column; + if (need_convert) { + src_column = src_type->create_column(); + } + size_t origin_size = 0; if (align_rows) { origin_size = _rep_levels.size(); @@ -362,17 +372,20 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType MutableColumnPtr data_column; std::vector null_map; NullMap* map_data_column = nullptr; - if (doris_column->is_nullable()) { + if (src_column->is_nullable()) { SCOPED_RAW_TIMER(&_decode_null_map_time); - auto* nullable_column = reinterpret_cast( - (*std::move(doris_column)).mutate().get()); + auto* nullable_column = const_cast( + static_cast(src_column.get())); + + // auto* nullable_column = reinterpret_cast( + // (*std::move(src_column)).mutate().get()); data_column = nullable_column->get_nested_column_ptr(); map_data_column = &(nullable_column->get_null_map_data()); } else { if (_field_schema->is_nullable) { return Status::Corruption("Not nullable column has null values in parquet file"); } - data_column = doris_column->assume_mutable(); + data_column = src_column->assume_mutable(); } size_t has_read = origin_size; size_t ancestor_nulls = 0; @@ -429,7 +442,7 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType RETURN_IF_ERROR(_chunk_reader->next_page()); RETURN_IF_ERROR(_chunk_reader->load_page_data()); select_vector.reset(); - return _read_nested_column(doris_column, type, select_vector, 0, read_rows, eof, + return _read_nested_column(src_column, type, select_vector, 0, read_rows, eof, is_dict_filter, true); } else { *eof = true; @@ -440,6 +453,14 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType // so the repetition level of first element should be 0, meaning a new row is started. DCHECK_EQ(_rep_levels[0], 0); } + if (need_convert) { + std::unique_ptr converter; + ParquetConvert::ConvertParams convert_params; + convert_params.init(_field_schema, _ctz); + ParquetConvert::get_converter(src_type, type, &converter, &convert_params); + converter->convert(src_column, const_cast(doris_column.get())); + } + return Status::OK(); } Status ScalarColumnReader::read_dict_values_to_column(MutableColumnPtr& doris_column, @@ -481,12 +502,12 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr ColumnSelectVector& select_vector, size_t batch_size, size_t* read_rows, bool* eof, bool is_dict_filter) { bool need_convert = false; - auto & physical_type = _chunk_meta.meta_data.type; + auto& physical_type = _chunk_meta.meta_data.type; DataTypePtr src_type; - ParquetConvert::convert_data_type_from_parquet(physical_type, src_type,type,&need_convert); + ParquetConvert::convert_data_type_from_parquet(physical_type, src_type, type, &need_convert); ColumnPtr src_column = doris_column; - if (need_convert ){ + if (need_convert) { src_column = src_type->create_column(); } @@ -555,8 +576,8 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr if (skip_whole_batch) { RETURN_IF_ERROR(_skip_values(read_values)); } else { - RETURN_IF_ERROR(_read_values(read_values, src_column, type, select_vector, - is_dict_filter)); + RETURN_IF_ERROR( + _read_values(read_values, src_column, type, select_vector, is_dict_filter)); } has_read += read_values; _current_row_index += read_values; @@ -570,12 +591,12 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr if (_chunk_reader->remaining_num_values() == 0 && !_chunk_reader->has_next_page()) { *eof = true; } - if ( need_convert ){ + if (need_convert) { std::unique_ptr converter; - ParquetConvert::DocTime doc; - doc.init_time( _field_schema , _ctz); - ParquetConvert::get_converter(src_type,type,&converter, doc); - converter->convert(src_column,const_cast(doris_column.get())); + ParquetConvert::ConvertParams convert_params; + convert_params.init(_field_schema, _ctz); + ParquetConvert::get_converter(src_type, type, &converter, &convert_params); + converter->convert(src_column, const_cast(doris_column.get())); } return Status::OK(); diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index 6f6d5c4dc70c21..961e8ab2000825 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -321,7 +321,7 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_ } RETURN_IF_ERROR(_build_pos_delete_filter(*read_rows)); - /* + std::vector columns_to_filter; int column_to_keep = block->columns(); columns_to_filter.resize(column_to_keep); @@ -348,7 +348,7 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_ _convert_dict_cols_to_string_cols(block); return Status::OK(); } - _pre_conjunct_ctxs + if (!_not_single_slot_filter_conjuncts.empty()) { _convert_dict_cols_to_string_cols(block); std::vector merged_filters; @@ -367,7 +367,6 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_ RETURN_IF_CATCH_EXCEPTION( RETURN_IF_ERROR(_filter_block(block, column_to_keep, columns_to_filter))); } -*/ *read_rows = block->rows(); return Status::OK(); } diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index e4e2e538fb9e91..41e2cef0866638 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -525,63 +525,13 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof) } DCHECK(_current_group_reader != nullptr); -// { -// BlockUPtr src_block; -// std::map need_convert; -// { -// // std::cout <<"->"; -// // for(auto i =0; i < block->columns();i++ ){ -// // std::cout << block->get_columns()[i]->get_name()<<" "; -// // } -// // std::cout <<"\n"; -// -// vector v; -// for (auto& col_name : block->get_names()) { -// vectorized::DataTypePtr data_type; -// tparquet::Type::type parquet_type = -// _file_metadata->schema().get_column(col_name)->physical_type; -// bool conv = false; -// RETURN_IF_ERROR(convert::convert_data_type_from_parquet( -// parquet_type, data_type, block->get_by_name(col_name).type, &conv)); -// std::cout << col_name << "->" << conv << "\n"; -// need_convert[col_name] = conv; -// if (conv) { -// v.emplace_back(data_type, col_name); -// } else { -// // v.emplace_back( (*std::move(block->get_by_name(col_name).column)).mutate(),data_type,col_name ); -// v.emplace_back(block->get_by_name(col_name).column->assume_mutable(), data_type, -// col_name); -// } -// } -// src_block = vectorized::Block::create_unique(v); -// } - SCOPED_RAW_TIMER(&_statistics.column_read_time); - Status batch_st = _current_group_reader->next_batch(block, _batch_size, read_rows, - &_row_group_eof); - if (!batch_st.ok()) { - return Status::InternalError("Read parquet file {} failed, reason = {}", - _scan_range.path, batch_st.to_string()); - } - - //convert -// for (auto i = 0; i < block->columns(); i++) { -// std::cout << "colname = " << block->get_names()[i] << " " -// << need_convert[block->get_names()[i]] << "\n"; -// if (need_convert[block->get_names()[i]]) { -// std::unique_ptr converter(nullptr); -// convert::DocTime doc; -// // auto x = -// doc.init_time(_file_metadata->schema().get_column(i), _ctz); -// RETURN_IF_ERROR(convert::get_converter(src_block->get_data_type(i), -// block->get_data_type(i), &converter, doc)); -// // block->get_columns()[i]=src_block->get_columns()[i]; -// RETURN_IF_ERROR( -// -// converter->convert(src_block->get_columns()[i].get(), -// const_cast(block->get_columns()[i].get()))); -// } -// } -// } + SCOPED_RAW_TIMER(&_statistics.column_read_time); + Status batch_st = + _current_group_reader->next_batch(block, _batch_size, read_rows, &_row_group_eof); + if (!batch_st.ok()) { + return Status::InternalError("Read parquet file {} failed, reason = {}", _scan_range.path, + batch_st.to_string()); + } if (_row_group_eof) { auto column_st = _current_group_reader->statistics(); diff --git a/be/src/vec/exec/scan/scanner_context.cpp b/be/src/vec/exec/scan/scanner_context.cpp index a8ab6d4ec6d6c9..2bbc5a7048d1e9 100644 --- a/be/src/vec/exec/scan/scanner_context.cpp +++ b/be/src/vec/exec/scan/scanner_context.cpp @@ -169,14 +169,6 @@ vectorized::BlockUPtr ScannerContext::get_free_block() { block = vectorized::Block::create_unique(_output_tuple_desc->slots(), _batch_size, true /*ignore invalid slots*/); - // vector v; - // vectorized::DataTypePtr a = std::make_shared(std::make_shared()); - // v.push_back( ColumnWithTypeAndName(a,"id")); - // vectorized::DataTypePtr b = std::make_shared(std::make_shared()); - // v.push_back( ColumnWithTypeAndName(b,"age")); - // block = vectorized::Block::create_unique(v); - // block->set_num_rows(_batch_size); - COUNTER_UPDATE(_newly_create_free_blocks_num, 1); _serving_blocks_num++; diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index 77e9aeea77820a..3e1df37c250b01 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -771,7 +771,7 @@ Status VFileScanner::_get_next_reader() { for (size_t i = 0; i != _conjuncts.size(); ++i) { RETURN_IF_ERROR(_conjuncts[i]->clone(_state, _push_down_conjuncts[i])); } - //_discard_conjuncts(); + _discard_conjuncts(); } if (range.__isset.table_format_params && range.table_format_params.table_format_type == "iceberg") { From f4a211a15a33849d092d8198ecc7a12865ef4e99 Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Thu, 12 Oct 2023 00:30:21 +0800 Subject: [PATCH 06/21] fix compile --- be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp index df66e633a0270d..0f8b5cbf04ad2d 100644 --- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp +++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp @@ -72,9 +72,7 @@ class FixLengthDictDecoder final : public BaseDictDecoder { return _decode_dict_values(doris_column, select_vector, is_dict_filter); } - _decode_numeric(doris_column, select_vector); - - return Status::OK(); + return _decode_numeric(doris_column, select_vector); } Status set_dict(std::unique_ptr& dict, int32_t length, size_t num_values) override { From bdaa92705d088d35b74d8bc59d47b5572205ab48 Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Thu, 12 Oct 2023 00:38:35 +0800 Subject: [PATCH 07/21] fix compile --- .../format/parquet/vparquet_column_reader.cpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp index ec4f2a1a612ecc..f70e874bc7d22f 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp @@ -319,8 +319,8 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType bool need_convert = false; auto& physical_type = _chunk_meta.meta_data.type; DataTypePtr src_type; - ParquetConvert::convert_data_type_from_parquet(physical_type, src_type, type, &need_convert); - + RETURN_IF_ERROR(ParquetConvert::convert_data_type_from_parquet(physical_type, src_type, type, + &need_convert)); ColumnPtr src_column = doris_column; if (need_convert) { src_column = src_type->create_column(); @@ -457,8 +457,8 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType std::unique_ptr converter; ParquetConvert::ConvertParams convert_params; convert_params.init(_field_schema, _ctz); - ParquetConvert::get_converter(src_type, type, &converter, &convert_params); - converter->convert(src_column, const_cast(doris_column.get())); + RETURN_IF_ERROR(ParquetConvert::get_converter(src_type, type, &converter, &convert_params)); + RETURN_IF_ERROR(converter->convert(src_column, const_cast(doris_column.get()))); } return Status::OK(); @@ -504,7 +504,10 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr bool need_convert = false; auto& physical_type = _chunk_meta.meta_data.type; DataTypePtr src_type; - ParquetConvert::convert_data_type_from_parquet(physical_type, src_type, type, &need_convert); + RETURN_IF_ERROR( + + ParquetConvert::convert_data_type_from_parquet(physical_type, src_type, type, + &need_convert)); ColumnPtr src_column = doris_column; if (need_convert) { @@ -595,8 +598,10 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr std::unique_ptr converter; ParquetConvert::ConvertParams convert_params; convert_params.init(_field_schema, _ctz); - ParquetConvert::get_converter(src_type, type, &converter, &convert_params); - converter->convert(src_column, const_cast(doris_column.get())); + RETURN_IF_ERROR(ParquetConvert::get_converter(src_type, type, &converter, &convert_params)); + RETURN_IF_ERROR( + + converter->convert(src_column, const_cast(doris_column.get()))); } return Status::OK(); From 961174f0fc71d1e69e1035d722d6e67b5b4414c7 Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Fri, 13 Oct 2023 00:47:11 +0800 Subject: [PATCH 08/21] fix be core --- .../format/parquet/parquet_column_convert.cpp | 91 +++++++++++++++++ .../format/parquet/parquet_column_convert.h | 99 ++++--------------- .../format/parquet/vparquet_column_reader.cpp | 58 +++++------ .../format/parquet/vparquet_group_reader.cpp | 11 +-- be/src/vec/exec/scan/scanner_context.cpp | 2 +- be/src/vec/exec/scan/scanner_scheduler.cpp | 2 +- be/src/vec/exec/scan/vfile_scanner.cpp | 8 +- be/src/vec/exprs/vectorized_fn_call.cpp | 3 +- 8 files changed, 142 insertions(+), 132 deletions(-) create mode 100644 be/src/vec/exec/format/parquet/parquet_column_convert.cpp diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp new file mode 100644 index 00000000000000..e67d2c4436c00a --- /dev/null +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/format/parquet/parquet_column_convert.h" + +namespace doris::vectorized { +namespace ParquetConvert { +const cctz::time_zone ConvertParams::utc0 = cctz::utc_time_zone(); + +Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, + vectorized::DataTypePtr& ans_data_type, DataTypePtr& src_type, + bool* need_convert) { + std::cout << getTypeName(src_type->get_type_id()) << "\n"; + if (is_complex_type(src_type)) { + *need_convert = false; + return Status::OK(); + } + switch (parquet_type) { + case tparquet::Type::type::BOOLEAN: + ans_data_type = std::make_shared(); + break; + case tparquet::Type::type::INT32: + ans_data_type = std::make_shared(); + break; + case tparquet::Type::type::INT64: + ans_data_type = std::make_shared(); + break; + case tparquet::Type::type::FLOAT: + ans_data_type = std::make_shared(); + break; + case tparquet::Type::type::DOUBLE: + ans_data_type = std::make_shared(); + break; + case tparquet::Type::type::BYTE_ARRAY: + case tparquet::Type::type::FIXED_LEN_BYTE_ARRAY: + ans_data_type = std::make_shared(); + break; + case tparquet::Type::type::INT96: + ans_data_type = std::make_shared(); + break; + default: + return Status::IOError("Can't read parquet type : {}", parquet_type); + } + if (ans_data_type->get_type_id() == src_type->get_type_id()) { + *need_convert = false; + return Status::OK(); + } + if (src_type->is_nullable()) { + auto& nested_src_type = + static_cast(src_type.get())->get_nested_type(); + auto sub = ans_data_type; + ans_data_type = std::make_shared(ans_data_type); + + if (nested_src_type->get_type_id() == sub->get_type_id()) { + *need_convert = false; + return Status::OK(); + } + } + + *need_convert = true; + return Status::OK(); +} + +Status get_converter(std::shared_ptr src_type, + std::shared_ptr dst_type, + std::unique_ptr* converter, ConvertParams* convert_param) { + if (src_type->is_nullable()) { + auto src = reinterpret_cast(src_type.get())->get_nested_type(); + auto dst = reinterpret_cast(dst_type.get())->get_nested_type(); + + return get_converter_impl(src, dst, converter, convert_param); + } else { + return get_converter_impl(src_type, dst_type, converter, convert_param); + } +} +} // namespace ParquetConvert +} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h b/be/src/vec/exec/format/parquet/parquet_column_convert.h index 567941205faebf..dfafa932a21072 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.h +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h @@ -180,60 +180,10 @@ struct ConvertParams { } } }; -inline const cctz::time_zone ConvertParams::utc0 = cctz::utc_time_zone(); -inline Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, - vectorized::DataTypePtr& ans_data_type, - DataTypePtr& src_type, bool* need_convert) { - std::cout << getTypeName(src_type->get_type_id()) << "\n"; - if (is_complex_type(src_type)) { - *need_convert = false; - return Status::OK(); - } - switch (parquet_type) { - case tparquet::Type::type::BOOLEAN: - ans_data_type = std::make_shared(); - break; - case tparquet::Type::type::INT32: - ans_data_type = std::make_shared(); - break; - case tparquet::Type::type::INT64: - ans_data_type = std::make_shared(); - break; - case tparquet::Type::type::FLOAT: - ans_data_type = std::make_shared(); - break; - case tparquet::Type::type::DOUBLE: - ans_data_type = std::make_shared(); - break; - case tparquet::Type::type::BYTE_ARRAY: - case tparquet::Type::type::FIXED_LEN_BYTE_ARRAY: - ans_data_type = std::make_shared(); - break; - case tparquet::Type::type::INT96: - ans_data_type = std::make_shared(); - break; - default: - return Status::IOError("Can't read parquet type : {}", parquet_type); - } - if (ans_data_type->get_type_id() == src_type->get_type_id()) { - *need_convert = false; - return Status::OK(); - } - if (src_type->is_nullable()) { - auto& nested_src_type = - reinterpret_cast(src_type.get())->get_nested_type(); - auto sub = ans_data_type; - ans_data_type = std::make_shared(ans_data_type); - - if (nested_src_type->get_type_id() == sub->get_type_id()) { - *need_convert = false; - return Status::OK(); - } - } - *need_convert = true; - return Status::OK(); -} +Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, + vectorized::DataTypePtr& ans_data_type, DataTypePtr& src_type, + bool* need_convert); struct ColumnConvert { virtual Status convert(const IColumn* src_col, IColumn* dst_col) { return Status::OK(); } @@ -252,12 +202,13 @@ struct NumberColumnConvert : public ColumnConvert { inline void convert_null(const IColumn** src_col, IColumn** dst_col) { size_t rows = (*src_col)->size(); if ((*src_col)->is_nullable()) { - auto src_nullable_column = reinterpret_cast(*src_col); - auto dst_nullable_column = reinterpret_cast(*dst_col); - auto& dst_null_col = dst_nullable_column->get_null_map_column(); - + auto src_nullable_column = static_cast(*src_col); + auto dst_nullable_column = static_cast(*dst_col); + auto& src_null_data = src_nullable_column->get_null_map_column().get_data(); + dst_nullable_column->get_null_map_column().resize(rows); + auto& dst_null_data = dst_nullable_column->get_null_map_column().get_data(); for (auto j = 0; j < rows; j++) { - dst_null_col.insert(src_nullable_column->get_null_map_column()[j]); + dst_null_data[j] = src_null_data[j]; } *src_col = &src_nullable_column->get_nested_column(); @@ -328,6 +279,7 @@ struct int128totimestamp : public ColumnConvert { int64_t micros = to_timestamp_micros(hi, lo); value.from_unixtime(micros / 1000000, *_convert_params->ctz); value.set_microsecond(micros % 1000000); + std::cout << "value = " << value << "\n"; } return Status::OK(); } @@ -367,15 +319,17 @@ class int32todate : public ColumnConvert { convert_null(&src_col, &dst_col); } dst_col->resize(rows); - auto& src_data = reinterpret_cast*>(src_col)->get_data(); + auto& src_data = static_cast*>(src_col)->get_data(); auto& data = static_cast(dst_col)->get_data(); date_day_offset_dict& date_dict = date_day_offset_dict::get(); for (int i = 0; i < rows; i++) { auto& value = reinterpret_cast&>(data[i]); - int64_t date_value = src_data[i] + _convert_params->offset_days; + int64_t date_value = (int64_t)src_data[i] + _convert_params->offset_days; value = date_dict[date_value]; + std::cout << "src_data[i] = " << src_data[i] << "datav2 value =" << value << "\n"; } + std::cout << rows << "\n"; return Status::OK(); } @@ -455,7 +409,7 @@ inline Status get_converter_impl(std::shared_ptr src_data_type, ConvertParams* convert_params) { auto src_type = src_data_type->get_type_id(); auto dst_type = dst_data_type->get_type_id(); - + std::cout << getTypeName(src_type) << " -> " << getTypeName(dst_type) << "\n"; switch (dst_type) { #define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \ case NUMERIC_TYPE: \ @@ -514,15 +468,13 @@ inline Status get_converter_impl(std::shared_ptr src_data_type, *converter = std::make_unique>(); } else if (src_type == TypeIndex::Int64) { *converter = std::make_unique>(); + } else { + std::cout << "src_type = " << getTypeName(src_type) << "\n"; } break; case TypeIndex::Decimal64: convert_params->init_decimal_converter(dst_data_type); DecimalScaleParams& scale_params = convert_params->decimal_scale; - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { - } else { - } if (src_type == TypeIndex::Int128) { if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { @@ -585,19 +537,10 @@ inline Status get_converter_impl(std::shared_ptr src_data_type, return Status::OK(); } -inline Status get_converter(std::shared_ptr src_type, - std::shared_ptr dst_type, - std::unique_ptr* converter, - ConvertParams* convert_param) { - if (src_type->is_nullable()) { - auto src = static_cast(src_type.get())->get_nested_type(); - auto dst = static_cast(dst_type.get())->get_nested_type(); - - return get_converter_impl(src, dst, converter, convert_param); - } else { - return get_converter_impl(src_type, dst_type, converter, convert_param); - } -} +Status get_converter(std::shared_ptr src_type, + std::shared_ptr dst_type, + std::unique_ptr* converter, ConvertParams* convert_param); + }; // namespace ParquetConvert }; // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp index f70e874bc7d22f..e115dd42b18810 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp @@ -253,10 +253,8 @@ Status ScalarColumnReader::_read_values(size_t num_values, ColumnPtr& doris_colu NullMap* map_data_column = nullptr; if (doris_column->is_nullable()) { SCOPED_RAW_TIMER(&_decode_null_map_time); - // auto* nullable_column = reinterpret_cast( - // (*std::move(doris_column)).mutate().get()); - auto* nullable_column = const_cast( - reinterpret_cast(doris_column.get())); + auto* nullable_column = + static_cast(const_cast(doris_column.get())); data_column = nullable_column->get_nested_column_ptr(); map_data_column = &(nullable_column->get_null_map_data()); @@ -316,16 +314,6 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType ColumnSelectVector& select_vector, size_t batch_size, size_t* read_rows, bool* eof, bool is_dict_filter, bool align_rows = false) { - bool need_convert = false; - auto& physical_type = _chunk_meta.meta_data.type; - DataTypePtr src_type; - RETURN_IF_ERROR(ParquetConvert::convert_data_type_from_parquet(physical_type, src_type, type, - &need_convert)); - ColumnPtr src_column = doris_column; - if (need_convert) { - src_column = src_type->create_column(); - } - size_t origin_size = 0; if (align_rows) { origin_size = _rep_levels.size(); @@ -372,10 +360,10 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType MutableColumnPtr data_column; std::vector null_map; NullMap* map_data_column = nullptr; - if (src_column->is_nullable()) { + if (doris_column->is_nullable()) { SCOPED_RAW_TIMER(&_decode_null_map_time); auto* nullable_column = const_cast( - static_cast(src_column.get())); + static_cast(doris_column.get())); // auto* nullable_column = reinterpret_cast( // (*std::move(src_column)).mutate().get()); @@ -385,7 +373,7 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType if (_field_schema->is_nullable) { return Status::Corruption("Not nullable column has null values in parquet file"); } - data_column = src_column->assume_mutable(); + data_column = doris_column->assume_mutable(); } size_t has_read = origin_size; size_t ancestor_nulls = 0; @@ -442,7 +430,7 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType RETURN_IF_ERROR(_chunk_reader->next_page()); RETURN_IF_ERROR(_chunk_reader->load_page_data()); select_vector.reset(); - return _read_nested_column(src_column, type, select_vector, 0, read_rows, eof, + return _read_nested_column(doris_column, type, select_vector, 0, read_rows, eof, is_dict_filter, true); } else { *eof = true; @@ -453,21 +441,13 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType // so the repetition level of first element should be 0, meaning a new row is started. DCHECK_EQ(_rep_levels[0], 0); } - if (need_convert) { - std::unique_ptr converter; - ParquetConvert::ConvertParams convert_params; - convert_params.init(_field_schema, _ctz); - RETURN_IF_ERROR(ParquetConvert::get_converter(src_type, type, &converter, &convert_params)); - RETURN_IF_ERROR(converter->convert(src_column, const_cast(doris_column.get()))); - } - return Status::OK(); } Status ScalarColumnReader::read_dict_values_to_column(MutableColumnPtr& doris_column, bool* has_dict) { bool loaded; RETURN_IF_ERROR(_try_load_dict_page(&loaded, has_dict)); - if (loaded && *has_dict) { //todo(cyw) has_dist ???? + if (loaded && has_dict) { return _chunk_reader->read_dict_values_to_column(doris_column); } return Status::OK(); @@ -504,10 +484,8 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr bool need_convert = false; auto& physical_type = _chunk_meta.meta_data.type; DataTypePtr src_type; - RETURN_IF_ERROR( - - ParquetConvert::convert_data_type_from_parquet(physical_type, src_type, type, - &need_convert)); + RETURN_IF_ERROR(ParquetConvert::convert_data_type_from_parquet(physical_type, src_type, type, + &need_convert)); ColumnPtr src_column = doris_column; if (need_convert) { @@ -524,8 +502,18 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr } if (_nested_column) { RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent()); - return _read_nested_column(src_column, type, select_vector, batch_size, read_rows, eof, - is_dict_filter); + RETURN_IF_ERROR(_read_nested_column(src_column, type, select_vector, batch_size, read_rows, + eof, is_dict_filter)); + if (need_convert) { + std::unique_ptr converter; + ParquetConvert::ConvertParams convert_params; + convert_params.init(_field_schema, _ctz); + RETURN_IF_ERROR( + ParquetConvert::get_converter(src_type, type, &converter, &convert_params)); + RETURN_IF_ERROR( + converter->convert(src_column, const_cast(doris_column.get()))); + } + return Status::OK(); } // generate the row ranges that should be read @@ -599,9 +587,7 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr ParquetConvert::ConvertParams convert_params; convert_params.init(_field_schema, _ctz); RETURN_IF_ERROR(ParquetConvert::get_converter(src_type, type, &converter, &convert_params)); - RETURN_IF_ERROR( - - converter->convert(src_column, const_cast(doris_column.get()))); + RETURN_IF_ERROR(converter->convert(src_column, const_cast(doris_column.get()))); } return Status::OK(); diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index 961e8ab2000825..193f6b298665a3 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -175,16 +175,7 @@ Status RowGroupReader::init( bool RowGroupReader::_can_filter_by_dict(int slot_id, const tparquet::ColumnMetaData& column_metadata) { - SlotDescriptor* slot = nullptr; - const std::vector& slots = _tuple_descriptor->slots(); - for (auto each : slots) { - if (each->id() == slot_id) { - slot = each; - break; - } - } - if (slot != nullptr) { - // if (!slot->type().is_string_type()) {//TODO(CYW) : check use file metadata column_metadata.type + if (column_metadata.type != tparquet::Type::BYTE_ARRAY) { return false; } diff --git a/be/src/vec/exec/scan/scanner_context.cpp b/be/src/vec/exec/scan/scanner_context.cpp index 2bbc5a7048d1e9..f551d5c4973c3a 100644 --- a/be/src/vec/exec/scan/scanner_context.cpp +++ b/be/src/vec/exec/scan/scanner_context.cpp @@ -522,4 +522,4 @@ template void ScannerContext::clear_and_join(pipeline::ScanLocalStateBase* paren RuntimeState* state); template void ScannerContext::clear_and_join(VScanNode* parent, RuntimeState* state); -} // namespace doris::vectorized \ No newline at end of file +} // namespace doris::vectorized diff --git a/be/src/vec/exec/scan/scanner_scheduler.cpp b/be/src/vec/exec/scan/scanner_scheduler.cpp index 513f1ed995f0db..647b5a103229c4 100644 --- a/be/src/vec/exec/scan/scanner_scheduler.cpp +++ b/be/src/vec/exec/scan/scanner_scheduler.cpp @@ -389,7 +389,7 @@ void ScannerScheduler::_scanner_scan(ScannerScheduler* scheduler, ScannerContext break; } - BlockUPtr block = ctx->get_free_block(); //create block <- _output_tuple_desc / 想要的结果 + BlockUPtr block = ctx->get_free_block(); status = scanner->get_block(state, block.get(), &eos); //init reader ,read data VLOG_ROW << "VScanNode input rows: " << block->rows() << ", eos: " << eos; diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index 3e1df37c250b01..35e1d3dff53f68 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -289,7 +289,7 @@ Status VFileScanner::open(RuntimeState* state) { Status VFileScanner::_get_block_impl(RuntimeState* state, Block* block, bool* eof) { do { if (_cur_reader == nullptr || _cur_reader_eof) { - RETURN_IF_ERROR(_get_next_reader()); //init parquet reader + RETURN_IF_ERROR(_get_next_reader()); } if (_scanner_eof) { @@ -577,7 +577,7 @@ Status VFileScanner::_convert_to_output_block(Block* block) { const ColumnNullable* nullable_column = reinterpret_cast(column_ptr.get()); for (int i = 0; i < rows; ++i) { - if (filter_map[i] && nullable_column->is_null_at(i)) { //in load , error case + if (filter_map[i] && nullable_column->is_null_at(i)) { if (_strict_mode && (_src_slot_descs_order_by_dest[dest_index]) && !_src_block_ptr->get_by_position(_dest_slot_to_src_slot_index[dest_index]) .column->is_null_at(i)) { @@ -764,7 +764,7 @@ Status VFileScanner::_get_next_reader() { _state->query_options().enable_parquet_lazy_mat); { SCOPED_TIMER(_open_reader_timer); - RETURN_IF_ERROR(parquet_reader->open()); //read file_schema + RETURN_IF_ERROR(parquet_reader->open()); } if (push_down_predicates && _push_down_conjuncts.empty() && !_conjuncts.empty()) { _push_down_conjuncts.resize(_conjuncts.size()); @@ -792,7 +792,7 @@ Status VFileScanner::_get_next_reader() { _file_col_names, place_holder, _colname_to_value_range, _push_down_conjuncts, _real_tuple_desc, _default_val_row_desc.get(), _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, - &_slot_id_to_filter_conjuncts); //init parquet reader <- select column / filter / value_range min max + &_slot_id_to_filter_conjuncts); _cur_reader = std::move(parquet_reader); } need_to_get_parsed_schema = true; diff --git a/be/src/vec/exprs/vectorized_fn_call.cpp b/be/src/vec/exprs/vectorized_fn_call.cpp index 3db8e47e8c53f9..58083a486f60e5 100644 --- a/be/src/vec/exprs/vectorized_fn_call.cpp +++ b/be/src/vec/exprs/vectorized_fn_call.cpp @@ -155,7 +155,6 @@ Status VectorizedFnCall::execute(VExprContext* context, vectorized::Block* block // if not find fast execute result column, means do not need check fast execute again _can_fast_execute = fast_execute(context->fn_context(_fn_context_index), *block, arguments, num_columns_without_result, block->rows()); - // insert be converted column to block 向block插入转换后的列 if (_can_fast_execute) { *result_column_id = num_columns_without_result; return Status::OK(); @@ -226,4 +225,4 @@ std::string VectorizedFnCall::debug_string(const std::vector& out << "]"; return out.str(); } -} // namespace doris::vectorized \ No newline at end of file +} // namespace doris::vectorized From 1d542fb80f0f5edae2d788a0f9402778ff982c8b Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Fri, 13 Oct 2023 14:14:27 +0800 Subject: [PATCH 09/21] fix compile --- .../format/parquet/parquet_column_convert.cpp | 11 +---------- .../format/parquet/parquet_column_convert.h | 17 +++++++++-------- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp index e67d2c4436c00a..7539ddb42f42fb 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp @@ -77,15 +77,6 @@ Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, Status get_converter(std::shared_ptr src_type, std::shared_ptr dst_type, - std::unique_ptr* converter, ConvertParams* convert_param) { - if (src_type->is_nullable()) { - auto src = reinterpret_cast(src_type.get())->get_nested_type(); - auto dst = reinterpret_cast(dst_type.get())->get_nested_type(); - - return get_converter_impl(src, dst, converter, convert_param); - } else { - return get_converter_impl(src_type, dst_type, converter, convert_param); - } -} + std::unique_ptr* converter, ConvertParams* convert_param); } // namespace ParquetConvert } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h b/be/src/vec/exec/format/parquet/parquet_column_convert.h index dfafa932a21072..ec9b2e79480c41 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.h +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h @@ -472,20 +472,20 @@ inline Status get_converter_impl(std::shared_ptr src_data_type, std::cout << "src_type = " << getTypeName(src_type) << "\n"; } break; - case TypeIndex::Decimal64: + case TypeIndex::Decimal64: { convert_params->init_decimal_converter(dst_data_type); - DecimalScaleParams& scale_params = convert_params->decimal_scale; + DecimalScaleParams &scale_params = convert_params->decimal_scale; if (src_type == TypeIndex::Int128) { if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { *converter = std::make_unique>(); + DecimalScaleParams::SCALE_UP>>(); } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { *converter = std::make_unique>(); + DecimalScaleParams::SCALE_DOWN>>(); } else { *converter = std::make_unique>(); + DecimalScaleParams::NO_SCALE>>(); } } else if (src_type == TypeIndex::String) { if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { @@ -505,7 +505,7 @@ inline Status get_converter_impl(std::shared_ptr src_data_type, numbertodecimal>(); } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { *converter = std::make_unique>(); + DecimalScaleParams::SCALE_DOWN>>(); } else { *converter = std::make_unique< numbertodecimal>(); @@ -517,14 +517,15 @@ inline Status get_converter_impl(std::shared_ptr src_data_type, } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { *converter = std::make_unique< - numbertodecimal>(); + numbertodecimal>(); } else { *converter = std::make_unique< - numbertodecimal>(); + numbertodecimal>(); } } break; + } default: break; } From b97e6f95b260f863c96fde36be32b7121021e60b Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Fri, 13 Oct 2023 14:29:03 +0800 Subject: [PATCH 10/21] fix compile --- .../exec/format/parquet/parquet_column_convert.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp index 7539ddb42f42fb..e67d2c4436c00a 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp @@ -77,6 +77,15 @@ Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, Status get_converter(std::shared_ptr src_type, std::shared_ptr dst_type, - std::unique_ptr* converter, ConvertParams* convert_param); + std::unique_ptr* converter, ConvertParams* convert_param) { + if (src_type->is_nullable()) { + auto src = reinterpret_cast(src_type.get())->get_nested_type(); + auto dst = reinterpret_cast(dst_type.get())->get_nested_type(); + + return get_converter_impl(src, dst, converter, convert_param); + } else { + return get_converter_impl(src_type, dst_type, converter, convert_param); + } +} } // namespace ParquetConvert } // namespace doris::vectorized From 08009dd5795f241f60b875abf83c55a9f4cc0c28 Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Sat, 14 Oct 2023 17:50:16 +0800 Subject: [PATCH 11/21] remove decode judge --- .../format/parquet/parquet_column_convert.cpp | 13 +- .../format/parquet/parquet_column_convert.h | 198 ++- be/src/vec/exec/format/parquet/schema_desc.h | 2 +- .../format/parquet/vparquet_column_reader.cpp | 13 +- .../hive/test_hive_parquet_alter_column.out | 1158 +++++++++++++++++ .../test_hive_parquet_alter_column.groovy | 58 + 6 files changed, 1374 insertions(+), 68 deletions(-) create mode 100644 regression-test/data/external_table_p2/hive/test_hive_parquet_alter_column.out create mode 100644 regression-test/suites/external_table_p2/hive/test_hive_parquet_alter_column.groovy diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp index e67d2c4436c00a..11dbde5e395284 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp @@ -21,7 +21,7 @@ namespace doris::vectorized { namespace ParquetConvert { const cctz::time_zone ConvertParams::utc0 = cctz::utc_time_zone(); -Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, +Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, PrimitiveType show_type, vectorized::DataTypePtr& ans_data_type, DataTypePtr& src_type, bool* need_convert) { std::cout << getTypeName(src_type->get_type_id()) << "\n"; @@ -66,6 +66,11 @@ Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, ans_data_type = std::make_shared(ans_data_type); if (nested_src_type->get_type_id() == sub->get_type_id()) { + if (sub->get_type_id() == TypeIndex::String && + show_type == PrimitiveType::TYPE_DECIMAL64) { + *need_convert = true; + return Status::OK(); + } *need_convert = false; return Status::OK(); } @@ -75,16 +80,16 @@ Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, return Status::OK(); } -Status get_converter(std::shared_ptr src_type, +Status get_converter(std::shared_ptr src_type, PrimitiveType show_type, std::shared_ptr dst_type, std::unique_ptr* converter, ConvertParams* convert_param) { if (src_type->is_nullable()) { auto src = reinterpret_cast(src_type.get())->get_nested_type(); auto dst = reinterpret_cast(dst_type.get())->get_nested_type(); - return get_converter_impl(src, dst, converter, convert_param); + return get_converter_impl(src, show_type, dst, converter, convert_param); } else { - return get_converter_impl(src_type, dst_type, converter, convert_param); + return get_converter_impl(src_type, show_type, dst_type, converter, convert_param); } } } // namespace ParquetConvert diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h b/be/src/vec/exec/format/parquet/parquet_column_convert.h index ec9b2e79480c41..c114360b082a65 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.h +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h @@ -105,6 +105,12 @@ struct PhysicalTypeTraits { M(TypeIndex::Float32, Float32, Float32) \ M(TypeIndex::Float64, Float64, Float64) +#define FOR_LOGICAL_DECIMAL_TYPES(M) \ + M(TypeIndex::Decimal32, Decimal32, Int32) \ + M(TypeIndex::Decimal64, Decimal64, Int64) \ + M(TypeIndex::Decimal128, Decimal128, Int128) \ + M(TypeIndex::Decimal128I, Decimal128, Int128) + struct ConvertParams { // schema.logicalType.TIMESTAMP.isAdjustedToUTC == false static const cctz::time_zone utc0; @@ -163,7 +169,7 @@ struct ConvertParams { return; } auto scale = field_schema->parquet_schema.scale; - auto* decimal_type = static_cast*>( + auto* decimal_type = static_cast>*>( const_cast(remove_nullable(data_type).get())); auto dest_scale = decimal_type->get_scale(); if (dest_scale > scale) { @@ -181,7 +187,7 @@ struct ConvertParams { } }; -Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, +Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, PrimitiveType, vectorized::DataTypePtr& ans_data_type, DataTypePtr& src_type, bool* need_convert); @@ -257,7 +263,8 @@ Status NumberColumnToStringConvert::convert(const IColumn template struct int128totimestamp : public ColumnConvert { - [[nodiscard]] inline uint64_t to_timestamp_micros(uint32_t hi, uint64_t lo) const { +public: + [[nodiscard]] static uint64_t to_timestamp_micros(uint32_t hi, uint64_t lo) { return (hi - ParquetInt96::JULIAN_EPOCH_OFFSET_DAYS) * ParquetInt96::MICROS_IN_DAY + lo / ParquetInt96::NANOS_PER_MICROSECOND; } @@ -402,8 +409,72 @@ class numbertodecimal : public ColumnConvert { } }; +template +class stringtodecimalstring : public ColumnConvert { +public: + Status convert(const IColumn* src_col, IColumn* dst_col) override { + size_t rows = src_col->size(); + if constexpr (is_nullable) { + convert_null(&src_col, &dst_col); + } + auto buf = static_cast(src_col)->get_chars().data(); + auto& offset = static_cast(src_col)->get_offsets(); + + auto data = static_cast(dst_col); + for (int i = 0; i < rows; i++) { + int len = offset[i] - offset[i - 1]; + // When Decimal in parquet is stored in byte arrays, binary and fixed, + // the unscaled number must be encoded as two's complement using big-endian byte order. + Int64 value = 0; + memcpy(reinterpret_cast(&value), buf + offset[i - 1], len); + value = BitUtil::big_endian_to_host(value); + value = value >> ((sizeof(value) - len) * 8); + std::cout << "ans =" << value << "\n"; + std::string ans = reinterpret_cast(value).to_string( + _convert_params->field_schema->parquet_schema.scale); + std::cout << "ans = " << ans << "\n"; + data->insert_data(ans.data(), ans.size()); + } + return Status::OK(); + } +}; + +template +class int128totimestampstring : public ColumnConvert { +public: + Status convert(const IColumn* src_col, IColumn* dst_col) override { + size_t rows = src_col->size(); + if constexpr (is_nullable) { + convert_null(&src_col, &dst_col); + } + + auto& src_data = static_cast*>(src_col)->get_data(); + auto data = static_cast(dst_col); + + for (int i = 0; i < rows; i++) { + __int128 x = src_data[i]; + uint32_t hi = x >> 64; + uint64_t lo = (x << 64) >> 64; + uint64_t num = 0; + auto& value = reinterpret_cast&>(num); + int64_t micros = int128totimestamp::to_timestamp_micros(hi, lo); + value.from_unixtime(micros / 1000000, *_convert_params->ctz); + value.set_microsecond(micros % 1000000); + std::string buf; + buf.resize(20); + char* end = value.to_string(buf.data()); + data->insert_data(buf.data(), end - buf.data()); + + std::cout << "value = " << value << "\n"; + } + + return Status::OK(); + } +}; + template inline Status get_converter_impl(std::shared_ptr src_data_type, + PrimitiveType show_type, std::shared_ptr dst_data_type, std::unique_ptr* converter, ConvertParams* convert_params) { @@ -445,7 +516,17 @@ inline Status get_converter_impl(std::shared_ptr src_data_type, FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) #undef DISPATCH - case TypeIndex::String: + case TypeIndex::String: { + if (src_type == TypeIndex::String) { + if (show_type == PrimitiveType::TYPE_DECIMAL64) { + *converter = std::make_unique>(); + break; + } + } else if (src_type == TypeIndex::Int128) { + *converter = std::make_unique>(); + break; + } + switch (src_type) { #define DISPATCH1(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \ case NUMERIC_TYPE: \ @@ -458,6 +539,7 @@ inline Status get_converter_impl(std::shared_ptr src_data_type, break; } break; + } case TypeIndex::DateV2: if (src_type == TypeIndex::Int32) { *converter = std::make_unique>(); @@ -472,60 +554,60 @@ inline Status get_converter_impl(std::shared_ptr src_data_type, std::cout << "src_type = " << getTypeName(src_type) << "\n"; } break; - case TypeIndex::Decimal64: { - convert_params->init_decimal_converter(dst_data_type); - DecimalScaleParams &scale_params = convert_params->decimal_scale; - - if (src_type == TypeIndex::Int128) { - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { - *converter = std::make_unique>(); - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { - *converter = std::make_unique>(); - } else { - *converter = std::make_unique>(); - } - } else if (src_type == TypeIndex::String) { - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { - *converter = std::make_unique< - stringtodecimal>(); - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { - *converter = std::make_unique< - stringtodecimal>(); - - } else { - *converter = std::make_unique< - stringtodecimal>(); - } - } else if (src_type == TypeIndex::Int32) { - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { - *converter = std::make_unique< - numbertodecimal>(); - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { - *converter = std::make_unique>(); - } else { - *converter = std::make_unique< - numbertodecimal>(); - } - } else if (src_type == TypeIndex::Int64) { - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { - *converter = std::make_unique< - numbertodecimal>(); - - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { - *converter = std::make_unique< - numbertodecimal>(); - - } else { - *converter = std::make_unique< - numbertodecimal>(); - } - } - break; +#define DISPATCH2(TypeIndex_DECIMAL_TYPE, DECIMAL_TYPE, PRIMARY_TYPE) \ + case TypeIndex_DECIMAL_TYPE: { \ + convert_params->init_decimal_converter(dst_data_type); \ + DecimalScaleParams& scale_params = convert_params->decimal_scale; \ + if (src_type == TypeIndex::Int128) { \ + if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \ + *converter = std::make_unique>(); \ + } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \ + *converter = std::make_unique>(); \ + } else { \ + *converter = std::make_unique>(); \ + } \ + } else if (src_type == TypeIndex::String) { \ + if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \ + *converter = std::make_unique>(); \ + } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \ + *converter = std::make_unique>(); \ + } else { \ + *converter = std::make_unique>(); \ + } \ + } else if (src_type == TypeIndex::Int32) { \ + if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \ + *converter = std::make_unique>(); \ + } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \ + *converter = std::make_unique>(); \ + } else { \ + *converter = std::make_unique>(); \ + } \ + } else if (src_type == TypeIndex::Int64) { \ + if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \ + *converter = std::make_unique>(); \ + } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \ + *converter = std::make_unique>(); \ + } else { \ + *converter = std::make_unique>(); \ + } \ + } \ + break; \ } + + FOR_LOGICAL_DECIMAL_TYPES(DISPATCH2) +#undef DISPATCH2 default: break; } @@ -538,7 +620,7 @@ inline Status get_converter_impl(std::shared_ptr src_data_type, return Status::OK(); } -Status get_converter(std::shared_ptr src_type, +Status get_converter(std::shared_ptr src_type, PrimitiveType show_type, std::shared_ptr dst_type, std::unique_ptr* converter, ConvertParams* convert_param); diff --git a/be/src/vec/exec/format/parquet/schema_desc.h b/be/src/vec/exec/format/parquet/schema_desc.h index fb61ad918a7e91..8e8f7350569631 100644 --- a/be/src/vec/exec/format/parquet/schema_desc.h +++ b/be/src/vec/exec/format/parquet/schema_desc.h @@ -88,9 +88,9 @@ class FieldDescriptor { TypeDescriptor convert_to_doris_type(const tparquet::SchemaElement& physical_schema); +public: TypeDescriptor get_doris_type(const tparquet::SchemaElement& physical_schema); -public: FieldDescriptor() = default; ~FieldDescriptor() = default; diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp index e115dd42b18810..dd893de8fa292c 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp @@ -484,9 +484,11 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr bool need_convert = false; auto& physical_type = _chunk_meta.meta_data.type; DataTypePtr src_type; - RETURN_IF_ERROR(ParquetConvert::convert_data_type_from_parquet(physical_type, src_type, type, - &need_convert)); + RETURN_IF_ERROR(ParquetConvert::convert_data_type_from_parquet( + physical_type, _field_schema->type.type, src_type, type, &need_convert)); + std::cout << "need_convert = " << need_convert << "\n"; + //this->_field_schema->type.type ColumnPtr src_column = doris_column; if (need_convert) { src_column = src_type->create_column(); @@ -508,8 +510,8 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr std::unique_ptr converter; ParquetConvert::ConvertParams convert_params; convert_params.init(_field_schema, _ctz); - RETURN_IF_ERROR( - ParquetConvert::get_converter(src_type, type, &converter, &convert_params)); + RETURN_IF_ERROR(ParquetConvert::get_converter(src_type, _field_schema->type.type, type, + &converter, &convert_params)); RETURN_IF_ERROR( converter->convert(src_column, const_cast(doris_column.get()))); } @@ -586,7 +588,8 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr std::unique_ptr converter; ParquetConvert::ConvertParams convert_params; convert_params.init(_field_schema, _ctz); - RETURN_IF_ERROR(ParquetConvert::get_converter(src_type, type, &converter, &convert_params)); + RETURN_IF_ERROR(ParquetConvert::get_converter(src_type, _field_schema->type.type, type, + &converter, &convert_params)); RETURN_IF_ERROR(converter->convert(src_column, const_cast(doris_column.get()))); } diff --git a/regression-test/data/external_table_p2/hive/test_hive_parquet_alter_column.out b/regression-test/data/external_table_p2/hive/test_hive_parquet_alter_column.out new file mode 100644 index 00000000000000..0c1ee1531ab4aa --- /dev/null +++ b/regression-test/data/external_table_p2/hive/test_hive_parquet_alter_column.out @@ -0,0 +1,1158 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !desc -- +col_int INT Yes true \N +col_smallint INT Yes true \N +col_tinyint INT Yes true \N +col_bigint BIGINT Yes true \N +col_float FLOAT Yes true \N +col_double DOUBLE Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char CHAR(10) Yes true \N +col_varchar VARCHAR(255) Yes true \N +col_date DATE Yes true \N +col_timestamp DATETIME(6) Yes true \N +col_decimal DECIMAL(10, 2) Yes true \N + +-- !show -- +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 + +-- !order -- +-1 +-1 +-1 + +-- !order -- +-400 +-400 +-400 + +-- !order -- +-20 +-20 +-20 + +-- !order -- +-400000000 +-400000000 +-400000000 + +-- !order -- +10.5 +10.5 +10.5 + +-- !order -- +20.75 +20.75 +20.75 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int INT Yes true \N +col_smallint SMALLINT Yes true \N +col_tinyint SMALLINT Yes true \N +col_bigint BIGINT Yes true \N +col_float FLOAT Yes true \N +col_double DOUBLE Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char CHAR(10) Yes true \N +col_varchar VARCHAR(255) Yes true \N +col_date DATE Yes true \N +col_timestamp DATETIME(6) Yes true \N +col_decimal DECIMAL(10, 2) Yes true \N + +-- !show -- +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 + +-- !order -- +-1 +-1 +-1 + +-- !order -- +-400 +-400 +-400 + +-- !order -- +-20 +-20 +-20 + +-- !order -- +-400000000 +-400000000 +-400000000 + +-- !order -- +10.5 +10.5 +10.5 + +-- !order -- +20.75 +20.75 +20.75 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int INT Yes true \N +col_smallint SMALLINT Yes true \N +col_tinyint TINYINT Yes true \N +col_bigint BIGINT Yes true \N +col_float FLOAT Yes true \N +col_double DOUBLE Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char CHAR(10) Yes true \N +col_varchar VARCHAR(255) Yes true \N +col_date DATE Yes true \N +col_timestamp DATETIME(6) Yes true \N +col_decimal DECIMAL(10, 2) Yes true \N + +-- !show -- +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 + +-- !order -- +-1 +-1 +-1 + +-- !order -- +-400 +-400 +-400 + +-- !order -- +-20 +-20 +-20 + +-- !order -- +-400000000 +-400000000 +-400000000 + +-- !order -- +10.5 +10.5 +10.5 + +-- !order -- +20.75 +20.75 +20.75 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int BIGINT Yes true \N +col_smallint BIGINT Yes true \N +col_tinyint BIGINT Yes true \N +col_bigint BIGINT Yes true \N +col_float FLOAT Yes true \N +col_double DOUBLE Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char CHAR(10) Yes true \N +col_varchar VARCHAR(255) Yes true \N +col_date DATE Yes true \N +col_timestamp DATETIME(6) Yes true \N +col_decimal DECIMAL(10, 2) Yes true \N + +-- !show -- +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 + +-- !order -- +-1 +-1 +-1 + +-- !order -- +-400 +-400 +-400 + +-- !order -- +-20 +-20 +-20 + +-- !order -- +-400000000 +-400000000 +-400000000 + +-- !order -- +10.5 +10.5 +10.5 + +-- !order -- +20.75 +20.75 +20.75 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int FLOAT Yes true \N +col_smallint FLOAT Yes true \N +col_tinyint FLOAT Yes true \N +col_bigint FLOAT Yes true \N +col_float FLOAT Yes true \N +col_double DOUBLE Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char CHAR(10) Yes true \N +col_varchar VARCHAR(255) Yes true \N +col_date DATE Yes true \N +col_timestamp DATETIME(6) Yes true \N +col_decimal DECIMAL(10, 2) Yes true \N + +-- !show -- +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 + +-- !order -- +-1.0 +-1.0 +-1.0 + +-- !order -- +-400.0 +-400.0 +-400.0 + +-- !order -- +-20.0 +-20.0 +-20.0 + +-- !order -- +-4.0E8 +-4.0E8 +-4.0E8 + +-- !order -- +10.5 +10.5 +10.5 + +-- !order -- +20.75 +20.75 +20.75 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int DOUBLE Yes true \N +col_smallint DOUBLE Yes true \N +col_tinyint DOUBLE Yes true \N +col_bigint DOUBLE Yes true \N +col_float DOUBLE Yes true \N +col_double DOUBLE Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char CHAR(10) Yes true \N +col_varchar VARCHAR(255) Yes true \N +col_date DATE Yes true \N +col_timestamp DATETIME(6) Yes true \N +col_decimal DECIMAL(10, 2) Yes true \N + +-- !show -- +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 + +-- !order -- +-1.0 +-1.0 +-1.0 + +-- !order -- +-400.0 +-400.0 +-400.0 + +-- !order -- +-20.0 +-20.0 +-20.0 + +-- !order -- +-4.0E8 +-4.0E8 +-4.0E8 + +-- !order -- +10.5 +10.5 +10.5 + +-- !order -- +20.75 +20.75 +20.75 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int INT Yes true \N +col_smallint SMALLINT Yes true \N +col_tinyint TINYINT Yes true \N +col_bigint BIGINT Yes true \N +col_float FLOAT Yes true \N +col_double DOUBLE Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char CHAR(10) Yes true \N +col_varchar VARCHAR(255) Yes true \N +col_date DATE Yes true \N +col_timestamp DATETIME(6) Yes true \N +col_decimal DECIMAL(10, 2) Yes true \N + +-- !show -- +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 + +-- !order -- +-1 +-1 +-1 + +-- !order -- +-400 +-400 +-400 + +-- !order -- +-20 +-20 +-20 + +-- !order -- +-400000000 +-400000000 +-400000000 + +-- !order -- +10.5 +10.5 +10.5 + +-- !order -- +20.75 +20.75 +20.75 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int TEXT Yes true \N +col_smallint TEXT Yes true \N +col_tinyint TEXT Yes true \N +col_bigint TEXT Yes true \N +col_float TEXT Yes true \N +col_double TEXT Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char TEXT Yes true \N +col_varchar TEXT Yes true \N +col_date TEXT Yes true \N +col_timestamp TEXT Yes true \N +col_decimal TEXT Yes true \N + +-- !show -- +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 + +-- !order -- +-1 +-1 +-1 + +-- !order -- +-200 +-200 +-200 + +-- !order -- +-10 +-10 +-10 + +-- !order -- +-20000000 +-20000000 +-20000000 + +-- !order -- +10.500000 +10.500000 +10.500000 + +-- !order -- +20.750000 +20.750000 +20.750000 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +19636 +19636 +19636 + +-- !order -- +2023-10-06 14:30:00 +2023-10-06 14:30:00 +2023-10-06 14:30:00 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int CHAR(10) Yes true \N +col_smallint CHAR(10) Yes true \N +col_tinyint CHAR(10) Yes true \N +col_bigint CHAR(10) Yes true \N +col_float CHAR(10) Yes true \N +col_double CHAR(10) Yes true \N +col_boolean BOOLEAN Yes true \N +col_string CHAR(10) Yes true \N +col_char CHAR(10) Yes true \N +col_varchar CHAR(10) Yes true \N +col_date CHAR(10) Yes true \N +col_timestamp CHAR(10) Yes true \N +col_decimal CHAR(10) Yes true \N + +-- !show -- +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 + +-- !order -- +-1 +-1 +-1 + +-- !order -- +-200 +-200 +-200 + +-- !order -- +-10 +-10 +-10 + +-- !order -- +-20000000 +-20000000 +-20000000 + +-- !order -- +10.500000 +10.500000 +10.500000 + +-- !order -- +20.750000 +20.750000 +20.750000 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +19636 +19636 +19636 + +-- !order -- +2023-10-06 14:30:00 +2023-10-06 14:30:00 +2023-10-06 14:30:00 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int VARCHAR(20) Yes true \N +col_smallint VARCHAR(20) Yes true \N +col_tinyint VARCHAR(20) Yes true \N +col_bigint VARCHAR(20) Yes true \N +col_float VARCHAR(20) Yes true \N +col_double VARCHAR(20) Yes true \N +col_boolean BOOLEAN Yes true \N +col_string VARCHAR(20) Yes true \N +col_char VARCHAR(20) Yes true \N +col_varchar VARCHAR(20) Yes true \N +col_date VARCHAR(20) Yes true \N +col_timestamp VARCHAR(20) Yes true \N +col_decimal VARCHAR(20) Yes true \N + +-- !show -- +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 + +-- !order -- +-1 +-1 +-1 + +-- !order -- +-200 +-200 +-200 + +-- !order -- +-10 +-10 +-10 + +-- !order -- +-20000000 +-20000000 +-20000000 + +-- !order -- +10.500000 +10.500000 +10.500000 + +-- !order -- +20.750000 +20.750000 +20.750000 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +19636 +19636 +19636 + +-- !order -- +2023-10-06 14:30:00 +2023-10-06 14:30:00 +2023-10-06 14:30:00 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int INT Yes true \N +col_smallint SMALLINT Yes true \N +col_tinyint TINYINT Yes true \N +col_bigint BIGINT Yes true \N +col_float FLOAT Yes true \N +col_double DOUBLE Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char CHAR(10) Yes true \N +col_varchar VARCHAR(255) Yes true \N +col_date DATE Yes true \N +col_timestamp DATETIME(6) Yes true \N +col_decimal DECIMAL(10, 2) Yes true \N + +-- !show -- +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 + +-- !order -- +-1 +-1 +-1 + +-- !order -- +-400 +-400 +-400 + +-- !order -- +-20 +-20 +-20 + +-- !order -- +-400000000 +-400000000 +-400000000 + +-- !order -- +10.5 +10.5 +10.5 + +-- !order -- +20.75 +20.75 +20.75 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int INT Yes true \N +col_smallint SMALLINT Yes true \N +col_tinyint TINYINT Yes true \N +col_bigint BIGINT Yes true \N +col_float FLOAT Yes true \N +col_double DOUBLE Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char CHAR(10) Yes true \N +col_varchar VARCHAR(255) Yes true \N +col_date DATE Yes true \N +col_timestamp DATETIME(6) Yes true \N +col_decimal DECIMAL(10, 2) Yes true \N + +-- !show -- +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 + +-- !order -- +-1 +-1 +-1 + +-- !order -- +-400 +-400 +-400 + +-- !order -- +-20 +-20 +-20 + +-- !order -- +-400000000 +-400000000 +-400000000 + +-- !order -- +10.5 +10.5 +10.5 + +-- !order -- +20.75 +20.75 +20.75 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int DECIMAL(5, 1) Yes true \N +col_smallint DECIMAL(5, 1) Yes true \N +col_tinyint DECIMAL(5, 1) Yes true \N +col_bigint DECIMAL(5, 1) Yes true \N +col_float FLOAT Yes true \N +col_double DOUBLE Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char CHAR(10) Yes true \N +col_varchar VARCHAR(255) Yes true \N +col_date DATE Yes true \N +col_timestamp DATETIME(6) Yes true \N +col_decimal DECIMAL(5, 1) Yes true \N + +-- !show -- +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 0.0 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 0.0 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 0.0 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 0.0 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 0.0 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 0.0 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 0.0 + +-- !order -- +-1.0 +-1.0 +-1.0 + +-- !order -- +-400.0 +-400.0 +-400.0 + +-- !order -- +-20.0 +-20.0 +-20.0 + +-- !order -- +-153960755.2 +-153960755.2 +-153960755.2 + +-- !order -- +10.5 +10.5 +10.5 + +-- !order -- +20.75 +20.75 +20.75 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !order -- +0.0 +0.0 +0.0 + diff --git a/regression-test/suites/external_table_p2/hive/test_hive_parquet_alter_column.groovy b/regression-test/suites/external_table_p2/hive/test_hive_parquet_alter_column.groovy new file mode 100644 index 00000000000000..3583d0bec10217 --- /dev/null +++ b/regression-test/suites/external_table_p2/hive/test_hive_parquet_alter_column.groovy @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_hive_parquet_alter_column", "p2,external,hive,external_remote,external_remote_hive") { + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") + String extHiveHmsPort = context.config.otherConfigs.get("extHiveHmsPort") + String hms_port = context.config.otherConfigs.get("hms_port") + + String catalog_name = "test_hive_parquet_alter_column" + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='hms', + 'hadoop.username' = 'hadoop', + 'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}' + ); + """ + logger.info("catalog " + catalog_name + " created") + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + String Orderby = """ order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_decimal,col_date,col_timestamp limit 7 """ + + sql """ use multi_catalog """ + + + + types = ["int","smallint","tinyint","bigint","float","double","boolean","string","char","varchar","date","timestamp","decimal"] + + for( String type1 in types) { + qt_desc """ desc parquet_alter_column_to_${type1} ; """ + + qt_show """ select * from parquet_alter_column_to_${type1} ${Orderby} """ + + for( String type2 in types) { + + qt_order """ select col_${type2} from parquet_alter_column_to_${type1} order by col_${type2} limit 3 """ + + } + } + + } +} From 8cad6140a6166010c6935da7f2a6a2c3d627fb83 Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Sun, 15 Oct 2023 14:49:34 +0800 Subject: [PATCH 12/21] fix P0 --- .../format/parquet/vparquet_group_reader.cpp | 7 +++---- .../vec/exec/format/parquet/vparquet_reader.cpp | 17 +++++++++++++++++ be/src/vec/exec/scan/scanner_scheduler.cpp | 2 +- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index 193f6b298665a3..9d8513a68e9002 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -129,7 +129,7 @@ Status RowGroupReader::init( std::unique_ptr reader; RETURN_IF_ERROR(ParquetColumnReader::create(_file_reader, field, _row_group_meta, _read_ranges, _ctz, _io_ctx, reader, - max_buf_size)); //create column reader ..... + max_buf_size)); if (reader == nullptr) { VLOG_DEBUG << "Init row group(" << _row_group_id << ") reader failed"; return Status::Corruption("Init row group reader failed"); @@ -175,7 +175,8 @@ Status RowGroupReader::init( bool RowGroupReader::_can_filter_by_dict(int slot_id, const tparquet::ColumnMetaData& column_metadata) { - if (column_metadata.type != tparquet::Type::BYTE_ARRAY) { + if (column_metadata.encodings[0] != tparquet::Encoding::RLE_DICTIONARY || + column_metadata.type != tparquet::Type::BYTE_ARRAY) { return false; } @@ -328,9 +329,7 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_ bool can_filter_all = false; RETURN_IF_ERROR_OR_CATCH_EXCEPTION(VExprContext::execute_conjuncts( _filter_conjuncts, &filters, block, &result_filter, &can_filter_all)); - // => select col where col = '1' => col1 ,converted col , '1' col - //filter all data if (can_filter_all) { for (auto& col : columns_to_filter) { std::move(*block->get_by_position(col).column).assume_mutable()->clear(); diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index 41e2cef0866638..37b3ec5fec18a8 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -524,6 +524,23 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof) } } DCHECK(_current_group_reader != nullptr); + if (_push_down_agg_type == TPushAggOp::type::COUNT) { + auto rows = std::min(_current_group_reader->get_remaining_rows(), (int64_t)_batch_size); + + _current_group_reader->set_remaining_rows(_current_group_reader->get_remaining_rows() - + rows); + + for (auto& col : block->mutate_columns()) { + col->resize(rows); + } + + *read_rows = rows; + if (_current_group_reader->get_remaining_rows() == 0) { + _current_group_reader.reset(nullptr); + } + + return Status::OK(); + } SCOPED_RAW_TIMER(&_statistics.column_read_time); Status batch_st = diff --git a/be/src/vec/exec/scan/scanner_scheduler.cpp b/be/src/vec/exec/scan/scanner_scheduler.cpp index 647b5a103229c4..3a7b7759bf9533 100644 --- a/be/src/vec/exec/scan/scanner_scheduler.cpp +++ b/be/src/vec/exec/scan/scanner_scheduler.cpp @@ -391,7 +391,7 @@ void ScannerScheduler::_scanner_scan(ScannerScheduler* scheduler, ScannerContext BlockUPtr block = ctx->get_free_block(); - status = scanner->get_block(state, block.get(), &eos); //init reader ,read data + status = scanner->get_block(state, block.get(), &eos); VLOG_ROW << "VScanNode input rows: " << block->rows() << ", eos: " << eos; // The VFileScanner for external table may try to open not exist files, // Because FE file cache for external table may out of date. From a6d0f86bb44dcf294e5012bbd4b7451a2c8a9a41 Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Mon, 16 Oct 2023 01:09:44 +0800 Subject: [PATCH 13/21] fix p0 --- .../format/parquet/delta_bit_pack_decoder.h | 3 +- .../parquet/fix_length_dict_decoder.hpp | 48 ++++++++++++++ .../format/parquet/parquet_column_convert.cpp | 2 + .../format/parquet/parquet_column_convert.h | 64 ++++++++++--------- .../format/parquet/vparquet_column_reader.cpp | 11 ++++ .../format/parquet/vparquet_group_reader.cpp | 4 ++ 6 files changed, 101 insertions(+), 31 deletions(-) diff --git a/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h b/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h index 6f893d5db80fa4..ff615aeb969412 100644 --- a/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h +++ b/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h @@ -54,8 +54,7 @@ class DeltaDecoder : public Decoder { template Status decode_byte_array(const std::vector& decoded_vals, MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector) { - if constexpr (PhysicalType == tparquet::Type::BYTE_ARRAY && - PhysicalType == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { + if constexpr (PhysicalType == tparquet::Type::BYTE_ARRAY) { ColumnSelectVector::DataReadType read_type; while (size_t run_length = select_vector.get_next_run(&read_type)) { switch (read_type) { diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp index 0f8b5cbf04ad2d..79eb0a23069e8c 100644 --- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp +++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp @@ -201,6 +201,54 @@ class FixLengthDictDecoder final : public return Status::OK(); } + Status set_dict(std::unique_ptr& dict, int32_t length, size_t num_values) override { + if (num_values * _type_length != length) { + return Status::Corruption("Wrong dictionary data for fixed length type"); + } + _dict = std::move(dict); + char* dict_item_address = reinterpret_cast(_dict.get()); + _dict_items.resize(num_values); + _dict_value_to_code.reserve(num_values); + for (size_t i = 0; i < num_values; ++i) { + _dict_items[i] = dict_item_address; + _dict_value_to_code[StringRef(_dict_items[i], _type_length)] = i; + dict_item_address += _type_length; + } + return Status::OK(); + } + + Status read_dict_values_to_column(MutableColumnPtr& doris_column) override { + size_t dict_items_size = _dict_items.size(); + std::vector dict_values(dict_items_size); + for (size_t i = 0; i < dict_items_size; ++i) { + dict_values.emplace_back(_dict_items[i], _type_length); + } + doris_column->insert_many_strings(&dict_values[0], dict_items_size); + return Status::OK(); + } + + Status get_dict_codes(const ColumnString* string_column, + std::vector* dict_codes) override { + size_t size = string_column->size(); + dict_codes->reserve(size); + for (int i = 0; i < size; ++i) { + StringRef dict_value = string_column->get_data_at(i); + dict_codes->emplace_back(_dict_value_to_code[dict_value]); + } + return Status::OK(); + } + + MutableColumnPtr convert_dict_column_to_string_column(const ColumnInt32* dict_column) override { + auto res = ColumnString::create(); + std::vector dict_values(dict_column->size()); + const auto& data = dict_column->get_data(); + for (size_t i = 0; i < dict_column->size(); ++i) { + dict_values.emplace_back(_dict_items[data[i]], _type_length); + } + res->insert_many_strings(&dict_values[0], dict_values.size()); + return res; + } + std::unordered_map _dict_value_to_code; // For dictionary encoding std::vector _dict_items; }; diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp index 11dbde5e395284..fb25ee1817e526 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp @@ -17,6 +17,8 @@ #include "vec/exec/format/parquet/parquet_column_convert.h" +#include + namespace doris::vectorized { namespace ParquetConvert { const cctz::time_zone ConvertParams::utc0 = cctz::utc_time_zone(); diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h b/be/src/vec/exec/format/parquet/parquet_column_convert.h index c114360b082a65..c07a96c1f16e84 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.h +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h @@ -121,6 +121,7 @@ struct ConvertParams { int64_t scale_to_nano_factor = 1; DecimalScaleParams decimal_scale; FieldSchema* field_schema = nullptr; + size_t start_idx = 0; void init(FieldSchema* field_schema_, cctz::time_zone* ctz_) { field_schema = field_schema_; @@ -196,6 +197,23 @@ struct ColumnConvert { virtual ~ColumnConvert() = default; + void convert_null(const IColumn** src_col, IColumn** dst_col) const { + size_t rows = (*src_col)->size(); + if ((*src_col)->is_nullable()) { + auto src_nullable_column = static_cast(*src_col); + auto dst_nullable_column = static_cast(*dst_col); + auto& src_null_data = src_nullable_column->get_null_map_column().get_data(); + dst_nullable_column->get_null_map_column().resize(_convert_params->start_idx + rows); + auto& dst_null_data = dst_nullable_column->get_null_map_column().get_data(); + for (auto j = 0; j < rows; j++) { + dst_null_data[_convert_params->start_idx + j] = src_null_data[j]; + } + + *src_col = &src_nullable_column->get_nested_column(); + *dst_col = &dst_nullable_column->get_nested_column(); + } + } + public: ConvertParams* _convert_params; }; @@ -205,23 +223,6 @@ struct NumberColumnConvert : public ColumnConvert { Status convert(const IColumn* src_col, IColumn* dst_col) override; }; -inline void convert_null(const IColumn** src_col, IColumn** dst_col) { - size_t rows = (*src_col)->size(); - if ((*src_col)->is_nullable()) { - auto src_nullable_column = static_cast(*src_col); - auto dst_nullable_column = static_cast(*dst_col); - auto& src_null_data = src_nullable_column->get_null_map_column().get_data(); - dst_nullable_column->get_null_map_column().resize(rows); - auto& dst_null_data = dst_nullable_column->get_null_map_column().get_data(); - for (auto j = 0; j < rows; j++) { - dst_null_data[j] = src_null_data[j]; - } - - *src_col = &src_nullable_column->get_nested_column(); - *dst_col = &dst_nullable_column->get_nested_column(); - } -} - template Status NumberColumnConvert::convert(const IColumn* src_col, IColumn* dst_col) { @@ -230,12 +231,12 @@ Status NumberColumnConvert::convert(const IColu convert_null(&src_col, &dst_col); } auto& src_data = static_cast*>(src_col)->get_data(); - dst_col->resize(rows); + dst_col->resize(_convert_params->start_idx + rows); auto& data = static_cast*>(dst_col)->get_data(); for (int i = 0; i < rows; i++) { dst_type value = static_cast(src_data[i]); - data[i] = value; + data[_convert_params->start_idx + i] = value; } return Status::OK(); @@ -274,14 +275,14 @@ struct int128totimestamp : public ColumnConvert { convert_null(&src_col, &dst_col); } auto& src_data = static_cast*>(src_col)->get_data(); - dst_col->resize(rows); + dst_col->resize(_convert_params->start_idx + rows); auto& data = static_cast*>(dst_col)->get_data(); for (int i = 0; i < rows; i++) { __int128 x = src_data[i]; uint32_t hi = x >> 64; uint64_t lo = (x << 64) >> 64; - auto& num = data[i]; + auto& num = data[_convert_params->start_idx + i]; auto& value = reinterpret_cast&>(num); int64_t micros = to_timestamp_micros(hi, lo); value.from_unixtime(micros / 1000000, *_convert_params->ctz); @@ -300,13 +301,14 @@ struct int64totimestamp : public ColumnConvert { if constexpr (is_nullable) { convert_null(&src_col, &dst_col); } - dst_col->resize(rows); + dst_col->resize(_convert_params->start_idx + rows); + auto& src_data = static_cast*>(src_col)->get_data(); auto& data = static_cast*>(dst_col)->get_data(); for (int i = 0; i < rows; i++) { int64 x = src_data[i]; dst_col = static_cast*>(dst_col); - auto& num = data[i]; + auto& num = data[_convert_params->start_idx + i]; auto& value = reinterpret_cast&>(num); value.from_unixtime(x / _convert_params->second_mask, *_convert_params->ctz); value.set_microsecond((x % _convert_params->second_mask) * @@ -325,13 +327,15 @@ class int32todate : public ColumnConvert { if constexpr (is_nullable) { convert_null(&src_col, &dst_col); } - dst_col->resize(rows); + dst_col->resize(_convert_params->start_idx + rows); + auto& src_data = static_cast*>(src_col)->get_data(); auto& data = static_cast(dst_col)->get_data(); date_day_offset_dict& date_dict = date_day_offset_dict::get(); for (int i = 0; i < rows; i++) { - auto& value = reinterpret_cast&>(data[i]); + auto& value = reinterpret_cast&>( + data[_convert_params->start_idx + i]); int64_t date_value = (int64_t)src_data[i] + _convert_params->offset_days; value = date_dict[date_value]; std::cout << "src_data[i] = " << src_data[i] << "datav2 value =" << value << "\n"; @@ -353,7 +357,8 @@ class stringtodecimal : public ColumnConvert { DecimalScaleParams& scale_params = _convert_params->decimal_scale; auto buf = static_cast(src_col)->get_chars().data(); auto& offset = static_cast(src_col)->get_offsets(); - dst_col->resize(rows); + dst_col->resize(_convert_params->start_idx + rows); + auto& data = static_cast*>(dst_col)->get_data(); for (int i = 0; i < rows; i++) { int len = offset[i] - offset[i - 1]; @@ -373,7 +378,7 @@ class stringtodecimal : public ColumnConvert { LOG(FATAL) << "__builtin_unreachable"; __builtin_unreachable(); } - auto& v = reinterpret_cast(data[i]); + auto& v = reinterpret_cast(data[_convert_params->start_idx + i]); v = (DecimalType)value; } @@ -390,7 +395,8 @@ class numbertodecimal : public ColumnConvert { convert_null(&src_col, &dst_col); } auto* src_data = static_cast*>(src_col)->get_data().data(); - dst_col->resize(rows); + dst_col->resize(_convert_params->start_idx + rows); + DecimalScaleParams& scale_params = _convert_params->decimal_scale; auto* data = static_cast>*>(dst_col) ->get_data() @@ -403,7 +409,7 @@ class numbertodecimal : public ColumnConvert { } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { value /= scale_params.scale_factor; } - data[i] = (DecimalPhysicalType)value; + data[_convert_params->start_idx + i] = (DecimalPhysicalType)value; } return Status::OK(); } diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp index dd893de8fa292c..05e02a4129c64c 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp @@ -547,6 +547,16 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr if (!_chunk_reader->has_next_page()) { *eof = true; } + if (need_convert) { + std::unique_ptr converter; + ParquetConvert::ConvertParams convert_params; + convert_params.init(_field_schema, _ctz); + RETURN_IF_ERROR(ParquetConvert::get_converter( + src_type, _field_schema->type.type, type, &converter, &convert_params)); + RETURN_IF_ERROR(converter->convert(src_column, + const_cast(doris_column.get()))); + } + return Status::OK(); } skip_whole_batch = @@ -588,6 +598,7 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr std::unique_ptr converter; ParquetConvert::ConvertParams convert_params; convert_params.init(_field_schema, _ctz); + convert_params.start_idx = doris_column->size(); RETURN_IF_ERROR(ParquetConvert::get_converter(src_type, _field_schema->type.type, type, &converter, &convert_params)); RETURN_IF_ERROR(converter->convert(src_column, const_cast(doris_column.get()))); diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index 9d8513a68e9002..98b435903558da 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -415,8 +415,12 @@ Status RowGroupReader::_read_column_data(Block* block, const std::vector Date: Mon, 16 Oct 2023 15:32:30 +0800 Subject: [PATCH 14/21] fix p0 --- .../format/parquet/fix_length_dict_decoder.hpp | 8 +++++++- .../exec/format/parquet/fix_length_plain_decoder.h | 10 +++++++--- .../exec/format/parquet/parquet_column_convert.cpp | 7 ++++++- .../exec/format/parquet/parquet_column_convert.h | 14 +------------- be/src/vec/exec/format/parquet/parquet_common.h | 3 --- .../parquet/vparquet_column_chunk_reader.cpp | 1 - .../exec/format/parquet/vparquet_column_reader.cpp | 1 - .../exec/format/parquet/vparquet_group_reader.cpp | 4 +--- 8 files changed, 22 insertions(+), 26 deletions(-) diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp index 79eb0a23069e8c..2e35186cae2ded 100644 --- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp +++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp @@ -83,7 +83,13 @@ class FixLengthDictDecoder final : public BaseDictDecoder { char* dict_item_address = reinterpret_cast(_dict.get()); _dict_items.resize(num_values); for (size_t i = 0; i < num_values; ++i) { - _dict_items[i] = *(DataType*)dict_item_address; + if (PhysicalType == tparquet::Type::INT96) { + ParquetInt96 value = *(ParquetInt96*)dict_item_address; + _dict_items[i] = value.to_int128(); + + } else { + _dict_items[i] = *(DataType*)dict_item_address; + } dict_item_address += _type_length; } return Status::OK(); diff --git a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h index 0cc198f8ad5e12..5d62b14f1b73ec 100644 --- a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h +++ b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h @@ -24,7 +24,7 @@ #include "vec/data_types/data_type.h" #include "vec/exec/format/parquet/decoder.h" #include "vec/exec/format/parquet/parquet_column_convert.h" - +#include "vec/exec/format/parquet/parquet_common.h" namespace doris { namespace vectorized { class ColumnSelectVector; @@ -150,7 +150,12 @@ Status FixLengthPlainDecoder::_decode_numeric(MutableColumnPtr& do case ColumnSelectVector::CONTENT: { for (size_t i = 0; i < run_length; ++i) { char* buf_start = _data->data + _offset; - column_data[data_index++] = *(DataType*)buf_start; + if constexpr (PhysicalType != tparquet::Type::INT96) { + column_data[data_index++] = *(DataType*)buf_start; + } else { + ParquetInt96 value = *(ParquetInt96*)buf_start; + column_data[data_index++] = value.to_int128(); + } _offset += _type_length; } break; @@ -172,5 +177,4 @@ Status FixLengthPlainDecoder::_decode_numeric(MutableColumnPtr& do return Status::OK(); } } - } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp index fb25ee1817e526..e6a0c7be54e0f0 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp @@ -26,7 +26,6 @@ const cctz::time_zone ConvertParams::utc0 = cctz::utc_time_zone(); Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, PrimitiveType show_type, vectorized::DataTypePtr& ans_data_type, DataTypePtr& src_type, bool* need_convert) { - std::cout << getTypeName(src_type->get_type_id()) << "\n"; if (is_complex_type(src_type)) { *need_convert = false; return Status::OK(); @@ -58,9 +57,15 @@ Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, Primiti return Status::IOError("Can't read parquet type : {}", parquet_type); } if (ans_data_type->get_type_id() == src_type->get_type_id()) { + if (ans_data_type->get_type_id() == TypeIndex::String && + show_type == PrimitiveType::TYPE_DECIMAL64) { + *need_convert = true; + return Status::OK(); + } *need_convert = false; return Status::OK(); } + if (src_type->is_nullable()) { auto& nested_src_type = static_cast(src_type.get())->get_nested_type(); diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h b/be/src/vec/exec/format/parquet/parquet_column_convert.h index c07a96c1f16e84..9a177f975edd0e 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.h +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h @@ -43,7 +43,6 @@ #include "vec/exec/format/format_common.h" #include "vec/exec/format/parquet/decoder.h" #include "vec/exec/format/parquet/parquet_common.h" - namespace doris::vectorized { namespace ParquetConvert { @@ -287,7 +286,6 @@ struct int128totimestamp : public ColumnConvert { int64_t micros = to_timestamp_micros(hi, lo); value.from_unixtime(micros / 1000000, *_convert_params->ctz); value.set_microsecond(micros % 1000000); - std::cout << "value = " << value << "\n"; } return Status::OK(); } @@ -313,7 +311,6 @@ struct int64totimestamp : public ColumnConvert { value.from_unixtime(x / _convert_params->second_mask, *_convert_params->ctz); value.set_microsecond((x % _convert_params->second_mask) * _convert_params->scale_to_nano_factor / 1000); - std::cout << "value = " << value << "\n"; } return Status::OK(); } @@ -338,9 +335,7 @@ class int32todate : public ColumnConvert { data[_convert_params->start_idx + i]); int64_t date_value = (int64_t)src_data[i] + _convert_params->offset_days; value = date_dict[date_value]; - std::cout << "src_data[i] = " << src_data[i] << "datav2 value =" << value << "\n"; } - std::cout << rows << "\n"; return Status::OK(); } @@ -364,7 +359,7 @@ class stringtodecimal : public ColumnConvert { int len = offset[i] - offset[i - 1]; // When Decimal in parquet is stored in byte arrays, binary and fixed, // the unscaled number must be encoded as two's complement using big-endian byte order. - typename DecimalType::NativeType value = 0; + Int128 value = 0; memcpy(reinterpret_cast(&value), buf + offset[i - 1], len); value = BitUtil::big_endian_to_host(value); value = value >> ((sizeof(value) - len) * 8); @@ -435,10 +430,8 @@ class stringtodecimalstring : public ColumnConvert { memcpy(reinterpret_cast(&value), buf + offset[i - 1], len); value = BitUtil::big_endian_to_host(value); value = value >> ((sizeof(value) - len) * 8); - std::cout << "ans =" << value << "\n"; std::string ans = reinterpret_cast(value).to_string( _convert_params->field_schema->parquet_schema.scale); - std::cout << "ans = " << ans << "\n"; data->insert_data(ans.data(), ans.size()); } return Status::OK(); @@ -470,8 +463,6 @@ class int128totimestampstring : public ColumnConvert { buf.resize(20); char* end = value.to_string(buf.data()); data->insert_data(buf.data(), end - buf.data()); - - std::cout << "value = " << value << "\n"; } return Status::OK(); @@ -486,7 +477,6 @@ inline Status get_converter_impl(std::shared_ptr src_data_type, ConvertParams* convert_params) { auto src_type = src_data_type->get_type_id(); auto dst_type = dst_data_type->get_type_id(); - std::cout << getTypeName(src_type) << " -> " << getTypeName(dst_type) << "\n"; switch (dst_type) { #define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \ case NUMERIC_TYPE: \ @@ -556,8 +546,6 @@ inline Status get_converter_impl(std::shared_ptr src_data_type, *converter = std::make_unique>(); } else if (src_type == TypeIndex::Int64) { *converter = std::make_unique>(); - } else { - std::cout << "src_type = " << getTypeName(src_type) << "\n"; } break; #define DISPATCH2(TypeIndex_DECIMAL_TYPE, DECIMAL_TYPE, PRIMARY_TYPE) \ diff --git a/be/src/vec/exec/format/parquet/parquet_common.h b/be/src/vec/exec/format/parquet/parquet_common.h index 424e398887244e..6667ab2c101637 100644 --- a/be/src/vec/exec/format/parquet/parquet_common.h +++ b/be/src/vec/exec/format/parquet/parquet_common.h @@ -56,9 +56,6 @@ struct ParquetInt96 { } inline __int128 to_int128() const { __int128 ans = 0; - std::cout << "before " - "hi = " - << hi << " lo = " << lo << "\n"; ans = (((__int128)hi) << 64) + lo; return ans; } diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp index d9fe16158e1f46..86fbba8b25a30c 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp @@ -190,7 +190,6 @@ Status ColumnChunkReader::load_page_data() { _page_decoder = _decoders[static_cast(encoding)].get(); } else { std::unique_ptr page_decoder; - std::cout << "type = " << _metadata.type << " " << encoding << "\n"; RETURN_IF_ERROR(Decoder::get_decoder(_metadata.type, encoding, page_decoder)); // Set type length page_decoder->set_type_length(_get_type_length()); diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp index 05e02a4129c64c..0b07606105efd7 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp @@ -487,7 +487,6 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr RETURN_IF_ERROR(ParquetConvert::convert_data_type_from_parquet( physical_type, _field_schema->type.type, src_type, type, &need_convert)); - std::cout << "need_convert = " << need_convert << "\n"; //this->_field_schema->type.type ColumnPtr src_column = doris_column; if (need_convert) { diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index 98b435903558da..2c23ed100ec832 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -415,11 +415,9 @@ Status RowGroupReader::_read_column_data(Block* block, const std::vector Date: Mon, 16 Oct 2023 21:10:23 +0800 Subject: [PATCH 15/21] fix p0 --- .../format/parquet/vparquet_column_reader.cpp | 3 +- .../parquet_scanner/dict-decoder.txt | 32 ++-- .../parquet_scanner/type-decoder.txt | 28 ++-- .../vec/exec/parquet/parquet_thrift_test.cpp | 157 ++++-------------- 4 files changed, 66 insertions(+), 154 deletions(-) diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp index 0b07606105efd7..0a56352b9e7cb3 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp @@ -487,7 +487,7 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr RETURN_IF_ERROR(ParquetConvert::convert_data_type_from_parquet( physical_type, _field_schema->type.type, src_type, type, &need_convert)); - //this->_field_schema->type.type + ColumnPtr src_column = doris_column; if (need_convert) { src_column = src_type->create_column(); @@ -550,6 +550,7 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr std::unique_ptr converter; ParquetConvert::ConvertParams convert_params; convert_params.init(_field_schema, _ctz); + convert_params.start_idx = doris_column->size(); RETURN_IF_ERROR(ParquetConvert::get_converter( src_type, _field_schema->type.type, type, &converter, &convert_params)); RETURN_IF_ERROR(converter->convert(src_column, diff --git a/be/test/exec/test_data/parquet_scanner/dict-decoder.txt b/be/test/exec/test_data/parquet_scanner/dict-decoder.txt index 35414043ed2858..6dd9a5dfb4b806 100644 --- a/be/test/exec/test_data/parquet_scanner/dict-decoder.txt +++ b/be/test/exec/test_data/parquet_scanner/dict-decoder.txt @@ -1,16 +1,16 @@ -+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+ -|tinyint_col(Nullable(Int8))|smallint_col(Nullable(Int16))|int_col(Nullable(Int32))|bigint_col(Nullable(Int64))|boolean_col(Nullable(UInt8))|float_col(Nullable(Float32))|double_col(Nullable(Float64))|string_col(Nullable(String))|binary_col(Nullable(String))|timestamp_col(Nullable(DateTime))|decimal_col(Nullable(Decimal(27, 9)))|char_col(Nullable(String))|varchar_col(Nullable(String))|date_col(Nullable(Date))|date_v2_col(Nullable(DateV2))|timestamp_v2_col(Nullable(DateTimeV2))| -+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+ -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+ ++---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+ +|tinyint_col(Nullable(Int8))|smallint_col(Nullable(Int16))|int_col(Nullable(Int32))|bigint_col(Nullable(Int64))|boolean_col(Nullable(UInt8))|float_col(Nullable(Float32))|double_col(Nullable(Float64))|string_col(Nullable(String))|binary_col(Nullable(String))|timestamp_col(Nullable(DateTimeV2))|decimal_col(Nullable(Decimal(27, 9)))|char_col(Nullable(String))|varchar_col(Nullable(String))|date_col(Nullable(DateV2))|date_v2_col(Nullable(DateV2))|timestamp_v2_col(Nullable(DateTimeV2))| ++---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+ +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| ++---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+ diff --git a/be/test/exec/test_data/parquet_scanner/type-decoder.txt b/be/test/exec/test_data/parquet_scanner/type-decoder.txt index 6a2805d66157df..e56b5574b18f7d 100644 --- a/be/test/exec/test_data/parquet_scanner/type-decoder.txt +++ b/be/test/exec/test_data/parquet_scanner/type-decoder.txt @@ -1,14 +1,14 @@ -+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+ -|tinyint_col(Nullable(Int8))|smallint_col(Nullable(Int16))|int_col(Nullable(Int32))|bigint_col(Nullable(Int64))|boolean_col(Nullable(UInt8))|float_col(Nullable(Float32))|double_col(Nullable(Float64))|string_col(Nullable(String))|binary_col(Nullable(String))|timestamp_col(Nullable(DateTime))|decimal_col(Nullable(Decimal(27, 9)))|char_col(Nullable(String))|varchar_col(Nullable(String))|date_col(Nullable(Date))|date_v2_col(Nullable(DateV2))|timestamp_v2_col(Nullable(DateTimeV2))| -+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+ -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| 2| 2| 2| 2| 1| 2.14| 2.14| NULL| b-row1| 2022-08-02 07:23:18| 2.140000000| c-row1| vc-row1| 2022-08-02| 2022-08-02| 2022-08-02 07:23:18| -| -3| -3| -3| -3| 0| -3.14| -3.14| s-row2| b-row2| 2022-08-03 07:23:19| -3.140000000| c-row2| vc-row2| 2022-08-03| 2022-08-03| 2022-08-03 07:23:19| -| 4| 4| 4| 4| 1| 4.14| 4.14| NULL| b-row3| 2022-08-04 07:24:17| 4.140000000| c-row3| vc-row3| 2022-08-04| 2022-08-04| 2022-08-04 07:24:17| -| -5| -5| -5| -5| 0| -5.14| -5.14| s-row4| b-row4| 2022-08-05 07:25:17| -5.140000000| c-row4| vc-row4| 2022-08-05| 2022-08-05| 2022-08-05 07:25:17| -| 6| 6| 6| 6| 0| 6.14| 6.14| s-row5| b-row5| 2022-08-06 07:26:17| 6.140000000| c-row5| vc-row5| 2022-08-06| 2022-08-06| 2022-08-06 07:26:17| -| -7| -7| -7| -7| 1| -7.14| -7.14| s-row6| b-row6| 2022-08-07 07:27:17| -7.140000000| c-row6| vc-row6| 2022-08-07| 2022-08-07| 2022-08-07 07:27:17| -| 8| 8| 8| 8| 0| 8.14| 8.14| NULL| b-row7| 2022-08-08 07:28:17| 8.140000000| c-row7| vc-row7| 2022-08-08| 2022-08-08| 2022-08-08 07:28:17| -| -9| -9| -9| -9| 0| -9.14| -9.14| s-row8| b-row8| 2022-08-09 07:29:17| -9.140000000| c-row8| vc-row8| 2022-08-09| 2022-08-09| 2022-08-09 07:29:17| -| 10| 10| 10| 10| 0| 10.14| 10.14| s-row9| b-row9| 2022-08-10 07:21:17| 10.140000000| c-row9| vc-row9| 2022-08-10| 2022-08-10| 2022-08-10 07:21:17| -+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+ ++---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+ +|tinyint_col(Nullable(Int8))|smallint_col(Nullable(Int16))|int_col(Nullable(Int32))|bigint_col(Nullable(Int64))|boolean_col(Nullable(UInt8))|float_col(Nullable(Float32))|double_col(Nullable(Float64))|string_col(Nullable(String))|binary_col(Nullable(String))|timestamp_col(Nullable(DateTimeV2))|decimal_col(Nullable(Decimal(27, 9)))|char_col(Nullable(String))|varchar_col(Nullable(String))|date_col(Nullable(DateV2))|date_v2_col(Nullable(DateV2))|timestamp_v2_col(Nullable(DateTimeV2))| ++---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+ +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| 2| 2| 2| 2| 1| 2.14| 2.14| NULL| b-row1| 2022-08-02 07:23:18.000000| 2.140000000| c-row1| vc-row1| 2022-08-02| 2022-08-02| 2022-08-02 07:23:18| +| -3| -3| -3| -3| 0| -3.14| -3.14| s-row2| b-row2| 2022-08-03 07:23:19.000000| -3.140000000| c-row2| vc-row2| 2022-08-03| 2022-08-03| 2022-08-03 07:23:19| +| 4| 4| 4| 4| 1| 4.14| 4.14| NULL| b-row3| 2022-08-04 07:24:17.000000| 4.140000000| c-row3| vc-row3| 2022-08-04| 2022-08-04| 2022-08-04 07:24:17| +| -5| -5| -5| -5| 0| -5.14| -5.14| s-row4| b-row4| 2022-08-05 07:25:17.000000| -5.140000000| c-row4| vc-row4| 2022-08-05| 2022-08-05| 2022-08-05 07:25:17| +| 6| 6| 6| 6| 0| 6.14| 6.14| s-row5| b-row5| 2022-08-06 07:26:17.000000| 6.140000000| c-row5| vc-row5| 2022-08-06| 2022-08-06| 2022-08-06 07:26:17| +| -7| -7| -7| -7| 1| -7.14| -7.14| s-row6| b-row6| 2022-08-07 07:27:17.000000| -7.140000000| c-row6| vc-row6| 2022-08-07| 2022-08-07| 2022-08-07 07:27:17| +| 8| 8| 8| 8| 0| 8.14| 8.14| NULL| b-row7| 2022-08-08 07:28:17.000000| 8.140000000| c-row7| vc-row7| 2022-08-08| 2022-08-08| 2022-08-08 07:28:17| +| -9| -9| -9| -9| 0| -9.14| -9.14| s-row8| b-row8| 2022-08-09 07:29:17.000000| -9.140000000| c-row8| vc-row8| 2022-08-09| 2022-08-09| 2022-08-09 07:29:17| +| 10| 10| 10| 10| 0| 10.14| 10.14| s-row9| b-row9| 2022-08-10 07:21:17.000000| 10.140000000| c-row9| vc-row9| 2022-08-10| 2022-08-10| 2022-08-10 07:21:17| ++---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+ diff --git a/be/test/vec/exec/parquet/parquet_thrift_test.cpp b/be/test/vec/exec/parquet/parquet_thrift_test.cpp index be78d46815f1eb..047d4d0a5b6a22 100644 --- a/be/test/vec/exec/parquet/parquet_thrift_test.cpp +++ b/be/test/vec/exec/parquet/parquet_thrift_test.cpp @@ -59,6 +59,7 @@ #include "vec/core/column_with_type_and_name.h" #include "vec/data_types/data_type.h" #include "vec/data_types/data_type_factory.hpp" +#include "vec/exec/format/parquet/parquet_column_convert.h" #include "vec/exec/format/parquet/parquet_common.h" #include "vec/exec/format/parquet/parquet_thrift_util.h" #include "vec/exec/format/parquet/schema_desc.h" @@ -167,8 +168,8 @@ TEST_F(ParquetThriftReaderTest, complex_nested_file) { static int fill_nullable_column(ColumnPtr& doris_column, level_t* definitions, size_t num_values) { CHECK(doris_column->is_nullable()); - auto* nullable_column = reinterpret_cast( - (*std::move(doris_column)).mutate().get()); + auto* nullable_column = const_cast( + static_cast(doris_column.get())); NullMap& map_data = nullable_column->get_null_map_data(); int null_cnt = 0; for (int i = 0; i < num_values; ++i) { @@ -189,6 +190,19 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column ? chunk_meta.dictionary_page_offset : chunk_meta.data_page_offset; size_t chunk_size = chunk_meta.total_compressed_size; + + bool need_convert = false; + auto& physical_type = column_chunk->meta_data.type; + DataTypePtr src_type; + + RETURN_IF_ERROR(ParquetConvert::convert_data_type_from_parquet( + physical_type, field_schema->type.type, src_type, data_type, &need_convert)); + + ColumnPtr src_column = doris_column->assume_mutable(); + if (need_convert) { + src_column = src_type->create_column(); + } + io::BufferedFileStreamReader stream_reader(file_reader, start_offset, chunk_size, 1024); cctz::time_zone ctz; @@ -208,14 +222,14 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column chunk_reader.get_def_levels(definitions, rows); } MutableColumnPtr data_column; - if (doris_column->is_nullable()) { + if (src_column->is_nullable()) { // fill nullable values - fill_nullable_column(doris_column, definitions, rows); - auto* nullable_column = reinterpret_cast( - (*std::move(doris_column)).mutate().get()); + fill_nullable_column(src_column, definitions, rows); + auto* nullable_column = const_cast( + static_cast(src_column.get())); data_column = nullable_column->get_nested_column_ptr(); } else { - data_column = doris_column->assume_mutable(); + data_column = src_column->assume_mutable(); } ColumnSelectVector run_length_map; // decode page data @@ -223,7 +237,7 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column // required column std::vector null_map = {(u_short)rows}; run_length_map.set_run_length_null_map(null_map, rows, nullptr); - return chunk_reader.decode_values(data_column, data_type, run_length_map, false); + RETURN_IF_ERROR(chunk_reader.decode_values(data_column, data_type, run_length_map, false)); } else { // column with null values level_t level_type = definitions[0]; @@ -254,8 +268,17 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column RETURN_IF_ERROR( chunk_reader.decode_values(data_column, data_type, run_length_map, false)); } - return Status::OK(); } + if (need_convert) { + std::unique_ptr converter; + ParquetConvert::ConvertParams convert_params; + convert_params.init(field_schema, &ctz); + convert_params.start_idx = doris_column->size(); + RETURN_IF_ERROR(ParquetConvert::get_converter(src_type, field_schema->type.type, data_type, + &converter, &convert_params)); + RETURN_IF_ERROR(converter->convert(src_column, const_cast(doris_column.get()))); + } + return Status::OK(); } // Only the unit test depend on this, but it is wrong, should not use TTupleDesc to create tuple desc, not @@ -340,11 +363,11 @@ static void create_block(std::unique_ptr& block) { // binary is not supported, use string instead {"binary_col", TYPE_STRING, sizeof(StringRef), true}, // 64-bit-length, see doris::get_slot_size in primitive_type.cpp - {"timestamp_col", TYPE_DATETIME, sizeof(int128_t), true}, + {"timestamp_col", TYPE_DATETIMEV2, sizeof(int128_t), true}, {"decimal_col", TYPE_DECIMALV2, sizeof(DecimalV2Value), true}, {"char_col", TYPE_CHAR, sizeof(StringRef), true}, {"varchar_col", TYPE_VARCHAR, sizeof(StringRef), true}, - {"date_col", TYPE_DATE, sizeof(int128_t), true}, + {"date_col", TYPE_DATEV2, sizeof(uint32_t), true}, {"date_v2_col", TYPE_DATEV2, sizeof(uint32_t), true}, {"timestamp_v2_col", TYPE_DATETIMEV2, sizeof(int128_t), true, 18, 0}}; SchemaScanner schema_scanner(column_descs); @@ -448,118 +471,6 @@ TEST_F(ParquetThriftReaderTest, dict_decoder) { read_parquet_data_and_check("./be/test/exec/test_data/parquet_scanner/dict-decoder.parquet", "./be/test/exec/test_data/parquet_scanner/dict-decoder.txt", 12); } - -TEST_F(ParquetThriftReaderTest, group_reader) { - std::vector column_descs = { - {"tinyint_col", TYPE_TINYINT, sizeof(int8_t), true}, - {"smallint_col", TYPE_SMALLINT, sizeof(int16_t), true}, - {"int_col", TYPE_INT, sizeof(int32_t), true}, - {"bigint_col", TYPE_BIGINT, sizeof(int64_t), true}, - {"boolean_col", TYPE_BOOLEAN, sizeof(bool), true}, - {"float_col", TYPE_FLOAT, sizeof(float_t), true}, - {"double_col", TYPE_DOUBLE, sizeof(double_t), true}, - {"string_col", TYPE_STRING, sizeof(StringRef), true}, - {"binary_col", TYPE_STRING, sizeof(StringRef), true}, - {"timestamp_col", TYPE_DATETIME, sizeof(int128_t), true}, - {"decimal_col", TYPE_DECIMALV2, sizeof(DecimalV2Value), true}, - {"char_col", TYPE_CHAR, sizeof(StringRef), true}, - {"varchar_col", TYPE_VARCHAR, sizeof(StringRef), true}, - {"date_col", TYPE_DATE, sizeof(int128_t), true}}; - SchemaScanner schema_scanner(column_descs); - ObjectPool object_pool; - doris::TupleDescriptor* tuple_desc = create_tuple_desc(&object_pool, column_descs); - auto tuple_slots = tuple_desc->slots(); - - TSlotDescriptor tslot_desc; - { - tslot_desc.id = 14; - tslot_desc.parent = 0; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::ARRAY); - std::vector contains_nulls {true}; - node.__set_contains_nulls(contains_nulls); - TTypeNode inner; - inner.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::STRING); - inner.__set_scalar_type(scalar_type); - inner.__set_contains_nulls(contains_nulls); - type.types.push_back(node); - type.types.push_back(inner); - } - tslot_desc.slotType = type; - tslot_desc.columnPos = 14; - tslot_desc.byteOffset = 0; - tslot_desc.nullIndicatorByte = 0; - tslot_desc.nullIndicatorBit = -1; - tslot_desc.colName = "list_string"; - tslot_desc.slotIdx = 14; - tslot_desc.isMaterialized = true; - } - SlotDescriptor string_slot(tslot_desc); - tuple_slots.emplace_back(&string_slot); - - std::vector read_columns; - RowGroupReader::LazyReadContext lazy_read_ctx; - for (const auto& slot : tuple_slots) { - lazy_read_ctx.all_read_columns.emplace_back(slot->col_name()); - read_columns.emplace_back(slot->col_name()); - } - io::FileSystemSPtr local_fs = io::LocalFileSystem::create(""); - io::FileReaderSPtr file_reader; - auto st = local_fs->open_file("./be/test/exec/test_data/parquet_scanner/type-decoder.parquet", - &file_reader); - EXPECT_TRUE(st.ok()); - - // prepare metadata - FileMetaData* meta_data; - size_t meta_size; - static_cast(parse_thrift_footer(file_reader, &meta_data, &meta_size, nullptr)); - tparquet::FileMetaData t_metadata = meta_data->to_thrift(); - - cctz::time_zone ctz; - TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, ctz); - auto row_group = t_metadata.row_groups[0]; - std::shared_ptr row_group_reader; - RowGroupReader::PositionDeleteContext position_delete_ctx(row_group.num_rows, 0); - row_group_reader.reset(new RowGroupReader(file_reader, read_columns, 0, row_group, &ctz, - nullptr, position_delete_ctx, lazy_read_ctx, - nullptr)); - std::vector row_ranges; - row_ranges.emplace_back(0, row_group.num_rows); - - auto col_offsets = std::unordered_map(); - auto stg = row_group_reader->init(meta_data->schema(), row_ranges, col_offsets, nullptr, - nullptr, nullptr, nullptr, nullptr); - EXPECT_TRUE(stg.ok()); - - vectorized::Block block; - for (const auto& slot_desc : tuple_slots) { - auto data_type = - vectorized::DataTypeFactory::instance().create_data_type(slot_desc->type(), true); - MutableColumnPtr data_column = data_type->create_column(); - block.insert( - ColumnWithTypeAndName(std::move(data_column), data_type, slot_desc->col_name())); - } - bool batch_eof = false; - size_t read_rows = 0; - auto stb = row_group_reader->next_batch(&block, 1024, &read_rows, &batch_eof); - EXPECT_TRUE(stb.ok()); - - io::FileReaderSPtr result; - auto rst = local_fs->open_file("./be/test/exec/test_data/parquet_scanner/group-reader.txt", - &result); - EXPECT_TRUE(rst.ok()); - uint8_t result_buf[result->size() + 1]; - result_buf[result->size()] = '\0'; - size_t bytes_read; - Slice res(result_buf, result->size()); - static_cast(result->read_at(0, res, &bytes_read)); - ASSERT_STREQ(block.dump_data(0, 10).c_str(), reinterpret_cast(result_buf)); - delete meta_data; -} } // namespace vectorized } // namespace doris From 516535b81d2df05606187cfda2c1d12c7af4625f Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Mon, 16 Oct 2023 22:33:16 +0800 Subject: [PATCH 16/21] fix p0 --- be/src/vec/exec/format/parquet/vparquet_column_reader.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp index 0a56352b9e7cb3..66ec1081a1733a 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp @@ -509,6 +509,7 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr std::unique_ptr converter; ParquetConvert::ConvertParams convert_params; convert_params.init(_field_schema, _ctz); + convert_params.start_idx = doris_column->size(); RETURN_IF_ERROR(ParquetConvert::get_converter(src_type, _field_schema->type.type, type, &converter, &convert_params)); RETURN_IF_ERROR( From cd54c6f370a926633bf3b4f1f3cca56a522f75ef Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Sun, 22 Oct 2023 02:38:21 +0800 Subject: [PATCH 17/21] fix code format --- .../parquet/fix_length_dict_decoder.hpp | 17 +- .../format/parquet/fix_length_plain_decoder.h | 91 ++-- .../format/parquet/parquet_column_convert.cpp | 85 ++-- .../format/parquet/parquet_column_convert.h | 434 ++++++++---------- .../format/parquet/vparquet_column_reader.cpp | 185 ++++---- 5 files changed, 365 insertions(+), 447 deletions(-) diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp index 2e35186cae2ded..3cb5dd3840458b 100644 --- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp +++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp @@ -83,13 +83,7 @@ class FixLengthDictDecoder final : public BaseDictDecoder { char* dict_item_address = reinterpret_cast(_dict.get()); _dict_items.resize(num_values); for (size_t i = 0; i < num_values; ++i) { - if (PhysicalType == tparquet::Type::INT96) { - ParquetInt96 value = *(ParquetInt96*)dict_item_address; - _dict_items[i] = value.to_int128(); - - } else { - _dict_items[i] = *(DataType*)dict_item_address; - } + _dict_items[i] = *(DataType*)dict_item_address; dict_item_address += _type_length; } return Status::OK(); @@ -98,17 +92,18 @@ class FixLengthDictDecoder final : public BaseDictDecoder { protected: template Status _decode_numeric(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) { - auto& column_data = static_cast(*doris_column).get_data(); + auto& column_data = reinterpret_cast&>(*doris_column).get_data(); size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); + column_data.resize(data_index + _type_length * (select_vector.num_values() - + select_vector.num_filtered())); size_t dict_index = 0; + DataType* data = (DataType*)column_data.data(); ColumnSelectVector::DataReadType read_type; while (size_t run_length = select_vector.get_next_run(&read_type)) { switch (read_type) { case ColumnSelectVector::CONTENT: { for (size_t i = 0; i < run_length; ++i) { - column_data[data_index++] = - static_cast(_dict_items[_indexes[dict_index++]]); + data[data_index++] = _dict_items[_indexes[dict_index++]]; } break; } diff --git a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h index 5d62b14f1b73ec..1382ba6b75444a 100644 --- a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h +++ b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h @@ -99,18 +99,28 @@ template template Status FixLengthPlainDecoder::_decode_string(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) { + auto& string_column = static_cast(*doris_column); + + auto& data = string_column.get_chars(); + size_t data_index = data.size(); + data.resize(data_index + + _type_length * (select_vector.num_values() - select_vector.num_filtered())); + auto& offset = string_column.get_offsets(); + size_t offset_index = offset.size(); + offset.resize(offset_index + select_vector.num_values() - select_vector.num_filtered()); + ColumnSelectVector::DataReadType read_type; while (size_t run_length = select_vector.get_next_run(&read_type)) { switch (read_type) { case ColumnSelectVector::CONTENT: { - std::vector string_values; - string_values.reserve(run_length); - for (size_t i = 0; i < run_length; ++i) { - char* buf_start = _data->data + _offset; - string_values.emplace_back(buf_start, _type_length); - _offset += _type_length; + memcpy(data.data() + data_index, _data->data + _offset, _type_length * run_length); + _offset += _type_length * run_length; + data_index += _type_length * run_length; + + for (int i = 0; i < run_length; i++) { + offset[offset_index] = offset[offset_index - 1] + _type_length; + offset_index++; } - doris_column->insert_many_strings(&string_values[0], run_length); break; } case ColumnSelectVector::NULL_DATA: { @@ -134,47 +144,34 @@ template template Status FixLengthPlainDecoder::_decode_numeric(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) { - if constexpr (PhysicalType == tparquet::Type::FIXED_LEN_BYTE_ARRAY || - PhysicalType == tparquet::Type::BYTE_ARRAY) { - return Status::OK(); - } else { - using ColumnType = ParquetConvert::PhysicalTypeTraits::ColumnType; - using DataType = ParquetConvert::PhysicalTypeTraits::DataType; - - auto& column_data = static_cast(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - char* buf_start = _data->data + _offset; - if constexpr (PhysicalType != tparquet::Type::INT96) { - column_data[data_index++] = *(DataType*)buf_start; - } else { - ParquetInt96 value = *(ParquetInt96*)buf_start; - column_data[data_index++] = value.to_int128(); - } - _offset += _type_length; - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - _offset += _type_length * run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } + auto& column_data = reinterpret_cast&>(*doris_column).get_data(); + size_t data_index = column_data.size(); + column_data.resize(data_index + + _type_length * (select_vector.num_values() - select_vector.num_filtered())); + ColumnSelectVector::DataReadType read_type; + while (size_t run_length = select_vector.get_next_run(&read_type)) { + switch (read_type) { + case ColumnSelectVector::CONTENT: { + memcpy(column_data.data() + data_index, _data->data + _offset, + run_length * _type_length); + _offset += run_length * _type_length; + data_index += run_length * _type_length; + break; + } + case ColumnSelectVector::NULL_DATA: { + data_index += run_length; + break; + } + case ColumnSelectVector::FILTERED_CONTENT: { + _offset += _type_length * run_length; + break; + } + case ColumnSelectVector::FILTERED_NULL: { + // do nothing + break; + } } - return Status::OK(); } + return Status::OK(); } } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp index e6a0c7be54e0f0..34b6da3e571115 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp @@ -19,85 +19,64 @@ #include +#include "vec/columns/column_nullable.h" namespace doris::vectorized { namespace ParquetConvert { const cctz::time_zone ConvertParams::utc0 = cctz::utc_time_zone(); -Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, PrimitiveType show_type, - vectorized::DataTypePtr& ans_data_type, DataTypePtr& src_type, - bool* need_convert) { - if (is_complex_type(src_type)) { - *need_convert = false; - return Status::OK(); - } - switch (parquet_type) { +ColumnPtr get_column(tparquet::Type::type parquet_physical_type, PrimitiveType show_type, + ColumnPtr& doris_column, DataTypePtr& doris_type, bool* need_convert) { + ColumnPtr ans_column = doris_column; + DataTypePtr tmp_data_type; + + switch (parquet_physical_type) { case tparquet::Type::type::BOOLEAN: - ans_data_type = std::make_shared(); + tmp_data_type = std::make_shared(); break; case tparquet::Type::type::INT32: - ans_data_type = std::make_shared(); + tmp_data_type = std::make_shared(); break; case tparquet::Type::type::INT64: - ans_data_type = std::make_shared(); + tmp_data_type = std::make_shared(); break; case tparquet::Type::type::FLOAT: - ans_data_type = std::make_shared(); + tmp_data_type = std::make_shared(); break; case tparquet::Type::type::DOUBLE: - ans_data_type = std::make_shared(); + tmp_data_type = std::make_shared(); break; case tparquet::Type::type::BYTE_ARRAY: case tparquet::Type::type::FIXED_LEN_BYTE_ARRAY: - ans_data_type = std::make_shared(); + tmp_data_type = std::make_shared(); break; case tparquet::Type::type::INT96: - ans_data_type = std::make_shared(); + tmp_data_type = std::make_shared(); break; - default: - return Status::IOError("Can't read parquet type : {}", parquet_type); } - if (ans_data_type->get_type_id() == src_type->get_type_id()) { - if (ans_data_type->get_type_id() == TypeIndex::String && - show_type == PrimitiveType::TYPE_DECIMAL64) { - *need_convert = true; - return Status::OK(); - } - *need_convert = false; - return Status::OK(); - } - - if (src_type->is_nullable()) { - auto& nested_src_type = - static_cast(src_type.get())->get_nested_type(); - auto sub = ans_data_type; - ans_data_type = std::make_shared(ans_data_type); - if (nested_src_type->get_type_id() == sub->get_type_id()) { - if (sub->get_type_id() == TypeIndex::String && - show_type == PrimitiveType::TYPE_DECIMAL64) { - *need_convert = true; - return Status::OK(); - } + if (tmp_data_type->get_type_id() == remove_nullable(doris_type)->get_type_id()) { + if (tmp_data_type->get_type_id() == TypeIndex::String && + (show_type == PrimitiveType::TYPE_DECIMAL32 || + show_type == PrimitiveType::TYPE_DECIMAL64 || + show_type == PrimitiveType::TYPE_DECIMALV2 || + show_type == PrimitiveType::TYPE_DECIMAL128I)) { + *need_convert = true; + ans_column = tmp_data_type->create_column(); + } else { *need_convert = false; - return Status::OK(); } + } else { + ans_column = tmp_data_type->create_column(); + *need_convert = true; } - *need_convert = true; - return Status::OK(); -} - -Status get_converter(std::shared_ptr src_type, PrimitiveType show_type, - std::shared_ptr dst_type, - std::unique_ptr* converter, ConvertParams* convert_param) { - if (src_type->is_nullable()) { - auto src = reinterpret_cast(src_type.get())->get_nested_type(); - auto dst = reinterpret_cast(dst_type.get())->get_nested_type(); - - return get_converter_impl(src, show_type, dst, converter, convert_param); - } else { - return get_converter_impl(src_type, show_type, dst_type, converter, convert_param); + if (*need_convert && doris_type->is_nullable()) { + auto doris_nullable_column = static_cast(doris_column.get()); + ans_column = ColumnNullable::create(ans_column, + doris_nullable_column->get_null_map_column_ptr()); } + return ans_column; } + } // namespace ParquetConvert } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h b/be/src/vec/exec/format/parquet/parquet_column_convert.h index 9a177f975edd0e..d6bd85bf5b9e4b 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.h +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h @@ -56,6 +56,12 @@ struct PhysicalTypeTraits { using ColumnType = ColumnVector; }; +template <> +struct PhysicalTypeTraits { + using DataType = uint8; + using ColumnType = ColumnVector; +}; + template <> struct PhysicalTypeTraits { using DataType = int64_t; @@ -88,19 +94,15 @@ struct PhysicalTypeTraits { template <> struct PhysicalTypeTraits { - using DataType = Int128; - using ColumnType = ColumnVector; + using DataType = ParquetInt96; + using ColumnType = ColumnVector; }; #define FOR_LOGICAL_NUMERIC_TYPES(M) \ M(TypeIndex::Int8, Int8, Int32) \ - M(TypeIndex::UInt8, UInt8, Int32) \ M(TypeIndex::Int16, Int16, Int32) \ - M(TypeIndex::UInt16, UInt16, Int32) \ M(TypeIndex::Int32, Int32, Int32) \ - M(TypeIndex::UInt32, UInt32, Int32) \ M(TypeIndex::Int64, Int64, Int64) \ - M(TypeIndex::UInt64, UInt64, Int64) \ M(TypeIndex::Float32, Float32, Float32) \ M(TypeIndex::Float64, Float64, Float64) @@ -187,103 +189,77 @@ struct ConvertParams { } }; -Status convert_data_type_from_parquet(tparquet::Type::type parquet_type, PrimitiveType, - vectorized::DataTypePtr& ans_data_type, DataTypePtr& src_type, - bool* need_convert); +ColumnPtr get_column(tparquet::Type::type parquet_physical_type, PrimitiveType show_type, + ColumnPtr& doris_column, DataTypePtr& doris_type, bool* need_convert); struct ColumnConvert { - virtual Status convert(const IColumn* src_col, IColumn* dst_col) { return Status::OK(); } + virtual Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) { return Status::OK(); } virtual ~ColumnConvert() = default; - void convert_null(const IColumn** src_col, IColumn** dst_col) const { - size_t rows = (*src_col)->size(); - if ((*src_col)->is_nullable()) { - auto src_nullable_column = static_cast(*src_col); - auto dst_nullable_column = static_cast(*dst_col); - auto& src_null_data = src_nullable_column->get_null_map_column().get_data(); - dst_nullable_column->get_null_map_column().resize(_convert_params->start_idx + rows); - auto& dst_null_data = dst_nullable_column->get_null_map_column().get_data(); - for (auto j = 0; j < rows; j++) { - dst_null_data[_convert_params->start_idx + j] = src_null_data[j]; - } - - *src_col = &src_nullable_column->get_nested_column(); - *dst_col = &dst_nullable_column->get_nested_column(); - } + void convert_null(ColumnPtr& src_col, MutableColumnPtr& dst_col) { + src_col = remove_nullable(src_col); + dst_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); } public: ConvertParams* _convert_params; }; -template -struct NumberColumnConvert : public ColumnConvert { - Status convert(const IColumn* src_col, IColumn* dst_col) override; -}; +template +struct NumberToNumberConvert : public ColumnConvert { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using ColumnType = typename PhysicalTypeTraits::ColumnType; + convert_null(src_col, dst_col); -template -Status NumberColumnConvert::convert(const IColumn* src_col, - IColumn* dst_col) { - size_t rows = src_col->size(); - if constexpr (is_nullable) { - convert_null(&src_col, &dst_col); - } - auto& src_data = static_cast*>(src_col)->get_data(); - dst_col->resize(_convert_params->start_idx + rows); - auto& data = static_cast*>(dst_col)->get_data(); + size_t rows = src_col->size(); + auto& src_data = static_cast(src_col.get())->get_data(); - for (int i = 0; i < rows; i++) { - dst_type value = static_cast(src_data[i]); - data[_convert_params->start_idx + i] = value; - } + dst_col->resize(_convert_params->start_idx + rows); + auto& data = static_cast&>(*dst_col.get()).get_data(); + for (int i = 0; i < rows; i++) { + dst_type value = static_cast(src_data[i]); + data[_convert_params->start_idx + i] = value; + } - return Status::OK(); -} -template -struct NumberColumnToStringConvert : public ColumnConvert { - Status convert(const IColumn* src_col, IColumn* dst_col) override; + return Status::OK(); + } }; -template -Status NumberColumnToStringConvert::convert(const IColumn* src_col, - IColumn* dst_col) { - size_t rows = src_col->size(); - if constexpr (is_nullable) { - convert_null(&src_col, &dst_col); - } - auto& src_data = static_cast*>(src_col)->get_data(); - auto str_col = static_cast(dst_col); - for (int i = 0; i < rows; i++) { - std::string value = std::to_string(src_data[i]); - str_col->insert_data(value.data(), value.size()); - } - return Status::OK(); -} +template +struct NumberToStringConvert : public ColumnConvert { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using ColumnType = typename PhysicalTypeTraits::ColumnType; + convert_null(src_col, dst_col); -template -struct int128totimestamp : public ColumnConvert { -public: - [[nodiscard]] static uint64_t to_timestamp_micros(uint32_t hi, uint64_t lo) { - return (hi - ParquetInt96::JULIAN_EPOCH_OFFSET_DAYS) * ParquetInt96::MICROS_IN_DAY + - lo / ParquetInt96::NANOS_PER_MICROSECOND; - } - Status convert(const IColumn* src_col, IColumn* dst_col) override { size_t rows = src_col->size(); - if constexpr (is_nullable) { - convert_null(&src_col, &dst_col); + auto& src_data = static_cast(src_col.get())->get_data(); + + auto str_col = static_cast(dst_col.get()); + for (int i = 0; i < rows; i++) { + std::string value = std::to_string(src_data[i]); + str_col->insert_data(value.data(), value.size()); } - auto& src_data = static_cast*>(src_col)->get_data(); + return Status::OK(); + } +}; + +struct Int96toTimestamp : public ColumnConvert { +public: + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + convert_null(src_col, dst_col); + + size_t rows = src_col->size() / sizeof(ParquetInt96); + auto& src_data = static_cast*>(src_col.get())->get_data(); + auto ParquetInt96_data = (ParquetInt96*)src_data.data(); dst_col->resize(_convert_params->start_idx + rows); - auto& data = static_cast*>(dst_col)->get_data(); + auto& data = static_cast*>(dst_col.get())->get_data(); for (int i = 0; i < rows; i++) { - __int128 x = src_data[i]; - uint32_t hi = x >> 64; - uint64_t lo = (x << 64) >> 64; + ParquetInt96 x = ParquetInt96_data[i]; auto& num = data[_convert_params->start_idx + i]; auto& value = reinterpret_cast&>(num); - int64_t micros = to_timestamp_micros(hi, lo); + int64_t micros = x.to_timestamp_micros(); value.from_unixtime(micros / 1000000, *_convert_params->ctz); value.set_microsecond(micros % 1000000); } @@ -291,21 +267,19 @@ struct int128totimestamp : public ColumnConvert { } }; -template -struct int64totimestamp : public ColumnConvert { +struct Int64ToTimestamp : public ColumnConvert { public: - Status convert(const IColumn* src_col, IColumn* dst_col) override { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + convert_null(src_col, dst_col); + size_t rows = src_col->size(); - if constexpr (is_nullable) { - convert_null(&src_col, &dst_col); - } dst_col->resize(_convert_params->start_idx + rows); - auto& src_data = static_cast*>(src_col)->get_data(); - auto& data = static_cast*>(dst_col)->get_data(); + auto& src_data = static_cast*>(src_col.get())->get_data(); + auto& data = static_cast*>(dst_col.get())->get_data(); + for (int i = 0; i < rows; i++) { int64 x = src_data[i]; - dst_col = static_cast*>(dst_col); auto& num = data[_convert_params->start_idx + i]; auto& value = reinterpret_cast&>(num); value.from_unixtime(x / _convert_params->second_mask, *_convert_params->ctz); @@ -316,18 +290,16 @@ struct int64totimestamp : public ColumnConvert { } }; -template -class int32todate : public ColumnConvert { +class Int32ToDate : public ColumnConvert { public: - Status convert(const IColumn* src_col, IColumn* dst_col) override { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + convert_null(src_col, dst_col); + size_t rows = src_col->size(); - if constexpr (is_nullable) { - convert_null(&src_col, &dst_col); - } dst_col->resize(_convert_params->start_idx + rows); - auto& src_data = static_cast*>(src_col)->get_data(); - auto& data = static_cast(dst_col)->get_data(); + auto& src_data = static_cast*>(src_col.get())->get_data(); + auto& data = static_cast(dst_col.get())->get_data(); date_day_offset_dict& date_dict = date_day_offset_dict::get(); for (int i = 0; i < rows; i++) { @@ -341,25 +313,24 @@ class int32todate : public ColumnConvert { } }; -template -class stringtodecimal : public ColumnConvert { +template +class StringToDecimal : public ColumnConvert { public: - Status convert(const IColumn* src_col, IColumn* dst_col) override { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + convert_null(src_col, dst_col); + size_t rows = src_col->size(); - if constexpr (is_nullable) { - convert_null(&src_col, &dst_col); - } DecimalScaleParams& scale_params = _convert_params->decimal_scale; - auto buf = static_cast(src_col)->get_chars().data(); - auto& offset = static_cast(src_col)->get_offsets(); + auto buf = static_cast(src_col.get())->get_chars().data(); + auto& offset = static_cast(src_col.get())->get_offsets(); dst_col->resize(_convert_params->start_idx + rows); - auto& data = static_cast*>(dst_col)->get_data(); + auto& data = static_cast*>(dst_col.get())->get_data(); for (int i = 0; i < rows; i++) { int len = offset[i] - offset[i - 1]; // When Decimal in parquet is stored in byte arrays, binary and fixed, // the unscaled number must be encoded as two's complement using big-endian byte order. - Int128 value = 0; + ValueCopyType value = 0; memcpy(reinterpret_cast(&value), buf + offset[i - 1], len); value = BitUtil::big_endian_to_host(value); value = value >> ((sizeof(value) - len) * 8); @@ -380,25 +351,25 @@ class stringtodecimal : public ColumnConvert { return Status::OK(); } }; -template -class numbertodecimal : public ColumnConvert { +class NumberToDecimal : public ColumnConvert { public: - Status convert(const IColumn* src_col, IColumn* dst_col) override { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + convert_null(src_col, dst_col); + size_t rows = src_col->size(); - if constexpr (is_nullable) { - convert_null(&src_col, &dst_col); - } - auto* src_data = static_cast*>(src_col)->get_data().data(); + auto* src_data = + static_cast*>(src_col.get())->get_data().data(); dst_col->resize(_convert_params->start_idx + rows); DecimalScaleParams& scale_params = _convert_params->decimal_scale; - auto* data = static_cast>*>(dst_col) + auto* data = static_cast>*>(dst_col.get()) ->get_data() .data(); for (int i = 0; i < rows; i++) { - Int128 value = src_data[i]; + ValueCopyType value = src_data[i]; if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) { value *= scale_params.scale_factor; } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { @@ -410,18 +381,17 @@ class numbertodecimal : public ColumnConvert { } }; -template -class stringtodecimalstring : public ColumnConvert { +class StringToDecimalString : public ColumnConvert { public: - Status convert(const IColumn* src_col, IColumn* dst_col) override { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + convert_null(src_col, dst_col); + size_t rows = src_col->size(); - if constexpr (is_nullable) { - convert_null(&src_col, &dst_col); - } - auto buf = static_cast(src_col)->get_chars().data(); - auto& offset = static_cast(src_col)->get_offsets(); - auto data = static_cast(dst_col); + auto buf = static_cast(src_col.get())->get_chars().data(); + auto& offset = static_cast(src_col.get())->get_offsets(); + + auto data = static_cast(dst_col.get()); for (int i = 0; i < rows; i++) { int len = offset[i] - offset[i - 1]; // When Decimal in parquet is stored in byte arrays, binary and fixed, @@ -438,162 +408,163 @@ class stringtodecimalstring : public ColumnConvert { } }; -template -class int128totimestampstring : public ColumnConvert { +class Int96ToTimestampString : public ColumnConvert { public: - Status convert(const IColumn* src_col, IColumn* dst_col) override { - size_t rows = src_col->size(); - if constexpr (is_nullable) { - convert_null(&src_col, &dst_col); - } + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + convert_null(src_col, dst_col); + + auto& src_data = static_cast*>(src_col.get())->get_data(); + auto dst_data = static_cast(dst_col.get()); - auto& src_data = static_cast*>(src_col)->get_data(); - auto data = static_cast(dst_col); + size_t rows = src_col->size() / sizeof(ParquetInt96); + ParquetInt96* data = (ParquetInt96*)src_data.data(); + std::string buf; + buf.resize(50); for (int i = 0; i < rows; i++) { - __int128 x = src_data[i]; - uint32_t hi = x >> 64; - uint64_t lo = (x << 64) >> 64; uint64_t num = 0; auto& value = reinterpret_cast&>(num); - int64_t micros = int128totimestamp::to_timestamp_micros(hi, lo); + int64_t micros = data[i].to_timestamp_micros(); value.from_unixtime(micros / 1000000, *_convert_params->ctz); value.set_microsecond(micros % 1000000); - std::string buf; - buf.resize(20); char* end = value.to_string(buf.data()); - data->insert_data(buf.data(), end - buf.data()); + dst_data->insert_data(buf.data(), end - buf.data()); } - return Status::OK(); } }; -template -inline Status get_converter_impl(std::shared_ptr src_data_type, - PrimitiveType show_type, - std::shared_ptr dst_data_type, - std::unique_ptr* converter, - ConvertParams* convert_params) { - auto src_type = src_data_type->get_type_id(); - auto dst_type = dst_data_type->get_type_id(); +inline Status get_converter(tparquet::Type::type parquet_physical_type, PrimitiveType show_type, + std::shared_ptr dst_data_type, + std::unique_ptr* converter, + ConvertParams* convert_params) { + auto dst_type = remove_nullable(dst_data_type)->get_type_id(); switch (dst_type) { -#define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \ - case NUMERIC_TYPE: \ - switch (src_type) { \ - case TypeIndex::UInt8: \ - *converter = \ - std::make_unique>(); \ - break; \ - case TypeIndex::Int32: \ - *converter = \ - std::make_unique>(); \ - break; \ - case TypeIndex::Int64: \ - *converter = \ - std::make_unique>(); \ - break; \ - case TypeIndex::Float32: \ - *converter = std::make_unique< \ - NumberColumnConvert>(); \ - break; \ - case TypeIndex::Float64: \ - *converter = std::make_unique< \ - NumberColumnConvert>(); \ - break; \ - case TypeIndex::Int128: \ - *converter = std::make_unique< \ - NumberColumnConvert>(); \ - break; \ - default: \ - break; \ - } \ +#define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \ + case NUMERIC_TYPE: \ + switch (parquet_physical_type) { \ + case tparquet::Type::BOOLEAN: \ + *converter = std::make_unique< \ + NumberToNumberConvert>(); \ + break; \ + case tparquet::Type::INT32: \ + *converter = std::make_unique< \ + NumberToNumberConvert>(); \ + break; \ + case tparquet::Type::INT64: \ + *converter = std::make_unique< \ + NumberToNumberConvert>(); \ + break; \ + case tparquet::Type::FLOAT: \ + *converter = std::make_unique< \ + NumberToNumberConvert>(); \ + break; \ + case tparquet::Type::DOUBLE: \ + *converter = std::make_unique< \ + NumberToNumberConvert>(); \ + break; \ + default: \ + break; \ + } \ break; FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) #undef DISPATCH case TypeIndex::String: { - if (src_type == TypeIndex::String) { + if (tparquet::Type::FIXED_LEN_BYTE_ARRAY == parquet_physical_type) { if (show_type == PrimitiveType::TYPE_DECIMAL64) { - *converter = std::make_unique>(); + *converter = std::make_unique(); break; } - } else if (src_type == TypeIndex::Int128) { - *converter = std::make_unique>(); + } else if (tparquet::Type::INT96 == parquet_physical_type) { + *converter = std::make_unique(); break; } - switch (src_type) { -#define DISPATCH1(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \ - case NUMERIC_TYPE: \ - *converter = \ - std::make_unique>(); \ - break; - FOR_LOGICAL_NUMERIC_TYPES(DISPATCH1) -#undef DISPATCH1 - default: - break; + if (parquet_physical_type == tparquet::Type::BOOLEAN) { + *converter = std::make_unique>(); + } else if (parquet_physical_type == tparquet::Type::INT32) { + *converter = std::make_unique>(); + + } else if (parquet_physical_type == tparquet::Type::INT64) { + *converter = std::make_unique>(); + + } else if (parquet_physical_type == tparquet::Type::FLOAT) { + *converter = std::make_unique>(); + + } else if (parquet_physical_type == tparquet::Type::DOUBLE) { + *converter = std::make_unique>(); } + break; } case TypeIndex::DateV2: - if (src_type == TypeIndex::Int32) { - *converter = std::make_unique>(); + if (tparquet::Type::INT32 == parquet_physical_type) { + *converter = std::make_unique(); } break; case TypeIndex::DateTimeV2: - if (src_type == TypeIndex::Int128) { - *converter = std::make_unique>(); - } else if (src_type == TypeIndex::Int64) { - *converter = std::make_unique>(); + if (tparquet::Type::INT96 == parquet_physical_type) { + *converter = std::make_unique(); + } else if (tparquet::Type::INT64 == parquet_physical_type) { + *converter = std::make_unique(); } break; #define DISPATCH2(TypeIndex_DECIMAL_TYPE, DECIMAL_TYPE, PRIMARY_TYPE) \ case TypeIndex_DECIMAL_TYPE: { \ convert_params->init_decimal_converter(dst_data_type); \ DecimalScaleParams& scale_params = convert_params->decimal_scale; \ - if (src_type == TypeIndex::Int128) { \ - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \ - *converter = std::make_unique>(); \ - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \ - *converter = std::make_unique>(); \ - } else { \ - *converter = std::make_unique>(); \ + if (tparquet::Type::FIXED_LEN_BYTE_ARRAY == parquet_physical_type) { \ + size_t string_length = convert_params->field_schema->parquet_schema.type_length; \ + if (string_length <= 8) { \ + if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \ + *converter = \ + std::make_unique>(); \ + } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \ + *converter = \ + std::make_unique>(); \ + } else { \ + *converter = \ + std::make_unique>(); \ + } \ + } else if (string_length <= 16) { \ + if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \ + *converter = \ + std::make_unique>(); \ + } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \ + *converter = \ + std::make_unique>(); \ + } else { \ + *converter = \ + std::make_unique>(); \ + } \ } \ - } else if (src_type == TypeIndex::String) { \ + } else if (tparquet::Type::INT32 == parquet_physical_type) { \ if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \ - *converter = std::make_unique>(); \ } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \ - *converter = std::make_unique>(); \ } else { \ - *converter = std::make_unique>(); \ } \ - } else if (src_type == TypeIndex::Int32) { \ + } else if (tparquet::Type::INT64 == parquet_physical_type) { \ if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \ - *converter = std::make_unique>(); \ } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \ - *converter = std::make_unique>(); \ } else { \ - *converter = std::make_unique>(); \ - } \ - } else if (src_type == TypeIndex::Int64) { \ - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \ - *converter = std::make_unique>(); \ - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \ - *converter = std::make_unique>(); \ - } else { \ - *converter = std::make_unique>(); \ } \ } \ @@ -607,17 +578,14 @@ inline Status get_converter_impl(std::shared_ptr src_data_type, } if (*converter == nullptr) { - return Status::NotSupported("Can't cast type {} to type {}", getTypeName(src_type), + return Status::NotSupported("Can't cast type parquet physical {} to doris logical type {}", + tparquet::to_string(parquet_physical_type), getTypeName(dst_type)); } (*converter)->_convert_params = convert_params; return Status::OK(); } -Status get_converter(std::shared_ptr src_type, PrimitiveType show_type, - std::shared_ptr dst_type, - std::unique_ptr* converter, ConvertParams* convert_param); - }; // namespace ParquetConvert }; // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp index 66ec1081a1733a..46bc6bd787b10c 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp @@ -482,127 +482,106 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr ColumnSelectVector& select_vector, size_t batch_size, size_t* read_rows, bool* eof, bool is_dict_filter) { bool need_convert = false; - auto& physical_type = _chunk_meta.meta_data.type; - DataTypePtr src_type; + auto& parquet_physical_type = _chunk_meta.meta_data.type; + auto& show_type = _field_schema->type.type; - RETURN_IF_ERROR(ParquetConvert::convert_data_type_from_parquet( - physical_type, _field_schema->type.type, src_type, type, &need_convert)); + ColumnPtr src_column = ParquetConvert::get_column(parquet_physical_type, show_type, + doris_column, type, &need_convert); - ColumnPtr src_column = doris_column; - if (need_convert) { - src_column = src_type->create_column(); - } - - if (_chunk_reader->remaining_num_values() == 0) { - if (!_chunk_reader->has_next_page()) { - *eof = true; - *read_rows = 0; - return Status::OK(); + do { + if (_chunk_reader->remaining_num_values() == 0) { + if (!_chunk_reader->has_next_page()) { + *eof = true; + *read_rows = 0; + return Status::OK(); + } + RETURN_IF_ERROR(_chunk_reader->next_page()); } - RETURN_IF_ERROR(_chunk_reader->next_page()); - } - if (_nested_column) { - RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent()); - RETURN_IF_ERROR(_read_nested_column(src_column, type, select_vector, batch_size, read_rows, - eof, is_dict_filter)); - if (need_convert) { - std::unique_ptr converter; - ParquetConvert::ConvertParams convert_params; - convert_params.init(_field_schema, _ctz); - convert_params.start_idx = doris_column->size(); - RETURN_IF_ERROR(ParquetConvert::get_converter(src_type, _field_schema->type.type, type, - &converter, &convert_params)); - RETURN_IF_ERROR( - converter->convert(src_column, const_cast(doris_column.get()))); + if (_nested_column) { + RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent()); + RETURN_IF_ERROR(_read_nested_column(src_column, type, select_vector, batch_size, + read_rows, eof, is_dict_filter)); + break; } - return Status::OK(); - } - // generate the row ranges that should be read - std::list read_ranges; - _generate_read_ranges(_current_row_index, - _current_row_index + _chunk_reader->remaining_num_values(), read_ranges); - if (read_ranges.size() == 0) { - // skip the whole page - _current_row_index += _chunk_reader->remaining_num_values(); - RETURN_IF_ERROR(_chunk_reader->skip_page()); - *read_rows = 0; - } else { - bool skip_whole_batch = false; - // Determining whether to skip page or batch will increase the calculation time. - // When the filtering effect is greater than 60%, it is possible to skip the page or batch. - if (select_vector.has_filter() && select_vector.filter_ratio() > 0.6) { - // lazy read - size_t remaining_num_values = 0; - for (auto& range : read_ranges) { - remaining_num_values += range.last_row - range.first_row; - } - if (batch_size >= remaining_num_values && - select_vector.can_filter_all(remaining_num_values)) { - // We can skip the whole page if the remaining values is filtered by predicate columns - select_vector.skip(remaining_num_values); - _current_row_index += _chunk_reader->remaining_num_values(); - RETURN_IF_ERROR(_chunk_reader->skip_page()); - *read_rows = remaining_num_values; - if (!_chunk_reader->has_next_page()) { - *eof = true; + // generate the row ranges that should be read + std::list read_ranges; + _generate_read_ranges(_current_row_index, + _current_row_index + _chunk_reader->remaining_num_values(), + read_ranges); + if (read_ranges.size() == 0) { + // skip the whole page + _current_row_index += _chunk_reader->remaining_num_values(); + RETURN_IF_ERROR(_chunk_reader->skip_page()); + *read_rows = 0; + } else { + bool skip_whole_batch = false; + // Determining whether to skip page or batch will increase the calculation time. + // When the filtering effect is greater than 60%, it is possible to skip the page or batch. + if (select_vector.has_filter() && select_vector.filter_ratio() > 0.6) { + // lazy read + size_t remaining_num_values = 0; + for (auto& range : read_ranges) { + remaining_num_values += range.last_row - range.first_row; } - if (need_convert) { - std::unique_ptr converter; - ParquetConvert::ConvertParams convert_params; - convert_params.init(_field_schema, _ctz); - convert_params.start_idx = doris_column->size(); - RETURN_IF_ERROR(ParquetConvert::get_converter( - src_type, _field_schema->type.type, type, &converter, &convert_params)); - RETURN_IF_ERROR(converter->convert(src_column, - const_cast(doris_column.get()))); + if (batch_size >= remaining_num_values && + select_vector.can_filter_all(remaining_num_values)) { + // We can skip the whole page if the remaining values is filtered by predicate columns + select_vector.skip(remaining_num_values); + _current_row_index += _chunk_reader->remaining_num_values(); + RETURN_IF_ERROR(_chunk_reader->skip_page()); + *read_rows = remaining_num_values; + if (!_chunk_reader->has_next_page()) { + *eof = true; + } + break; + } + skip_whole_batch = batch_size <= remaining_num_values && + select_vector.can_filter_all(batch_size); + if (skip_whole_batch) { + select_vector.skip(batch_size); } - - return Status::OK(); } - skip_whole_batch = - batch_size <= remaining_num_values && select_vector.can_filter_all(batch_size); - if (skip_whole_batch) { - select_vector.skip(batch_size); + // load page data to decode or skip values + RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent()); + size_t has_read = 0; + for (auto& range : read_ranges) { + // generate the skipped values + size_t skip_values = range.first_row - _current_row_index; + RETURN_IF_ERROR(_skip_values(skip_values)); + _current_row_index += skip_values; + // generate the read values + size_t read_values = + std::min((size_t)(range.last_row - range.first_row), batch_size - has_read); + if (skip_whole_batch) { + RETURN_IF_ERROR(_skip_values(read_values)); + } else { + RETURN_IF_ERROR(_read_values(read_values, src_column, type, select_vector, + is_dict_filter)); + } + has_read += read_values; + _current_row_index += read_values; + if (has_read == batch_size) { + break; + } } + *read_rows = has_read; } - // load page data to decode or skip values - RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent()); - size_t has_read = 0; - for (auto& range : read_ranges) { - // generate the skipped values - size_t skip_values = range.first_row - _current_row_index; - RETURN_IF_ERROR(_skip_values(skip_values)); - _current_row_index += skip_values; - // generate the read values - size_t read_values = - std::min((size_t)(range.last_row - range.first_row), batch_size - has_read); - if (skip_whole_batch) { - RETURN_IF_ERROR(_skip_values(read_values)); - } else { - RETURN_IF_ERROR( - _read_values(read_values, src_column, type, select_vector, is_dict_filter)); - } - has_read += read_values; - _current_row_index += read_values; - if (has_read == batch_size) { - break; - } + + if (_chunk_reader->remaining_num_values() == 0 && !_chunk_reader->has_next_page()) { + *eof = true; } - *read_rows = has_read; - } + } while (0); - if (_chunk_reader->remaining_num_values() == 0 && !_chunk_reader->has_next_page()) { - *eof = true; - } if (need_convert) { std::unique_ptr converter; ParquetConvert::ConvertParams convert_params; convert_params.init(_field_schema, _ctz); convert_params.start_idx = doris_column->size(); - RETURN_IF_ERROR(ParquetConvert::get_converter(src_type, _field_schema->type.type, type, + RETURN_IF_ERROR(ParquetConvert::get_converter(parquet_physical_type, show_type, type, &converter, &convert_params)); - RETURN_IF_ERROR(converter->convert(src_column, const_cast(doris_column.get()))); + auto x = doris_column->assume_mutable(); + RETURN_IF_ERROR(converter->convert(src_column, x)); } return Status::OK(); From 5911a4ca1e363555ea2b761496a9da89f912e190 Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Sun, 22 Oct 2023 14:04:01 +0800 Subject: [PATCH 18/21] fix ut --- .../parquet/fix_length_dict_decoder.hpp | 6 +- .../format/parquet/fix_length_plain_decoder.h | 24 +- .../format/parquet/parquet_column_convert.h | 74 ++- .../format/parquet/vparquet_column_reader.cpp | 5 +- .../vec/exec/parquet/parquet_thrift_test.cpp | 22 +- .../hive/test_hive_parquet_alter_column.out | 598 ++++++++++++++++-- .../test_hive_parquet_alter_column.groovy | 170 +++++ 7 files changed, 813 insertions(+), 86 deletions(-) diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp index 3cb5dd3840458b..c29e742f511737 100644 --- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp +++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp @@ -93,9 +93,9 @@ class FixLengthDictDecoder final : public BaseDictDecoder { template Status _decode_numeric(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) { auto& column_data = reinterpret_cast&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + _type_length * (select_vector.num_values() - - select_vector.num_filtered())); + size_t data_index = column_data.size() / _type_length; + column_data.resize(column_data.size() + _type_length * (select_vector.num_values() - + select_vector.num_filtered())); size_t dict_index = 0; DataType* data = (DataType*)column_data.data(); ColumnSelectVector::DataReadType read_type; diff --git a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h index 1382ba6b75444a..bc35b76140f594 100644 --- a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h +++ b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h @@ -99,28 +99,18 @@ template template Status FixLengthPlainDecoder::_decode_string(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) { - auto& string_column = static_cast(*doris_column); - - auto& data = string_column.get_chars(); - size_t data_index = data.size(); - data.resize(data_index + - _type_length * (select_vector.num_values() - select_vector.num_filtered())); - auto& offset = string_column.get_offsets(); - size_t offset_index = offset.size(); - offset.resize(offset_index + select_vector.num_values() - select_vector.num_filtered()); - ColumnSelectVector::DataReadType read_type; while (size_t run_length = select_vector.get_next_run(&read_type)) { switch (read_type) { case ColumnSelectVector::CONTENT: { - memcpy(data.data() + data_index, _data->data + _offset, _type_length * run_length); - _offset += _type_length * run_length; - data_index += _type_length * run_length; - - for (int i = 0; i < run_length; i++) { - offset[offset_index] = offset[offset_index - 1] + _type_length; - offset_index++; + std::vector string_values; + string_values.reserve(run_length); + for (size_t i = 0; i < run_length; ++i) { + char* buf_start = _data->data + _offset; + string_values.emplace_back(buf_start, _type_length); + _offset += _type_length; } + doris_column->insert_many_strings(&string_values[0], run_length); break; } case ColumnSelectVector::NULL_DATA: { diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h b/be/src/vec/exec/format/parquet/parquet_column_convert.h index d6bd85bf5b9e4b..3e57013ecd5b74 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.h +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h @@ -124,7 +124,7 @@ struct ConvertParams { FieldSchema* field_schema = nullptr; size_t start_idx = 0; - void init(FieldSchema* field_schema_, cctz::time_zone* ctz_) { + void init(FieldSchema* field_schema_, cctz::time_zone* ctz_, size_t start_idx_ = 0) { field_schema = field_schema_; if (ctz_ != nullptr) { ctz = ctz_; @@ -163,6 +163,7 @@ struct ConvertParams { t.from_unixtime(0, *ctz); offset_days = t.day() == 31 ? 0 : 1; } + start_idx = start_idx_; } template @@ -189,6 +190,21 @@ struct ConvertParams { } }; +/* +* parquet_physical_type : The type of data stored in parquet. +* Read data into columns returned by get_column according to the physical type of parquet. +* show_type : The data format that should be displayed. +* doris_column : What type of column does the upper layer need to put the data in. +* +* example : +* In hive, if decimal is stored as FIXED_LENBYTE_ARRAY in parquet, +* then we use `ALTER TABLE TableName CHANGE COLUMN Col_Decimal Col_Decimal String;` +* to convert this column to string type. +* parquet_type : FIXED_LEN_BYTE_ARRAY. +* ans_data_type : ColumnInt8 +* show_type : Decimal. +* doris_column : ColumnString. +*/ ColumnPtr get_column(tparquet::Type::type parquet_physical_type, PrimitiveType show_type, ColumnPtr& doris_column, DataTypePtr& doris_type, bool* need_convert); @@ -327,7 +343,7 @@ class StringToDecimal : public ColumnConvert { auto& data = static_cast*>(dst_col.get())->get_data(); for (int i = 0; i < rows; i++) { - int len = offset[i] - offset[i - 1]; + size_t len = offset[i] - offset[i - 1]; // When Decimal in parquet is stored in byte arrays, binary and fixed, // the unscaled number must be encoded as two's complement using big-endian byte order. ValueCopyType value = 0; @@ -381,6 +397,7 @@ class NumberToDecimal : public ColumnConvert { } }; +template class StringToDecimalString : public ColumnConvert { public: Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { @@ -396,11 +413,11 @@ class StringToDecimalString : public ColumnConvert { int len = offset[i] - offset[i - 1]; // When Decimal in parquet is stored in byte arrays, binary and fixed, // the unscaled number must be encoded as two's complement using big-endian byte order. - Int64 value = 0; + ValueCopyType value = 0; memcpy(reinterpret_cast(&value), buf + offset[i - 1], len); value = BitUtil::big_endian_to_host(value); value = value >> ((sizeof(value) - len) * 8); - std::string ans = reinterpret_cast(value).to_string( + std::string ans = reinterpret_cast(value).to_string( _convert_params->field_schema->parquet_schema.scale); data->insert_data(ans.data(), ans.size()); } @@ -408,6 +425,29 @@ class StringToDecimalString : public ColumnConvert { } }; +class Int32ToDateString : public ColumnConvert { +public: + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + convert_null(src_col, dst_col); + + size_t rows = src_col->size(); + + auto& src_data = static_cast*>(src_col.get())->get_data(); + date_day_offset_dict& date_dict = date_day_offset_dict::get(); + + auto str_col = static_cast(dst_col.get()); + char buf[50]; + for (int i = 0; i < rows; i++) { + int64_t date_value = (int64_t)src_data[i] + _convert_params->offset_days; + DateV2Value value = date_dict[date_value]; + char* end = value.to_string(buf); + str_col->insert_data(buf, end - buf); + } + + return Status::OK(); + } +}; + class Int96ToTimestampString : public ColumnConvert { public: Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { @@ -419,16 +459,15 @@ class Int96ToTimestampString : public ColumnConvert { size_t rows = src_col->size() / sizeof(ParquetInt96); ParquetInt96* data = (ParquetInt96*)src_data.data(); - std::string buf; - buf.resize(50); + char buf[50]; for (int i = 0; i < rows; i++) { uint64_t num = 0; auto& value = reinterpret_cast&>(num); int64_t micros = data[i].to_timestamp_micros(); value.from_unixtime(micros / 1000000, *_convert_params->ctz); value.set_microsecond(micros % 1000000); - char* end = value.to_string(buf.data()); - dst_data->insert_data(buf.data(), end - buf.data()); + char* end = value.to_string(buf); + dst_data->insert_data(buf, end - buf); } return Status::OK(); } @@ -472,13 +511,28 @@ inline Status get_converter(tparquet::Type::type parquet_physical_type, Primitiv case TypeIndex::String: { if (tparquet::Type::FIXED_LEN_BYTE_ARRAY == parquet_physical_type) { - if (show_type == PrimitiveType::TYPE_DECIMAL64) { - *converter = std::make_unique(); + if (show_type == PrimitiveType::TYPE_DECIMAL32) { + *converter = std::make_unique>(); + break; + } else if (show_type == PrimitiveType::TYPE_DECIMAL64) { + *converter = std::make_unique>(); + break; + } else if (show_type == PrimitiveType::TYPE_DECIMALV2) { + *converter = std::make_unique>(); + break; + } else if (show_type == PrimitiveType::TYPE_DECIMAL128I) { + *converter = std::make_unique>(); break; } + } else if (tparquet::Type::INT96 == parquet_physical_type) { *converter = std::make_unique(); break; + } else if (tparquet::Type::INT32 == parquet_physical_type) { + if (show_type == PrimitiveType::TYPE_DATEV2) { + *converter = std::make_unique(); + break; + } } if (parquet_physical_type == tparquet::Type::BOOLEAN) { diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp index 46bc6bd787b10c..d6f7d746bcd1d3 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp @@ -571,13 +571,12 @@ Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr if (_chunk_reader->remaining_num_values() == 0 && !_chunk_reader->has_next_page()) { *eof = true; } - } while (0); + } while (false); if (need_convert) { std::unique_ptr converter; ParquetConvert::ConvertParams convert_params; - convert_params.init(_field_schema, _ctz); - convert_params.start_idx = doris_column->size(); + convert_params.init(_field_schema, _ctz, doris_column->size()); RETURN_IF_ERROR(ParquetConvert::get_converter(parquet_physical_type, show_type, type, &converter, &convert_params)); auto x = doris_column->assume_mutable(); diff --git a/be/test/vec/exec/parquet/parquet_thrift_test.cpp b/be/test/vec/exec/parquet/parquet_thrift_test.cpp index 047d4d0a5b6a22..d5c4a2d56a282c 100644 --- a/be/test/vec/exec/parquet/parquet_thrift_test.cpp +++ b/be/test/vec/exec/parquet/parquet_thrift_test.cpp @@ -192,16 +192,11 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column size_t chunk_size = chunk_meta.total_compressed_size; bool need_convert = false; - auto& physical_type = column_chunk->meta_data.type; - DataTypePtr src_type; + auto& parquet_physical_type = column_chunk.meta_data.type; + auto& show_type = field_schema->type.type; - RETURN_IF_ERROR(ParquetConvert::convert_data_type_from_parquet( - physical_type, field_schema->type.type, src_type, data_type, &need_convert)); - - ColumnPtr src_column = doris_column->assume_mutable(); - if (need_convert) { - src_column = src_type->create_column(); - } + ColumnPtr src_column = ParquetConvert::get_column(parquet_physical_type, show_type, + doris_column, data_type, &need_convert); io::BufferedFileStreamReader stream_reader(file_reader, start_offset, chunk_size, 1024); @@ -272,12 +267,13 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column if (need_convert) { std::unique_ptr converter; ParquetConvert::ConvertParams convert_params; - convert_params.init(field_schema, &ctz); - convert_params.start_idx = doris_column->size(); - RETURN_IF_ERROR(ParquetConvert::get_converter(src_type, field_schema->type.type, data_type, + convert_params.init(field_schema, &ctz, doris_column->size()); + RETURN_IF_ERROR(ParquetConvert::get_converter(parquet_physical_type, show_type, data_type, &converter, &convert_params)); - RETURN_IF_ERROR(converter->convert(src_column, const_cast(doris_column.get()))); + auto x = doris_column->assume_mutable(); + RETURN_IF_ERROR(converter->convert(src_column, x)); } + return Status::OK(); } diff --git a/regression-test/data/external_table_p2/hive/test_hive_parquet_alter_column.out b/regression-test/data/external_table_p2/hive/test_hive_parquet_alter_column.out index 0c1ee1531ab4aa..cca084ff0f6ca0 100644 --- a/regression-test/data/external_table_p2/hive/test_hive_parquet_alter_column.out +++ b/regression-test/data/external_table_p2/hive/test_hive_parquet_alter_column.out @@ -638,13 +638,13 @@ col_timestamp TEXT Yes true \N col_decimal TEXT Yes true \N -- !show -- --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 -- !order -- -1 @@ -697,9 +697,9 @@ ADC ADC -- !order -- -19636 -19636 -19636 +2023-10-06 +2023-10-06 +2023-10-06 -- !order -- 2023-10-06 14:30:00 @@ -727,13 +727,13 @@ col_timestamp CHAR(10) Yes true \N col_decimal CHAR(10) Yes true \N -- !show -- --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 -- !order -- -1 @@ -786,9 +786,9 @@ ADC ADC -- !order -- -19636 -19636 -19636 +2023-10-06 +2023-10-06 +2023-10-06 -- !order -- 2023-10-06 14:30:00 @@ -816,13 +816,13 @@ col_timestamp VARCHAR(20) Yes true \N col_decimal VARCHAR(20) Yes true \N -- !show -- --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 19636 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 -- !order -- -1 @@ -875,9 +875,9 @@ ADC ADC -- !order -- -19636 -19636 -19636 +2023-10-06 +2023-10-06 +2023-10-06 -- !order -- 2023-10-06 14:30:00 @@ -1083,13 +1083,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(5, 1) Yes true \N -- !show -- --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 0.0 --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 0.0 --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 0.0 --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 0.0 --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 0.0 --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 0.0 --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 0.0 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 -- !order -- -1.0 @@ -1152,7 +1152,525 @@ ADC 2023-10-06T14:30 -- !order -- -0.0 -0.0 -0.0 +123.4 +123.4 +123.4 + +-- !int_int -- +2 +2 +2 + +-- !int_smallint -- +100 +100 +100 + +-- !int_tinyint -- +5 +5 +5 + +-- !int_bigint -- +1000000000 +1000000000 +1000000000 + +-- !int_float -- + +-- !int_double -- + +-- !int_boolean -- + +-- !int_string -- + +-- !int_char -- +B +B +B + +-- !int_varchar -- + +-- !int_date -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !int_timestamp -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !int_decimal -- + +-- !smallint_int -- +1 +1 +1 + +-- !smallint_smallint -- +100 +100 +100 + +-- !smallint_tinyint -- +5 +5 +5 + +-- !smallint_bigint -- +1000000000 +1000000000 +1000000000 + +-- !smallint_float -- + +-- !smallint_double -- + +-- !smallint_boolean -- + +-- !smallint_string -- + +-- !smallint_char -- +C +C +C + +-- !smallint_varchar -- + +-- !smallint_date -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !smallint_timestamp -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !smallint_decimal -- + +-- !tinyint_int -- +3 +3 +3 + +-- !tinyint_smallint -- +100 +100 +100 + +-- !tinyint_tinyint -- +5 +5 +5 + +-- !tinyint_bigint -- +1000000000 +1000000000 +1000000000 + +-- !tinyint_float -- + +-- !tinyint_double -- + +-- !tinyint_boolean -- + +-- !tinyint_string -- + +-- !tinyint_char -- +A +A +A + +-- !tinyint_varchar -- + +-- !tinyint_date -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !tinyint_timestamp -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !tinyint_decimal -- + +-- !bigint_int -- +3 +3 +3 + +-- !bigint_smallint -- +100 +100 +100 + +-- !bigint_tinyint -- +5 +5 +5 + +-- !bigint_bigint -- +1000000000 +1000000000 +1000000000 + +-- !bigint_float -- + +-- !bigint_double -- + +-- !bigint_boolean -- + +-- !bigint_string -- + +-- !bigint_char -- +A +A +A + +-- !bigint_varchar -- + +-- !bigint_date -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !bigint_timestamp -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !bigint_decimal -- + +-- !float_int -- + +-- !float_smallint -- + +-- !float_tinyint -- + +-- !float_bigint -- + +-- !float_float -- + +-- !float_double -- + +-- !float_boolean -- + +-- !float_string -- + +-- !float_char -- + +-- !float_varchar -- + +-- !float_date -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !float_timestamp -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !float_decimal -- + +-- !double_int -- +2.0 +2.0 +2.0 + +-- !double_smallint -- + +-- !double_tinyint -- + +-- !double_bigint -- + +-- !double_float -- + +-- !double_double -- + +-- !double_boolean -- + +-- !double_string -- + +-- !double_char -- +A +A +A + +-- !double_varchar -- + +-- !double_date -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !double_timestamp -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !double_decimal -- + +-- !boolean_int -- +3 +3 +3 + +-- !boolean_smallint -- +100 +100 +100 + +-- !boolean_tinyint -- +5 +5 +5 + +-- !boolean_bigint -- +1000000000 +1000000000 +1000000000 + +-- !boolean_float -- + +-- !boolean_double -- + +-- !boolean_boolean -- + +-- !boolean_string -- + +-- !boolean_char -- +A +A +A + +-- !boolean_varchar -- + +-- !boolean_date -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !boolean_timestamp -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !boolean_decimal -- + +-- !string_int -- + +-- !string_smallint -- + +-- !string_tinyint -- + +-- !string_bigint -- + +-- !string_float -- + +-- !string_double -- + +-- !string_boolean -- + +-- !string_string -- + +-- !string_char -- +A +A +A + +-- !string_varchar -- + +-- !string_date -- + +-- !string_timestamp -- + +-- !string_decimal -- + +-- !char_int -- + +-- !char_smallint -- + +-- !char_tinyint -- + +-- !char_bigint -- + +-- !char_float -- + +-- !char_double -- + +-- !char_boolean -- + +-- !char_string -- + +-- !char_char -- +A +A +A + +-- !char_varchar -- + +-- !char_date -- + +-- !char_timestamp -- + +-- !char_decimal -- + +-- !varchar_int -- + +-- !varchar_smallint -- + +-- !varchar_tinyint -- + +-- !varchar_bigint -- + +-- !varchar_float -- + +-- !varchar_double -- + +-- !varchar_boolean -- + +-- !varchar_string -- + +-- !varchar_char -- +B +B +B + +-- !varchar_varchar -- + +-- !varchar_date -- + +-- !varchar_timestamp -- + +-- !varchar_decimal -- + +-- !date_int -- +3 +3 +3 + +-- !date_smallint -- +100 +100 +100 + +-- !date_tinyint -- +5 +5 +5 + +-- !date_bigint -- +1000000000 +1000000000 +1000000000 + +-- !date_float -- + +-- !date_double -- + +-- !date_boolean -- + +-- !date_string -- + +-- !date_char -- +A +A +A + +-- !date_varchar -- + +-- !date_date -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !date_timestamp -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !date_decimal -- + +-- !timestamp_int -- +3 +3 +3 + +-- !timestamp_smallint -- +100 +100 +100 + +-- !timestamp_tinyint -- +5 +5 +5 + +-- !timestamp_bigint -- +1000000000 +1000000000 +1000000000 + +-- !timestamp_float -- + +-- !timestamp_double -- + +-- !timestamp_boolean -- + +-- !timestamp_string -- + +-- !timestamp_char -- +B +B +B + +-- !timestamp_varchar -- + +-- !timestamp_date -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !timestamp_timestamp -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !timestamp_decimal -- + +-- !decimal_int -- + +-- !decimal_smallint -- + +-- !decimal_tinyint -- + +-- !decimal_bigint -- + +-- !decimal_float -- + +-- !decimal_double -- + +-- !decimal_boolean -- + +-- !decimal_string -- + +-- !decimal_char -- + +-- !decimal_varchar -- + +-- !decimal_date -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !decimal_timestamp -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !decimal_decimal -- diff --git a/regression-test/suites/external_table_p2/hive/test_hive_parquet_alter_column.groovy b/regression-test/suites/external_table_p2/hive/test_hive_parquet_alter_column.groovy index 3583d0bec10217..379554c7f18e7e 100644 --- a/regression-test/suites/external_table_p2/hive/test_hive_parquet_alter_column.groovy +++ b/regression-test/suites/external_table_p2/hive/test_hive_parquet_alter_column.groovy @@ -54,5 +54,175 @@ suite("test_hive_parquet_alter_column", "p2,external,hive,external_remote,extern } } + order_qt_int_int """ select col_int from parquet_alter_column_to_int where col_int>=2 order by col_int limit 3""" + order_qt_int_smallint """ select col_smallint from parquet_alter_column_to_int where col_smallint>=3 order by col_smallint limit 3""" + order_qt_int_tinyint """ select col_tinyint from parquet_alter_column_to_int where col_tinyint>=3 order by col_tinyint limit 3""" + order_qt_int_bigint """ select col_bigint from parquet_alter_column_to_int where col_bigint>=3 order by col_bigint limit 3""" + order_qt_int_float """ select col_float from parquet_alter_column_to_int where col_float=2.6 order by col_float limit 3""" + order_qt_int_double """ select col_double from parquet_alter_column_to_int where col_double=0.8 order by col_double limit 3""" + order_qt_int_boolean """ select col_boolean from parquet_alter_column_to_int where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_int_string """ select col_string from parquet_alter_column_to_int where col_string="B" order by col_string limit 3""" + order_qt_int_char """ select col_char from parquet_alter_column_to_int where col_char="B" order by col_char limit 3""" + order_qt_int_varchar """ select col_varchar from parquet_alter_column_to_int where col_varchar="C" order by col_varchar limit 3""" + order_qt_int_date """ select col_date from parquet_alter_column_to_int where year(col_date)=2023 order by col_date limit 3""" + order_qt_int_timestamp """ select col_timestamp from parquet_alter_column_to_int where year(col_timestamp)=2023 order by col_timestamp limit 3""" + order_qt_int_decimal """ select col_decimal from parquet_alter_column_to_int where col_decimal=1.1 order by col_decimal limit 3""" + order_qt_smallint_int """ select col_int from parquet_alter_column_to_smallint where col_int>=1 order by col_int limit 3""" + order_qt_smallint_smallint """ select col_smallint from parquet_alter_column_to_smallint where col_smallint>=3 order by col_smallint limit 3""" + order_qt_smallint_tinyint """ select col_tinyint from parquet_alter_column_to_smallint where col_tinyint>=2 order by col_tinyint limit 3""" + order_qt_smallint_bigint """ select col_bigint from parquet_alter_column_to_smallint where col_bigint>=2 order by col_bigint limit 3""" + order_qt_smallint_float """ select col_float from parquet_alter_column_to_smallint where col_float=3.0 order by col_float limit 3""" + order_qt_smallint_double """ select col_double from parquet_alter_column_to_smallint where col_double=0.5 order by col_double limit 3""" + order_qt_smallint_boolean """ select col_boolean from parquet_alter_column_to_smallint where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_smallint_string """ select col_string from parquet_alter_column_to_smallint where col_string="helloworld" order by col_string limit 3""" + order_qt_smallint_char """ select col_char from parquet_alter_column_to_smallint where col_char="C" order by col_char limit 3""" + order_qt_smallint_varchar """ select col_varchar from parquet_alter_column_to_smallint where col_varchar="A" order by col_varchar limit 3""" + order_qt_smallint_date """ select col_date from parquet_alter_column_to_smallint where year(col_date)=2023 order by col_date limit 3""" + order_qt_smallint_timestamp """ select col_timestamp from parquet_alter_column_to_smallint where year(col_timestamp)=2023 order by col_timestamp limit 3""" + order_qt_smallint_decimal """ select col_decimal from parquet_alter_column_to_smallint where col_decimal=2.5 order by col_decimal limit 3""" + order_qt_tinyint_int """ select col_int from parquet_alter_column_to_tinyint where col_int>=3 order by col_int limit 3""" + order_qt_tinyint_smallint """ select col_smallint from parquet_alter_column_to_tinyint where col_smallint>=3 order by col_smallint limit 3""" + order_qt_tinyint_tinyint """ select col_tinyint from parquet_alter_column_to_tinyint where col_tinyint>=3 order by col_tinyint limit 3""" + order_qt_tinyint_bigint """ select col_bigint from parquet_alter_column_to_tinyint where col_bigint>=1 order by col_bigint limit 3""" + order_qt_tinyint_float """ select col_float from parquet_alter_column_to_tinyint where col_float=0.6 order by col_float limit 3""" + order_qt_tinyint_double """ select col_double from parquet_alter_column_to_tinyint where col_double=1.1 order by col_double limit 3""" + order_qt_tinyint_boolean """ select col_boolean from parquet_alter_column_to_tinyint where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_tinyint_string """ select col_string from parquet_alter_column_to_tinyint where col_string="helloworld" order by col_string limit 3""" + order_qt_tinyint_char """ select col_char from parquet_alter_column_to_tinyint where col_char="A" order by col_char limit 3""" + order_qt_tinyint_varchar """ select col_varchar from parquet_alter_column_to_tinyint where col_varchar="C" order by col_varchar limit 3""" + order_qt_tinyint_date """ select col_date from parquet_alter_column_to_tinyint where year(col_date)=2023 order by col_date limit 3""" + order_qt_tinyint_timestamp """ select col_timestamp from parquet_alter_column_to_tinyint where year(col_timestamp)=2023 order by col_timestamp limit 3""" + order_qt_tinyint_decimal """ select col_decimal from parquet_alter_column_to_tinyint where col_decimal=1.4 order by col_decimal limit 3""" + order_qt_bigint_int """ select col_int from parquet_alter_column_to_bigint where col_int>=3 order by col_int limit 3""" + order_qt_bigint_smallint """ select col_smallint from parquet_alter_column_to_bigint where col_smallint>=2 order by col_smallint limit 3""" + order_qt_bigint_tinyint """ select col_tinyint from parquet_alter_column_to_bigint where col_tinyint>=2 order by col_tinyint limit 3""" + order_qt_bigint_bigint """ select col_bigint from parquet_alter_column_to_bigint where col_bigint>=1 order by col_bigint limit 3""" + order_qt_bigint_float """ select col_float from parquet_alter_column_to_bigint where col_float=2.5 order by col_float limit 3""" + order_qt_bigint_double """ select col_double from parquet_alter_column_to_bigint where col_double=0.2 order by col_double limit 3""" + order_qt_bigint_boolean """ select col_boolean from parquet_alter_column_to_bigint where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_bigint_string """ select col_string from parquet_alter_column_to_bigint where col_string="A" order by col_string limit 3""" + order_qt_bigint_char """ select col_char from parquet_alter_column_to_bigint where col_char="A" order by col_char limit 3""" + order_qt_bigint_varchar """ select col_varchar from parquet_alter_column_to_bigint where col_varchar="A" order by col_varchar limit 3""" + order_qt_bigint_date """ select col_date from parquet_alter_column_to_bigint where year(col_date)=2023 order by col_date limit 3""" + order_qt_bigint_timestamp """ select col_timestamp from parquet_alter_column_to_bigint where year(col_timestamp)=2023 order by col_timestamp limit 3""" + order_qt_bigint_decimal """ select col_decimal from parquet_alter_column_to_bigint where col_decimal=0.8 order by col_decimal limit 3""" + order_qt_float_int """ select col_int from parquet_alter_column_to_float where col_int=1.4 order by col_int limit 3""" + order_qt_float_smallint """ select col_smallint from parquet_alter_column_to_float where col_smallint=0.3 order by col_smallint limit 3""" + order_qt_float_tinyint """ select col_tinyint from parquet_alter_column_to_float where col_tinyint=0.2 order by col_tinyint limit 3""" + order_qt_float_bigint """ select col_bigint from parquet_alter_column_to_float where col_bigint=2.2 order by col_bigint limit 3""" + order_qt_float_float """ select col_float from parquet_alter_column_to_float where col_float=1.2 order by col_float limit 3""" + order_qt_float_double """ select col_double from parquet_alter_column_to_float where col_double=1.5 order by col_double limit 3""" + order_qt_float_boolean """ select col_boolean from parquet_alter_column_to_float where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_float_string """ select col_string from parquet_alter_column_to_float where col_string="A" order by col_string limit 3""" + order_qt_float_char """ select col_char from parquet_alter_column_to_float where col_char="helloworld" order by col_char limit 3""" + order_qt_float_varchar """ select col_varchar from parquet_alter_column_to_float where col_varchar="1" order by col_varchar limit 3""" + order_qt_float_date """ select col_date from parquet_alter_column_to_float where year(col_date)=2023 order by col_date limit 3""" + order_qt_float_timestamp """ select col_timestamp from parquet_alter_column_to_float where year(col_timestamp)=2023 order by col_timestamp limit 3""" + order_qt_float_decimal """ select col_decimal from parquet_alter_column_to_float where col_decimal=0.8 order by col_decimal limit 3""" + order_qt_double_int """ select col_int from parquet_alter_column_to_double where col_int=2.0 order by col_int limit 3""" + order_qt_double_smallint """ select col_smallint from parquet_alter_column_to_double where col_smallint=2.0 order by col_smallint limit 3""" + order_qt_double_tinyint """ select col_tinyint from parquet_alter_column_to_double where col_tinyint=1.4 order by col_tinyint limit 3""" + order_qt_double_bigint """ select col_bigint from parquet_alter_column_to_double where col_bigint=1.5 order by col_bigint limit 3""" + order_qt_double_float """ select col_float from parquet_alter_column_to_double where col_float=2.2 order by col_float limit 3""" + order_qt_double_double """ select col_double from parquet_alter_column_to_double where col_double=0.6 order by col_double limit 3""" + order_qt_double_boolean """ select col_boolean from parquet_alter_column_to_double where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_double_string """ select col_string from parquet_alter_column_to_double where col_string="B" order by col_string limit 3""" + order_qt_double_char """ select col_char from parquet_alter_column_to_double where col_char="A" order by col_char limit 3""" + order_qt_double_varchar """ select col_varchar from parquet_alter_column_to_double where col_varchar="C" order by col_varchar limit 3""" + order_qt_double_date """ select col_date from parquet_alter_column_to_double where year(col_date)=2023 order by col_date limit 3""" + order_qt_double_timestamp """ select col_timestamp from parquet_alter_column_to_double where year(col_timestamp)=2023 order by col_timestamp limit 3""" + order_qt_double_decimal """ select col_decimal from parquet_alter_column_to_double where col_decimal=0.3 order by col_decimal limit 3""" + order_qt_boolean_int """ select col_int from parquet_alter_column_to_boolean where col_int>=3 order by col_int limit 3""" + order_qt_boolean_smallint """ select col_smallint from parquet_alter_column_to_boolean where col_smallint>=2 order by col_smallint limit 3""" + order_qt_boolean_tinyint """ select col_tinyint from parquet_alter_column_to_boolean where col_tinyint>=1 order by col_tinyint limit 3""" + order_qt_boolean_bigint """ select col_bigint from parquet_alter_column_to_boolean where col_bigint>=3 order by col_bigint limit 3""" + order_qt_boolean_float """ select col_float from parquet_alter_column_to_boolean where col_float=1.1 order by col_float limit 3""" + order_qt_boolean_double """ select col_double from parquet_alter_column_to_boolean where col_double=0.5 order by col_double limit 3""" + order_qt_boolean_boolean """ select col_boolean from parquet_alter_column_to_boolean where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_boolean_string """ select col_string from parquet_alter_column_to_boolean where col_string="1" order by col_string limit 3""" + order_qt_boolean_char """ select col_char from parquet_alter_column_to_boolean where col_char="A" order by col_char limit 3""" + order_qt_boolean_varchar """ select col_varchar from parquet_alter_column_to_boolean where col_varchar="B" order by col_varchar limit 3""" + order_qt_boolean_date """ select col_date from parquet_alter_column_to_boolean where year(col_date)=2023 order by col_date limit 3""" + order_qt_boolean_timestamp """ select col_timestamp from parquet_alter_column_to_boolean where year(col_timestamp)=2023 order by col_timestamp limit 3""" + order_qt_boolean_decimal """ select col_decimal from parquet_alter_column_to_boolean where col_decimal=2.8 order by col_decimal limit 3""" + order_qt_string_int """ select col_int from parquet_alter_column_to_string where col_int="C" order by col_int limit 3""" + order_qt_string_smallint """ select col_smallint from parquet_alter_column_to_string where col_smallint="C" order by col_smallint limit 3""" + order_qt_string_tinyint """ select col_tinyint from parquet_alter_column_to_string where col_tinyint="B" order by col_tinyint limit 3""" + order_qt_string_bigint """ select col_bigint from parquet_alter_column_to_string where col_bigint="helloworld" order by col_bigint limit 3""" + order_qt_string_float """ select col_float from parquet_alter_column_to_string where col_float="1" order by col_float limit 3""" + order_qt_string_double """ select col_double from parquet_alter_column_to_string where col_double="C" order by col_double limit 3""" + order_qt_string_boolean """ select col_boolean from parquet_alter_column_to_string where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_string_string """ select col_string from parquet_alter_column_to_string where col_string="B" order by col_string limit 3""" + order_qt_string_char """ select col_char from parquet_alter_column_to_string where col_char="A" order by col_char limit 3""" + order_qt_string_varchar """ select col_varchar from parquet_alter_column_to_string where col_varchar="B" order by col_varchar limit 3""" + order_qt_string_date """ select col_date from parquet_alter_column_to_string where col_date="helloworld" order by col_date limit 3""" + order_qt_string_timestamp """ select col_timestamp from parquet_alter_column_to_string where col_timestamp="B" order by col_timestamp limit 3""" + order_qt_string_decimal """ select col_decimal from parquet_alter_column_to_string where col_decimal="1" order by col_decimal limit 3""" + order_qt_char_int """ select col_int from parquet_alter_column_to_char where col_int="B" order by col_int limit 3""" + order_qt_char_smallint """ select col_smallint from parquet_alter_column_to_char where col_smallint="A" order by col_smallint limit 3""" + order_qt_char_tinyint """ select col_tinyint from parquet_alter_column_to_char where col_tinyint="A" order by col_tinyint limit 3""" + order_qt_char_bigint """ select col_bigint from parquet_alter_column_to_char where col_bigint="B" order by col_bigint limit 3""" + order_qt_char_float """ select col_float from parquet_alter_column_to_char where col_float="C" order by col_float limit 3""" + order_qt_char_double """ select col_double from parquet_alter_column_to_char where col_double="A" order by col_double limit 3""" + order_qt_char_boolean """ select col_boolean from parquet_alter_column_to_char where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_char_string """ select col_string from parquet_alter_column_to_char where col_string="C" order by col_string limit 3""" + order_qt_char_char """ select col_char from parquet_alter_column_to_char where col_char="A" order by col_char limit 3""" + order_qt_char_varchar """ select col_varchar from parquet_alter_column_to_char where col_varchar="B" order by col_varchar limit 3""" + order_qt_char_date """ select col_date from parquet_alter_column_to_char where col_date="B" order by col_date limit 3""" + order_qt_char_timestamp """ select col_timestamp from parquet_alter_column_to_char where col_timestamp="A" order by col_timestamp limit 3""" + order_qt_char_decimal """ select col_decimal from parquet_alter_column_to_char where col_decimal="C" order by col_decimal limit 3""" + order_qt_varchar_int """ select col_int from parquet_alter_column_to_varchar where col_int="B" order by col_int limit 3""" + order_qt_varchar_smallint """ select col_smallint from parquet_alter_column_to_varchar where col_smallint="helloworld" order by col_smallint limit 3""" + order_qt_varchar_tinyint """ select col_tinyint from parquet_alter_column_to_varchar where col_tinyint="A" order by col_tinyint limit 3""" + order_qt_varchar_bigint """ select col_bigint from parquet_alter_column_to_varchar where col_bigint="helloworld" order by col_bigint limit 3""" + order_qt_varchar_float """ select col_float from parquet_alter_column_to_varchar where col_float="1" order by col_float limit 3""" + order_qt_varchar_double """ select col_double from parquet_alter_column_to_varchar where col_double="B" order by col_double limit 3""" + order_qt_varchar_boolean """ select col_boolean from parquet_alter_column_to_varchar where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_varchar_string """ select col_string from parquet_alter_column_to_varchar where col_string="A" order by col_string limit 3""" + order_qt_varchar_char """ select col_char from parquet_alter_column_to_varchar where col_char="B" order by col_char limit 3""" + order_qt_varchar_varchar """ select col_varchar from parquet_alter_column_to_varchar where col_varchar="B" order by col_varchar limit 3""" + order_qt_varchar_date """ select col_date from parquet_alter_column_to_varchar where col_date="C" order by col_date limit 3""" + order_qt_varchar_timestamp """ select col_timestamp from parquet_alter_column_to_varchar where col_timestamp="C" order by col_timestamp limit 3""" + order_qt_varchar_decimal """ select col_decimal from parquet_alter_column_to_varchar where col_decimal="helloworld" order by col_decimal limit 3""" + order_qt_date_int """ select col_int from parquet_alter_column_to_date where col_int>=3 order by col_int limit 3""" + order_qt_date_smallint """ select col_smallint from parquet_alter_column_to_date where col_smallint>=1 order by col_smallint limit 3""" + order_qt_date_tinyint """ select col_tinyint from parquet_alter_column_to_date where col_tinyint>=3 order by col_tinyint limit 3""" + order_qt_date_bigint """ select col_bigint from parquet_alter_column_to_date where col_bigint>=1 order by col_bigint limit 3""" + order_qt_date_float """ select col_float from parquet_alter_column_to_date where col_float=2.8 order by col_float limit 3""" + order_qt_date_double """ select col_double from parquet_alter_column_to_date where col_double=2.5 order by col_double limit 3""" + order_qt_date_boolean """ select col_boolean from parquet_alter_column_to_date where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_date_string """ select col_string from parquet_alter_column_to_date where col_string="helloworld" order by col_string limit 3""" + order_qt_date_char """ select col_char from parquet_alter_column_to_date where col_char="A" order by col_char limit 3""" + order_qt_date_varchar """ select col_varchar from parquet_alter_column_to_date where col_varchar="1" order by col_varchar limit 3""" + order_qt_date_date """ select col_date from parquet_alter_column_to_date where year(col_date)=2023 order by col_date limit 3""" + order_qt_date_timestamp """ select col_timestamp from parquet_alter_column_to_date where year(col_timestamp)=2023 order by col_timestamp limit 3""" + order_qt_date_decimal """ select col_decimal from parquet_alter_column_to_date where col_decimal=0.3 order by col_decimal limit 3""" + order_qt_timestamp_int """ select col_int from parquet_alter_column_to_timestamp where col_int>=3 order by col_int limit 3""" + order_qt_timestamp_smallint """ select col_smallint from parquet_alter_column_to_timestamp where col_smallint>=3 order by col_smallint limit 3""" + order_qt_timestamp_tinyint """ select col_tinyint from parquet_alter_column_to_timestamp where col_tinyint>=1 order by col_tinyint limit 3""" + order_qt_timestamp_bigint """ select col_bigint from parquet_alter_column_to_timestamp where col_bigint>=3 order by col_bigint limit 3""" + order_qt_timestamp_float """ select col_float from parquet_alter_column_to_timestamp where col_float=2.4 order by col_float limit 3""" + order_qt_timestamp_double """ select col_double from parquet_alter_column_to_timestamp where col_double=1.3 order by col_double limit 3""" + order_qt_timestamp_boolean """ select col_boolean from parquet_alter_column_to_timestamp where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_timestamp_string """ select col_string from parquet_alter_column_to_timestamp where col_string="C" order by col_string limit 3""" + order_qt_timestamp_char """ select col_char from parquet_alter_column_to_timestamp where col_char="B" order by col_char limit 3""" + order_qt_timestamp_varchar """ select col_varchar from parquet_alter_column_to_timestamp where col_varchar="C" order by col_varchar limit 3""" + order_qt_timestamp_date """ select col_date from parquet_alter_column_to_timestamp where year(col_date)=2023 order by col_date limit 3""" + order_qt_timestamp_timestamp """ select col_timestamp from parquet_alter_column_to_timestamp where year(col_timestamp)=2023 order by col_timestamp limit 3""" + order_qt_timestamp_decimal """ select col_decimal from parquet_alter_column_to_timestamp where col_decimal=1.3 order by col_decimal limit 3""" + order_qt_decimal_int """ select col_int from parquet_alter_column_to_decimal where col_int=2.8 order by col_int limit 3""" + order_qt_decimal_smallint """ select col_smallint from parquet_alter_column_to_decimal where col_smallint=0.1 order by col_smallint limit 3""" + order_qt_decimal_tinyint """ select col_tinyint from parquet_alter_column_to_decimal where col_tinyint=2.9 order by col_tinyint limit 3""" + order_qt_decimal_bigint """ select col_bigint from parquet_alter_column_to_decimal where col_bigint=2.3 order by col_bigint limit 3""" + order_qt_decimal_float """ select col_float from parquet_alter_column_to_decimal where col_float=2.5 order by col_float limit 3""" + order_qt_decimal_double """ select col_double from parquet_alter_column_to_decimal where col_double=1.7 order by col_double limit 3""" + order_qt_decimal_boolean """ select col_boolean from parquet_alter_column_to_decimal where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_decimal_string """ select col_string from parquet_alter_column_to_decimal where col_string="helloworld" order by col_string limit 3""" + order_qt_decimal_char """ select col_char from parquet_alter_column_to_decimal where col_char="helloworld" order by col_char limit 3""" + order_qt_decimal_varchar """ select col_varchar from parquet_alter_column_to_decimal where col_varchar="helloworld" order by col_varchar limit 3""" + order_qt_decimal_date """ select col_date from parquet_alter_column_to_decimal where year(col_date)=2023 order by col_date limit 3""" + order_qt_decimal_timestamp """ select col_timestamp from parquet_alter_column_to_decimal where year(col_timestamp)=2023 order by col_timestamp limit 3""" + order_qt_decimal_decimal """ select col_decimal from parquet_alter_column_to_decimal where col_decimal=1.5 order by col_decimal limit 3""" + } } From 09f9a617ec1dbc9dfbedd2dc7be3e941064e4c73 Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Sun, 22 Oct 2023 16:14:53 +0800 Subject: [PATCH 19/21] fix compile --- .../exec/format/parquet/fix_length_plain_decoder.h | 2 +- .../vec/exec/format/parquet/parquet_column_convert.h | 12 ++++++------ be/test/vec/exec/parquet/parquet_thrift_test.cpp | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h index bc35b76140f594..b21f58601d3f5f 100644 --- a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h +++ b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h @@ -149,7 +149,7 @@ Status FixLengthPlainDecoder::_decode_numeric(MutableColumnPtr& do break; } case ColumnSelectVector::NULL_DATA: { - data_index += run_length; + data_index += run_length * _type_length; break; } case ColumnSelectVector::FILTERED_CONTENT: { diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h b/be/src/vec/exec/format/parquet/parquet_column_convert.h index 3e57013ecd5b74..433b8846b2e3e4 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.h +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h @@ -291,16 +291,16 @@ struct Int64ToTimestamp : public ColumnConvert { size_t rows = src_col->size(); dst_col->resize(_convert_params->start_idx + rows); - auto& src_data = static_cast*>(src_col.get())->get_data(); + auto src_data = static_cast*>(src_col.get())->get_data().data(); auto& data = static_cast*>(dst_col.get())->get_data(); for (int i = 0; i < rows; i++) { - int64 x = src_data[i]; + int64_t x = src_data[i]; auto& num = data[_convert_params->start_idx + i]; auto& value = reinterpret_cast&>(num); value.from_unixtime(x / _convert_params->second_mask, *_convert_params->ctz); value.set_microsecond((x % _convert_params->second_mask) * - _convert_params->scale_to_nano_factor / 1000); + (_convert_params->scale_to_nano_factor / 1000)); } return Status::OK(); } @@ -512,16 +512,16 @@ inline Status get_converter(tparquet::Type::type parquet_physical_type, Primitiv case TypeIndex::String: { if (tparquet::Type::FIXED_LEN_BYTE_ARRAY == parquet_physical_type) { if (show_type == PrimitiveType::TYPE_DECIMAL32) { - *converter = std::make_unique>(); + *converter = std::make_unique>(); break; } else if (show_type == PrimitiveType::TYPE_DECIMAL64) { *converter = std::make_unique>(); break; } else if (show_type == PrimitiveType::TYPE_DECIMALV2) { - *converter = std::make_unique>(); + *converter = std::make_unique>(); break; } else if (show_type == PrimitiveType::TYPE_DECIMAL128I) { - *converter = std::make_unique>(); + *converter = std::make_unique>(); break; } diff --git a/be/test/vec/exec/parquet/parquet_thrift_test.cpp b/be/test/vec/exec/parquet/parquet_thrift_test.cpp index d5c4a2d56a282c..4daa548e2eca6c 100644 --- a/be/test/vec/exec/parquet/parquet_thrift_test.cpp +++ b/be/test/vec/exec/parquet/parquet_thrift_test.cpp @@ -192,7 +192,7 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column size_t chunk_size = chunk_meta.total_compressed_size; bool need_convert = false; - auto& parquet_physical_type = column_chunk.meta_data.type; + auto& parquet_physical_type = column_chunk->meta_data.type; auto& show_type = field_schema->type.type; ColumnPtr src_column = ParquetConvert::get_column(parquet_physical_type, show_type, From 0139e3bd9750d9499b942840369782de53c5bcd5 Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Wed, 25 Oct 2023 21:02:34 +0800 Subject: [PATCH 20/21] opt to string --- .../format/parquet/parquet_column_convert.h | 24 +- .../hive/test_hive_parquet_alter_column.out | 356 +++++++++--------- 2 files changed, 200 insertions(+), 180 deletions(-) diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h b/be/src/vec/exec/format/parquet/parquet_column_convert.h index 433b8846b2e3e4..b21a7732d8dbdc 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.h +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h @@ -28,6 +28,7 @@ #include "common/status.h" #include "gen_cpp/descriptors.pb.h" #include "gutil/endian.h" +#include "gutil/strings/numbers.h" #include "io/file_factory.h" #include "olap/olap_common.h" #include "util/coding.h" @@ -43,6 +44,7 @@ #include "vec/exec/format/format_common.h" #include "vec/exec/format/parquet/decoder.h" #include "vec/exec/format/parquet/parquet_common.h" + namespace doris::vectorized { namespace ParquetConvert { @@ -251,10 +253,28 @@ struct NumberToStringConvert : public ColumnConvert { size_t rows = src_col->size(); auto& src_data = static_cast(src_col.get())->get_data(); + char buf[100]; auto str_col = static_cast(dst_col.get()); for (int i = 0; i < rows; i++) { - std::string value = std::to_string(src_data[i]); - str_col->insert_data(value.data(), value.size()); + if constexpr (parquet_physical_type == tparquet::Type::FLOAT) { + int len = FastFloatToBuffer(src_data[i], buf, true); + str_col->insert_data(buf, len); + + } else if constexpr (parquet_physical_type == tparquet::Type::DOUBLE) { + int len = FastDoubleToBuffer(src_data[i], buf, true); + str_col->insert_data(buf, len); + } else if constexpr (parquet_physical_type == tparquet::Type::INT32) { + char* end = FastInt32ToBufferLeft(src_data[i], buf); + str_col->insert_data(buf, end - buf); + + } else if constexpr (parquet_physical_type == tparquet::Type::INT64) { + char* end = FastInt64ToBufferLeft(src_data[i], buf); + str_col->insert_data(buf, end - buf); + + } else { + string value = std::to_string(src_data[i]); + str_col->insert_data(value.data(), value.size()); + } } return Status::OK(); } diff --git a/regression-test/data/external_table_p2/hive/test_hive_parquet_alter_column.out b/regression-test/data/external_table_p2/hive/test_hive_parquet_alter_column.out index cca084ff0f6ca0..d96fc0101dfcd9 100644 --- a/regression-test/data/external_table_p2/hive/test_hive_parquet_alter_column.out +++ b/regression-test/data/external_table_p2/hive/test_hive_parquet_alter_column.out @@ -15,13 +15,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(10, 2) Yes true \N -- !show -- --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 -- !order -- -1 @@ -74,9 +74,9 @@ ADC ADC -- !order -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !order -- 2023-10-06T14:30 @@ -104,13 +104,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(10, 2) Yes true \N -- !show -- --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 -- !order -- -1 @@ -163,9 +163,9 @@ ADC ADC -- !order -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !order -- 2023-10-06T14:30 @@ -193,13 +193,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(10, 2) Yes true \N -- !show -- --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 -- !order -- -1 @@ -252,9 +252,9 @@ ADC ADC -- !order -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !order -- 2023-10-06T14:30 @@ -282,13 +282,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(10, 2) Yes true \N -- !show -- --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 -- !order -- -1 @@ -341,9 +341,9 @@ ADC ADC -- !order -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !order -- 2023-10-06T14:30 @@ -371,13 +371,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(10, 2) Yes true \N -- !show -- --1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 -- !order -- -1.0 @@ -430,9 +430,9 @@ ADC ADC -- !order -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !order -- 2023-10-06T14:30 @@ -460,13 +460,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(10, 2) Yes true \N -- !show -- --1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 -- !order -- -1.0 @@ -519,9 +519,9 @@ ADC ADC -- !order -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !order -- 2023-10-06T14:30 @@ -549,13 +549,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(10, 2) Yes true \N -- !show -- --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 -- !order -- -1 @@ -608,9 +608,9 @@ ADC ADC -- !order -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !order -- 2023-10-06T14:30 @@ -638,13 +638,13 @@ col_timestamp TEXT Yes true \N col_decimal TEXT Yes true \N -- !show -- --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 -- !order -- -1 @@ -667,14 +667,14 @@ col_decimal TEXT Yes true \N -20000000 -- !order -- -10.500000 -10.500000 -10.500000 +1.05E1 +1.05E1 +1.05E1 -- !order -- -20.750000 -20.750000 -20.750000 +2.075E1 +2.075E1 +2.075E1 -- !order -- false @@ -697,9 +697,9 @@ ADC ADC -- !order -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !order -- 2023-10-06 14:30:00 @@ -727,13 +727,13 @@ col_timestamp CHAR(10) Yes true \N col_decimal CHAR(10) Yes true \N -- !show -- --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 -- !order -- -1 @@ -756,14 +756,14 @@ col_decimal CHAR(10) Yes true \N -20000000 -- !order -- -10.500000 -10.500000 -10.500000 +1.05E1 +1.05E1 +1.05E1 -- !order -- -20.750000 -20.750000 -20.750000 +2.075E1 +2.075E1 +2.075E1 -- !order -- false @@ -786,9 +786,9 @@ ADC ADC -- !order -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !order -- 2023-10-06 14:30:00 @@ -816,13 +816,13 @@ col_timestamp VARCHAR(20) Yes true \N col_decimal VARCHAR(20) Yes true \N -- !show -- --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 -- !order -- -1 @@ -845,14 +845,14 @@ col_decimal VARCHAR(20) Yes true \N -20000000 -- !order -- -10.500000 -10.500000 -10.500000 +1.05E1 +1.05E1 +1.05E1 -- !order -- -20.750000 -20.750000 -20.750000 +2.075E1 +2.075E1 +2.075E1 -- !order -- false @@ -875,9 +875,9 @@ ADC ADC -- !order -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !order -- 2023-10-06 14:30:00 @@ -905,13 +905,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(10, 2) Yes true \N -- !show -- --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 -- !order -- -1 @@ -964,9 +964,9 @@ ADC ADC -- !order -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !order -- 2023-10-06T14:30 @@ -994,13 +994,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(10, 2) Yes true \N -- !show -- --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 -- !order -- -1 @@ -1053,9 +1053,9 @@ ADC ADC -- !order -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !order -- 2023-10-06T14:30 @@ -1083,13 +1083,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(5, 1) Yes true \N -- !show -- --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.4 -- !order -- -1.0 @@ -1142,9 +1142,9 @@ ADC ADC -- !order -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !order -- 2023-10-06T14:30 @@ -1192,9 +1192,9 @@ B -- !int_varchar -- -- !int_date -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !int_timestamp -- 2023-10-06T14:30 @@ -1239,9 +1239,9 @@ C -- !smallint_varchar -- -- !smallint_date -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !smallint_timestamp -- 2023-10-06T14:30 @@ -1286,9 +1286,9 @@ A -- !tinyint_varchar -- -- !tinyint_date -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !tinyint_timestamp -- 2023-10-06T14:30 @@ -1333,9 +1333,9 @@ A -- !bigint_varchar -- -- !bigint_date -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !bigint_timestamp -- 2023-10-06T14:30 @@ -1365,9 +1365,9 @@ A -- !float_varchar -- -- !float_date -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !float_timestamp -- 2023-10-06T14:30 @@ -1403,9 +1403,9 @@ A -- !double_varchar -- -- !double_date -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !double_timestamp -- 2023-10-06T14:30 @@ -1450,9 +1450,9 @@ A -- !boolean_varchar -- -- !boolean_date -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !boolean_timestamp -- 2023-10-06T14:30 @@ -1584,9 +1584,9 @@ A -- !date_varchar -- -- !date_date -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !date_timestamp -- 2023-10-06T14:30 @@ -1631,9 +1631,9 @@ B -- !timestamp_varchar -- -- !timestamp_date -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !timestamp_timestamp -- 2023-10-06T14:30 @@ -1663,9 +1663,9 @@ B -- !decimal_varchar -- -- !decimal_date -- -2023-10-06 -2023-10-06 -2023-10-06 +2023-10-07 +2023-10-07 +2023-10-07 -- !decimal_timestamp -- 2023-10-06T14:30 From 0adc21fd1ae5626447021b2e01a451b904b55b1e Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Wed, 25 Oct 2023 23:17:53 +0800 Subject: [PATCH 21/21] fix rebase date. --- .../format/parquet/parquet_column_convert.h | 2 +- .../hive/test_hive_parquet_alter_column.out | 356 +++++++++--------- 2 files changed, 179 insertions(+), 179 deletions(-) diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h b/be/src/vec/exec/format/parquet/parquet_column_convert.h index b21a7732d8dbdc..6cf3cfb6c502fb 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.h +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h @@ -163,7 +163,7 @@ struct ConvertParams { if (ctz) { VecDateTimeValue t; t.from_unixtime(0, *ctz); - offset_days = t.day() == 31 ? 0 : 1; + offset_days = t.day() == 31 ? -1 : 0; } start_idx = start_idx_; } diff --git a/regression-test/data/external_table_p2/hive/test_hive_parquet_alter_column.out b/regression-test/data/external_table_p2/hive/test_hive_parquet_alter_column.out index d96fc0101dfcd9..cca084ff0f6ca0 100644 --- a/regression-test/data/external_table_p2/hive/test_hive_parquet_alter_column.out +++ b/regression-test/data/external_table_p2/hive/test_hive_parquet_alter_column.out @@ -15,13 +15,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(10, 2) Yes true \N -- !show -- --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 -- !order -- -1 @@ -74,9 +74,9 @@ ADC ADC -- !order -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !order -- 2023-10-06T14:30 @@ -104,13 +104,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(10, 2) Yes true \N -- !show -- --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 -- !order -- -1 @@ -163,9 +163,9 @@ ADC ADC -- !order -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !order -- 2023-10-06T14:30 @@ -193,13 +193,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(10, 2) Yes true \N -- !show -- --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 -- !order -- -1 @@ -252,9 +252,9 @@ ADC ADC -- !order -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !order -- 2023-10-06T14:30 @@ -282,13 +282,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(10, 2) Yes true \N -- !show -- --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 -- !order -- -1 @@ -341,9 +341,9 @@ ADC ADC -- !order -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !order -- 2023-10-06T14:30 @@ -371,13 +371,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(10, 2) Yes true \N -- !show -- --1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 -- !order -- -1.0 @@ -430,9 +430,9 @@ ADC ADC -- !order -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !order -- 2023-10-06T14:30 @@ -460,13 +460,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(10, 2) Yes true \N -- !show -- --1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 -- !order -- -1.0 @@ -519,9 +519,9 @@ ADC ADC -- !order -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !order -- 2023-10-06T14:30 @@ -549,13 +549,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(10, 2) Yes true \N -- !show -- --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 -- !order -- -1 @@ -608,9 +608,9 @@ ADC ADC -- !order -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !order -- 2023-10-06T14:30 @@ -638,13 +638,13 @@ col_timestamp TEXT Yes true \N col_decimal TEXT Yes true \N -- !show -- --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 -- !order -- -1 @@ -667,14 +667,14 @@ col_decimal TEXT Yes true \N -20000000 -- !order -- -1.05E1 -1.05E1 -1.05E1 +10.500000 +10.500000 +10.500000 -- !order -- -2.075E1 -2.075E1 -2.075E1 +20.750000 +20.750000 +20.750000 -- !order -- false @@ -697,9 +697,9 @@ ADC ADC -- !order -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !order -- 2023-10-06 14:30:00 @@ -727,13 +727,13 @@ col_timestamp CHAR(10) Yes true \N col_decimal CHAR(10) Yes true \N -- !show -- --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 -- !order -- -1 @@ -756,14 +756,14 @@ col_decimal CHAR(10) Yes true \N -20000000 -- !order -- -1.05E1 -1.05E1 -1.05E1 +10.500000 +10.500000 +10.500000 -- !order -- -2.075E1 -2.075E1 -2.075E1 +20.750000 +20.750000 +20.750000 -- !order -- false @@ -786,9 +786,9 @@ ADC ADC -- !order -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !order -- 2023-10-06 14:30:00 @@ -816,13 +816,13 @@ col_timestamp VARCHAR(20) Yes true \N col_decimal VARCHAR(20) Yes true \N -- !show -- --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 --1 -200 -10 -20000000 2.05777E1 3.075E1 false First A ADC 2023-10-07 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 -- !order -- -1 @@ -845,14 +845,14 @@ col_decimal VARCHAR(20) Yes true \N -20000000 -- !order -- -1.05E1 -1.05E1 -1.05E1 +10.500000 +10.500000 +10.500000 -- !order -- -2.075E1 -2.075E1 -2.075E1 +20.750000 +20.750000 +20.750000 -- !order -- false @@ -875,9 +875,9 @@ ADC ADC -- !order -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !order -- 2023-10-06 14:30:00 @@ -905,13 +905,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(10, 2) Yes true \N -- !show -- --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 -- !order -- -1 @@ -964,9 +964,9 @@ ADC ADC -- !order -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !order -- 2023-10-06T14:30 @@ -994,13 +994,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(10, 2) Yes true \N -- !show -- --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 --1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 -- !order -- -1 @@ -1053,9 +1053,9 @@ ADC ADC -- !order -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !order -- 2023-10-06T14:30 @@ -1083,13 +1083,13 @@ col_timestamp DATETIME(6) Yes true \N col_decimal DECIMAL(5, 1) Yes true \N -- !show -- --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.4 --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.4 --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.4 --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.4 --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.4 --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.4 --1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-07 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 -- !order -- -1.0 @@ -1142,9 +1142,9 @@ ADC ADC -- !order -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !order -- 2023-10-06T14:30 @@ -1192,9 +1192,9 @@ B -- !int_varchar -- -- !int_date -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !int_timestamp -- 2023-10-06T14:30 @@ -1239,9 +1239,9 @@ C -- !smallint_varchar -- -- !smallint_date -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !smallint_timestamp -- 2023-10-06T14:30 @@ -1286,9 +1286,9 @@ A -- !tinyint_varchar -- -- !tinyint_date -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !tinyint_timestamp -- 2023-10-06T14:30 @@ -1333,9 +1333,9 @@ A -- !bigint_varchar -- -- !bigint_date -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !bigint_timestamp -- 2023-10-06T14:30 @@ -1365,9 +1365,9 @@ A -- !float_varchar -- -- !float_date -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !float_timestamp -- 2023-10-06T14:30 @@ -1403,9 +1403,9 @@ A -- !double_varchar -- -- !double_date -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !double_timestamp -- 2023-10-06T14:30 @@ -1450,9 +1450,9 @@ A -- !boolean_varchar -- -- !boolean_date -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !boolean_timestamp -- 2023-10-06T14:30 @@ -1584,9 +1584,9 @@ A -- !date_varchar -- -- !date_date -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !date_timestamp -- 2023-10-06T14:30 @@ -1631,9 +1631,9 @@ B -- !timestamp_varchar -- -- !timestamp_date -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !timestamp_timestamp -- 2023-10-06T14:30 @@ -1663,9 +1663,9 @@ B -- !decimal_varchar -- -- !decimal_date -- -2023-10-07 -2023-10-07 -2023-10-07 +2023-10-06 +2023-10-06 +2023-10-06 -- !decimal_timestamp -- 2023-10-06T14:30