From 6f0a95000e282308d3d32451ba20e86986a0bd01 Mon Sep 17 00:00:00 2001 From: daidai Date: Fri, 28 Feb 2025 10:12:21 +0800 Subject: [PATCH 1/2] [enchement](schema change)Standardize the behavior after a table schema change. (#47471) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Related PR: #32873 Problem Summary: Explicitly defines the behavior of column type conversions. image Special notes are as follows: `String => boolean`: In Parquet, only "false", "off", "no", "0", and an empty string ("") are considered false; otherwise, it is true. In Orc, a string can be parsed as a number, and if that number is 0, it is considered false; otherwise, it is true. If parsing the number fails, it results in null. Conversion between `Int/smallint/tinyint/bigint`: Unless the conversion can be perfectly represented, an error will be reported. For example: Bigint column => smallint column Reason: [INTERNAL_ERROR] Failed to cast value '9223372036854775807' to Nullable(Int16) column. Conversion between `Decimal`: Unless the conversion can be perfectly done, an error will be reported. `String => Int/smallint/tinyint/bigint`: It can be successfully converted to a number, and the number can be correctly stored. Otherwise, the result is null. `Int/smallint/tinyint/bigint => float`: The conversion is successful only if abs(number type) < 2^23. `Int/smallint/tinyint/bigint => double`: The conversion is successful only if abs(number type) < 2^52. `Decimal => Int/smallint/tinyint/bigint`: If the integer part of the decimal can be perfectly stored, only the integer part will be shown; otherwise, it will result in null. `Float => double`: Refer to the C++ static_cast(float). `Decimal => float/double`: Attempt to store the approximate value. `Boolean => string`: The conversion will result in “TRUE” or “FALSE”. TODO: conversion to `char/varchar` type requires truncation. --- .../vec/exec/format/column_type_convert.cpp | 182 +- be/src/vec/exec/format/column_type_convert.h | 412 +++- be/src/vec/exec/format/orc/vorc_reader.cpp | 64 +- .../format/parquet/parquet_column_convert.cpp | 9 +- .../format/parquet/parquet_column_convert.h | 2 +- be/test/vec/exec/column_type_convert_test.cpp | 1117 +++++++++++ .../create_preinstalled_scripts/run07.hql | 14 +- .../create_preinstalled_scripts/run75.hql | 515 +++++ .../orc_schema_change/origin_file.orc | Bin 0 -> 2533 bytes .../parquet_schema_change/origin_file.parquet | Bin 0 -> 3651 bytes .../hive/test_hive_schema_change_orc.out | 1611 ++++++++++++++++ .../hive/test_hive_schema_change_parquet.out | 1653 +++++++++++++++++ .../test_hive_parquet_alter_column.groovy | 5 + .../hive/test_hive_schema_change.groovy | 4 +- .../hive/test_hive_schema_change_orc.groovy | 1034 +++++++++++ .../test_hive_schema_change_parquet.groovy | 1034 +++++++++++ 16 files changed, 7534 insertions(+), 122 deletions(-) create mode 100644 be/test/vec/exec/column_type_convert_test.cpp create mode 100644 docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run75.hql create mode 100644 docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/orc_schema_change/origin_file.orc create mode 100644 docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_schema_change/origin_file.parquet create mode 100644 regression-test/data/external_table_p0/hive/test_hive_schema_change_orc.out create mode 100644 regression-test/data/external_table_p0/hive/test_hive_schema_change_parquet.out create mode 100644 regression-test/suites/external_table_p0/hive/test_hive_schema_change_orc.groovy create mode 100644 regression-test/suites/external_table_p0/hive/test_hive_schema_change_parquet.groovy diff --git a/be/src/vec/exec/format/column_type_convert.cpp b/be/src/vec/exec/format/column_type_convert.cpp index a2c226c91d6799..cfe6f8cbd98e1a 100644 --- a/be/src/vec/exec/format/column_type_convert.cpp +++ b/be/src/vec/exec/format/column_type_convert.cpp @@ -19,8 +19,17 @@ namespace doris::vectorized::converter { +const std::set SafeCastString::FALSE_VALUES = {"false", "off", "no", "0", + ""}; + +#define FOR_LOGICAL_INTEGER_TYPES(M) \ + M(TYPE_TINYINT) \ + M(TYPE_SMALLINT) \ + M(TYPE_INT) \ + M(TYPE_BIGINT) \ + M(TYPE_LARGEINT) + #define FOR_LOGICAL_NUMERIC_TYPES(M) \ - M(TYPE_BOOLEAN) \ M(TYPE_TINYINT) \ M(TYPE_SMALLINT) \ M(TYPE_INT) \ @@ -30,7 +39,6 @@ namespace doris::vectorized::converter { M(TYPE_DOUBLE) #define FOR_LOGICAL_DECIMAL_TYPES(M) \ - M(TYPE_DECIMALV2) \ M(TYPE_DECIMAL32) \ M(TYPE_DECIMAL64) \ M(TYPE_DECIMAL128I) \ @@ -126,46 +134,70 @@ static std::unique_ptr _numeric_converter(const TypeDescrip PrimitiveType src_primitive_type = src_type.type; PrimitiveType dst_primitive_type = remove_nullable(dst_type)->get_type_as_type_descriptor().type; - switch (src_primitive_type) { -#define DISPATCH(SRC_PTYPE) \ - case SRC_PTYPE: { \ - switch (dst_primitive_type) { \ - case TYPE_BOOLEAN: \ - return std::make_unique>(); \ + + switch (dst_primitive_type) { +#define DISPATCH(DST_PTYPE) \ + case DST_PTYPE: { \ + switch (src_primitive_type) { \ case TYPE_TINYINT: \ - return std::make_unique>(); \ + return std::make_unique>(); \ case TYPE_SMALLINT: \ - return std::make_unique>(); \ + return std::make_unique>(); \ case TYPE_INT: \ - return std::make_unique>(); \ + return std::make_unique>(); \ case TYPE_BIGINT: \ - return std::make_unique>(); \ + return std::make_unique>(); \ case TYPE_LARGEINT: \ - return std::make_unique>(); \ - case TYPE_FLOAT: \ - return std::make_unique>(); \ - case TYPE_DOUBLE: \ - return std::make_unique>(); \ + return std::make_unique>(); \ default: \ return std::make_unique(src_type, dst_type); \ } \ } - FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) + FOR_LOGICAL_INTEGER_TYPES(DISPATCH) +#undef DISPATCH + + case TYPE_FLOAT: { + switch (src_primitive_type) { +#define DISPATCH(SRC_PTYPE) \ + case SRC_PTYPE: { \ + return std::make_unique>(); \ + } + FOR_LOGICAL_INTEGER_TYPES(DISPATCH) #undef DISPATCH + default: + return std::make_unique(src_type, dst_type); + } + } + + case TYPE_DOUBLE: { + switch (src_primitive_type) { +#define DISPATCH(SRC_PTYPE) \ + case SRC_PTYPE: { \ + return std::make_unique>(); \ + } + FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) +#undef DISPATCH + default: + return std::make_unique(src_type, dst_type); + } + } default: return std::make_unique(src_type, dst_type); } } +template static std::unique_ptr _to_string_converter(const TypeDescriptor& src_type, const DataTypePtr& dst_type) { PrimitiveType src_primitive_type = src_type.type; // numeric type to string, using native std::to_string - if (_is_numeric_type(src_primitive_type)) { + if (src_primitive_type == TYPE_BOOLEAN) { + return std::make_unique(); + } else if (_is_numeric_type(src_primitive_type)) { switch (src_primitive_type) { #define DISPATCH(SRC_PTYPE) \ case SRC_PTYPE: \ - return std::make_unique>(); + return std::make_unique>(); FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) #undef DISPATCH default: @@ -195,14 +227,16 @@ static std::unique_ptr _to_string_converter(const TypeDescr return std::make_unique(src_type, dst_type); } +template static std::unique_ptr _from_string_converter(const TypeDescriptor& src_type, const DataTypePtr& dst_type) { PrimitiveType dst_primitive_type = remove_nullable(dst_type)->get_type_as_type_descriptor().type; switch (dst_primitive_type) { -#define DISPATCH(DST_PTYPE) \ - case DST_PTYPE: \ - return std::make_unique>(remove_nullable(dst_type)); +#define DISPATCH(DST_PTYPE) \ + case DST_PTYPE: \ + return std::make_unique>( \ + remove_nullable(dst_type)); FOR_ALL_LOGICAL_TYPES(DISPATCH) #undef DISPATCH default: @@ -216,24 +250,26 @@ static std::unique_ptr _numeric_to_decimal_converter( PrimitiveType dst_primitive_type = remove_nullable(dst_type)->get_type_as_type_descriptor().type; int scale = remove_nullable(dst_type)->get_scale(); + int precision = remove_nullable(dst_type)->get_precision(); switch (src_primitive_type) { -#define DISPATCH(SRC_PTYPE) \ - case SRC_PTYPE: { \ - switch (dst_primitive_type) { \ - case TYPE_DECIMALV2: \ - return std::make_unique>(scale); \ - case TYPE_DECIMAL32: \ - return std::make_unique>(scale); \ - case TYPE_DECIMAL64: \ - return std::make_unique>(scale); \ - case TYPE_DECIMAL128I: \ - return std::make_unique>( \ - scale); \ - case TYPE_DECIMAL256: \ - return std::make_unique>(scale); \ - default: \ - return std::make_unique(src_type, dst_type); \ - } \ +#define DISPATCH(SRC_PTYPE) \ + case SRC_PTYPE: { \ + switch (dst_primitive_type) { \ + case TYPE_DECIMAL32: \ + return std::make_unique>( \ + precision, scale); \ + case TYPE_DECIMAL64: \ + return std::make_unique>( \ + precision, scale); \ + case TYPE_DECIMAL128I: \ + return std::make_unique>( \ + precision, scale); \ + case TYPE_DECIMAL256: \ + return std::make_unique>( \ + precision, scale); \ + default: \ + return std::make_unique(src_type, dst_type); \ + } \ } FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) #undef DISPATCH @@ -252,8 +288,6 @@ static std::unique_ptr _decimal_to_numeric_converter( #define DISPATCH(DST_PTYPE) \ case DST_PTYPE: { \ switch (src_primitive_type) { \ - case TYPE_DECIMALV2: \ - return std::make_unique>(scale); \ case TYPE_DECIMAL32: \ return std::make_unique>(scale); \ case TYPE_DECIMAL64: \ @@ -274,18 +308,62 @@ static std::unique_ptr _decimal_to_numeric_converter( } } -std::unique_ptr ColumnTypeConverter::get_converter( - const TypeDescriptor& src_type, const DataTypePtr& dst_type) { +static std::unique_ptr _decimal_converter(const TypeDescriptor& src_type, + const DataTypePtr& dst_type) { + int from_precision = src_type.precision; + int from_scale = src_type.scale; + int to_precision = remove_nullable(dst_type)->get_precision(); + int to_scale = remove_nullable(dst_type)->get_scale(); + + if (from_scale == to_scale && from_precision == to_precision) { + return std::make_unique(); + } + PrimitiveType src_primitive_type = src_type.type; PrimitiveType dst_primitive_type = remove_nullable(dst_type)->get_type_as_type_descriptor().type; - if (src_primitive_type == dst_primitive_type) { - return std::make_unique(); + switch (dst_primitive_type) { +#define DISPATCH(DST_PTYPE) \ + case DST_PTYPE: { \ + switch (src_primitive_type) { \ + case TYPE_DECIMAL32: \ + return std::make_unique>( \ + from_precision, from_scale, to_precision, to_scale); \ + case TYPE_DECIMAL64: \ + return std::make_unique>( \ + from_precision, from_scale, to_precision, to_scale); \ + case TYPE_DECIMAL128I: \ + return std::make_unique>( \ + from_precision, from_scale, to_precision, to_scale); \ + case TYPE_DECIMAL256: \ + return std::make_unique>( \ + from_precision, from_scale, to_precision, to_scale); \ + default: \ + return std::make_unique(src_type, dst_type); \ + } \ + } + FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) +#undef DISPATCH + default: + return std::make_unique(src_type, dst_type); } +} + +std::unique_ptr ColumnTypeConverter::get_converter( + const TypeDescriptor& src_type, const DataTypePtr& dst_type, FileFormat file_format) { + PrimitiveType src_primitive_type = src_type.type; + PrimitiveType dst_primitive_type = + remove_nullable(dst_type)->get_type_as_type_descriptor().type; + //todo: type to varchar/char. if (is_string_type(src_primitive_type) && is_string_type(dst_primitive_type)) { return std::make_unique(); } + if (_is_decimal_type(src_primitive_type) && _is_decimal_type(dst_primitive_type)) { + return _decimal_converter(src_type, dst_type); + } + + if (src_primitive_type == dst_primitive_type) { return std::make_unique(); } @@ -298,13 +376,21 @@ std::unique_ptr ColumnTypeConverter::get_converter( // change to string type // example: decimal -> string if (is_string_type(dst_primitive_type)) { - return _to_string_converter(src_type, dst_type); + if (file_format == ORC) { + return _to_string_converter(src_type, dst_type); + } else { + return _to_string_converter(src_type, dst_type); + } } // string type to other type // example: string -> date if (is_string_type(src_primitive_type)) { - return _from_string_converter(src_type, dst_type); + if (file_format == ORC) { + return _from_string_converter(src_type, dst_type); + } else { + return _from_string_converter(src_type, dst_type); + } } // date to datetime, datetime to date diff --git a/be/src/vec/exec/format/column_type_convert.h b/be/src/vec/exec/format/column_type_convert.h index d4a8186549ab1d..f18bcaa3cd6bd6 100644 --- a/be/src/vec/exec/format/column_type_convert.h +++ b/be/src/vec/exec/format/column_type_convert.h @@ -26,12 +26,30 @@ namespace doris::vectorized::converter { +enum FileFormat { COMMON, ORC, PARQUET }; + template -constexpr bool is_decimal_type_const() { +constexpr bool is_decimal_type() { return type == TYPE_DECIMALV2 || type == TYPE_DECIMAL32 || type == TYPE_DECIMAL64 || type == TYPE_DECIMAL128I || type == TYPE_DECIMAL256; } +template +constexpr bool is_integer_type() { + return type == TYPE_INT || type == TYPE_TINYINT || type == TYPE_SMALLINT || + type == TYPE_BIGINT || type == TYPE_LARGEINT; +} + +template +constexpr bool is_real_type() { + return type == TYPE_FLOAT || type == TYPE_DOUBLE; +} + +template +constexpr bool is_numeric_type() { + return is_integer_type() || is_real_type(); +} + /** * Unified schema change interface for all format readers: * @@ -55,7 +73,8 @@ class ColumnTypeConverter { * @param dst_type column type from FE planner(the changed column type) */ static std::unique_ptr get_converter(const TypeDescriptor& src_type, - const DataTypePtr& dst_type); + const DataTypePtr& dst_type, + FileFormat file_format); ColumnTypeConverter() = default; virtual ~ColumnTypeConverter() = default; @@ -123,11 +142,14 @@ class UnsupportedConverter : public ColumnTypeConverter { }; template -class NumericToNumericConverter : public ColumnTypeConverter { + requires(is_integer_type() && is_integer_type()) +class IntegerToIntegerConverter : public ColumnTypeConverter { +public: Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { using SrcColumnType = typename PrimitiveTypeTraits::ColumnType; - using DstCppType = typename PrimitiveTypeTraits::CppType; + using SrcCppType = typename PrimitiveTypeTraits::CppType; using DstColumnType = typename PrimitiveTypeTraits::ColumnType; + using DstCppType = typename PrimitiveTypeTraits::CppType; ColumnPtr from_col = remove_nullable(src_col); MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); @@ -137,29 +159,140 @@ class NumericToNumericConverter : public ColumnTypeConverter { to_col->resize(start_idx + rows); auto& data = static_cast(*to_col.get()).get_data(); for (int i = 0; i < rows; ++i) { + if constexpr (sizeof(DstCppType) < sizeof(SrcCppType)) { + SrcCppType src_value = src_data[i]; + if ((SrcCppType)std::numeric_limits::min() > src_value || + src_value > (SrcCppType)std::numeric_limits::max()) { + return Status::InternalError("Failed to cast value '{}' to {} column", + src_value, dst_col->get_name()); + } + } + data[start_idx + i] = static_cast(src_data[i]); } + return Status::OK(); + } +}; +template + requires(is_numeric_type() && is_real_type()) +class NumericToFloatPointConverter : public ColumnTypeConverter { + static constexpr long MIN_EXACT_DOUBLE = -(1L << 52); // -2^52 + static constexpr long MAX_EXACT_DOUBLE = (1L << 52) - 1; // 2^52 - 1 + static constexpr long MIN_EXACT_FLOAT = -(1L << 23); // -2^23 + static constexpr long MAX_EXACT_FLOAT = (1L << 23) - 1; // 2^23 - 1 + + bool overflow(typename PrimitiveTypeTraits::CppType value) const { + if constexpr (DstPrimitiveType == TYPE_DOUBLE) { + return value < MIN_EXACT_DOUBLE || value > MAX_EXACT_DOUBLE; + } else if constexpr (DstPrimitiveType == TYPE_FLOAT) { + return value < MIN_EXACT_FLOAT || value > MAX_EXACT_FLOAT; + } + return true; // Default case, should not occur + } + +public: + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using SrcColumnType = typename PrimitiveTypeTraits::ColumnType; + using SrcCppType = typename PrimitiveTypeTraits::CppType; + using DstColumnType = typename PrimitiveTypeTraits::ColumnType; + using DstCppType = typename PrimitiveTypeTraits::CppType; + ColumnPtr from_col = remove_nullable(src_col); + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + NullMap* null_map = nullptr; + if (dst_col->is_nullable()) { + null_map = + &static_cast(dst_col.get())->get_null_map_data(); + } + + size_t rows = from_col->size(); + auto& src_data = static_cast(from_col.get())->get_data(); + size_t start_idx = to_col->size(); + to_col->resize(start_idx + rows); + auto& data = static_cast(*to_col.get()).get_data(); + for (int i = 0; i < rows; ++i) { + SrcCppType src_value = src_data[i]; + if constexpr (is_integer_type()) { + if (overflow(src_value)) { + if (null_map == nullptr) { + return Status::InternalError("Failed to cast value '{}' to {} column", + src_value, dst_col->get_name()); + } else { + (*null_map)[start_idx + i] = 1; + } + } + } + + data[start_idx + i] = static_cast(src_value); + } return Status::OK(); } }; -template +class BooleanToStringConverter : public ColumnTypeConverter { +public: + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using SrcColumnType = typename PrimitiveTypeTraits::ColumnType; + ColumnPtr from_col = remove_nullable(src_col); + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + size_t rows = from_col->size(); + auto& src_data = static_cast(from_col.get())->get_data(); + auto& string_col = static_cast(*to_col.get()); + for (int i = 0; i < rows; ++i) { + std::string value = src_data[i] != 0 ? "TRUE" : "FALSE"; + string_col.insert_data(value.data(), value.size()); + } + return Status::OK(); + } +}; + +template + requires(is_numeric_type()) class NumericToStringConverter : public ColumnTypeConverter { +private: +public: Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { using SrcColumnType = typename PrimitiveTypeTraits::ColumnType; ColumnPtr from_col = remove_nullable(src_col); MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + NullMap* null_map = nullptr; + if (dst_col->is_nullable()) { + null_map = &reinterpret_cast(dst_col.get()) + ->get_null_map_data(); + } + size_t rows = from_col->size(); + size_t start_idx = to_col->size(); auto& src_data = static_cast(from_col.get())->get_data(); auto& string_col = static_cast(*to_col.get()); for (int i = 0; i < rows; ++i) { - if constexpr (SrcPrimitiveType == TYPE_LARGEINT) { - string value = int128_to_string(src_data[i]); - string_col.insert_data(value.data(), value.size()); + if constexpr (SrcPrimitiveType == TYPE_FLOAT || SrcPrimitiveType == TYPE_DOUBLE) { + if (fileFormat == FileFormat::ORC && std::isnan(src_data[i])) { + if (null_map == nullptr) { + return Status::InternalError("Failed to cast value '{}' to {} column", + src_data[i], dst_col->get_name()); + } else { + (*null_map)[start_idx + i] = 1; + } + } + char buf[128]; + int strlen; + if constexpr (SrcPrimitiveType == TYPE_FLOAT) { + strlen = FastFloatToBuffer(src_data[i], buf); + } else { + strlen = FastDoubleToBuffer(src_data[i], buf); + } + string_col.insert_data(buf, strlen); } else { - string value = std::to_string(src_data[i]); + std::string value; + if constexpr (SrcPrimitiveType == TYPE_LARGEINT) { + value = int128_to_string(src_data[i]); + } else { + value = std::to_string(src_data[i]); + } string_col.insert_data(value.data(), value.size()); } } @@ -193,8 +326,9 @@ class DecimalToStringConverter : public ColumnTypeConverter { } }; -template +template class TimeToStringConverter : public ColumnTypeConverter { +public: Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { using SrcCppType = typename PrimitiveTypeTraits::CppType; using SrcColumnType = typename PrimitiveTypeTraits::ColumnType; @@ -214,16 +348,33 @@ class TimeToStringConverter : public ColumnTypeConverter { } }; -template +template struct SafeCastString {}; template <> struct SafeCastString { + // Ref: https://github.com/apache/hive/blob/4df4d75bf1e16fe0af75aad0b4179c34c07fc975/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorUtils.java#L559 + static const std::set FALSE_VALUES; static bool safe_cast_string(const char* startptr, const int buffer_size, PrimitiveTypeTraits::ColumnType::value_type* value) { - int32 cast_to_int = 0; - bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int); - *value = cast_to_int == 0 ? 0 : 1; + std::string str_value(startptr, buffer_size); + std::transform(str_value.begin(), str_value.end(), str_value.begin(), ::tolower); + bool is_false = (FALSE_VALUES.find(str_value) != FALSE_VALUES.end()); + *value = is_false ? 0 : 1; + return true; + } +}; + +//Apache Hive reads 0 as false, numeric string as true and non-numeric string as null for ORC file format +// https://github.com/apache/orc/blob/fb1c4cb9461d207db652fc253396e57640ed805b/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java#L567 +template <> +struct SafeCastString { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits::ColumnType::value_type* value) { + std::string str_value(startptr, buffer_size); + int64 cast_to_long = 0; + bool can_cast = safe_strto64(startptr, buffer_size, &cast_to_long); + *value = cast_to_long == 0 ? 0 : 1; return can_cast; } }; @@ -285,23 +436,34 @@ struct SafeCastString { } }; -template <> -struct SafeCastString { +template +struct SafeCastString { static bool safe_cast_string(const char* startptr, const int buffer_size, PrimitiveTypeTraits::ColumnType::value_type* value) { float cast_to_float = 0; bool can_cast = safe_strtof(std::string(startptr, buffer_size), &cast_to_float); + if (can_cast && fileFormat == ORC) { + // Apache Hive reads Float.NaN as null when coerced to varchar for ORC file format. + if (std::isnan(cast_to_float)) { + return false; + } + } *value = cast_to_float; return can_cast; } }; -template <> -struct SafeCastString { +template +struct SafeCastString { static bool safe_cast_string(const char* startptr, const int buffer_size, PrimitiveTypeTraits::ColumnType::value_type* value) { double cast_to_double = 0; bool can_cast = safe_strtod(std::string(startptr, buffer_size), &cast_to_double); + if (can_cast && fileFormat == ORC) { + if (std::isnan(cast_to_double)) { + return false; + } + } *value = cast_to_double; return can_cast; } @@ -357,7 +519,7 @@ struct SafeCastDecimalString { } }; -template +template class CastStringConverter : public ColumnTypeConverter { private: DataTypePtr _dst_type_desc; @@ -387,17 +549,21 @@ class CastStringConverter : public ColumnTypeConverter { DstCppType& value = data[start_idx + i]; auto string_value = string_col.get_data_at(i); bool can_cast = false; - if constexpr (is_decimal_type_const()) { + if constexpr (is_decimal_type()) { can_cast = SafeCastDecimalString::safe_cast_string( string_value.data, string_value.size, &value, _dst_type_desc->get_precision(), _dst_type_desc->get_scale()); } else if constexpr (DstPrimitiveType == TYPE_DATETIMEV2) { can_cast = SafeCastString::safe_cast_string( string_value.data, string_value.size, &value, _dst_type_desc->get_scale()); + } else if constexpr (DstPrimitiveType == TYPE_BOOLEAN && fileFormat == ORC) { + can_cast = SafeCastString::safe_cast_string( + string_value.data, string_value.size, &value); } else { can_cast = SafeCastString::safe_cast_string( string_value.data, string_value.size, &value); } + if (!can_cast) { if (null_map == nullptr) { return Status::InternalError("Failed to cast string '{}' to not null column", @@ -415,6 +581,7 @@ class CastStringConverter : public ColumnTypeConverter { // only support date & datetime v2 template class TimeV2Converter : public ColumnTypeConverter { +public: Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { using SrcColumnType = typename PrimitiveTypeTraits::ColumnType; using DstColumnType = typename PrimitiveTypeTraits::ColumnType; @@ -442,61 +609,96 @@ class TimeV2Converter : public ColumnTypeConverter { }; template + requires(is_numeric_type() && is_decimal_type()) class NumericToDecimalConverter : public ColumnTypeConverter { private: + int _precision; int _scale; public: - NumericToDecimalConverter(int scale) : _scale(scale) {} + NumericToDecimalConverter(int precision, int scale) : _precision(precision), _scale(scale) {} Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { using SrcColumnType = typename PrimitiveTypeTraits::ColumnType; + using SrcCppType = typename PrimitiveTypeTraits::CppType; using DstColumnType = typename PrimitiveTypeTraits::ColumnType; using DstNativeType = typename PrimitiveTypeTraits::ColumnType::value_type::NativeType; - using DstCppType = typename PrimitiveTypeTraits::ColumnType::value_type; + using DstDorisType = typename PrimitiveTypeTraits::ColumnType::value_type; ColumnPtr from_col = remove_nullable(src_col); MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + NullMap* null_map = nullptr; + if (dst_col->is_nullable()) { + null_map = &reinterpret_cast(dst_col.get()) + ->get_null_map_data(); + } + size_t rows = from_col->size(); auto& src_data = static_cast(from_col.get())->get_data(); size_t start_idx = to_col->size(); to_col->resize(start_idx + rows); auto& data = static_cast(*to_col.get()).get_data(); - int64_t scale_factor = 1; - if (_scale > DecimalV2Value::SCALE) { - scale_factor = common::exp10_i64(_scale - DecimalV2Value::SCALE); - } else if (_scale < DecimalV2Value::SCALE) { - scale_factor = common::exp10_i64(DecimalV2Value::SCALE - _scale); - } + + auto max_result = DataTypeDecimal::get_max_digits_number(_precision); + auto multiplier = DataTypeDecimal::get_scale_multiplier(_scale).value; for (int i = 0; i < rows; ++i) { - if constexpr (SrcPrimitiveType == TYPE_FLOAT || SrcPrimitiveType == TYPE_DOUBLE) { - DecimalV2Value decimal_value; - if constexpr (SrcPrimitiveType == TYPE_FLOAT) { - decimal_value.assign_from_float(src_data[i]); - } else { - decimal_value.assign_from_double(src_data[i]); + const SrcCppType& src_value = src_data[i]; + DstDorisType& res = data[start_idx + i]; + + if constexpr (is_integer_type()) { + if constexpr (sizeof(DstNativeType) < sizeof(SrcCppType)) { + if (src_value > std::numeric_limits::max() || + src_value < std::numeric_limits::min()) { + if (null_map == nullptr) { + return Status::InternalError("Failed to cast value '{}' to {} column", + src_data[i], dst_col->get_name()); + } else { + (*null_map)[start_idx + i] = 1; + } + } } - int128_t decimal_int128 = reinterpret_cast(decimal_value); - if (_scale > DecimalV2Value::SCALE) { - decimal_int128 *= scale_factor; - } else if (_scale < DecimalV2Value::SCALE) { - decimal_int128 /= scale_factor; + if (common::mul_overflow(static_cast(src_value), multiplier, + res.value)) { + if (null_map == nullptr) { + return Status::InternalError("Failed to cast value '{}' to {} column", + src_data[i], dst_col->get_name()); + } else { + (*null_map)[start_idx + i] = 1; + } + } else { + if (res.value > max_result.value || res.value < -max_result.value) { + if (null_map == nullptr) { + return Status::InternalError("Failed to cast value '{}' to {} column", + src_data[i], dst_col->get_name()); + } else { + (*null_map)[start_idx + i] = 1; + } + } } - auto& v = reinterpret_cast(data[start_idx + i]); - v = (DstNativeType)decimal_int128; } else { - data[start_idx + i] = DstCppType::from_int_frac(src_data[i], 0, _scale); + SrcCppType dst_value = src_value * static_cast(multiplier); + res = static_cast(dst_value); + if (UNLIKELY(!std::isfinite(src_value) || + dst_value > static_cast(max_result.value) || + dst_value < static_cast(-max_result.value))) { + if (null_map == nullptr) { + return Status::InternalError("Failed to cast value '{}' to {} column", + src_data[i], dst_col->get_name()); + } else { + (*null_map)[start_idx + i] = 1; + } + } } } - return Status::OK(); } }; template + requires(is_numeric_type() && is_decimal_type()) class DecimalToNumericConverter : public ColumnTypeConverter { private: int _scale; @@ -506,6 +708,8 @@ class DecimalToNumericConverter : public ColumnTypeConverter { Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { using SrcColumnType = typename PrimitiveTypeTraits::ColumnType; + using SrcNativeType = + typename PrimitiveTypeTraits::ColumnType::value_type::NativeType; using DstColumnType = typename PrimitiveTypeTraits::ColumnType; using DstCppType = typename PrimitiveTypeTraits::CppType; @@ -518,13 +722,44 @@ class DecimalToNumericConverter : public ColumnTypeConverter { to_col->resize(start_idx + rows); auto& data = static_cast(*to_col.get()).get_data(); - int64_t scale_factor = common::exp10_i64(_scale); + NullMap* null_map = nullptr; + if (dst_col->is_nullable()) { + null_map = &reinterpret_cast(dst_col.get()) + ->get_null_map_data(); + } + + SrcNativeType scale_factor; + if constexpr (sizeof(SrcNativeType) <= sizeof(int)) { + scale_factor = common::exp10_i32(_scale); + } else if constexpr (sizeof(SrcNativeType) <= sizeof(int64)) { + scale_factor = common::exp10_i64(_scale); + } else if constexpr (sizeof(SrcNativeType) <= sizeof(__int128)) { + scale_factor = common::exp10_i128(_scale); + } else if constexpr (sizeof(SrcNativeType) <= sizeof(wide::Int256)) { + scale_factor = common::exp10_i256(_scale); + } + for (int i = 0; i < rows; ++i) { if constexpr (DstPrimitiveType == TYPE_FLOAT || DstPrimitiveType == TYPE_DOUBLE) { data[start_idx + i] = static_cast(src_data[i].value / (double)scale_factor); } else { - data[start_idx + i] = static_cast(src_data[i].value / scale_factor); + SrcNativeType tmp_value = src_data[i].value / scale_factor; + + if constexpr (sizeof(SrcNativeType) > sizeof(DstCppType)) { + if ((SrcNativeType)std::numeric_limits::min() > tmp_value || + tmp_value > (SrcNativeType)std::numeric_limits::max()) { + if (null_map == nullptr) { + return Status::InternalError("Failed to cast value '{}' to {} column", + src_data[i].to_string(_scale), + dst_col->get_name()); + } else { + (*null_map)[start_idx + i] = 1; + } + } + } + + data[start_idx + i] = static_cast(tmp_value); } } @@ -532,4 +767,91 @@ class DecimalToNumericConverter : public ColumnTypeConverter { } }; +template +class DecimalToDecimalConverter : public ColumnTypeConverter { +private: + int _from_precision; + int _from_scale; + int _to_precision; + int _to_scale; + +public: + DecimalToDecimalConverter(int from_precision, int from_scale, int to_precision, int to_scale) + : _from_precision(from_precision), + _from_scale(from_scale), + _to_precision(to_precision), + _to_scale(to_scale) {} + + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using SrcColumnType = typename PrimitiveTypeTraits::ColumnType; + using DstColumnType = typename PrimitiveTypeTraits::ColumnType; + using SrcNativeType = typename PrimitiveTypeTraits< + SrcDecimalPrimitiveType>::ColumnType::value_type::NativeType; + using DstNativeType = typename PrimitiveTypeTraits< + DstDecimalPrimitiveType>::ColumnType::value_type::NativeType; + using MaxNativeType = std::conditional_t<(sizeof(SrcNativeType) > sizeof(DstNativeType)), + SrcNativeType, DstNativeType>; + + auto max_result = + DataTypeDecimal>::get_max_digits_number(_to_precision); + bool narrow_integral = (_to_precision - _to_scale) < (_from_precision - _from_scale); + + ColumnPtr from_col = remove_nullable(src_col); + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + size_t rows = from_col->size(); + auto& src_data = static_cast(from_col.get())->get_data(); + size_t start_idx = to_col->size(); + to_col->resize(start_idx + rows); + auto& data = static_cast(*to_col.get()).get_data(); + + for (int i = 0; i < rows; ++i) { + SrcNativeType src_value = src_data[i].value; + DstNativeType& res_value = data[start_idx + i].value; + + if (_to_scale > _from_scale) { + const MaxNativeType multiplier = + DataTypeDecimal>::get_scale_multiplier(_to_scale - + _from_scale); + MaxNativeType res; + if (common::mul_overflow(src_value, multiplier, res)) { + return Status::InternalError("Failed to cast value '{}' to {} column", + src_data[i].to_string(_from_scale), + dst_col->get_name()); + } else { + if (res > max_result.value || res < -max_result.value) { + return Status::InternalError("Failed to cast value '{}' to {} column", + src_data[i].to_string(_from_scale), + dst_col->get_name()); + } else { + res_value = static_cast(res); + } + } + } else if (_to_scale == _from_scale) { + res_value = static_cast(src_value); + if (narrow_integral && + (src_value > max_result.value || src_value < -max_result.value)) { + return Status::InternalError("Failed to cast value '{}' to {} column", + src_data[i].to_string(_from_scale), + dst_col->get_name()); + } + } else { + MaxNativeType multiplier = + DataTypeDecimal>::get_scale_multiplier(_from_scale - + _to_scale) + .value; + MaxNativeType res = src_value / multiplier; + if (src_value % multiplier != 0 || res > max_result.value || + res < -max_result.value) { + return Status::InternalError("Failed to cast value '{}' to {} column", + src_data[i].to_string(_from_scale), + dst_col->get_name()); + } + res_value = static_cast(res); + } + } + return Status::OK(); + } +}; + } // namespace doris::vectorized::converter diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 293e6e9b98f9db..54fb8b18ce853b 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -467,20 +467,41 @@ std::tuple convert_to_orc_literal(const orc::Type* type, con int precision, int scale) { try { switch (type->getKind()) { - case orc::TypeKind::BOOLEAN: + case orc::TypeKind::BOOLEAN: { + if (primitive_type != TYPE_BOOLEAN) { + return std::make_tuple(false, orc::Literal(false)); + } return std::make_tuple(true, orc::Literal(bool(*((uint8_t*)value)))); + } case orc::TypeKind::BYTE: - return std::make_tuple(true, orc::Literal(int64_t(*((int8_t*)value)))); case orc::TypeKind::SHORT: - return std::make_tuple(true, orc::Literal(int64_t(*((int16_t*)value)))); case orc::TypeKind::INT: - return std::make_tuple(true, orc::Literal(int64_t(*((int32_t*)value)))); - case orc::TypeKind::LONG: - return std::make_tuple(true, orc::Literal(*((int64_t*)value))); - case orc::TypeKind::FLOAT: - return std::make_tuple(true, orc::Literal(double(*((float*)value)))); - case orc::TypeKind::DOUBLE: - return std::make_tuple(true, orc::Literal(*((double*)value))); + case orc::TypeKind::LONG: { + if constexpr (primitive_type == TYPE_TINYINT) { + return std::make_tuple(true, orc::Literal(int64_t(*((int8_t*)value)))); + } else if constexpr (primitive_type == TYPE_SMALLINT) { + return std::make_tuple(true, orc::Literal(int64_t(*((int16_t*)value)))); + } else if constexpr (primitive_type == TYPE_INT) { + return std::make_tuple(true, orc::Literal(int64_t(*((int32_t*)value)))); + } else if constexpr (primitive_type == TYPE_BIGINT) { + return std::make_tuple(true, orc::Literal(int64_t(*((int64_t*)value)))); + } + return std::make_tuple(false, orc::Literal(false)); + } + case orc::TypeKind::FLOAT: { + if constexpr (primitive_type == TYPE_FLOAT) { + return std::make_tuple(true, orc::Literal(double(*((float*)value)))); + } else if constexpr (primitive_type == TYPE_DOUBLE) { + return std::make_tuple(true, orc::Literal(double(*((double*)value)))); + } + return std::make_tuple(false, orc::Literal(false)); + } + case orc::TypeKind::DOUBLE: { + if (primitive_type == TYPE_DOUBLE) { + return std::make_tuple(true, orc::Literal(*((double*)value))); + } + return std::make_tuple(false, orc::Literal(false)); + } case orc::TypeKind::STRING: [[fallthrough]]; case orc::TypeKind::BINARY: @@ -489,8 +510,16 @@ std::tuple convert_to_orc_literal(const orc::Type* type, con // case orc::TypeKind::CHAR: // [[fallthrough]]; case orc::TypeKind::VARCHAR: { +<<<<<<< HEAD StringRef* string_value = (StringRef*)value; return std::make_tuple(true, orc::Literal(string_value->data, string_value->size)); +======= + if (primitive_type == TYPE_STRING || primitive_type == TYPE_CHAR || + primitive_type == TYPE_VARCHAR) { + return std::make_tuple(true, orc::Literal(literal_data.data, literal_data.size)); + } + return std::make_tuple(false, orc::Literal(false)); +>>>>>>> 3f85ad6d75c ([enchement](schema change)Standardize the behavior after a table schema change. (#47471)) } case orc::TypeKind::DECIMAL: { int128_t decimal_value; @@ -502,8 +531,10 @@ std::tuple convert_to_orc_literal(const orc::Type* type, con decimal_value = *((int32_t*)value); } else if constexpr (primitive_type == TYPE_DECIMAL64) { decimal_value = *((int64_t*)value); - } else { + } else if constexpr (primitive_type == TYPE_DECIMAL128I) { decimal_value = *((int128_t*)value); + } else { + return std::make_tuple(false, orc::Literal(false)); } return std::make_tuple(true, orc::Literal(orc::Int128(uint64_t(decimal_value >> 64), uint64_t(decimal_value)), @@ -517,12 +548,14 @@ std::tuple convert_to_orc_literal(const orc::Type* type, con cctz::civil_day civil_date(date_v1.year(), date_v1.month(), date_v1.day()); day_offset = cctz::convert(civil_date, utc0).time_since_epoch().count() / (24 * 60 * 60); - } else { // primitive_type == TYPE_DATEV2 + } else if (primitive_type == TYPE_DATEV2) { const DateV2Value date_v2 = *reinterpret_cast*>(value); cctz::civil_day civil_date(date_v2.year(), date_v2.month(), date_v2.day()); day_offset = cctz::convert(civil_date, utc0).time_since_epoch().count() / (24 * 60 * 60); + } else { + return std::make_tuple(false, orc::Literal(false)); } return std::make_tuple(true, orc::Literal(orc::PredicateDataType::DATE, day_offset)); } @@ -539,7 +572,7 @@ std::tuple convert_to_orc_literal(const orc::Type* type, con datetime_v1.minute(), datetime_v1.second()); seconds = cctz::convert(civil_seconds, utc0).time_since_epoch().count(); nanos = 0; - } else { // primitive_type == TYPE_DATETIMEV2 + } else if (primitive_type == TYPE_DATETIMEV2) { const DateV2Value datetime_v2 = *reinterpret_cast*>(value); cctz::civil_second civil_seconds(datetime_v2.year(), datetime_v2.month(), @@ -547,6 +580,8 @@ std::tuple convert_to_orc_literal(const orc::Type* type, con datetime_v2.minute(), datetime_v2.second()); seconds = cctz::convert(civil_seconds, utc0).time_since_epoch().count(); nanos = datetime_v2.microsecond() * 1000; + } else { + return std::make_tuple(false, orc::Literal(false)); } return std::make_tuple(true, orc::Literal(seconds, nanos)); } @@ -1565,7 +1600,8 @@ Status OrcReader::_orc_column_to_doris_column(const std::string& col_name, Colum if (!_converters.contains(converter_key)) { std::unique_ptr converter = - converter::ColumnTypeConverter::get_converter(src_type, data_type); + converter::ColumnTypeConverter::get_converter(src_type, data_type, + converter::FileFormat::ORC); if (!converter->support()) { return Status::InternalError( "The column type of '{}' has changed and is not supported: ", col_name, diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp index 0a5ef2913dd940..0d24a14612a3d7 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp @@ -25,7 +25,6 @@ namespace doris::vectorized::parquet { const cctz::time_zone ConvertParams::utc0 = cctz::utc_time_zone(); #define FOR_LOGICAL_DECIMAL_TYPES(M) \ - M(TYPE_DECIMALV2) \ M(TYPE_DECIMAL32) \ M(TYPE_DECIMAL64) \ M(TYPE_DECIMAL128I) \ @@ -133,8 +132,8 @@ static void get_decimal_converter(FieldSchema* field_schema, TypeDescriptor src_ std::unique_ptr& physical_converter) { const tparquet::SchemaElement& parquet_schema = field_schema->parquet_schema; if (is_decimal(remove_nullable(dst_logical_type))) { - // using destination decimal type, avoid type and scale change - src_logical_type = remove_nullable(dst_logical_type)->get_type_as_type_descriptor(); + src_logical_type = create_decimal(parquet_schema.precision, parquet_schema.scale, false) + ->get_type_as_type_descriptor(); } tparquet::Type::type src_physical_type = parquet_schema.type; @@ -298,8 +297,8 @@ std::unique_ptr PhysicalToLogicalConverter::get_conv if (physical_converter->support()) { physical_converter->_convert_params = std::move(convert_params); - physical_converter->_logical_converter = - converter::ColumnTypeConverter::get_converter(src_logical_type, dst_logical_type); + physical_converter->_logical_converter = converter::ColumnTypeConverter::get_converter( + src_logical_type, dst_logical_type, converter::FileFormat::PARQUET); if (!physical_converter->_logical_converter->support()) { physical_converter.reset(new UnsupportedConverter( "Unsupported type change: " + diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h b/be/src/vec/exec/format/parquet/parquet_column_convert.h index cf6f8aa13fa1d1..4bf8cbdbf8a652 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.h +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h @@ -154,7 +154,7 @@ struct ConvertParams { * * Ultimate performance optimization: * 1. If process of (First => Second) is consistent, eg. from BYTE_ARRAY to string, no additional copies and conversions will be introduced; - * 2. If process of (Second => Third) is consistent, eg. from decimal(12, 4) to decimal(8, 2), no additional copies and conversions will be introduced; + * 2. If process of (Second => Third) is consistent, no additional copies and conversions will be introduced; * 3. Null map is share among all processes, no additional copies and conversions will be introduced in null map; * 4. Only create one physical column in physical conversion, and reused in each loop; * 5. Only create one logical column in logical conversion, and reused in each loop; diff --git a/be/test/vec/exec/column_type_convert_test.cpp b/be/test/vec/exec/column_type_convert_test.cpp new file mode 100644 index 00000000000000..3a45befd028530 --- /dev/null +++ b/be/test/vec/exec/column_type_convert_test.cpp @@ -0,0 +1,1117 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/format/column_type_convert.h" + +#include + +#include + +#include "vec/columns/column_decimal.h" +#include "vec/columns/column_nullable.h" +#include "vec/columns/column_string.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type_date.h" +#include "vec/data_types/data_type_date_time.h" +#include "vec/data_types/data_type_decimal.h" +#include "vec/data_types/data_type_factory.hpp" +#include "vec/data_types/data_type_nullable.h" +#include "vec/data_types/data_type_number.h" +#include "vec/data_types/data_type_string.h" + +namespace doris::vectorized { + +class ColumnTypeConverterTest : public testing::Test { +public: + ColumnTypeConverterTest() = default; + virtual ~ColumnTypeConverterTest() = default; +}; + +// Test integer type conversions (widening) +TEST_F(ColumnTypeConverterTest, TestIntegerWideningConversions) { + // Test TINYINT -> SMALLINT + { + TypeDescriptor src_type(TYPE_TINYINT); + auto dst_type = std::make_shared(); + + auto converter = converter::ColumnTypeConverter::get_converter(src_type, dst_type, + converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnInt8::create(); + auto& src_data = src_col->get_data(); + // Test normal values + src_data.push_back(42); + src_data.push_back(-42); + // Test boundary values + src_data.push_back(std::numeric_limits::max()); + src_data.push_back(std::numeric_limits::min()); + + auto dst_col = dst_type->create_column(); + auto mutable_dst = dst_col->assume_mutable(); + + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + + auto& dst_data = static_cast(*mutable_dst).get_data(); + ASSERT_EQ(4, dst_data.size()); + EXPECT_EQ(42, dst_data[0]); + EXPECT_EQ(-42, dst_data[1]); + EXPECT_EQ(std::numeric_limits::max(), dst_data[2]); + EXPECT_EQ(std::numeric_limits::min(), dst_data[3]); + } + + // Test SMALLINT -> INT + { + TypeDescriptor src_type(TYPE_SMALLINT); + auto dst_type = std::make_shared(); + + auto converter = converter::ColumnTypeConverter::get_converter(src_type, dst_type, + converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnInt16::create(); + auto& src_data = src_col->get_data(); + // Test normal values + src_data.push_back(1234); + src_data.push_back(-1234); + // Test boundary values + src_data.push_back(std::numeric_limits::max()); + src_data.push_back(std::numeric_limits::min()); + + auto dst_col = dst_type->create_column(); + auto mutable_dst = dst_col->assume_mutable(); + + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + + auto& dst_data = static_cast(*mutable_dst).get_data(); + ASSERT_EQ(4, dst_data.size()); + EXPECT_EQ(1234, dst_data[0]); + EXPECT_EQ(-1234, dst_data[1]); + EXPECT_EQ(std::numeric_limits::max(), dst_data[2]); + EXPECT_EQ(std::numeric_limits::min(), dst_data[3]); + } +} + +// Test integer type conversions (narrowing) +TEST_F(ColumnTypeConverterTest, TestIntegerNarrowingConversions) { + // Test INT -> SMALLINT with values in range + { + TypeDescriptor src_type(TYPE_INT); + auto dst_type = std::make_shared(); + + auto converter = converter::ColumnTypeConverter::get_converter(src_type, dst_type, + converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnInt32::create(); + auto& src_data = src_col->get_data(); + src_data.push_back(1234); + src_data.push_back(-1234); + src_data.push_back(std::numeric_limits::max()); + src_data.push_back(std::numeric_limits::min()); + + auto dst_col = dst_type->create_column(); + auto mutable_dst = dst_col->assume_mutable(); + + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + + auto& dst_data = static_cast(*mutable_dst).get_data(); + ASSERT_EQ(4, dst_data.size()); + EXPECT_EQ(1234, dst_data[0]); + EXPECT_EQ(-1234, dst_data[1]); + EXPECT_EQ(std::numeric_limits::max(), dst_data[2]); + EXPECT_EQ(std::numeric_limits::min(), dst_data[3]); + } + + // Test INT -> SMALLINT with out of range values + { + TypeDescriptor src_type(TYPE_INT); + auto dst_type = std::make_shared(); + auto nullable_dst_type = std::make_shared(dst_type); + + auto converter = converter::ColumnTypeConverter::get_converter(src_type, nullable_dst_type, + converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnInt32::create(); + auto& src_data = src_col->get_data(); + src_data.push_back(std::numeric_limits::max() + 1); + src_data.push_back(std::numeric_limits::min() - 1); + + auto dst_col = nullable_dst_type->create_column(); + auto mutable_dst = dst_col->assume_mutable(); + + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(!st.ok()); + } +} + +// Test floating point type conversions +TEST_F(ColumnTypeConverterTest, TestFloatingPointConversions) { + // TEST INT -> FLOAT + { + TypeDescriptor src_type(TYPE_INT); + auto dst_type = std::make_shared(); + auto nullable_dst_type = std::make_shared(dst_type); + auto converter = converter::ColumnTypeConverter::get_converter(src_type, dst_type, + converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnInt32::create(); + auto& src_data = src_col->get_data(); + src_data.resize(0); + // Add test values + src_data.push_back(12345); + src_data.push_back(-67890); + src_data.push_back((1L << 23) - 1); + src_data.push_back(1L << 23); + src_data.push_back((1L << 23) + 1); + auto dst_nullable_col = nullable_dst_type->create_column(); + auto mutable_dst = dst_nullable_col->assume_mutable(); + auto& nullable_col = static_cast(*mutable_dst); + auto& nested_col = static_cast(nullable_col.get_nested_column()); + auto& null_map = nullable_col.get_null_map_data(); + null_map.resize_fill(src_data.size(), 0); + + // Perform conversion + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + + ASSERT_EQ(5, nested_col.size()); + EXPECT_FLOAT_EQ(12345.0f, nested_col.get_data()[0]); + EXPECT_FLOAT_EQ(-67890.0f, nested_col.get_data()[1]); + EXPECT_FLOAT_EQ((float)((1L << 23) - 1), nested_col.get_data()[2]); + EXPECT_FLOAT_EQ(1, null_map[3]); + EXPECT_FLOAT_EQ(1, null_map[4]); + } + // TEST STRING -> FLOAT + { + TypeDescriptor src_type(TYPE_STRING); + auto dst_type = std::make_shared(); + auto nullable_dst_type = std::make_shared(dst_type); + + auto converter = converter::ColumnTypeConverter::get_converter(src_type, dst_type, + converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnString::create(); + // Add test strings + src_col->insert_data("0", 1); // Zero + src_col->insert_data("123.45", 6); // Positive float + src_col->insert_data("-678.90", 7); // Negative float + src_col->insert_data("1.17549e-38", 11); // Smallest positive float + src_col->insert_data("3.40282e+38", 11); // Largest positive float (FLT_MAX) + src_col->insert_data("-3.40282e+38", 12); // Largest negative float (-FLT_MAX) + src_col->insert_data("Infinity", 8); // Infinity + src_col->insert_data("-Infinity", 9); // Negative infinity + src_col->insert_data("NaN", 3); // Not-a-number + src_col->insert_data("invalid", 7); // Invalid string + src_col->insert_data("", 0); // Empty string + + auto dst_nullable_col = nullable_dst_type->create_column(); + auto mutable_dst = dst_nullable_col->assume_mutable(); + + auto& nullable_col = static_cast(*mutable_dst); + auto& nested_col = static_cast(nullable_col.get_nested_column()); + auto& null_map = nullable_col.get_null_map_data(); + null_map.resize_fill(src_col->size(), 0); + + // Perform conversion + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + + ASSERT_EQ(11, nested_col.size()); + + // Valid conversions + EXPECT_FLOAT_EQ(0.0f, nested_col.get_data()[0]); // "0" + EXPECT_FLOAT_EQ(123.45f, nested_col.get_data()[1]); // "123.45" + EXPECT_FLOAT_EQ(-678.90f, nested_col.get_data()[2]); // "-678.90" + EXPECT_FLOAT_EQ(1.17549e-38f, nested_col.get_data()[3]); // Smallest positive float + EXPECT_FLOAT_EQ(3.40282e+38f, nested_col.get_data()[4]); // Largest positive float + EXPECT_FLOAT_EQ(-3.40282e+38f, nested_col.get_data()[5]); // Largest negative float + + EXPECT_TRUE(std::isinf(nested_col.get_data()[6]) && + nested_col.get_data()[6] > 0); // Infinity + EXPECT_TRUE(std::isinf(nested_col.get_data()[7]) && + nested_col.get_data()[7] < 0); // Negative infinity + EXPECT_TRUE(std::isnan(nested_col.get_data()[8])); // NaN + + // Invalid conversions marked as null + for (int i = 0; i < 9; i++) { + EXPECT_EQ(0, null_map[i]); + } + + EXPECT_EQ(1, null_map[9]); // "invalid" + EXPECT_EQ(1, null_map[10]); // Empty string + } + + // Test FLOAT -> DOUBLE (widening) + { + TypeDescriptor src_type(TYPE_FLOAT); + auto dst_type = std::make_shared(); + + auto converter = converter::ColumnTypeConverter::get_converter(src_type, dst_type, + converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnFloat32::create(); + auto& src_data = src_col->get_data(); + // Test normal values + src_data.push_back(3.14159f); + src_data.push_back(-2.71828f); + // Test special values + src_data.push_back(std::numeric_limits::infinity()); + src_data.push_back(-std::numeric_limits::infinity()); + src_data.push_back(std::numeric_limits::quiet_NaN()); + + auto dst_col = dst_type->create_column(); + auto mutable_dst = dst_col->assume_mutable(); + + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + + auto& dst_data = static_cast(*mutable_dst).get_data(); + ASSERT_EQ(5, dst_data.size()); + EXPECT_FLOAT_EQ(3.14159, dst_data[0]); + EXPECT_FLOAT_EQ(-2.71828, dst_data[1]); + EXPECT_TRUE(std::isinf(dst_data[2]) && dst_data[2] > 0); + EXPECT_TRUE(std::isinf(dst_data[3]) && dst_data[3] < 0); + EXPECT_TRUE(std::isnan(dst_data[4])); + } +} + +// Test decimal type conversions +TEST_F(ColumnTypeConverterTest, TestDecimalConversions) { + // Test DECIMAL32 -> DECIMAL64 (widening) + { + TypeDescriptor src_type(TYPE_DECIMAL32); + src_type.precision = 9; + src_type.scale = 2; + + auto dst_type = std::make_shared>(18, 2); + + auto converter = converter::ColumnTypeConverter::get_converter(src_type, dst_type, + converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnDecimal::create(9, 2); + auto& src_data = src_col->get_data(); + // Test normal values + src_data.resize(0); + src_data.push_back(Decimal32(12345)); // 123.45 + src_data.push_back(Decimal32(-12345)); // -123.45 + + auto dst_col = dst_type->create_column(); + auto mutable_dst = dst_col->assume_mutable(); + + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + auto& dst_data = static_cast&>(*mutable_dst).get_data(); + ASSERT_EQ(2, dst_data.size()); + EXPECT_EQ(12345, dst_data[0].value); + EXPECT_EQ(-12345, dst_data[1].value); + } + + // Test DECIMAL32 -> DECIMAL128 (from small decimal to large decimal) + { + TypeDescriptor src_type(TYPE_DECIMAL32); + src_type.precision = 9; + src_type.scale = 2; + + auto dst_type = std::make_shared>(38, 10); + + auto converter = converter::ColumnTypeConverter::get_converter(src_type, dst_type, + converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnDecimal::create(9, 2); + src_col->resize(0); + auto& src_data = src_col->get_data(); + // Test normal values + src_data.push_back(Decimal32(12345)); // 123.45 + src_data.push_back(Decimal32(-67890)); // -678.90 + + auto dst_col = dst_type->create_column(); + auto mutable_dst = dst_col->assume_mutable(); + + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + + auto& dst_data = static_cast&>(*mutable_dst).get_data(); + ASSERT_EQ(2, dst_data.size()); + EXPECT_EQ(1234500000000L, dst_data[0].value); // 12345 scaled to 123.45000000 + EXPECT_EQ(-6789000000000L, dst_data[1].value); // -67890 scaled to -678.90000000 + } + + // Test DECIMAL64 -> DECIMAL256 (from medium decimal to large decimal) + { + TypeDescriptor src_type(TYPE_DECIMAL64); + src_type.precision = 18; + src_type.scale = 4; + + auto dst_type = std::make_shared>(76, 35); + + auto converter = converter::ColumnTypeConverter::get_converter(src_type, dst_type, + converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnDecimal::create(18, 4); + src_col->resize(0); + auto& src_data = src_col->get_data(); + + // Add test values + src_data.push_back(Decimal64(12345678901234)); // Normal value: 1234567890.1234 + src_data.push_back(Decimal64(-98765432109876)); // Negative value: -9876543210.9876 + + auto dst_col = dst_type->create_column(); + auto mutable_dst = dst_col->assume_mutable(); + + // Perform conversion + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + + auto& dst_data = static_cast&>(*mutable_dst).get_data(); + ASSERT_EQ(2, dst_data.size()); + // Verify data + EXPECT_EQ("1234567890.12340000000000000000000000000000000", + dst_data[0].to_string(76, 35)); // Scaled correctly + EXPECT_EQ("-9876543210.98760000000000000000000000000000000", dst_data[1].to_string(76, 35)); + } + + // Test DECIMAL -> INT (with potential precision loss) + { + TypeDescriptor src_type(TYPE_DECIMAL32); + src_type.precision = 9; + src_type.scale = 2; + + auto dst_type = std::make_shared(); + auto nullable_dst_type = std::make_shared(dst_type); + + auto converter = converter::ColumnTypeConverter::get_converter(src_type, nullable_dst_type, + converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnDecimal::create(9, 2); + auto& src_data = src_col->get_data(); + src_data.resize(0); + src_data.push_back(Decimal32(12345)); // 123.45 + src_data.push_back(Decimal32(-12345)); // -123.45 + src_data.push_back(Decimal32(23345)); // Too large 233.45 + + auto dst_col = nullable_dst_type->create_column(); + auto mutable_dst = dst_col->assume_mutable(); + auto& nullable_col = static_cast(*mutable_dst); + auto& nested_col = static_cast(nullable_col.get_nested_column()); + auto& null_map = nullable_col.get_null_map_data(); + null_map.resize_fill(src_data.size(), 0); + + ASSERT_EQ(3, src_data.size()); + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + ASSERT_EQ(3, nested_col.size()); + EXPECT_EQ(123, nested_col.get_data()[0]); // Truncated to 123 + EXPECT_EQ(-123, nested_col.get_data()[1]); // Truncated to -123 + for (int i = 0; i < 2; i++) { + EXPECT_EQ(0, null_map[i]); + } + EXPECT_EQ(1, null_map[2]); // Should be null due to overflow + } + // TEST INT -> DECIMAL + { + TypeDescriptor src_type(TYPE_INT); + auto dst_type = std::make_shared>(10, 2); + + auto converter = converter::ColumnTypeConverter::get_converter(src_type, dst_type, + converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnInt32::create(); + auto& src_data = src_col->get_data(); + // Test normal values + src_data.resize(0); + src_data.push_back(12345); // 123.45 after scaling + src_data.push_back(-67890); // -678.90 after scaling + src_data.push_back(0); // Zero check + + auto dst_col = dst_type->create_column(); + auto mutable_dst = dst_col->assume_mutable(); + + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + auto& dst_data = static_cast&>(*mutable_dst).get_data(); + ASSERT_EQ(3, dst_data.size()); + EXPECT_EQ(1234500, dst_data[0].value); // 1234500 represents 123.45 + EXPECT_EQ(-6789000, dst_data[1].value); // -6789000 represents -678.90 + EXPECT_EQ(0, dst_data[2].value); // Zero remains zero + } + + // TEST DECIMAL64 -> DECIMAL32 (narrowing) (1) + { + TypeDescriptor src_type(TYPE_DECIMAL64); + src_type.precision = 18; + src_type.scale = 4; + + auto dst_type = std::make_shared>(9, 4); + auto nullable_dst_type = std::make_shared(dst_type); + + auto converter = converter::ColumnTypeConverter::get_converter(src_type, nullable_dst_type, + converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnDecimal::create(18, 4); + auto& src_data = src_col->get_data(); + src_data.resize(0); + // Add test values + src_data.push_back(Decimal64(1234567890)); // In range + src_data.push_back(Decimal64(999999999)); // Edge case: max for Decimal32 + src_data.push_back(Decimal64(1000000000)); // Out of range (overflow) + src_data.push_back(Decimal64(-999999999)); // Edge case: negative max for Decimal32 + src_data.push_back(Decimal64(-1000000000)); // Out of range (underflow) + + auto dst_col = nullable_dst_type->create_column(); + auto mutable_dst = dst_col->assume_mutable(); + + auto& nullable_col = static_cast(*mutable_dst); + auto& null_map = nullable_col.get_null_map_data(); + null_map.resize_fill(src_data.size(), 0); + + // Perform conversion + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_FALSE(st.ok()); + } + + // TEST DECIMAL64 -> DECIMAL32 (narrowing) (2) + { + TypeDescriptor src_type(TYPE_DECIMAL64); + src_type.precision = 18; + src_type.scale = 4; + + auto dst_type = std::make_shared>(9, 4); + auto nullable_dst_type = std::make_shared(dst_type); + + auto converter = converter::ColumnTypeConverter::get_converter(src_type, nullable_dst_type, + converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnDecimal::create(18, 4); + auto& src_data = src_col->get_data(); + // Add test values + src_data.resize(0); + src_data.push_back(Decimal64(123456789)); // In range + src_data.push_back(Decimal64(999999999)); // Edge case: max for Decimal32 + src_data.push_back(Decimal64(-999999999)); // Edge case: negative max for Decimal32 + ASSERT_EQ(3, src_data.size()); + auto dst_col = nullable_dst_type->create_column(); + dst_col->resize(0); + auto mutable_dst = dst_col->assume_mutable(); + + auto& nullable_col = static_cast(*mutable_dst); + auto& nested_col = static_cast&>(nullable_col.get_nested_column()); + auto& null_map = nullable_col.get_null_map_data(); + null_map.resize_fill(src_data.size(), 0); + + // Perform conversion + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + + ASSERT_EQ(3, nested_col.size()); + EXPECT_EQ(123456789, nested_col.get_data()[0].value); // Valid conversion + EXPECT_EQ(999999999, nested_col.get_data()[1].value); // Valid edge case + EXPECT_EQ(-999999999, nested_col.get_data()[2].value); // Valid negative edge case + + for (int i = 0; i < 3; i++) { + EXPECT_EQ(0, null_map[i]); + } + } + + // TEST FLOAT -> DECIMAL + { + TypeDescriptor src_type(TYPE_FLOAT); + auto dst_type = std::make_shared>(10, 2); + auto nullable_dst_type = std::make_shared(dst_type); + + auto converter = converter::ColumnTypeConverter::get_converter(src_type, nullable_dst_type, + converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnFloat32::create(); + auto& src_data = src_col->get_data(); + // Add test values + src_data.resize(0); + src_data.push_back(123.45f); // Normal value + src_data.push_back(-678.90f); // Negative value + src_data.push_back(std::numeric_limits::max()); // Overflow (too large for Decimal64) + src_data.push_back(0.0f); // Zero value + src_data.push_back(std::numeric_limits::infinity()); // Infinity + src_data.push_back(std::numeric_limits::quiet_NaN()); // NaN + + auto dst_col = nullable_dst_type->create_column(); + dst_col->resize(0); + auto mutable_dst = dst_col->assume_mutable(); + + auto& nullable_col = static_cast(*mutable_dst); + auto& nested_col = static_cast&>(nullable_col.get_nested_column()); + auto& null_map = nullable_col.get_null_map_data(); + null_map.resize_fill(src_data.size(), 0); + + // Perform conversion + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + + ASSERT_EQ(6, nested_col.size()); + EXPECT_EQ(12345, nested_col.get_data()[0].value); // 123.45 scaled to 12345 + EXPECT_EQ(-67890, nested_col.get_data()[1].value); // -678.90 scaled to -67890 + for (int i = 0; i < 2; i++) { + EXPECT_EQ(0, null_map[i]); + } + EXPECT_EQ(0, null_map[3]); + EXPECT_EQ(1, null_map[2]); // Overflow: value too large + EXPECT_EQ(0, nested_col.get_data()[3].value); // Zero remains zero + EXPECT_EQ(1, null_map[4]); // Infinity should be null + EXPECT_EQ(1, null_map[5]); // NaN should be null + } + + // TEST STRING -> DECIMAL + { + TypeDescriptor src_type(TYPE_STRING); + auto dst_type = std::make_shared>(10, 2); + auto nullable_dst_type = std::make_shared(dst_type); + + auto converter = converter::ColumnTypeConverter::get_converter(src_type, nullable_dst_type, + converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnString::create(); + // Add test strings + src_col->resize(0); + src_col->insert_data("123.45", 6); // Normal string + src_col->insert_data("-678.90", 7); // Negative value string + src_col->insert_data("1e20", 4); // Out of range for Decimal64 + src_col->insert_data("abc", 3); // Invalid format + src_col->insert_data("", 0); // Empty string + src_col->insert_data("0.0", 3); // Zero value + src_col->insert_data("9999999999.99", 13); // Edge case: max valid value within precision + + auto dst_col = nullable_dst_type->create_column(); + dst_col->resize(0); + auto mutable_dst = dst_col->assume_mutable(); + + auto& nullable_col = static_cast(*mutable_dst); + auto& nested_col = static_cast&>(nullable_col.get_nested_column()); + auto& null_map = nullable_col.get_null_map_data(); + null_map.resize_fill(src_col->size(), 0); + + // Perform conversion + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + + ASSERT_EQ(7, nested_col.size()); + EXPECT_EQ(12345, nested_col.get_data()[0].value); // "123.45" -> 12345 + EXPECT_EQ(-67890, nested_col.get_data()[1].value); // "-678.90" -> -67890 + EXPECT_EQ(1, null_map[2]); // Out of range -> null + EXPECT_EQ(1, null_map[3]); // Invalid format -> null + EXPECT_EQ(1, null_map[4]); // Empty string -> null + EXPECT_EQ(0, nested_col.get_data()[5].value); // "0.0" -> 0 + EXPECT_EQ(1, null_map[6]); // Edge case: maximum valid conversion + for (int i = 0; i < 2; i++) { + EXPECT_EQ(0, null_map[i]); + } + EXPECT_EQ(0, null_map[5]); + } +} + +// Test string type conversions +TEST_F(ColumnTypeConverterTest, TestStringConversions) { + // Test numeric to string conversions + {// INT -> STRING + {TypeDescriptor src_type(TYPE_INT); + auto dst_type = std::make_shared(); + + auto converter = + converter::ColumnTypeConverter::get_converter(src_type, dst_type, converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnInt32::create(); + auto& src_data = src_col->get_data(); + src_data.push_back(std::numeric_limits::max()); + src_data.push_back(std::numeric_limits::min()); + src_data.push_back(0); + + auto dst_col = dst_type->create_column(); + auto mutable_dst = dst_col->assume_mutable(); + + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + + auto& string_col = static_cast(*mutable_dst); + ASSERT_EQ(3, string_col.size()); + EXPECT_EQ(std::to_string(std::numeric_limits::max()), + string_col.get_data_at(0).to_string()); + EXPECT_EQ(std::to_string(std::numeric_limits::min()), + string_col.get_data_at(1).to_string()); + EXPECT_EQ("0", string_col.get_data_at(2).to_string()); +} + +// DOUBLE -> STRING +{ + TypeDescriptor src_type(TYPE_DOUBLE); + auto dst_type = std::make_shared(); + + auto converter = + converter::ColumnTypeConverter::get_converter(src_type, dst_type, converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnFloat64::create(); + auto& src_data = src_col->get_data(); + src_data.push_back(3.14159265359); + src_data.push_back(-2.71828182846); + src_data.push_back(std::numeric_limits::infinity()); + src_data.push_back(std::numeric_limits::quiet_NaN()); + + auto dst_col = dst_type->create_column(); + auto mutable_dst = dst_col->assume_mutable(); + + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + + auto& string_col = static_cast(*mutable_dst); + ASSERT_EQ(4, string_col.size()); + // Note: Exact string representation may vary by platform + EXPECT_TRUE(string_col.get_data_at(0).to_string().find("3.14159") == 0); + EXPECT_TRUE(string_col.get_data_at(1).to_string().find("-2.71828") == 0); + EXPECT_TRUE(string_col.get_data_at(2).to_string().find("inf") != std::string::npos); + EXPECT_TRUE(string_col.get_data_at(3).to_string().find("nan") != std::string::npos); +} +} // namespace doris::vectorized + +// Test string to numeric conversions with invalid input +{ + TypeDescriptor src_type(TYPE_STRING); + auto dst_type = std::make_shared(); + auto nullable_dst_type = std::make_shared(dst_type); + + auto converter = converter::ColumnTypeConverter::get_converter(src_type, nullable_dst_type, + converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnString::create(); + src_col->resize(0); + src_col->insert_data("42", 2); + src_col->insert_data("not a number", 11); + src_col->insert_data("2147483648", 10); // Greater than INT32_MAX + + auto dst_col = nullable_dst_type->create_column(); + auto mutable_dst = dst_col->assume_mutable(); + auto& nullable_col = static_cast(*mutable_dst); + auto& nested_col = static_cast(nullable_col.get_nested_column()); + auto& null_map = nullable_col.get_null_map_data(); + null_map.resize_fill(src_col->size(), 0); + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + + ASSERT_EQ(3, nested_col.size()); + EXPECT_EQ(42, nested_col.get_data()[0]); + EXPECT_EQ(1, null_map[1]); // Invalid format + EXPECT_EQ(1, null_map[2]); // Out of range + EXPECT_EQ(0, null_map[0]); +} +// TEST DECIMAL -> STRING +{ + TypeDescriptor src_type(TYPE_DECIMAL32); + src_type.precision = 9; + src_type.scale = 2; + + auto dst_type = std::make_shared(); + + auto converter = + converter::ColumnTypeConverter::get_converter(src_type, dst_type, converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnDecimal::create(9, 2); + auto& src_data = src_col->get_data(); + // Add test values + src_data.resize(0); + src_data.push_back(Decimal32(12345)); // 123.45 + src_data.push_back(Decimal32(-67890)); // -678.90 + src_data.push_back(Decimal32(0)); // Zero + + auto dst_col = dst_type->create_column(); + dst_col->resize(0); + auto mutable_dst = dst_col->assume_mutable(); + + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + + auto& string_col = static_cast(*mutable_dst); + ASSERT_EQ(3, string_col.size()); + EXPECT_EQ("123.45", string_col.get_data_at(0).to_string()); // 123.45 + EXPECT_EQ("-678.90", string_col.get_data_at(1).to_string()); // -678.90 + EXPECT_EQ("0.00", string_col.get_data_at(2).to_string()); // Zero value +} + +// TEST DATE/TIMESTAMP -> STRING +{ + TypeDescriptor src_type(TYPE_DATEV2); + auto dst_type = std::make_shared(); + + auto converter = + converter::ColumnTypeConverter::get_converter(src_type, dst_type, converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnDateV2::create(); + auto& src_data = src_col->get_data(); + // Add test date values + src_data.resize(0); + DateV2Value value; + value.unchecked_set_time(2021, 1, 1, 0, 0, 0); + src_data.push_back( + *reinterpret_cast(&value)); // "2021-01-01" in days format + value.unchecked_set_time(1970, 1, 1, 0, 0, 0); + src_data.push_back(*reinterpret_cast( + &value)); // "1970-01-01" in days format (epoch start) + value.unchecked_set_time(2070, 1, 1, 0, 0, 0); + src_data.push_back( + *reinterpret_cast(&value)); // "2070-01-01" in days format + + auto dst_col = dst_type->create_column(); + dst_col->resize(0); + auto mutable_dst = dst_col->assume_mutable(); + + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + + auto& string_col = static_cast(*mutable_dst); + ASSERT_EQ(3, string_col.size()); + EXPECT_EQ("2021-01-01", string_col.get_data_at(0).to_string()); + EXPECT_EQ("1970-01-01", string_col.get_data_at(1).to_string()); + EXPECT_EQ("2070-01-01", string_col.get_data_at(2).to_string()); +} + +// TEST BOOLEAN -> STRING +{ + TypeDescriptor src_type(TYPE_BOOLEAN); + auto dst_type = std::make_shared(); + + auto converter = + converter::ColumnTypeConverter::get_converter(src_type, dst_type, converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnUInt8::create(); + auto& src_data = src_col->get_data(); + src_data.resize(0); + // Add boolean values + src_data.push_back(1); // true + src_data.push_back(0); // false + src_data.push_back(1); // true + src_data.push_back(0); // false + + auto dst_col = dst_type->create_column(); + auto mutable_dst = dst_col->assume_mutable(); + + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + + auto& string_col = static_cast(*mutable_dst); + ASSERT_EQ(4, string_col.size()); + EXPECT_EQ("TRUE", string_col.get_data_at(0).to_string()); // true + EXPECT_EQ("FALSE", string_col.get_data_at(1).to_string()); // false + EXPECT_EQ("TRUE", string_col.get_data_at(2).to_string()); // true + EXPECT_EQ("FALSE", string_col.get_data_at(3).to_string()); // false +} + +// TEST STRING -> BOOLEAN (for ORC file format, Apache Hive behavior) +{ + TypeDescriptor src_type(TYPE_STRING); + auto dst_type = std::make_shared(); // BOOLEAN represented as UInt8 + auto nullable_dst_type = std::make_shared(dst_type); + + auto converter = converter::ColumnTypeConverter::get_converter(src_type, nullable_dst_type, + converter::ORC); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnString::create(); + // Add test strings + src_col->resize(0); + src_col->insert_data("0", 1); // Hive: false + src_col->insert_data("123", 3); // Hive: true + src_col->insert_data("-1", 2); // Hive: true + src_col->insert_data(" ", 1); // Hive: null + src_col->insert_data("not_a_number", 13); // Hive: null + src_col->insert_data("1.5", 3); // Hive: null (not an integer) + src_col->insert_data("", 0); // Hive: null + + auto dst_col = nullable_dst_type->create_column(); + auto mutable_dst = dst_col->assume_mutable(); + + auto& nullable_col = static_cast(*mutable_dst); + auto& nested_col = static_cast( + nullable_col.get_nested_column()); // Boolean as UInt8 (0 or 1) + auto& null_map = nullable_col.get_null_map_data(); + null_map.resize_fill(src_col->size(), 0); + + // Perform conversion + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + + ASSERT_EQ(7, nested_col.size()); + EXPECT_EQ(0, nested_col.get_data()[0]); // "0" -> false (0) + EXPECT_EQ(1, nested_col.get_data()[1]); // "123" -> true (1) + EXPECT_EQ(1, nested_col.get_data()[2]); // "-1" -> true (1) + EXPECT_EQ(1, null_map[3]); // " " -> null + EXPECT_EQ(1, null_map[4]); // "not_a_number" -> null + EXPECT_EQ(1, null_map[5]); // "1.5" -> null + EXPECT_EQ(1, null_map[6]); // "" -> null + + for (int i = 0; i < 3; i++) { + EXPECT_EQ(0, null_map[i]); + } +} +} + +TEST_F(ColumnTypeConverterTest, TestUnsupportedConversions) { + { + std::vector> unsupported_conversions = { + + {TYPE_BOOLEAN, TYPE_TINYINT}, + {TYPE_BOOLEAN, TYPE_SMALLINT}, + {TYPE_BOOLEAN, TYPE_INT}, + {TYPE_BOOLEAN, TYPE_BIGINT}, + {TYPE_BOOLEAN, TYPE_FLOAT}, + {TYPE_BOOLEAN, TYPE_DOUBLE}, + {TYPE_BOOLEAN, TYPE_DATE}, + {TYPE_BOOLEAN, TYPE_DATEV2}, + {TYPE_BOOLEAN, TYPE_TIMEV2}, + {TYPE_BOOLEAN, TYPE_DATETIME}, + {TYPE_BOOLEAN, TYPE_DATETIMEV2}, + + {TYPE_TINYINT, TYPE_BOOLEAN}, + {TYPE_SMALLINT, TYPE_BOOLEAN}, + {TYPE_INT, TYPE_BOOLEAN}, + {TYPE_BIGINT, TYPE_BOOLEAN}, + + {TYPE_TINYINT, TYPE_DATE}, + {TYPE_SMALLINT, TYPE_DATE}, + {TYPE_INT, TYPE_DATE}, + {TYPE_BIGINT, TYPE_DATE}, + {TYPE_TINYINT, TYPE_DATEV2}, + {TYPE_SMALLINT, TYPE_DATEV2}, + {TYPE_INT, TYPE_DATEV2}, + {TYPE_BIGINT, TYPE_DATEV2}, + {TYPE_TINYINT, TYPE_DATETIME}, + {TYPE_SMALLINT, TYPE_DATETIME}, + {TYPE_INT, TYPE_DATETIME}, + {TYPE_BIGINT, TYPE_DATETIME}, + {TYPE_TINYINT, TYPE_DATETIMEV2}, + {TYPE_SMALLINT, TYPE_DATETIMEV2}, + {TYPE_INT, TYPE_DATETIMEV2}, + {TYPE_BIGINT, TYPE_DATETIMEV2}, + {TYPE_TINYINT, TYPE_TIMEV2}, + {TYPE_SMALLINT, TYPE_TIMEV2}, + {TYPE_INT, TYPE_TIMEV2}, + {TYPE_BIGINT, TYPE_TIMEV2}, + + {TYPE_FLOAT, TYPE_BOOLEAN}, + {TYPE_FLOAT, TYPE_INT}, + {TYPE_FLOAT, TYPE_SMALLINT}, + {TYPE_FLOAT, TYPE_TINYINT}, + {TYPE_FLOAT, TYPE_BIGINT}, + {TYPE_FLOAT, TYPE_DATE}, + {TYPE_FLOAT, TYPE_DATEV2}, + {TYPE_FLOAT, TYPE_TIMEV2}, + {TYPE_FLOAT, TYPE_DATETIME}, + {TYPE_FLOAT, TYPE_DATETIMEV2}, + + {TYPE_DOUBLE, TYPE_BOOLEAN}, + {TYPE_DOUBLE, TYPE_INT}, + {TYPE_DOUBLE, TYPE_SMALLINT}, + {TYPE_DOUBLE, TYPE_TINYINT}, + {TYPE_DOUBLE, TYPE_BIGINT}, + {TYPE_DOUBLE, TYPE_DATE}, + {TYPE_DOUBLE, TYPE_DATEV2}, + {TYPE_DOUBLE, TYPE_TIMEV2}, + {TYPE_DOUBLE, TYPE_DATETIME}, + {TYPE_DOUBLE, TYPE_DATETIMEV2}, + + {TYPE_DOUBLE, TYPE_FLOAT}, + + {TYPE_DATE, TYPE_BOOLEAN}, + {TYPE_DATE, TYPE_TINYINT}, + {TYPE_DATE, TYPE_SMALLINT}, + {TYPE_DATE, TYPE_INT}, + {TYPE_DATE, TYPE_BIGINT}, + {TYPE_DATE, TYPE_FLOAT}, + {TYPE_DATE, TYPE_DOUBLE}, + {TYPE_DATEV2, TYPE_BOOLEAN}, + {TYPE_DATEV2, TYPE_TINYINT}, + {TYPE_DATEV2, TYPE_SMALLINT}, + {TYPE_DATEV2, TYPE_INT}, + {TYPE_DATEV2, TYPE_BIGINT}, + {TYPE_DATEV2, TYPE_FLOAT}, + {TYPE_DATEV2, TYPE_DOUBLE}, + {TYPE_TIMEV2, TYPE_BOOLEAN}, + {TYPE_TIMEV2, TYPE_TINYINT}, + {TYPE_TIMEV2, TYPE_SMALLINT}, + {TYPE_TIMEV2, TYPE_INT}, + {TYPE_TIMEV2, TYPE_BIGINT}, + {TYPE_TIMEV2, TYPE_FLOAT}, + {TYPE_TIMEV2, TYPE_DOUBLE}, + {TYPE_DATETIME, TYPE_BOOLEAN}, + {TYPE_DATETIME, TYPE_TINYINT}, + {TYPE_DATETIME, TYPE_SMALLINT}, + {TYPE_DATETIME, TYPE_INT}, + {TYPE_DATETIME, TYPE_BIGINT}, + {TYPE_DATETIME, TYPE_FLOAT}, + {TYPE_DATETIME, TYPE_DOUBLE}, + {TYPE_DATETIMEV2, TYPE_BOOLEAN}, + {TYPE_DATETIMEV2, TYPE_TINYINT}, + {TYPE_DATETIMEV2, TYPE_SMALLINT}, + {TYPE_DATETIMEV2, TYPE_INT}, + {TYPE_DATETIMEV2, TYPE_BIGINT}, + {TYPE_DATETIMEV2, TYPE_FLOAT}, + {TYPE_DATETIMEV2, TYPE_DOUBLE}, + }; + + for (const auto& [src_type_enum, dst_type_enum] : unsupported_conversions) { + TypeDescriptor src_type(src_type_enum); + for (auto len : {-1, 1, 2}) { + TypeDescriptor dst_type(dst_type_enum); + dst_type.len = len; + auto converter = converter::ColumnTypeConverter::get_converter( + src_type, DataTypeFactory::instance().create_data_type(dst_type, false), + converter::COMMON); + + ASSERT_FALSE(converter->support()) + << "Conversion from " << src_type.debug_string() << " to " + << dst_type.debug_string() << " should not be supported"; + } + } + } + //to decimal + { + std::vector> unsupported_conversions = { + {TYPE_BOOLEAN, TYPE_DECIMAL32}, {TYPE_DATE, TYPE_DECIMAL32}, + {TYPE_DATEV2, TYPE_DECIMAL32}, {TYPE_TIMEV2, TYPE_DECIMAL32}, + {TYPE_DATETIME, TYPE_DECIMAL32}, {TYPE_DATETIMEV2, TYPE_DECIMAL32}, + }; + + for (const auto& [src_type_enum, dst_type_enum] : unsupported_conversions) { + TypeDescriptor src_type(src_type_enum); + + for (int precision = min_decimal_precision(); + precision <= BeConsts::MAX_DECIMAL256_PRECISION; precision++) { + for (int scale = 0; scale <= precision; scale++) { + TypeDescriptor dst_type(dst_type_enum); + dst_type.precision = precision; + dst_type.scale = scale; + auto converter = converter::ColumnTypeConverter::get_converter( + src_type, DataTypeFactory::instance().create_data_type(dst_type, false), + converter::COMMON); + + ASSERT_FALSE(converter->support()) + << "Conversion from " << src_type.debug_string() << " to " + << dst_type.debug_string() << " should not be supported"; + } + } + } + } + + //from decimal + { + std::vector> unsupported_conversions = { + {TYPE_DECIMAL32, TYPE_BOOLEAN}, {TYPE_DECIMAL32, TYPE_DATE}, + {TYPE_DECIMAL32, TYPE_DATEV2}, {TYPE_DECIMAL32, TYPE_TIMEV2}, + {TYPE_DECIMAL32, TYPE_DATETIME}, {TYPE_DECIMAL32, TYPE_DATETIMEV2}, + }; + + for (const auto& [src_type_enum, dst_type_enum] : unsupported_conversions) { + TypeDescriptor src_type(src_type_enum); + + for (int precision = min_decimal_precision(); + precision <= BeConsts::MAX_DECIMAL256_PRECISION; precision++) { + for (int scale = 0; scale <= precision; scale++) { + src_type.precision = precision; + src_type.scale = scale; + auto decimal_date_type = + DataTypeFactory::instance().create_data_type(src_type, false); + + TypeDescriptor dst_type(dst_type_enum); + auto converter = converter::ColumnTypeConverter::get_converter( + decimal_date_type->get_type_as_type_descriptor(), + DataTypeFactory::instance().create_data_type(dst_type, false), + converter::COMMON); + + ASSERT_FALSE(converter->support()) + << "Conversion from " << src_type.debug_string() << " to " + << dst_type.debug_string() << " should not be supported"; + } + } + } + } +} + +TEST_F(ColumnTypeConverterTest, TestEmptyColumnConversions) { + // Test empty column + { + TypeDescriptor src_type(TYPE_INT); + auto dst_type = std::make_shared(); + + auto converter = converter::ColumnTypeConverter::get_converter(src_type, dst_type, + converter::COMMON); + ASSERT_TRUE(converter->support()); + ASSERT_FALSE(converter->is_consistent()); + + auto src_col = ColumnInt32::create(); // Empty column (no data) + auto dst_col = dst_type->create_column(); + auto mutable_dst = dst_col->assume_mutable(); + src_col->resize(0); + dst_col->resize(0); + // Perform conversion + Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); + ASSERT_TRUE(st.ok()); + + // Check size remains zero + ASSERT_EQ(0, static_cast(*mutable_dst).get_data().size()); + } +} + +} // namespace doris::vectorized \ No newline at end of file diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run07.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run07.hql index ffceea62a78b73..a2b19e7d071302 100755 --- a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run07.hql +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run07.hql @@ -1,11 +1,11 @@ CREATE EXTERNAL TABLE `delta_encoding_optional_column`( - c_customer_sk int, - c_current_cdemo_sk int, - c_current_hdemo_sk int, - c_current_addr_sk int, - c_first_shipto_date_sk int, - c_first_sales_date_sk int, - c_birth_year int, + c_customer_sk bigint, + c_current_cdemo_sk bigint, + c_current_hdemo_sk bigint, + c_current_addr_sk bigint, + c_first_shipto_date_sk bigint, + c_first_sales_date_sk bigint, + c_birth_year bigint, c_customer_id string, c_salutation string, c_first_name string, diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run75.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run75.hql new file mode 100644 index 00000000000000..41db62fbaba961 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run75.hql @@ -0,0 +1,515 @@ +create database if not exists schema_change; +use schema_change; + +CREATE TABLE IF NOT EXISTS parquet_primitive_types_to_boolean ( + id INT, + bool_col BOOLEAN, + int_col BOOLEAN, + smallint_col BOOLEAN, + tinyint_col BOOLEAN, + bigint_col BOOLEAN, + float_col BOOLEAN, + double_col BOOLEAN, + string_col BOOLEAN, + char1_col BOOLEAN, + char2_col BOOLEAN, + varchar_col BOOLEAN, + date_col BOOLEAN, + timestamp_col BOOLEAN, + decimal1_col BOOLEAN, + decimal2_col BOOLEAN +) STORED AS PARQUET +LOCATION '/user/doris/preinstalled_data/parquet_table/parquet_schema_change'; + +CREATE TABLE IF NOT EXISTS parquet_primitive_types_to_bigint ( + id INT, + bool_col BIGINT, + int_col BIGINT, + smallint_col BIGINT, + tinyint_col BIGINT, + bigint_col BIGINT, + float_col BIGINT, + double_col BIGINT, + string_col BIGINT, + char1_col BIGINT, + char2_col BIGINT, + varchar_col BIGINT, + date_col BIGINT, + timestamp_col BIGINT, + decimal1_col BIGINT, + decimal2_col BIGINT +) STORED AS PARQUET +LOCATION '/user/doris/preinstalled_data/parquet_table/parquet_schema_change'; + +CREATE TABLE IF NOT EXISTS parquet_primitive_types_to_int ( + id INT, + bool_col INT, + int_col INT, + smallint_col INT, + tinyint_col INT, + bigint_col INT, + float_col INT, + double_col INT, + string_col INT, + char1_col INT, + char2_col INT, + varchar_col INT, + date_col INT, + timestamp_col INT, + decimal1_col INT, + decimal2_col INT +) STORED AS PARQUET +LOCATION '/user/doris/preinstalled_data/parquet_table/parquet_schema_change'; + +CREATE TABLE IF NOT EXISTS parquet_primitive_types_to_smallint ( + id INT, + bool_col SMALLINT, + int_col SMALLINT, + smallint_col SMALLINT, + tinyint_col SMALLINT, + bigint_col SMALLINT, + float_col SMALLINT, + double_col SMALLINT, + string_col SMALLINT, + char1_col SMALLINT, + char2_col SMALLINT, + varchar_col SMALLINT, + date_col SMALLINT, + timestamp_col SMALLINT, + decimal1_col SMALLINT, + decimal2_col SMALLINT +) STORED AS PARQUET +LOCATION '/user/doris/preinstalled_data/parquet_table/parquet_schema_change'; + +CREATE TABLE IF NOT EXISTS parquet_primitive_types_to_tinyint ( + id INT, + bool_col TINYINT, + int_col TINYINT, + smallint_col TINYINT, + tinyint_col TINYINT, + bigint_col TINYINT, + float_col TINYINT, + double_col TINYINT, + string_col TINYINT, + char1_col TINYINT, + char2_col TINYINT, + varchar_col TINYINT, + date_col TINYINT, + timestamp_col TINYINT, + decimal1_col TINYINT, + decimal2_col TINYINT +) STORED AS PARQUET +LOCATION '/user/doris/preinstalled_data/parquet_table/parquet_schema_change'; + +CREATE TABLE IF NOT EXISTS parquet_primitive_types_to_float ( + id INT, + bool_col FLOAT, + int_col FLOAT, + smallint_col FLOAT, + tinyint_col FLOAT, + bigint_col FLOAT, + float_col FLOAT, + double_col FLOAT, + string_col FLOAT, + char1_col FLOAT, + char2_col FLOAT, + varchar_col FLOAT, + date_col FLOAT, + timestamp_col FLOAT, + decimal1_col FLOAT, + decimal2_col FLOAT +) STORED AS PARQUET +LOCATION '/user/doris/preinstalled_data/parquet_table/parquet_schema_change'; + +CREATE TABLE IF NOT EXISTS parquet_primitive_types_to_double ( + id INT, + bool_col DOUBLE, + int_col DOUBLE, + smallint_col DOUBLE, + tinyint_col DOUBLE, + bigint_col DOUBLE, + float_col DOUBLE, + double_col DOUBLE, + string_col DOUBLE, + char1_col DOUBLE, + char2_col DOUBLE, + varchar_col DOUBLE, + date_col DOUBLE, + timestamp_col DOUBLE, + decimal1_col DOUBLE, + decimal2_col DOUBLE +) STORED AS PARQUET +LOCATION '/user/doris/preinstalled_data/parquet_table/parquet_schema_change'; + +CREATE TABLE IF NOT EXISTS parquet_primitive_types_to_string ( + id INT, + bool_col STRING, + int_col STRING, + smallint_col STRING, + tinyint_col STRING, + bigint_col STRING, + float_col STRING, + double_col STRING, + string_col STRING, + char1_col STRING, + char2_col STRING, + varchar_col STRING, + date_col STRING, + timestamp_col STRING, + decimal1_col STRING, + decimal2_col STRING +) STORED AS PARQUET +LOCATION '/user/doris/preinstalled_data/parquet_table/parquet_schema_change'; + +CREATE TABLE IF NOT EXISTS parquet_primitive_types_to_date ( + id INT, + bool_col DATE, + int_col DATE, + smallint_col DATE, + tinyint_col DATE, + bigint_col DATE, + float_col DATE, + double_col DATE, + string_col DATE, + char1_col DATE, + char2_col DATE, + varchar_col DATE, + date_col DATE, + timestamp_col DATE, + decimal1_col DATE, + decimal2_col DATE +) STORED AS PARQUET +LOCATION '/user/doris/preinstalled_data/parquet_table/parquet_schema_change'; + +CREATE TABLE IF NOT EXISTS parquet_primitive_types_to_timestamp ( + id INT, + bool_col TIMESTAMP, + int_col TIMESTAMP, + smallint_col TIMESTAMP, + tinyint_col TIMESTAMP, + bigint_col TIMESTAMP, + float_col TIMESTAMP, + double_col TIMESTAMP, + string_col TIMESTAMP, + char1_col TIMESTAMP, + char2_col TIMESTAMP, + varchar_col TIMESTAMP, + date_col TIMESTAMP, + timestamp_col TIMESTAMP, + decimal1_col TIMESTAMP, + decimal2_col TIMESTAMP +) STORED AS PARQUET +LOCATION '/user/doris/preinstalled_data/parquet_table/parquet_schema_change'; + + +CREATE TABLE IF NOT EXISTS parquet_primitive_types_to_decimal1 ( + id INT, + bool_col DECIMAL(20,5), + int_col DECIMAL(20,5), + smallint_col DECIMAL(20,5), + tinyint_col DECIMAL(20,5), + bigint_col DECIMAL(20,5), + float_col DECIMAL(20,5), + double_col DECIMAL(20,5), + string_col DECIMAL(20,5), + char1_col DECIMAL(20,5), + char2_col DECIMAL(20,5), + varchar_col DECIMAL(20,5), + date_col DECIMAL(20,5), + timestamp_col DECIMAL(20,5), + decimal1_col DECIMAL(20,5), + decimal2_col DECIMAL(20,5) +) STORED AS PARQUET +LOCATION '/user/doris/preinstalled_data/parquet_table/parquet_schema_change'; + +CREATE TABLE IF NOT EXISTS parquet_primitive_types_to_decimal2 ( + id INT, + bool_col DECIMAL(7,1), + int_col DECIMAL(7,1), + smallint_col DECIMAL(7,1), + tinyint_col DECIMAL(7,1), + bigint_col DECIMAL(7,1), + float_col DECIMAL(7,1), + double_col DECIMAL(7,1), + string_col DECIMAL(7,1), + char1_col DECIMAL(7,1), + char2_col DECIMAL(7,1), + varchar_col DECIMAL(7,1), + date_col DECIMAL(7,1), + timestamp_col DECIMAL(7,1), + decimal1_col DECIMAL(7,1), + decimal2_col DECIMAL(7,1) +) STORED AS PARQUET +LOCATION '/user/doris/preinstalled_data/parquet_table/parquet_schema_change'; + + + + +CREATE TABLE IF NOT EXISTS orc_primitive_types_to_boolean ( + id INT, + bool_col BOOLEAN, + int_col BOOLEAN, + smallint_col BOOLEAN, + tinyint_col BOOLEAN, + bigint_col BOOLEAN, + float_col BOOLEAN, + double_col BOOLEAN, + string_col BOOLEAN, + char1_col BOOLEAN, + char2_col BOOLEAN, + varchar_col BOOLEAN, + date_col BOOLEAN, + timestamp_col BOOLEAN, + decimal1_col BOOLEAN, + decimal2_col BOOLEAN +) STORED AS orc +LOCATION '/user/doris/preinstalled_data/orc_table/orc_schema_change'; + + +CREATE TABLE IF NOT EXISTS orc_primitive_types_to_bigint ( + id INT, + bool_col BIGINT, + int_col BIGINT, + smallint_col BIGINT, + tinyint_col BIGINT, + bigint_col BIGINT, + float_col BIGINT, + double_col BIGINT, + string_col BIGINT, + char1_col BIGINT, + char2_col BIGINT, + varchar_col BIGINT, + date_col BIGINT, + timestamp_col BIGINT, + decimal1_col BIGINT, + decimal2_col BIGINT +) STORED AS orc +LOCATION '/user/doris/preinstalled_data/orc_table/orc_schema_change'; + +CREATE TABLE IF NOT EXISTS orc_primitive_types_to_int ( + id INT, + bool_col INT, + int_col INT, + smallint_col INT, + tinyint_col INT, + bigint_col INT, + float_col INT, + double_col INT, + string_col INT, + char1_col INT, + char2_col INT, + varchar_col INT, + date_col INT, + timestamp_col INT, + decimal1_col INT, + decimal2_col INT +) STORED AS orc +LOCATION '/user/doris/preinstalled_data/orc_table/orc_schema_change'; + +CREATE TABLE IF NOT EXISTS orc_primitive_types_to_smallint ( + id INT, + bool_col SMALLINT, + int_col SMALLINT, + smallint_col SMALLINT, + tinyint_col SMALLINT, + bigint_col SMALLINT, + float_col SMALLINT, + double_col SMALLINT, + string_col SMALLINT, + char1_col SMALLINT, + char2_col SMALLINT, + varchar_col SMALLINT, + date_col SMALLINT, + timestamp_col SMALLINT, + decimal1_col SMALLINT, + decimal2_col SMALLINT +) STORED AS orc +LOCATION '/user/doris/preinstalled_data/orc_table/orc_schema_change'; + +CREATE TABLE IF NOT EXISTS orc_primitive_types_to_tinyint ( + id INT, + bool_col TINYINT, + int_col TINYINT, + smallint_col TINYINT, + tinyint_col TINYINT, + bigint_col TINYINT, + float_col TINYINT, + double_col TINYINT, + string_col TINYINT, + char1_col TINYINT, + char2_col TINYINT, + varchar_col TINYINT, + date_col TINYINT, + timestamp_col TINYINT, + decimal1_col TINYINT, + decimal2_col TINYINT +) STORED AS orc +LOCATION '/user/doris/preinstalled_data/orc_table/orc_schema_change'; + +CREATE TABLE IF NOT EXISTS orc_primitive_types_to_float ( + id INT, + bool_col FLOAT, + int_col FLOAT, + smallint_col FLOAT, + tinyint_col FLOAT, + bigint_col FLOAT, + float_col FLOAT, + double_col FLOAT, + string_col FLOAT, + char1_col FLOAT, + char2_col FLOAT, + varchar_col FLOAT, + date_col FLOAT, + timestamp_col FLOAT, + decimal1_col FLOAT, + decimal2_col FLOAT +) STORED AS orc +LOCATION '/user/doris/preinstalled_data/orc_table/orc_schema_change'; + +CREATE TABLE IF NOT EXISTS orc_primitive_types_to_double ( + id INT, + bool_col DOUBLE, + int_col DOUBLE, + smallint_col DOUBLE, + tinyint_col DOUBLE, + bigint_col DOUBLE, + float_col DOUBLE, + double_col DOUBLE, + string_col DOUBLE, + char1_col DOUBLE, + char2_col DOUBLE, + varchar_col DOUBLE, + date_col DOUBLE, + timestamp_col DOUBLE, + decimal1_col DOUBLE, + decimal2_col DOUBLE +) STORED AS orc +LOCATION '/user/doris/preinstalled_data/orc_table/orc_schema_change'; + +CREATE TABLE IF NOT EXISTS orc_primitive_types_to_string ( + id INT, + bool_col STRING, + int_col STRING, + smallint_col STRING, + tinyint_col STRING, + bigint_col STRING, + float_col STRING, + double_col STRING, + string_col STRING, + char1_col STRING, + char2_col STRING, + varchar_col STRING, + date_col STRING, + timestamp_col STRING, + decimal1_col STRING, + decimal2_col STRING +) STORED AS orc +LOCATION '/user/doris/preinstalled_data/orc_table/orc_schema_change'; + +CREATE TABLE IF NOT EXISTS orc_primitive_types_to_date ( + id INT, + bool_col DATE, + int_col DATE, + smallint_col DATE, + tinyint_col DATE, + bigint_col DATE, + float_col DATE, + double_col DATE, + string_col DATE, + char1_col DATE, + char2_col DATE, + varchar_col DATE, + date_col DATE, + timestamp_col DATE, + decimal1_col DATE, + decimal2_col DATE +) STORED AS orc +LOCATION '/user/doris/preinstalled_data/orc_table/orc_schema_change'; + +CREATE TABLE IF NOT EXISTS orc_primitive_types_to_timestamp ( + id INT, + bool_col TIMESTAMP, + int_col TIMESTAMP, + smallint_col TIMESTAMP, + tinyint_col TIMESTAMP, + bigint_col TIMESTAMP, + float_col TIMESTAMP, + double_col TIMESTAMP, + string_col TIMESTAMP, + char1_col TIMESTAMP, + char2_col TIMESTAMP, + varchar_col TIMESTAMP, + date_col TIMESTAMP, + timestamp_col TIMESTAMP, + decimal1_col TIMESTAMP, + decimal2_col TIMESTAMP +) STORED AS orc +LOCATION '/user/doris/preinstalled_data/orc_table/orc_schema_change'; + + +CREATE TABLE IF NOT EXISTS orc_primitive_types_to_decimal1 ( + id INT, + bool_col DECIMAL(20,5), + int_col DECIMAL(20,5), + smallint_col DECIMAL(20,5), + tinyint_col DECIMAL(20,5), + bigint_col DECIMAL(20,5), + float_col DECIMAL(20,5), + double_col DECIMAL(20,5), + string_col DECIMAL(20,5), + char1_col DECIMAL(20,5), + char2_col DECIMAL(20,5), + varchar_col DECIMAL(20,5), + date_col DECIMAL(20,5), + timestamp_col DECIMAL(20,5), + decimal1_col DECIMAL(20,5), + decimal2_col DECIMAL(20,5) +) STORED AS orc +LOCATION '/user/doris/preinstalled_data/orc_table/orc_schema_change'; + +CREATE TABLE IF NOT EXISTS orc_primitive_types_to_decimal2 ( + id INT, + bool_col DECIMAL(7,1), + int_col DECIMAL(7,1), + smallint_col DECIMAL(7,1), + tinyint_col DECIMAL(7,1), + bigint_col DECIMAL(7,1), + float_col DECIMAL(7,1), + double_col DECIMAL(7,1), + string_col DECIMAL(7,1), + char1_col DECIMAL(7,1), + char2_col DECIMAL(7,1), + varchar_col DECIMAL(7,1), + date_col DECIMAL(7,1), + timestamp_col DECIMAL(7,1), + decimal1_col DECIMAL(7,1), + decimal2_col DECIMAL(7,1) +) STORED AS orc +LOCATION '/user/doris/preinstalled_data/orc_table/orc_schema_change'; + + +MSCK REPAIR TABLE parquet_primitive_types_to_boolean; +MSCK REPAIR TABLE parquet_primitive_types_to_bigint; +MSCK REPAIR TABLE parquet_primitive_types_to_int; +MSCK REPAIR TABLE parquet_primitive_types_to_smallint; +MSCK REPAIR TABLE parquet_primitive_types_to_tinyint; +MSCK REPAIR TABLE parquet_primitive_types_to_float; +MSCK REPAIR TABLE parquet_primitive_types_to_double; +MSCK REPAIR TABLE parquet_primitive_types_to_string; +MSCK REPAIR TABLE parquet_primitive_types_to_date; +MSCK REPAIR TABLE parquet_primitive_types_to_timestamp; +MSCK REPAIR TABLE parquet_primitive_types_to_decimal1; +MSCK REPAIR TABLE parquet_primitive_types_to_decimal2; + +MSCK REPAIR TABLE orc_primitive_types_to_boolean; +MSCK REPAIR TABLE orc_primitive_types_to_bigint; +MSCK REPAIR TABLE orc_primitive_types_to_int; +MSCK REPAIR TABLE orc_primitive_types_to_smallint; +MSCK REPAIR TABLE orc_primitive_types_to_tinyint; +MSCK REPAIR TABLE orc_primitive_types_to_float; +MSCK REPAIR TABLE orc_primitive_types_to_double; +MSCK REPAIR TABLE orc_primitive_types_to_string; +MSCK REPAIR TABLE orc_primitive_types_to_date; +MSCK REPAIR TABLE orc_primitive_types_to_timestamp; +MSCK REPAIR TABLE orc_primitive_types_to_decimal1; +MSCK REPAIR TABLE orc_primitive_types_to_decimal2; \ No newline at end of file diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/orc_schema_change/origin_file.orc b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/orc_schema_change/origin_file.orc new file mode 100644 index 0000000000000000000000000000000000000000..ab1af67fcf549c38f2f6c50d446f47a43b931ed1 GIT binary patch literal 2533 zcmZ8h3pkW%8-C|#W=w-IWk@n%h>RJOoYokc%5j`R4m%;z)+}nHAto};hf!fdv7~ey zivPEMHHfkaYj>|jcAJK#En92{e<{BI8+QNe`v2?t-uHU1_r9O!zMuDfzSlp{Lk9o? zmW;*`dB7T8QV1vj0C^3LNY=n1wW-uv1P{;wm?l3{@n)QTaE)vIJ#nH1^xnt zNu^LR5~7+FBdG{=;xXk4bR33umqJXr#k|(=nbk;7HLQw>1t{7Ml3;V9pYf+22%0O? z;?FPNH9iwygRgYuE7cC{)efF1<3tD=f+R3wFtD`Bif_k) z3ogUOan$kCDs_0m*7|R}n74ZPbHIYaS0G5~M)pMg_b?pOy)$}3+L2ky& zBLLyUE6monF*JK=tpku)%WlGh3@pi#s1qNOx;`eZ}d&>wZ9m z*FNW~$+`7kp>O~oC@TvdYANt&sMsw&9xMMC{WBjPil(GGx7=9?`>F6VJK0sqUe(D5 z;1jeF{o5}&Ezlff$3DrTW^`QGs*9#-nf_su(eD3X5ao3t?D?|HEAH2X^oSd?MVg0l z4Hvae{G9)yDs>*j%5H60t#Cu2{@fiG|z&AzjQJ7!Zo-;^s=T z9T!`SS-L@BEdX3pDiVR>B9HP{!_xpbvXDfyJ<2tsDCGBknnL!n>}|KPw%XZ+@|dR{ z8uWyJ$4ai({Za= zDSu^+feSoBOi`P(w-hXez zEt((m&20VfC^s>5I3uF(onBD>)2Y1!spir>uG052Gk;8X38FqOz8PBb-zl)0bLx3F z^SDkk-A2|i)Gd=9cZXhUUDMigI28+E^L_*h1>kUG9DztAoJWAe<<3P&7KIxdUomL< z78kMQ$*^U18Hac159_$_Etn-JYyVtkXKwzE?X^Ls4i1ZoTkeHW?bD*$A55*b4DNj~ z?<;(djj-kWc7&DdthY^^JL9UF5v_Gpvg1GF65eE**uS}RNzXvqn3lVtd(TbBGD&Bl zbMeueuaDoAI2JiG!)0KBm5K zwLQVx$i&x;ww6uqd_97H*d3leP~vKmHO`+IH=nFpn0j))ML1=XHI4++lOw(Ty*;{e zdH+a{$-UtoQ)3z{k0qjI*A>;|L=;JjB(5S>cAj@_PDK8NX*cb8&Ce(axrQPlFGlvV z@q&7^9j{t{)8z+dfHI@_Fk&%Z7SM~c*!2BH97Tej)j)A&G;O-)X;zi3#y`&3o1;*D zIS}+@quwdoDKmYxLR5g_Nj3-Tcs~CGW%IbMZJWMi(EjJ`wy3Xm-PKrOuUCaM#3f%e zQ&BEuPmoioCkr$FIdrn{U8g5S256G(vWOfOXy}q;8FsIhlFo#%D zZ1%m#_J+J(AyX>CwpQ(~kG0d~c@}#Y=kyi%$Ml_gH4?qk%HC!6P#;H6n6n{UP=z^tAREJgTS3Ai^OIn}a zJ2sZ>l+-l$+g!%;jORvCin9)F=aJFJ%@5op4>)YIsrfRzJS6v7+CB!2Q?V<>vM%80 z@K>{=hQ!b5*H7Loigv>lxV-GVLXZ+h{~C6~@uT2o;Htz+jh&6SP(-`{ry zc+cK(zFl@%*LUihywh8J>R(|cthH&~!xl;DTHjhv)>Y;ejc4tDiaw1p>bd8T$Jar| zkRr79u2AM-;e|}JSZ=;9mRqxJVT#pHI6cpBKOds+w+W7T%?jhwq9BK$o?7ZO> z>0p&P_lQJ0cAe$Kv84d>naR|3H8*~=+;+XZHkos|Gos}2EOcek+^D1`*rPGqrs7fi zz}b>^@heJu)U|8Vp#YP%R1tLkgn}o%^cMUwHu<=_?9<)X2L1~dpydhACBt@h${Wu+ zVMUF+t1V-#o>fT)TlQLq63*H>9a=oIb*ZUY(E;D53F=%ke9bEzwuP?c(*U xI)WGz{hanV$BtaG;D@MRyE=P5BXF%~Z3Id`zc<;tozt_m!_Lmk zoA-Y2&6{~MZ{4!>24_6M6N|WxP9b-=bz2EzOt&fcYphBdfuPcqLQ~W$K((RTQ43M0 zp*m2DP@Sm7s00S&S$vkHLQ!;uX=;^d!9YOv{?hNJfwPUbNu0KHI^D06lHK5!MwS$D z&^#(5%O*j%tR7nJeHj1d4U8{<$f4mJOwNN3S>W$Zrv-j?1VPiPDfDz2lq~*YA7kHN zN~ixDpG?zjfEW1I$igQY&7;ZU??8V)XaXNC$M~FR`LKdd@{j{$D*ZaSo(g#?4;3?+ z6`2x(+w>gu$$M1DgH-t2zrT6EZ2rtUZ)tmq$FJQbYp9S%sgPHxkY}mn@ou^wy9eW@ zyqg_?IcrOQKh2Z;k3Sr!YilbU`TMn>wxIs{S@*|v`|f;Df1_`re%sQqdf-1{_2XE* zFgZO=nlgQfDr+RpacHG7E;N(&fGGFsj(kK#8QJpfw%fzAUT84e3ny3I=qo>YuWqUH z=TEzr&Z<9n_OGf}4?a`hK;%_!vTj7dXYEZGCpMMrjp+n$u9sqDL$sqi5i_i=pBTuA zeuLNye?&B1KQ#ewaEcfpfq>Tt8llA$5d=+PXL$XAV2IoJ2?bvxFXx!0_*>q9_!d75 z7>NW!ks3NC21$g&5!y(Detw#*23gDkFjj5&D6ZS0iJlluLQ4=!oX_Xe0&U{1zyj@p zL0+NF&YoQ!dl#1gxJDIxjR=20lmo>w$?AbDu8mO5V#DVRN9f8R^CC6gNH`P>kbMC% zVWDK-iiL)cXie)kt{^+BpJ1Pj7#=gPX*<>l^G+%FB#&)cwqDrf4**j5Fc1nFhT(-h zMN5R0%NjSVpq+tH;YdEOIW0wx*_Lr#6|ZXKPIBC6BV(IZ zF}4wPXbodW*NRxvxo&NRZX+(;Mm(CMN*ciqIzRT7(t?8hD?h}lckmr1GTTyfo|Q7T zZlDq@1H~;t#%}CIKRkY^In%G-7CnH~H0-x{7Q9zU(Vk;Ln2t`4CdPO1T{+=kSDT7| zOY8QBU*30<69s#Jsi00{r*ccKIywG&)>YK)Mv&st9NYprTar7mSVLCW zx%TQs##PtE%!cCj`1aW5o|g7lN0e89SgGo2ycIiwj-$D&E3vtyD=|f%7Vk`1qmrJE zXd+>fDoxSiRJ?O%hBgmj0TgF*d}~(H0YXt*qAO}5Ifj@Qh~hb>!U!DN{5g*dG*(~% ztw2rf6=5=S*%%{jOWb7l;-Ml1& zZAFH_o>a6+#dBnOt&|_gvWH7eS({4=At8EP)sCy4!=)seeR*AraP)0BnmC`-;Dip( zd;~%Sf`AxSwP6(uLc94m=0TX4Z6U}pDlkr~+G*7@F`F=QpRznigPM7?u$*jU3z$Jg z8&o`l8aC{Sk5?YRTXWpJkbDck+?zq5UQx6wis#mxT0BU3q=(?@8RpL9+#o{utfHM& zJVP$w;GD-w-l564k7HOKzFDB%EbvUuC5)$ejU;64>O4(nF1KFxdNQ7hCB1a*FitVvvPrR$M!tf%2@>Ntd_r?>g6~2HG z4I_c_`4-0l%{6|XpE8xGQ4 Date: Mon, 23 Jun 2025 15:12:29 +0800 Subject: [PATCH 2/2] fix build --- be/src/vec/exec/format/orc/vorc_reader.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 54fb8b18ce853b..4391917cf29ea3 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -510,16 +510,12 @@ std::tuple convert_to_orc_literal(const orc::Type* type, con // case orc::TypeKind::CHAR: // [[fallthrough]]; case orc::TypeKind::VARCHAR: { -<<<<<<< HEAD - StringRef* string_value = (StringRef*)value; - return std::make_tuple(true, orc::Literal(string_value->data, string_value->size)); -======= if (primitive_type == TYPE_STRING || primitive_type == TYPE_CHAR || primitive_type == TYPE_VARCHAR) { - return std::make_tuple(true, orc::Literal(literal_data.data, literal_data.size)); + StringRef* string_value = (StringRef*)value; + return std::make_tuple(true, orc::Literal(string_value->data, string_value->size)); } return std::make_tuple(false, orc::Literal(false)); ->>>>>>> 3f85ad6d75c ([enchement](schema change)Standardize the behavior after a table schema change. (#47471)) } case orc::TypeKind::DECIMAL: { int128_t decimal_value;