From 715178e5abf9ab3f660d631aaee5b94fc30b0984 Mon Sep 17 00:00:00 2001 From: zhangstar333 Date: Tue, 2 Dec 2025 17:19:30 +0800 Subject: [PATCH] [bug](parquet) fix parquet type not handle float16 type (#58528) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? Problem Summary: before not deal with float16 type, so it's mapping to string type, and the result is invaild eg: ``` mysql> select * from HDFS("uri" = "hdfs://127.0.0.1:8020/user/doris/tvf_data/test_hdfs_parquet/group0/float16_nonzeros_and_nans.parquet", "hadoop.username" = "doris", "format" = "parquet"); +------+ | x | +------+ | NULL | | < | | � | | ~ | | | | � | | � | | @ | +------+ ``` but it's could mapping to float type, eg: ``` mysql> select x from file( "fs.local.support" = "true", "file_path" = "float16_nonzeros_and_nans.parquet", "format" = "parquet",'shared_storage'='true' ); +------+ | x | +------+ | NULL | | 1 | | -2 | | NaN | | 0 | | -1 | | -0 | | 2 | +------+ ``` --- .../format/parquet/parquet_column_convert.cpp | 5 ++ .../format/parquet/parquet_column_convert.h | 84 ++++++++++++++++++ .../vec/exec/format/parquet/schema_desc.cpp | 2 + .../tvf/test_hdfs_parquet_group0.out | Bin 23857 -> 23955 bytes .../tvf/test_hdfs_tvf_float16.out | 19 ++++ .../tvf/test_hdfs_tvf_float16.groovy | 46 ++++++++++ 6 files changed, 156 insertions(+) create mode 100644 regression-test/data/external_table_p0/tvf/test_hdfs_tvf_float16.out create mode 100644 regression-test/suites/external_table_p0/tvf/test_hdfs_tvf_float16.groovy diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp index d0b2d62d68cd0d..d703d6b1a5af89 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp @@ -231,6 +231,11 @@ std::unique_ptr PhysicalToLogicalConverter::get_conv // for FixedSizeBinary physical_converter = std::make_unique(parquet_schema.type_length); + } else if (src_logical_primitive == TYPE_FLOAT && + src_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY && + parquet_schema.logicalType.__isset.FLOAT16) { + physical_converter = + std::make_unique(parquet_schema.type_length); } else { physical_converter = std::make_unique(); } diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h b/be/src/vec/exec/format/parquet/parquet_column_convert.h index b4749a3c8298dc..e8b29131bdf843 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.h +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h @@ -20,6 +20,7 @@ #include #include "common/cast_set.h" +#include "runtime/primitive_type.h" #include "vec/columns/column_varbinary.h" #include "vec/core/extended_types.h" #include "vec/core/field.h" @@ -354,6 +355,89 @@ class FixedSizeBinaryConverter : public PhysicalToLogicalConverter { } }; +class Float16PhysicalConverter : public PhysicalToLogicalConverter { +private: + int _type_length; + +public: + Float16PhysicalConverter(int type_length) : _type_length(type_length) { + DCHECK_EQ(_type_length, 2); + } + + Status physical_convert(ColumnPtr& src_physical_col, ColumnPtr& src_logical_column) override { + ColumnPtr from_col = remove_nullable(src_physical_col); + MutableColumnPtr to_col = remove_nullable(src_logical_column)->assume_mutable(); + + const auto* src_data = assert_cast(from_col.get()); + size_t length = src_data->size(); + size_t num_values = length / _type_length; + auto* to_float_column = assert_cast(to_col.get()); + size_t start_idx = to_float_column->size(); + to_float_column->resize(start_idx + num_values); + auto& to_float_column_data = to_float_column->get_data(); + const uint8_t* ptr = src_data->get_data().data(); + for (int i = 0; i < num_values; ++i) { + size_t offset = i * _type_length; + const uint8_t* data_ptr = ptr + offset; + uint16_t raw; + memcpy(&raw, data_ptr, sizeof(uint16_t)); + float value = half_to_float(raw); + to_float_column_data[start_idx + i] = value; + } + + return Status::OK(); + } + + float half_to_float(uint16_t h) { + // uint16_t h: half precision floating point + // bit 15: sign(1 bit) + // bits 14..10 : exponent(5 bits) + // bits 9..0 : mantissa(10 bits) + + // sign bit placed to float32 bit31 + uint32_t sign = (h & 0x8000U) << 16; // 0x8000 << 16 = 0x8000_0000 + // exponent:(5 bits) + uint32_t exp = (h & 0x7C00U) >> 10; // 0x7C00 = 0111 1100 0000 (half exponent mask) + // mantissa(10 bits) + uint32_t mant = (h & 0x03FFU); // 10-bit fraction + + // cases:Zero/Subnormal, Normal, Inf/NaN + if (exp == 0) { + // exp==0: Zero or Subnormal ---------- + if (mant == 0) { + // ±0.0 + // sign = either 0x00000000 or 0x80000000 + return std::bit_cast(sign); + } else { + // ---------- Subnormal ---------- + // half subnormal: + // value = (-1)^sign * (mant / 2^10) * 2^(1 - bias) + // half bias = 15 → exponent = 1 - 15 = -14 + float f = (static_cast(mant) / 1024.0F) * std::powf(2.0F, -14.0F); + return sign ? -f : f; + } + } else if (exp == 0x1F) { + // exp==31: Inf or NaN ---------- + // float32: + // exponent = 255 (0xFF) + // mantissa = mant << 13 + uint32_t f = sign | 0x7F800000U | (mant << 13); + return std::bit_cast(f); + } else { + // Normalized ---------- + // float32 exponent: + // exp32 = exp16 - bias16 + bias32 + // bias16 = 15 + // bias32 = 127 + // + // so: exp32 = exp + (127 - 15) + uint32_t f = sign | ((exp + (127 - 15)) << 23) // place to float32 exponent + | (mant << 13); // mantissa align to 23 bits + return std::bit_cast(f); + } + } +}; + class UUIDVarBinaryConverter : public PhysicalToLogicalConverter { public: UUIDVarBinaryConverter(int type_length) : _type_length(type_length) {} diff --git a/be/src/vec/exec/format/parquet/schema_desc.cpp b/be/src/vec/exec/format/parquet/schema_desc.cpp index 677898da6e75b5..a3ef4fb222b5e9 100644 --- a/be/src/vec/exec/format/parquet/schema_desc.cpp +++ b/be/src/vec/exec/format/parquet/schema_desc.cpp @@ -308,6 +308,8 @@ std::pair FieldDescriptor::convert_to_doris_type( } else if (logicalType.__isset.UUID) { ans.first = DataTypeFactory::instance().create_data_type(TYPE_VARBINARY, nullable, -1, -1, 16); + } else if (logicalType.__isset.FLOAT16) { + ans.first = DataTypeFactory::instance().create_data_type(TYPE_FLOAT, nullable); } else { throw Exception(Status::InternalError("Not supported parquet logicalType")); } diff --git a/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out b/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out index 3cb34820fcc25c8a59f6fc3256ccc7db3f4c3a3c..0e21a8fad6f690858499234dde89675694b89fa2 100644 GIT binary patch delta 267 zcmXwyI|{-;5QfPcA3;(J)@icrCYxlpwU%5!EL5zlC4?MfE~2EVr#v$L101vl&bwL`&~Nb`{iI69{YG2^n(;5t)i4Zy>zBh7?V672x651LT3{;COB`z`KiT*IFR kfS2EP&tT;)63+NI5j0X9)Tj_w+p1em=W#%i%1++nKNpBUwEzGB delta 169 zcmbQdn{neV#tko+SlK){**qo-vR1MRcybDOPX5YN%Bt?ksqQ(snz@=a(t|V7WAbd) za@GV-&IHfNj4b7>6`q_Go|D^IDp{*NIIBG-E3=hzzTdys158XdWUofz?M30OW~