From 0aad20bff2d989ca2cb0e27f13d16fd8960a49a4 Mon Sep 17 00:00:00 2001 From: Socrates Date: Sun, 6 Oct 2024 16:24:22 +0800 Subject: [PATCH 1/4] support parquet --- be/src/gutil/endian.h | 13 +++++++++++-- be/src/util/bit_util.h | 9 ++++++++- .../format/parquet/parquet_column_convert.cpp | 5 ++++- .../format/parquet/parquet_column_convert.h | 19 ++++++++++++++++++- 4 files changed, 41 insertions(+), 5 deletions(-) diff --git a/be/src/gutil/endian.h b/be/src/gutil/endian.h index 4bc04e1e303eb5..f78480b3cf5fec 100644 --- a/be/src/gutil/endian.h +++ b/be/src/gutil/endian.h @@ -60,8 +60,8 @@ inline unsigned __int128 gbswap_128(unsigned __int128 host_int) { } inline wide::UInt256 gbswap_256(wide::UInt256 host_int) { - wide::UInt256 result{gbswap_64(host_int.items[3]), gbswap_64(host_int.items[2]), - gbswap_64(host_int.items[1]), gbswap_64(host_int.items[0])}; + wide::UInt256 result {gbswap_64(host_int.items[3]), gbswap_64(host_int.items[2]), + gbswap_64(host_int.items[1]), gbswap_64(host_int.items[0])}; return result; } @@ -136,6 +136,9 @@ class LittleEndian { static unsigned __int128 FromHost128(unsigned __int128 x) { return x; } static unsigned __int128 ToHost128(unsigned __int128 x) { return x; } + static wide::UInt256 FromHost256(wide::UInt256 x) { return x; } + static wide::UInt256 ToHost256(wide::UInt256 x) { return x; } + static bool IsLittleEndian() { return true; } #elif defined IS_BIG_ENDIAN @@ -149,6 +152,12 @@ class LittleEndian { static uint64 FromHost64(uint64 x) { return gbswap_64(x); } static uint64 ToHost64(uint64 x) { return gbswap_64(x); } + static unsigned __int128 FromHost128(unsigned __int128 x) { return gbswap_128(x); } + static unsigned __int128 ToHost128(unsigned __int128 x) { return gbswap_128(x); } + + static wide::UInt256 FromHost256(wide::UInt256 x) { return gbswap_256(x); } + static wide::UInt256 ToHost256(wide::UInt256 x) { return gbswap_256(x); } + static bool IsLittleEndian() { return false; } #endif /* ENDIAN */ diff --git a/be/src/util/bit_util.h b/be/src/util/bit_util.h index 44b391f44dae34..504b0b27428190 100644 --- a/be/src/util/bit_util.h +++ b/be/src/util/bit_util.h @@ -20,6 +20,9 @@ #pragma once +#include + +#include "vec/core/wide_integer.h" #ifndef __APPLE__ #include #endif @@ -209,7 +212,11 @@ class BitUtil { template static T big_endian_to_host(T value) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + return BigEndian::ToHost256(value); + } else if constexpr (std::is_same_v) { + return BigEndian::ToHost256(value); + } else if constexpr (std::is_same_v) { return BigEndian::ToHost128(value); } else if constexpr (std::is_same_v) { return BigEndian::ToHost128(value); diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp index 2fb0afea82ae8a..0a5ef2913dd940 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp @@ -19,6 +19,7 @@ #include +#include "runtime/define_primitive_type.h" #include "vec/columns/column_nullable.h" namespace doris::vectorized::parquet { const cctz::time_zone ConvertParams::utc0 = cctz::utc_time_zone(); @@ -27,7 +28,8 @@ const cctz::time_zone ConvertParams::utc0 = cctz::utc_time_zone(); M(TYPE_DECIMALV2) \ M(TYPE_DECIMAL32) \ M(TYPE_DECIMAL64) \ - M(TYPE_DECIMAL128I) + M(TYPE_DECIMAL128I) \ + M(TYPE_DECIMAL256) bool PhysicalToLogicalConverter::is_parquet_native_type(PrimitiveType type) { switch (type) { @@ -50,6 +52,7 @@ bool PhysicalToLogicalConverter::is_decimal_type(doris::PrimitiveType type) { case TYPE_DECIMAL32: case TYPE_DECIMAL64: case TYPE_DECIMAL128I: + case TYPE_DECIMAL256: case TYPE_DECIMALV2: return true; default: diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h b/be/src/vec/exec/format/parquet/parquet_column_convert.h index 91b81121aa4303..cf6f8aa13fa1d1 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.h +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h @@ -20,6 +20,7 @@ #include #include "vec/core/types.h" +#include "vec/core/wide_integer.h" #include "vec/data_types/data_type_factory.hpp" #include "vec/exec/format/column_type_convert.h" #include "vec/exec/format/format_common.h" @@ -401,7 +402,23 @@ class FixedSizeToDecimal : public PhysicalToLogicalConverter { M(13, int128_t) \ M(14, int128_t) \ M(15, int128_t) \ - M(16, int128_t) + M(16, int128_t) \ + M(17, wide::Int256) \ + M(18, wide::Int256) \ + M(19, wide::Int256) \ + M(20, wide::Int256) \ + M(21, wide::Int256) \ + M(22, wide::Int256) \ + M(23, wide::Int256) \ + M(24, wide::Int256) \ + M(25, wide::Int256) \ + M(26, wide::Int256) \ + M(27, wide::Int256) \ + M(28, wide::Int256) \ + M(29, wide::Int256) \ + M(30, wide::Int256) \ + M(31, wide::Int256) \ + M(32, wide::Int256) switch (_type_length) { APPLY_FOR_DECIMALS() From 66d2e0635a6dbcd8d1d200ae881b142028768a4e Mon Sep 17 00:00:00 2001 From: Socrates Date: Tue, 8 Oct 2024 15:08:11 +0800 Subject: [PATCH 2/4] add regression --- .../hdfs_tvf/test_parqeut_decimal256.parquet | Bin 0 -> 1321 bytes .../external_table_p0/tvf/test_hdfs_tvf.groovy | 8 ++++++++ 2 files changed, 8 insertions(+) create mode 100644 docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parqeut_decimal256.parquet diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parqeut_decimal256.parquet b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parqeut_decimal256.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4b039d5ddfa4e35f16213c743d8364cab1c99f6b GIT binary patch literal 1321 zcmWG=3^EjD5!DdY@e$<`We{Ru5b6P3~dsu2)n6J?W; zU;&9ruz=KxF=!z1fCAhaAnja?%*-8b3~EOhWki`IO&Qn(Bqf+KQ$(4>xWu+FitS-k z3sB?0XON^0&}9swOd!hzfE3UbETRXPM3*qa+`511FRTiH{#tbUQISy^7X z^kd;v{{MoEF)yDON4!gW<@IB>cZm3`Yi%7}QG9(_9~gKU85nsv;($(P6!ZX6ah41+ zK*PZR!=DPcEcg!v5(>ENff~sO3?C|(0yGUZT)(i%i1I`{kIz>Xofht{~ zhPryVf&`txMuV9kuA?JJsZ&N$xTRrI9>iLpJwQev$b6tZ`5<|)eO^%efdYY!PGEU= zN5^DG7oY&h5pY8TJ;4TffW+Zm0Wv}Q-E*>mB+!xNAoGDrAcg|PoWPy{GJwFAK|-M* yv8b>#wL~|$pg=dVs3^Z&p)9qiI5R&_!O&dKK+ix@2UsM4N`)f literal 0 HcmV?d00001 diff --git a/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy index 8c4028bfefe021..02bda4ec0ddc1b 100644 --- a/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy +++ b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy @@ -108,6 +108,14 @@ suite("test_hdfs_tvf","external,hive,tvf,external_docker") { "hadoop.username" = "${hdfsUserName}", "format" = "${format}") order by s_suppkey limit 20; """ + // test parquet decimal256 + uri = "${defaultFS}" + "/user/doris/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet" + format = "parquet" + qt_parquet_decimal256 """ select * from HDFS( + "uri" = "${uri}", + "hadoop.username" = "${hdfsUserName}", + "format" = "${format}") order by id; """ + // test orc uri = "${defaultFS}" + "/user/doris/preinstalled_data/hdfs_tvf/test_orc.snappy.orc" format = "orc" From cbbf9d22558eaeb841a69a59e15f9b024bb25552 Mon Sep 17 00:00:00 2001 From: Socrates Date: Tue, 8 Oct 2024 16:18:19 +0800 Subject: [PATCH 3/4] fix regression --- ...arquet => test_parquet_decimal256.parquet} | Bin 1321 -> 1320 bytes .../external_table_p0/tvf/test_hdfs_tvf.out | 7 +++++++ 2 files changed, 7 insertions(+) rename docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/{test_parqeut_decimal256.parquet => test_parquet_decimal256.parquet} (52%) diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parqeut_decimal256.parquet b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet similarity index 52% rename from docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parqeut_decimal256.parquet rename to docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet index 4b039d5ddfa4e35f16213c743d8364cab1c99f6b..323ded32160e00a77ca68cf493b3d300796155a1 100644 GIT binary patch delta 206 zcmZ3vnHWSF zfa;hS7(mnkCebB~5DmNI8CfPvGcE@yoqUvW6+fz4rpa|ou25qnP^8qpu*rz>Nt#0S zalj-uA7M6TG?Xt$EGjHbEzwObD9}wTD#|ZcC`&CW&dkqKFw-;8GnCYkVPF6vh9y8J Hfjj{KX>ul` delta 207 zcmZ3%wUUcJz%j^BltolSRL4h@OO#Nt#0S zalj-uA7M6TG*T!?EGjHbEzwObD9}wTD#|ZcC`&CW&dkqKFf`XQ&@+(Kkzrr}B8H_v IH-UTs0KR!AIRF3v diff --git a/regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out b/regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out index 3f12b44858136c..e850e38a237b06 100644 --- a/regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out +++ b/regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out @@ -221,6 +221,13 @@ 19 Supplier#000000019 edZT3es,nBFD8lBXTGeTl 24 34-278-310-2731 6150.38 refully final foxes across the dogged theodolites sleep slyly abou 20 Supplier#000000020 iybAE,RmTymrZVYaFZva2SH,j 3 13-715-945-6730 530.82 n, ironic ideas would nag blithely about the slyly regular accounts. silent, expr +-- !parquet_decimal256 -- +1 99999999999999999999999999999999999999.99999999999999999999999999999999999999 +2 -99999999999999999999999999999999999999.99999999999999999999999999999999999999 +3 1E-38 +4 -1E-38 +5 0E-38 + -- !orc -- 1 goldenrod lavender spring chocolate lace Manufacturer#1 Brand#13 PROMO BURNISHED COPPER 7 JUMBO PKG 901.00 ly. slyly ironi 2 blush thistle blue yellow saddle Manufacturer#1 Brand#13 LARGE BRUSHED BRASS 1 LG CASE 902.00 lar accounts amo From 5beb621f57f9393fefeb608d668c9f0168bfda5b Mon Sep 17 00:00:00 2001 From: Socrates Date: Tue, 8 Oct 2024 17:56:17 +0800 Subject: [PATCH 4/4] add be ut --- be/test/util/bit_util_test.cpp | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/be/test/util/bit_util_test.cpp b/be/test/util/bit_util_test.cpp index 514daafa604745..fd3bee0143223a 100644 --- a/be/test/util/bit_util_test.cpp +++ b/be/test/util/bit_util_test.cpp @@ -21,7 +21,6 @@ #include #include -#include #include "gtest/gtest_pred_impl.h" @@ -48,4 +47,22 @@ TEST(BitUtil, Popcount) { EXPECT_EQ(BitUtil::popcount_no_hw(0), 0); } +TEST(BitUtil, BigEndianToHost) { + uint16_t v16 = 0x1234; + uint32_t v32 = 0x12345678; + uint64_t v64 = 0x123456789abcdef0; + unsigned __int128 v128 = ((__int128)0x123456789abcdef0LL << 64) | 0x123456789abcdef0LL; + wide::UInt256 v256 = + wide::UInt256(0x123456789abcdef0) << 192 | wide::UInt256(0x123456789abcdef0) << 128 | + wide::UInt256(0x123456789abcdef0) << 64 | wide::UInt256(0x123456789abcdef0); + EXPECT_EQ(BitUtil::big_endian_to_host(v16), 0x3412); + EXPECT_EQ(BitUtil::big_endian_to_host(v32), 0x78563412); + EXPECT_EQ(BitUtil::big_endian_to_host(v64), 0xf0debc9a78563412); + EXPECT_EQ(BitUtil::big_endian_to_host(v128), + ((__int128)0xf0debc9a78563412LL << 64) | 0xf0debc9a78563412LL); + EXPECT_EQ(BitUtil::big_endian_to_host(v256), + wide::UInt256(0xf0debc9a78563412) << 192 | wide::UInt256(0xf0debc9a78563412) << 128 | + wide::UInt256(0xf0debc9a78563412) << 64 | wide::UInt256(0xf0debc9a78563412)); +} + } // namespace doris