From 044105ace0d791dce1904b1bd1a8b14757db91b0 Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Wed, 29 Nov 2023 19:59:57 +0800 Subject: [PATCH 1/2] [feature](parquet)support read parquet lzo compression. --- be/src/util/block_compression.cpp | 88 +++++++++++++++++++ .../hive/test_compress_type.out | 87 ++++++++++++++++++ .../hive/test_compress_type.groovy | 46 ++++++++++ 3 files changed, 221 insertions(+) diff --git a/be/src/util/block_compression.cpp b/be/src/util/block_compression.cpp index 58c75a0c433f7f..5b53bcb446c1ab 100644 --- a/be/src/util/block_compression.cpp +++ b/be/src/util/block_compression.cpp @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -45,6 +46,7 @@ #include "common/config.h" #include "exec/decompressor.h" +#include "gutil/endian.h" #include "gutil/strings/substitute.h" #include "util/bit_util.h" #include "util/defer_op.h" @@ -1071,6 +1073,89 @@ class GzipBlockCompressionByLibdeflate final : public GzipBlockCompression { }; #endif +class LzoBlockCompression final : public BlockCompressionCodec { +public: + static LzoBlockCompression* instance() { + static LzoBlockCompression s_instance; + return &s_instance; + } + + Status compress(const Slice& input, faststring* output) override { + return Status::InvalidArgument("not impl lzo compress."); + } + size_t max_compressed_len(size_t len) override { return 0; }; + Status decompress(const Slice& input, Slice* output) override { + auto* input_ptr = input.data; + auto remain_input_size = input.size; + auto* output_ptr = output->data; + auto remain_output_size = output->size; + auto* output_limit = output->data + output->size; + + // Example: + // OriginData(The original data will be divided into several large data block.) : + // large data block1 | large data block2 | large data block3 | .... + // The large data block will be divided into several small data block. + // Suppose a large data block is divided into three small blocks: + // large data block1: | small block1 | small block2 | small block3 | + // CompressData: + // + // A : original length of the current block of large data block. + // sizeof(A) = 4 bytes. + // A = length(small block1) + length(small block2) + length(small block3) + // Bx : length of small data block bx. + // sizeof(Bx) = 4 bytes. + // Bx = length(compress(small blockx)) + try { + while (remain_input_size > 0) { + if (remain_input_size < 4) { + return Status::InvalidArgument( + "Need more input buffer to get large_block_uncompressed_len."); + } + + uint32_t large_block_uncompressed_len = BigEndian::Load32(input_ptr); + input_ptr += 4; + remain_input_size -= 4; + + if (remain_output_size < large_block_uncompressed_len) { + return Status::InvalidArgument( + "Need more output buffer to get uncompressed data."); + } + + while (large_block_uncompressed_len > 0) { + if (remain_input_size < 4) { + return Status::InvalidArgument( + "Need more input buffer to get small_block_compressed_len."); + } + + uint32_t small_block_compressed_len = BigEndian::Load32(input_ptr); + input_ptr += 4; + remain_input_size -= 4; + + if (remain_input_size < small_block_compressed_len) { + return Status::InvalidArgument( + "Need more input buffer to decompress small block."); + } + + auto small_block_uncompressed_len = + orc::lzoDecompress(input_ptr, input_ptr + small_block_compressed_len, + output_ptr, output_limit); + + input_ptr += small_block_compressed_len; + remain_input_size -= small_block_compressed_len; + + output_ptr += small_block_uncompressed_len; + large_block_uncompressed_len -= small_block_uncompressed_len; + remain_output_size -= small_block_uncompressed_len; + } + } + } catch (const orc::ParseError& e) { + //Prevent be from hanging due to orc::lzoDecompress throw exception + return Status::InternalError("Fail to do LZO decompress, error={}", e.what()); + } + return Status::OK(); + } +}; + Status get_block_compression_codec(segment_v2::CompressionTypePB type, BlockCompressionCodec** codec) { switch (type) { @@ -1127,6 +1212,9 @@ Status get_block_compression_codec(tparquet::CompressionCodec::type parquet_code *codec = GzipBlockCompression::instance(); #endif break; + case tparquet::CompressionCodec::LZO: + *codec = LzoBlockCompression::instance(); + break; default: return Status::InternalError("unknown compression type({})", parquet_codec); } diff --git a/regression-test/data/external_table_p2/hive/test_compress_type.out b/regression-test/data/external_table_p2/hive/test_compress_type.out index ed88a8f6967827..1f18839b53a73e 100644 --- a/regression-test/data/external_table_p2/hive/test_compress_type.out +++ b/regression-test/data/external_table_p2/hive/test_compress_type.out @@ -484,3 +484,90 @@ 8 800 40 8000000000 45.75 55.25 false Eighth H Theta 2023-10-13 2023-10-13T21:45 890.12 9 900 45 9000000000 50.0 60.5 true Ninth I Iota 2023-10-14 2023-10-14T22:15 901.23 +-- !lzo_1 -- +127 317 22 139027217294 5.8534396E7 1.097115615520323E10 true NxvCOVAHCAzWEFOs VdEf vXxekmctPmPmmbecHgf 2023-12-15 2023-12-28T23:15:48 147638.24 +135 194 7 57894842960 1.3718646E7 2.1169820465574505E10 true aseqfHnnrtaL HwV IqXKe 2023-12-28 2023-12-19T06:14:48 32041.77 +139 146 4 149816593644 4.9618156E7 1.3744723380110355E10 false sreHCjYoJoBOjUJMMBSQ dD iBaixPwGysIVgkomhg 2023-12-27 2023-12-19T15:51:48 191090.57 +167 275 28 46739421643 2.790689E7 5.638235691917528E8 false BDX iY pOrAYVd 2023-12-01 2023-12-09T12:59:48 105181.01 +241 496 63 26957970271 3.7214888E7 1.2043262506506804E10 true VJPXXigvP wfZp cwyoMdOxN 2023-12-10 2023-12-18T05:33:48 105023.30 +285 43 47 124246184718 2162507.5 1.6279579779299034E10 false gXIEVQzqfokBv raxj NbGVRlQeotLBDWbDqP 2023-12-17 2023-12-24T03:36:48 75425.14 +311 44 67 79901279497 3.0787934E7 1.5853816694193293E10 false LTsSxeetbYKCwcJvg BCrf XkuC 2023-12-11 2023-12-05T13:16:48 103792.88 +333 390 29 61080978873 2916969.0 1.053228375816898E10 true HcZnbf Wp iHqLLiPhgZ 2023-12-01 2023-12-10T11:31:48 68471.38 +36 369 2 24371701950 5.54394E7 8.576150848699297E9 false uQpDcwEZT sd SwzJInNDb 2023-12-05 2023-12-08T15:00:40 8954.61 +363 375 1 20494251127 8.9166856E7 2.2005002173871223E10 false hkHvijevoRfHhK szl hwHUAjwqTQOmLEPDFbt 2023-12-05 2023-12-08T05:28:48 96630.28 +368 37 42 60649320592 2.3388714E7 1.81031191987985E9 true yXoDmKpjjRsVV Hq MbWlyi 2023-12-19 2023-12-06T00:45:48 197736.91 +414 301 63 87524210634 1.2944316E7 3.5428357192711325E9 true piECj tGM pkOyUdxLBFCw 2023-12-19 2023-12-10T15:28:48 112255.75 +42 132 39 128076453206 3.1733946E7 2.001312160047691E9 false kzviLgVNqxrDQ kr YhdXGtPun 2023-12-15 2023-12-10T01:42:48 189135.64 +427 286 67 78312070726 3.1794338E7 1.7713252925472687E10 true lHoUCBbY LTkc CgMrDWTGppMIaZPk 2023-12-13 2023-12-08T16:02:48 75175.71 +438 491 21 66065079309 6.6624016E7 1.5542114222539822E10 false CEbvKZRdvMHxzVOIejq wJ eoTkUlht 2023-12-08 2023-12-17T19:49:48 86666.80 +469 156 25 41259191749 6.2344956E7 1.5674967382662376E10 true dfyMUJYNppBDDD az lVofKt 2023-12-19 2023-12-09T10:37:48 15427.43 +540 416 70 110655654086 4.9027904E7 1.1345965638449787E10 true gZF oPNx kDYTiiCPhyQqnmPLd 2023-12-26 2023-12-20T22:47:48 177628.27 +563 327 1 86402793406 1.4668673E7 2.1932020019521263E10 false uEPywVtgb IN HCcPuRYlwlezseie 2023-12-27 2023-12-01T09:02:48 12840.38 +585 423 69 141894410515 1.7955736E7 8.784239710423233E9 false IsWEZJsPRXIFqapTTb yO qRAEvl 2023-11-29 2023-12-26T04:50:48 46733.25 +618 390 70 40611757422 4.9496784E7 1.90943138552761E9 true cuqniQE dxKv KlxZsrJad 2023-12-05 2023-11-30T13:41:48 13904.80 + +-- !lzo_2 -- +1078 229 63 79026532317 1.4363472E7 1.193746461651589E10 true znYrIGhEXITIdyiifBPZ BBh klhSDtg 2023-12-14 2023-12-02T06:34:48 80402.53 +1105 186 31 129159878912 1.3102703E7 1.6989058048889019E10 false OxhcUomBMLjVjdwgOI Qa eJoODDnkdDd 2023-12-20 2023-12-25T03:33:48 83174.73 +1108 223 24 64158736405 1.3341401E7 2.0128416779917E10 false rLXbarkH xU ggGSZGxLwT 2023-11-28 2023-12-23T01:43:48 17986.48 +1126 178 38 137633520558 9.6421152E7 1.2075476530488207E10 true vHgghYPQNpzTmYx EKhO Pg 2023-12-03 2023-12-03T18:17:48 119990.49 +1215 20 18 15934394806 9.6266544E7 3.303291140952643E8 true zzkAwmKNf RKO VzyGx 2023-12-26 2023-12-14T02:36:48 59236.59 +1225 131 17 119517491015 7.868396E7 1.2812171639342154E10 true maOgXoCzsrPVZqxaeS vm AJNnbqdEzk 2023-12-28 2023-12-22T23:18:48 85523.88 +1252 142 68 92511639613 5.2273456E7 2.0197789593796345E10 true zFl Avwm Yi 2023-12-24 2023-12-01T22:31:48 181634.60 +1262 279 57 63627626380 2.3360408E7 6.674186807593108E9 true wjuW ueO tOWuzwJj 2023-12-24 2023-12-04T17:27:48 112884.97 +1266 253 10 139941604087 2.5471874E7 2.6004794480891223E9 true YBx MqsR sLu 2023-12-03 2023-12-23T10:00:48 83930.38 +1267 155 54 38456715756 4.2582072E7 3.350085153856542E9 true qFXXKbhqXfSYFXteGF WMH CWZwGCkmg 2023-12-17 2023-12-20T19:06:48 13843.42 + +-- !lzo_3 -- +127 317 22 139027217294 5.8534396E7 1.097115615520323E10 true NxvCOVAHCAzWEFOs VdEf vXxekmctPmPmmbecHgf 2023-12-15 2023-12-28T23:15:48 147638.24 +135 194 7 57894842960 1.3718646E7 2.1169820465574505E10 true aseqfHnnrtaL HwV IqXKe 2023-12-28 2023-12-19T06:14:48 32041.77 +241 496 63 26957970271 3.7214888E7 1.2043262506506804E10 true VJPXXigvP wfZp cwyoMdOxN 2023-12-10 2023-12-18T05:33:48 105023.30 +333 390 29 61080978873 2916969.0 1.053228375816898E10 true HcZnbf Wp iHqLLiPhgZ 2023-12-01 2023-12-10T11:31:48 68471.38 +368 37 42 60649320592 2.3388714E7 1.81031191987985E9 true yXoDmKpjjRsVV Hq MbWlyi 2023-12-19 2023-12-06T00:45:48 197736.91 +414 301 63 87524210634 1.2944316E7 3.5428357192711325E9 true piECj tGM pkOyUdxLBFCw 2023-12-19 2023-12-10T15:28:48 112255.75 +427 286 67 78312070726 3.1794338E7 1.7713252925472687E10 true lHoUCBbY LTkc CgMrDWTGppMIaZPk 2023-12-13 2023-12-08T16:02:48 75175.71 +469 156 25 41259191749 6.2344956E7 1.5674967382662376E10 true dfyMUJYNppBDDD az lVofKt 2023-12-19 2023-12-09T10:37:48 15427.43 +540 416 70 110655654086 4.9027904E7 1.1345965638449787E10 true gZF oPNx kDYTiiCPhyQqnmPLd 2023-12-26 2023-12-20T22:47:48 177628.27 +618 390 70 40611757422 4.9496784E7 1.90943138552761E9 true cuqniQE dxKv KlxZsrJad 2023-12-05 2023-11-30T13:41:48 13904.80 + +-- !lzo_4 -- +139 146 4 149816593644 4.9618156E7 1.3744723380110355E10 false sreHCjYoJoBOjUJMMBSQ dD iBaixPwGysIVgkomhg 2023-12-27 2023-12-19T15:51:48 191090.57 +167 275 28 46739421643 2.790689E7 5.638235691917528E8 false BDX iY pOrAYVd 2023-12-01 2023-12-09T12:59:48 105181.01 +285 43 47 124246184718 2162507.5 1.6279579779299034E10 false gXIEVQzqfokBv raxj NbGVRlQeotLBDWbDqP 2023-12-17 2023-12-24T03:36:48 75425.14 +311 44 67 79901279497 3.0787934E7 1.5853816694193293E10 false LTsSxeetbYKCwcJvg BCrf XkuC 2023-12-11 2023-12-05T13:16:48 103792.88 +36 369 2 24371701950 5.54394E7 8.576150848699297E9 false uQpDcwEZT sd SwzJInNDb 2023-12-05 2023-12-08T15:00:40 8954.61 +363 375 1 20494251127 8.9166856E7 2.2005002173871223E10 false hkHvijevoRfHhK szl hwHUAjwqTQOmLEPDFbt 2023-12-05 2023-12-08T05:28:48 96630.28 +42 132 39 128076453206 3.1733946E7 2.001312160047691E9 false kzviLgVNqxrDQ kr YhdXGtPun 2023-12-15 2023-12-10T01:42:48 189135.64 +438 491 21 66065079309 6.6624016E7 1.5542114222539822E10 false CEbvKZRdvMHxzVOIejq wJ eoTkUlht 2023-12-08 2023-12-17T19:49:48 86666.80 +563 327 1 86402793406 1.4668673E7 2.1932020019521263E10 false uEPywVtgb IN HCcPuRYlwlezseie 2023-12-27 2023-12-01T09:02:48 12840.38 +585 423 69 141894410515 1.7955736E7 8.784239710423233E9 false IsWEZJsPRXIFqapTTb yO qRAEvl 2023-11-29 2023-12-26T04:50:48 46733.25 + +-- !lzo_5 -- +127 317 22 139027217294 5.8534396E7 1.097115615520323E10 true NxvCOVAHCAzWEFOs VdEf vXxekmctPmPmmbecHgf 2023-12-15 2023-12-28T23:15:48 147638.24 +139 146 4 149816593644 4.9618156E7 1.3744723380110355E10 false sreHCjYoJoBOjUJMMBSQ dD iBaixPwGysIVgkomhg 2023-12-27 2023-12-19T15:51:48 191090.57 +167 275 28 46739421643 2.790689E7 5.638235691917528E8 false BDX iY pOrAYVd 2023-12-01 2023-12-09T12:59:48 105181.01 +241 496 63 26957970271 3.7214888E7 1.2043262506506804E10 true VJPXXigvP wfZp cwyoMdOxN 2023-12-10 2023-12-18T05:33:48 105023.30 +285 43 47 124246184718 2162507.5 1.6279579779299034E10 false gXIEVQzqfokBv raxj NbGVRlQeotLBDWbDqP 2023-12-17 2023-12-24T03:36:48 75425.14 +311 44 67 79901279497 3.0787934E7 1.5853816694193293E10 false LTsSxeetbYKCwcJvg BCrf XkuC 2023-12-11 2023-12-05T13:16:48 103792.88 +333 390 29 61080978873 2916969.0 1.053228375816898E10 true HcZnbf Wp iHqLLiPhgZ 2023-12-01 2023-12-10T11:31:48 68471.38 +36 369 2 24371701950 5.54394E7 8.576150848699297E9 false uQpDcwEZT sd SwzJInNDb 2023-12-05 2023-12-08T15:00:40 8954.61 +368 37 42 60649320592 2.3388714E7 1.81031191987985E9 true yXoDmKpjjRsVV Hq MbWlyi 2023-12-19 2023-12-06T00:45:48 197736.91 +42 132 39 128076453206 3.1733946E7 2.001312160047691E9 false kzviLgVNqxrDQ kr YhdXGtPun 2023-12-15 2023-12-10T01:42:48 189135.64 + +-- !lzo_6 -- +9379 258 6 31310350438 3.1661348E7 8.857541516631796E8 false nuXBDInOfoaWz AKyn ggtgZNvWuC 2023-11-28 2023-12-06T03:40:40 50071.94 + +-- !lzo_7 -- +135 194 7 57894842960 1.3718646E7 2.1169820465574505E10 true aseqfHnnrtaL HwV IqXKe 2023-12-28 2023-12-19T06:14:48 32041.77 +36 369 2 24371701950 5.54394E7 8.576150848699297E9 false uQpDcwEZT sd SwzJInNDb 2023-12-05 2023-12-08T15:00:40 8954.61 +469 156 25 41259191749 6.2344956E7 1.5674967382662376E10 true dfyMUJYNppBDDD az lVofKt 2023-12-19 2023-12-09T10:37:48 15427.43 +563 327 1 86402793406 1.4668673E7 2.1932020019521263E10 false uEPywVtgb IN HCcPuRYlwlezseie 2023-12-27 2023-12-01T09:02:48 12840.38 +585 423 69 141894410515 1.7955736E7 8.784239710423233E9 false IsWEZJsPRXIFqapTTb yO qRAEvl 2023-11-29 2023-12-26T04:50:48 46733.25 +618 390 70 40611757422 4.9496784E7 1.90943138552761E9 true cuqniQE dxKv KlxZsrJad 2023-12-05 2023-11-30T13:41:48 13904.80 +687 230 36 65023623256 8.2819664E7 2.059826790149805E10 false QBfgJpvaevEubRI QTP nneEuMZvlVXDlUG 2023-12-01 2023-12-18T05:20:48 35673.65 +744 33 53 133832713020 6.46669E7 1.909766060768045E10 true eSJGGBBZjGCMxZ gDmD SzRcNftkktGZKa 2023-12-26 2023-12-17T03:57:40 31797.49 +758 90 17 87654906351 7314712.5 9.549600187302872E9 false RAUyeYqsKGBCGrIpMeGP cjeC lbvKaqxQEROGxTGQQ 2023-12-16 2023-12-11T12:13:48 20710.24 +874 172 72 140230596072 7.323136E7 2.8372205443769336E9 true OySCFRGBmgxSmJ Yazj LfZMcWtlxvpp 2023-12-12 2023-12-02T00:51:48 32283.90 + +-- !lzo_8 -- + diff --git a/regression-test/suites/external_table_p2/hive/test_compress_type.groovy b/regression-test/suites/external_table_p2/hive/test_compress_type.groovy index 585e8691690a2c..73efd35834e24e 100644 --- a/regression-test/suites/external_table_p2/hive/test_compress_type.groovy +++ b/regression-test/suites/external_table_p2/hive/test_compress_type.groovy @@ -83,6 +83,52 @@ suite("test_compress_type", "p2,external,hive,external_remote,external_remote_hi order_qt_q48 """ select * from parquet_lz4_compression where col_string != "Random" order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal """ + + order_qt_lzo_1 """ select * from parquet_lzo_compression + order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal + limit 20; + """ + + order_qt_lzo_2 """ select * from parquet_lzo_compression where col_int > 1000 + order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal + limit 10; + """ + + + order_qt_lzo_3 """ select * from parquet_lzo_compression where col_float > 5.1 and col_boolean = 1 + order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal + limit 10; + """ + + order_qt_lzo_4 """ select * from parquet_lzo_compression where col_float > 1000 and col_boolean != 1 + order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal + limit 10; + """ + + + order_qt_lzo_5 """ select * from parquet_lzo_compression where col_double < 17672101476 and col_char !='ft' + order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal + limit 10; + """ + + + order_qt_lzo_6 """ select * from parquet_lzo_compression where col_string='nuXBDInOfoaWz' + order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal + limit 10; + """ + + + order_qt_lzo_7 """ select * from parquet_lzo_compression where col_decimal < 50071 and year(col_timestamp) = 2023 + order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal + limit 10; + """ + + + order_qt_lzo_8 """ select * from parquet_lzo_compression where year(col_date)!=2023 and year(col_timestamp) = 2023 + order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal + limit 10; + """ + } } From a64920b89542164a79c66b3a27aa8a26fa135a69 Mon Sep 17 00:00:00 2001 From: changyuwei <2017501503@qq.com> Date: Thu, 30 Nov 2023 11:02:44 +0800 Subject: [PATCH 2/2] fix compile --- be/src/util/block_compression.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/be/src/util/block_compression.cpp b/be/src/util/block_compression.cpp index 5b53bcb446c1ab..b7e93dbeb536ec 100644 --- a/be/src/util/block_compression.cpp +++ b/be/src/util/block_compression.cpp @@ -38,7 +38,6 @@ #include #include -#include #include #include #include @@ -48,10 +47,24 @@ #include "exec/decompressor.h" #include "gutil/endian.h" #include "gutil/strings/substitute.h" +#include "orc/OrcFile.hh" #include "util/bit_util.h" #include "util/defer_op.h" #include "util/faststring.h" +namespace orc { +/** + * Decompress the bytes in to the output buffer. + * @param inputAddress the start of the input + * @param inputLimit one past the last byte of the input + * @param outputAddress the start of the output buffer + * @param outputLimit one past the last byte of the output buffer + * @result the number of bytes decompressed + */ +uint64_t lzoDecompress(const char* inputAddress, const char* inputLimit, char* outputAddress, + char* outputLimit); +} // namespace orc + namespace doris { using strings::Substitute;