Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 101 additions & 0 deletions be/src/util/block_compression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,26 @@

#include "common/config.h"
#include "exec/decompressor.h"
#include "gutil/endian.h"
#include "gutil/strings/substitute.h"
#include "orc/OrcFile.hh"
#include "util/bit_util.h"
#include "util/defer_op.h"
#include "util/faststring.h"

namespace orc {
/**
* Decompress the bytes in to the output buffer.
* @param inputAddress the start of the input
* @param inputLimit one past the last byte of the input
* @param outputAddress the start of the output buffer
* @param outputLimit one past the last byte of the output buffer
* @result the number of bytes decompressed
*/
uint64_t lzoDecompress(const char* inputAddress, const char* inputLimit, char* outputAddress,
char* outputLimit);
} // namespace orc

namespace doris {

using strings::Substitute;
Expand Down Expand Up @@ -1071,6 +1086,89 @@ class GzipBlockCompressionByLibdeflate final : public GzipBlockCompression {
};
#endif

class LzoBlockCompression final : public BlockCompressionCodec {
public:
static LzoBlockCompression* instance() {
static LzoBlockCompression s_instance;
return &s_instance;
}

Status compress(const Slice& input, faststring* output) override {
return Status::InvalidArgument("not impl lzo compress.");
}
size_t max_compressed_len(size_t len) override { return 0; };
Status decompress(const Slice& input, Slice* output) override {
auto* input_ptr = input.data;
auto remain_input_size = input.size;
auto* output_ptr = output->data;
auto remain_output_size = output->size;
auto* output_limit = output->data + output->size;

// Example:
// OriginData(The original data will be divided into several large data block.) :
// large data block1 | large data block2 | large data block3 | ....
// The large data block will be divided into several small data block.
// Suppose a large data block is divided into three small blocks:
// large data block1: | small block1 | small block2 | small block3 |
// CompressData: <A [B1 compress(small block1) ] [B2 compress(small block1) ] [B3 compress(small block1)]>
//
// A : original length of the current block of large data block.
// sizeof(A) = 4 bytes.
// A = length(small block1) + length(small block2) + length(small block3)
// Bx : length of small data block bx.
// sizeof(Bx) = 4 bytes.
// Bx = length(compress(small blockx))
try {
while (remain_input_size > 0) {
if (remain_input_size < 4) {
return Status::InvalidArgument(
"Need more input buffer to get large_block_uncompressed_len.");
}

uint32_t large_block_uncompressed_len = BigEndian::Load32(input_ptr);
input_ptr += 4;
remain_input_size -= 4;

if (remain_output_size < large_block_uncompressed_len) {
return Status::InvalidArgument(
"Need more output buffer to get uncompressed data.");
}

while (large_block_uncompressed_len > 0) {
if (remain_input_size < 4) {
return Status::InvalidArgument(
"Need more input buffer to get small_block_compressed_len.");
}

uint32_t small_block_compressed_len = BigEndian::Load32(input_ptr);
input_ptr += 4;
remain_input_size -= 4;

if (remain_input_size < small_block_compressed_len) {
return Status::InvalidArgument(
"Need more input buffer to decompress small block.");
}

auto small_block_uncompressed_len =
orc::lzoDecompress(input_ptr, input_ptr + small_block_compressed_len,
output_ptr, output_limit);

input_ptr += small_block_compressed_len;
remain_input_size -= small_block_compressed_len;

output_ptr += small_block_uncompressed_len;
large_block_uncompressed_len -= small_block_uncompressed_len;
remain_output_size -= small_block_uncompressed_len;
}
}
} catch (const orc::ParseError& e) {
//Prevent be from hanging due to orc::lzoDecompress throw exception
return Status::InternalError("Fail to do LZO decompress, error={}", e.what());
}
return Status::OK();
}
};

Status get_block_compression_codec(segment_v2::CompressionTypePB type,
BlockCompressionCodec** codec) {
switch (type) {
Expand Down Expand Up @@ -1127,6 +1225,9 @@ Status get_block_compression_codec(tparquet::CompressionCodec::type parquet_code
*codec = GzipBlockCompression::instance();
#endif
break;
case tparquet::CompressionCodec::LZO:
*codec = LzoBlockCompression::instance();
break;
default:
return Status::InternalError("unknown compression type({})", parquet_codec);
}
Expand Down
87 changes: 87 additions & 0 deletions regression-test/data/external_table_p2/hive/test_compress_type.out
Original file line number Diff line number Diff line change
Expand Up @@ -484,3 +484,90 @@
8 800 40 8000000000 45.75 55.25 false Eighth H Theta 2023-10-13 2023-10-13T21:45 890.12
9 900 45 9000000000 50.0 60.5 true Ninth I Iota 2023-10-14 2023-10-14T22:15 901.23

-- !lzo_1 --
127 317 22 139027217294 5.8534396E7 1.097115615520323E10 true NxvCOVAHCAzWEFOs VdEf vXxekmctPmPmmbecHgf 2023-12-15 2023-12-28T23:15:48 147638.24
135 194 7 57894842960 1.3718646E7 2.1169820465574505E10 true aseqfHnnrtaL HwV IqXKe 2023-12-28 2023-12-19T06:14:48 32041.77
139 146 4 149816593644 4.9618156E7 1.3744723380110355E10 false sreHCjYoJoBOjUJMMBSQ dD iBaixPwGysIVgkomhg 2023-12-27 2023-12-19T15:51:48 191090.57
167 275 28 46739421643 2.790689E7 5.638235691917528E8 false BDX iY pOrAYVd 2023-12-01 2023-12-09T12:59:48 105181.01
241 496 63 26957970271 3.7214888E7 1.2043262506506804E10 true VJPXXigvP wfZp cwyoMdOxN 2023-12-10 2023-12-18T05:33:48 105023.30
285 43 47 124246184718 2162507.5 1.6279579779299034E10 false gXIEVQzqfokBv raxj NbGVRlQeotLBDWbDqP 2023-12-17 2023-12-24T03:36:48 75425.14
311 44 67 79901279497 3.0787934E7 1.5853816694193293E10 false LTsSxeetbYKCwcJvg BCrf XkuC 2023-12-11 2023-12-05T13:16:48 103792.88
333 390 29 61080978873 2916969.0 1.053228375816898E10 true HcZnbf Wp iHqLLiPhgZ 2023-12-01 2023-12-10T11:31:48 68471.38
36 369 2 24371701950 5.54394E7 8.576150848699297E9 false uQpDcwEZT sd SwzJInNDb 2023-12-05 2023-12-08T15:00:40 8954.61
363 375 1 20494251127 8.9166856E7 2.2005002173871223E10 false hkHvijevoRfHhK szl hwHUAjwqTQOmLEPDFbt 2023-12-05 2023-12-08T05:28:48 96630.28
368 37 42 60649320592 2.3388714E7 1.81031191987985E9 true yXoDmKpjjRsVV Hq MbWlyi 2023-12-19 2023-12-06T00:45:48 197736.91
414 301 63 87524210634 1.2944316E7 3.5428357192711325E9 true piECj tGM pkOyUdxLBFCw 2023-12-19 2023-12-10T15:28:48 112255.75
42 132 39 128076453206 3.1733946E7 2.001312160047691E9 false kzviLgVNqxrDQ kr YhdXGtPun 2023-12-15 2023-12-10T01:42:48 189135.64
427 286 67 78312070726 3.1794338E7 1.7713252925472687E10 true lHoUCBbY LTkc CgMrDWTGppMIaZPk 2023-12-13 2023-12-08T16:02:48 75175.71
438 491 21 66065079309 6.6624016E7 1.5542114222539822E10 false CEbvKZRdvMHxzVOIejq wJ eoTkUlht 2023-12-08 2023-12-17T19:49:48 86666.80
469 156 25 41259191749 6.2344956E7 1.5674967382662376E10 true dfyMUJYNppBDDD az lVofKt 2023-12-19 2023-12-09T10:37:48 15427.43
540 416 70 110655654086 4.9027904E7 1.1345965638449787E10 true gZF oPNx kDYTiiCPhyQqnmPLd 2023-12-26 2023-12-20T22:47:48 177628.27
563 327 1 86402793406 1.4668673E7 2.1932020019521263E10 false uEPywVtgb IN HCcPuRYlwlezseie 2023-12-27 2023-12-01T09:02:48 12840.38
585 423 69 141894410515 1.7955736E7 8.784239710423233E9 false IsWEZJsPRXIFqapTTb yO qRAEvl 2023-11-29 2023-12-26T04:50:48 46733.25
618 390 70 40611757422 4.9496784E7 1.90943138552761E9 true cuqniQE dxKv KlxZsrJad 2023-12-05 2023-11-30T13:41:48 13904.80

-- !lzo_2 --
1078 229 63 79026532317 1.4363472E7 1.193746461651589E10 true znYrIGhEXITIdyiifBPZ BBh klhSDtg 2023-12-14 2023-12-02T06:34:48 80402.53
1105 186 31 129159878912 1.3102703E7 1.6989058048889019E10 false OxhcUomBMLjVjdwgOI Qa eJoODDnkdDd 2023-12-20 2023-12-25T03:33:48 83174.73
1108 223 24 64158736405 1.3341401E7 2.0128416779917E10 false rLXbarkH xU ggGSZGxLwT 2023-11-28 2023-12-23T01:43:48 17986.48
1126 178 38 137633520558 9.6421152E7 1.2075476530488207E10 true vHgghYPQNpzTmYx EKhO Pg 2023-12-03 2023-12-03T18:17:48 119990.49
1215 20 18 15934394806 9.6266544E7 3.303291140952643E8 true zzkAwmKNf RKO VzyGx 2023-12-26 2023-12-14T02:36:48 59236.59
1225 131 17 119517491015 7.868396E7 1.2812171639342154E10 true maOgXoCzsrPVZqxaeS vm AJNnbqdEzk 2023-12-28 2023-12-22T23:18:48 85523.88
1252 142 68 92511639613 5.2273456E7 2.0197789593796345E10 true zFl Avwm Yi 2023-12-24 2023-12-01T22:31:48 181634.60
1262 279 57 63627626380 2.3360408E7 6.674186807593108E9 true wjuW ueO tOWuzwJj 2023-12-24 2023-12-04T17:27:48 112884.97
1266 253 10 139941604087 2.5471874E7 2.6004794480891223E9 true YBx MqsR sLu 2023-12-03 2023-12-23T10:00:48 83930.38
1267 155 54 38456715756 4.2582072E7 3.350085153856542E9 true qFXXKbhqXfSYFXteGF WMH CWZwGCkmg 2023-12-17 2023-12-20T19:06:48 13843.42

-- !lzo_3 --
127 317 22 139027217294 5.8534396E7 1.097115615520323E10 true NxvCOVAHCAzWEFOs VdEf vXxekmctPmPmmbecHgf 2023-12-15 2023-12-28T23:15:48 147638.24
135 194 7 57894842960 1.3718646E7 2.1169820465574505E10 true aseqfHnnrtaL HwV IqXKe 2023-12-28 2023-12-19T06:14:48 32041.77
241 496 63 26957970271 3.7214888E7 1.2043262506506804E10 true VJPXXigvP wfZp cwyoMdOxN 2023-12-10 2023-12-18T05:33:48 105023.30
333 390 29 61080978873 2916969.0 1.053228375816898E10 true HcZnbf Wp iHqLLiPhgZ 2023-12-01 2023-12-10T11:31:48 68471.38
368 37 42 60649320592 2.3388714E7 1.81031191987985E9 true yXoDmKpjjRsVV Hq MbWlyi 2023-12-19 2023-12-06T00:45:48 197736.91
414 301 63 87524210634 1.2944316E7 3.5428357192711325E9 true piECj tGM pkOyUdxLBFCw 2023-12-19 2023-12-10T15:28:48 112255.75
427 286 67 78312070726 3.1794338E7 1.7713252925472687E10 true lHoUCBbY LTkc CgMrDWTGppMIaZPk 2023-12-13 2023-12-08T16:02:48 75175.71
469 156 25 41259191749 6.2344956E7 1.5674967382662376E10 true dfyMUJYNppBDDD az lVofKt 2023-12-19 2023-12-09T10:37:48 15427.43
540 416 70 110655654086 4.9027904E7 1.1345965638449787E10 true gZF oPNx kDYTiiCPhyQqnmPLd 2023-12-26 2023-12-20T22:47:48 177628.27
618 390 70 40611757422 4.9496784E7 1.90943138552761E9 true cuqniQE dxKv KlxZsrJad 2023-12-05 2023-11-30T13:41:48 13904.80

-- !lzo_4 --
139 146 4 149816593644 4.9618156E7 1.3744723380110355E10 false sreHCjYoJoBOjUJMMBSQ dD iBaixPwGysIVgkomhg 2023-12-27 2023-12-19T15:51:48 191090.57
167 275 28 46739421643 2.790689E7 5.638235691917528E8 false BDX iY pOrAYVd 2023-12-01 2023-12-09T12:59:48 105181.01
285 43 47 124246184718 2162507.5 1.6279579779299034E10 false gXIEVQzqfokBv raxj NbGVRlQeotLBDWbDqP 2023-12-17 2023-12-24T03:36:48 75425.14
311 44 67 79901279497 3.0787934E7 1.5853816694193293E10 false LTsSxeetbYKCwcJvg BCrf XkuC 2023-12-11 2023-12-05T13:16:48 103792.88
36 369 2 24371701950 5.54394E7 8.576150848699297E9 false uQpDcwEZT sd SwzJInNDb 2023-12-05 2023-12-08T15:00:40 8954.61
363 375 1 20494251127 8.9166856E7 2.2005002173871223E10 false hkHvijevoRfHhK szl hwHUAjwqTQOmLEPDFbt 2023-12-05 2023-12-08T05:28:48 96630.28
42 132 39 128076453206 3.1733946E7 2.001312160047691E9 false kzviLgVNqxrDQ kr YhdXGtPun 2023-12-15 2023-12-10T01:42:48 189135.64
438 491 21 66065079309 6.6624016E7 1.5542114222539822E10 false CEbvKZRdvMHxzVOIejq wJ eoTkUlht 2023-12-08 2023-12-17T19:49:48 86666.80
563 327 1 86402793406 1.4668673E7 2.1932020019521263E10 false uEPywVtgb IN HCcPuRYlwlezseie 2023-12-27 2023-12-01T09:02:48 12840.38
585 423 69 141894410515 1.7955736E7 8.784239710423233E9 false IsWEZJsPRXIFqapTTb yO qRAEvl 2023-11-29 2023-12-26T04:50:48 46733.25

-- !lzo_5 --
127 317 22 139027217294 5.8534396E7 1.097115615520323E10 true NxvCOVAHCAzWEFOs VdEf vXxekmctPmPmmbecHgf 2023-12-15 2023-12-28T23:15:48 147638.24
139 146 4 149816593644 4.9618156E7 1.3744723380110355E10 false sreHCjYoJoBOjUJMMBSQ dD iBaixPwGysIVgkomhg 2023-12-27 2023-12-19T15:51:48 191090.57
167 275 28 46739421643 2.790689E7 5.638235691917528E8 false BDX iY pOrAYVd 2023-12-01 2023-12-09T12:59:48 105181.01
241 496 63 26957970271 3.7214888E7 1.2043262506506804E10 true VJPXXigvP wfZp cwyoMdOxN 2023-12-10 2023-12-18T05:33:48 105023.30
285 43 47 124246184718 2162507.5 1.6279579779299034E10 false gXIEVQzqfokBv raxj NbGVRlQeotLBDWbDqP 2023-12-17 2023-12-24T03:36:48 75425.14
311 44 67 79901279497 3.0787934E7 1.5853816694193293E10 false LTsSxeetbYKCwcJvg BCrf XkuC 2023-12-11 2023-12-05T13:16:48 103792.88
333 390 29 61080978873 2916969.0 1.053228375816898E10 true HcZnbf Wp iHqLLiPhgZ 2023-12-01 2023-12-10T11:31:48 68471.38
36 369 2 24371701950 5.54394E7 8.576150848699297E9 false uQpDcwEZT sd SwzJInNDb 2023-12-05 2023-12-08T15:00:40 8954.61
368 37 42 60649320592 2.3388714E7 1.81031191987985E9 true yXoDmKpjjRsVV Hq MbWlyi 2023-12-19 2023-12-06T00:45:48 197736.91
42 132 39 128076453206 3.1733946E7 2.001312160047691E9 false kzviLgVNqxrDQ kr YhdXGtPun 2023-12-15 2023-12-10T01:42:48 189135.64

-- !lzo_6 --
9379 258 6 31310350438 3.1661348E7 8.857541516631796E8 false nuXBDInOfoaWz AKyn ggtgZNvWuC 2023-11-28 2023-12-06T03:40:40 50071.94

-- !lzo_7 --
135 194 7 57894842960 1.3718646E7 2.1169820465574505E10 true aseqfHnnrtaL HwV IqXKe 2023-12-28 2023-12-19T06:14:48 32041.77
36 369 2 24371701950 5.54394E7 8.576150848699297E9 false uQpDcwEZT sd SwzJInNDb 2023-12-05 2023-12-08T15:00:40 8954.61
469 156 25 41259191749 6.2344956E7 1.5674967382662376E10 true dfyMUJYNppBDDD az lVofKt 2023-12-19 2023-12-09T10:37:48 15427.43
563 327 1 86402793406 1.4668673E7 2.1932020019521263E10 false uEPywVtgb IN HCcPuRYlwlezseie 2023-12-27 2023-12-01T09:02:48 12840.38
585 423 69 141894410515 1.7955736E7 8.784239710423233E9 false IsWEZJsPRXIFqapTTb yO qRAEvl 2023-11-29 2023-12-26T04:50:48 46733.25
618 390 70 40611757422 4.9496784E7 1.90943138552761E9 true cuqniQE dxKv KlxZsrJad 2023-12-05 2023-11-30T13:41:48 13904.80
687 230 36 65023623256 8.2819664E7 2.059826790149805E10 false QBfgJpvaevEubRI QTP nneEuMZvlVXDlUG 2023-12-01 2023-12-18T05:20:48 35673.65
744 33 53 133832713020 6.46669E7 1.909766060768045E10 true eSJGGBBZjGCMxZ gDmD SzRcNftkktGZKa 2023-12-26 2023-12-17T03:57:40 31797.49
758 90 17 87654906351 7314712.5 9.549600187302872E9 false RAUyeYqsKGBCGrIpMeGP cjeC lbvKaqxQEROGxTGQQ 2023-12-16 2023-12-11T12:13:48 20710.24
874 172 72 140230596072 7.323136E7 2.8372205443769336E9 true OySCFRGBmgxSmJ Yazj LfZMcWtlxvpp 2023-12-12 2023-12-02T00:51:48 32283.90

-- !lzo_8 --

Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,52 @@ suite("test_compress_type", "p2,external,hive,external_remote,external_remote_hi
order_qt_q48 """ select * from parquet_lz4_compression where col_string != "Random"
order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal
"""

order_qt_lzo_1 """ select * from parquet_lzo_compression
order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal
limit 20;
"""

order_qt_lzo_2 """ select * from parquet_lzo_compression where col_int > 1000
order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal
limit 10;
"""


order_qt_lzo_3 """ select * from parquet_lzo_compression where col_float > 5.1 and col_boolean = 1
order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal
limit 10;
"""

order_qt_lzo_4 """ select * from parquet_lzo_compression where col_float > 1000 and col_boolean != 1
order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal
limit 10;
"""


order_qt_lzo_5 """ select * from parquet_lzo_compression where col_double < 17672101476 and col_char !='ft'
order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal
limit 10;
"""


order_qt_lzo_6 """ select * from parquet_lzo_compression where col_string='nuXBDInOfoaWz'
order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal
limit 10;
"""


order_qt_lzo_7 """ select * from parquet_lzo_compression where col_decimal < 50071 and year(col_timestamp) = 2023
order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal
limit 10;
"""


order_qt_lzo_8 """ select * from parquet_lzo_compression where year(col_date)!=2023 and year(col_timestamp) = 2023
order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal
limit 10;
"""


}
}