diff --git a/be/src/util/block_compression.cpp b/be/src/util/block_compression.cpp index a711b1331a6069..8f73ba2412552a 100644 --- a/be/src/util/block_compression.cpp +++ b/be/src/util/block_compression.cpp @@ -1022,7 +1022,10 @@ class Bzip2BlockCompression : public BlockCompressionCodec { size_t max_compressed_len(size_t len) override { // TODO: make sure the max_compressed_len for bzip2 - return len * 2; + // 50 is an estimate fix overhead for bzip2 + // in case the input len is small and BZ2_bzBuffToBuffCompress will return + // BZ_OUTBUFF_FULL + return len * 2 + 50; } }; diff --git a/regression-test/data/export_p0/test_outfile_csv_compress.out b/regression-test/data/export_p0/test_outfile_csv_compress.out index 48ae4946778124..7d3965e897461c 100644 --- a/regression-test/data/export_p0/test_outfile_csv_compress.out +++ b/regression-test/data/export_p0/test_outfile_csv_compress.out @@ -113,6 +113,66 @@ c2 text Yes false \N NONE c1 text Yes false \N NONE c2 text Yes false \N NONE +-- !select -- +1 2 + +-- !select -- +1 1 + +-- !select -- +c1 text Yes false \N NONE +c2 text Yes false \N NONE + +-- !select -- +1 2 + +-- !select -- +1 1 + +-- !select -- +c1 text Yes false \N NONE +c2 text Yes false \N NONE + +-- !select -- +1 2 + +-- !select -- +1 1 + +-- !select -- +c1 text Yes false \N NONE +c2 text Yes false \N NONE + +-- !select -- +1 2 + +-- !select -- +1 1 + +-- !select -- +c1 text Yes false \N NONE +c2 text Yes false \N NONE + +-- !select -- +1 2 + +-- !select -- +1 1 + +-- !select -- +c1 text Yes false \N NONE +c2 text Yes false \N NONE + +-- !select -- +1 2 + +-- !select -- +1 1 + +-- !select -- +c1 text Yes false \N NONE +c2 text Yes false \N NONE + -- !select -- __dummy_col text Yes false \N NONE diff --git a/regression-test/data/external_table_p0/hive/test_hive_get_schema_from_table.out b/regression-test/data/external_table_p0/hive/test_hive_get_schema_from_table.out index ea3ed7287c465d..59da4bf0ad5653 100644 --- a/regression-test/data/external_table_p0/hive/test_hive_get_schema_from_table.out +++ b/regression-test/data/external_table_p0/hive/test_hive_get_schema_from_table.out @@ -1275,7 +1275,7 @@ false 1 1 1 10 1.1 10.1 3951 01/31/10 1 2010-01-31T12:01:13.500 2010 1 true 0 0 0 0 0.0 0 3950 01/31/10 0 2010-01-31T12:00:13.500 2010 1 -- !schema_1 -- -1 7706 1 155190 17.00 21168.23 0.04 0.02 N O 1996-03-13 1996-02-12 1996-03-22 DELIVER IN PERSON TRUCK egular courts above the cn beijing +1 638 6 15635 32.00 49620.16 0.07 0.02 N O 1996-01-30 1996-02-07 1996-02-03 DELIVER IN PERSON MAIL arefully slyly ex cn beijing -- !schema_2 -- 6374628540732951412 -77 -65 -70 -107 -215 65 0 -526 -1309 3750 8827 -19795 34647 57042 -1662 -138248 -890685 -228568 1633079 -2725524 6163040 -10491702 697237 74565050 127767368 93532213 -209675435 -32116110 -3624917040 -2927805617 15581947241 21893441661 24075494509 -116822110531 -59683724667 -146210393388 114424524398 1341560771667 -1638742564263 520137948334 -2927347587131 7415137351179 -7963937754617 52157548982266 140803519083304 -294675355729619 -868076759504942 181128508165910 -91753231238823 -3511241416682881 -11545256318348796 -1952917510863468 -5161099825338866 -59726090170689781 287170105829528178 607326725526282735 1253194074103207461 -162443950414676064 -2964036188567341159 2602201580810990248 5581917084094110764 111739292249520611 -315687754593838642 -2804420462762366976 -2078683524 diff --git a/regression-test/suites/export_p0/test_outfile_csv_compress.groovy b/regression-test/suites/export_p0/test_outfile_csv_compress.groovy index 6bdbb39fe75338..01e5f0664407d6 100644 --- a/regression-test/suites/export_p0/test_outfile_csv_compress.groovy +++ b/regression-test/suites/export_p0/test_outfile_csv_compress.groovy @@ -39,6 +39,17 @@ suite("test_outfile_csv_compress", "p0") { for (int i = 0; i < 20; i++) { sql """ insert into ${table_name} select id + ${i}, concat(name, id + ${i}) from ${table_name};""" } + + // small table + sql """ DROP TABLE IF EXISTS small_${table_name} """ + sql """ + CREATE TABLE IF NOT EXISTS small_${table_name} ( + `id` int, + `name` int + ) + DISTRIBUTED BY HASH(name) PROPERTIES("replication_num" = "1"); + """ + sql """INSERT INTO small_${table_name} values(1, 2);""" } def table_name = "test_outfile_csv_compress" @@ -96,6 +107,42 @@ suite("test_outfile_csv_compress", "p0") { """ } + for (String compression_type: ["plain", "gz", "bz2", "snappyblock", "lz4block", "zstd"]) { + def small = "small_${table_name}" + def outfile_url = csv_outfile_result(small, compression_type); + print("http://${bucket}.${s3_endpoint}${outfile_url.substring(5 + bucket.length(), outfile_url.length() - 1)}0.") + qt_select """ select c1, c2 from s3( + "uri" = "http://${bucket}.${s3_endpoint}${outfile_url.substring(5 + bucket.length(), outfile_url.length() - 1)}*", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "provider" = "${getS3Provider()}", + "region" = "${region}", + "compress_type" = "${compression_type}" + ) order by c1, c2 limit 10; + """ + qt_select """ select count(c1), count(c2) from s3( + "uri" = "http://${bucket}.${s3_endpoint}${outfile_url.substring(5 + bucket.length(), outfile_url.length() - 1)}*", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "provider" = "${getS3Provider()}", + "region" = "${region}", + "compress_type" = "${compression_type}" + ); + """ + qt_select """desc function s3( + "uri" = "http://${bucket}.${s3_endpoint}${outfile_url.substring(5 + bucket.length(), outfile_url.length() - 1)}*", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "csv", + "provider" = "${getS3Provider()}", + "region" = "${region}", + "compress_type" = "${compression_type}" + ); + """ + } + // test invalid compression_type test { sql """ diff --git a/regression-test/suites/external_table_p0/hive/test_hive_get_schema_from_table.groovy b/regression-test/suites/external_table_p0/hive/test_hive_get_schema_from_table.groovy index 157d53c366ab04..4fbba0fb778137 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_get_schema_from_table.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_get_schema_from_table.groovy @@ -64,10 +64,6 @@ suite("test_hive_get_schema_from_table", "external_docker,hive,external_docker_h test_col_topn("month") } - - - - // test get scheam from table for (String hivePrefix : ["hive2", "hive3"]) { String catalog_name = "test_${hivePrefix}_get_schema" @@ -96,7 +92,7 @@ suite("test_hive_get_schema_from_table", "external_docker,hive,external_docker_h test_topn() test_topn_abs() - order_qt_schema_1 """select * from ${catalog_name}.${ex_db_name}.parquet_partition_table order by l_orderkey limit 1;""" + order_qt_schema_1 """select * from ${catalog_name}.${ex_db_name}.parquet_partition_table order by l_orderkey, l_partkey limit 1;""" order_qt_schema_2 """select * from ${catalog_name}.${ex_db_name}.parquet_delta_binary_packed order by int_value limit 1;""" order_qt_schema_3 """select * from ${catalog_name}.${ex_db_name}.parquet_alltypes_tiny_pages order by id desc limit 5;""" order_qt_schema_4 """select * from ${catalog_name}.${ex_db_name}.orc_all_types_partition order by bigint_col desc limit 3;"""