From 2aeea0fe3f4a821f6c1a344e3f6e5914d84a541f Mon Sep 17 00:00:00 2001 From: zzzzzzzs <1443539042@qq.com> Date: Sat, 29 Jul 2023 12:56:31 +0800 Subject: [PATCH 1/2] [Refactor](load) Extract load public code (#22304) --- be/src/http/action/stream_load.cpp | 66 +------------ be/src/util/load_util.cpp | 84 +++++++++++++++++ be/src/util/load_util.h | 33 +++++++ be/test/util/load_util_test.cpp | 143 +++++++++++++++++++++++++++++ 4 files changed, 264 insertions(+), 62 deletions(-) create mode 100644 be/src/util/load_util.cpp create mode 100644 be/src/util/load_util.h create mode 100644 be/test/util/load_util_test.cpp diff --git a/be/src/http/action/stream_load.cpp b/be/src/http/action/stream_load.cpp index 837a84f5a55871..6f5e1ed91dd006 100644 --- a/be/src/http/action/stream_load.cpp +++ b/be/src/http/action/stream_load.cpp @@ -60,6 +60,7 @@ #include "runtime/stream_load/stream_load_recorder.h" #include "util/byte_buffer.h" #include "util/doris_metrics.h" +#include "util/load_util.h" #include "util/metrics.h" #include "util/string_util.h" #include "util/thrift_rpc_helper.h" @@ -78,65 +79,6 @@ DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(streaming_load_current_processing, MetricUnit TStreamLoadPutResult k_stream_load_put_result; #endif -static void parse_format(const std::string& format_str, const std::string& compress_type_str, - TFileFormatType::type* format_type, - TFileCompressType::type* compress_type) { - if (format_str.empty()) { - parse_format("CSV", compress_type_str, format_type, compress_type); - return; - } - *compress_type = TFileCompressType::PLAIN; - *format_type = TFileFormatType::FORMAT_UNKNOWN; - if (iequal(format_str, "CSV")) { - if (compress_type_str.empty()) { - *format_type = TFileFormatType::FORMAT_CSV_PLAIN; - } else if (iequal(compress_type_str, "GZ")) { - *format_type = TFileFormatType::FORMAT_CSV_GZ; - *compress_type = TFileCompressType::GZ; - } else if (iequal(compress_type_str, "LZO")) { - *format_type = TFileFormatType::FORMAT_CSV_LZO; - *compress_type = TFileCompressType::LZO; - } else if (iequal(compress_type_str, "BZ2")) { - *format_type = TFileFormatType::FORMAT_CSV_BZ2; - *compress_type = TFileCompressType::BZ2; - } else if (iequal(compress_type_str, "LZ4")) { - *format_type = TFileFormatType::FORMAT_CSV_LZ4FRAME; - *compress_type = TFileCompressType::LZ4FRAME; - } else if (iequal(compress_type_str, "LZOP")) { - *format_type = TFileFormatType::FORMAT_CSV_LZOP; - *compress_type = TFileCompressType::LZO; - } else if (iequal(compress_type_str, "DEFLATE")) { - *format_type = TFileFormatType::FORMAT_CSV_DEFLATE; - *compress_type = TFileCompressType::DEFLATE; - } - } else if (iequal(format_str, "JSON")) { - if (compress_type_str.empty()) { - *format_type = TFileFormatType::FORMAT_JSON; - } - } else if (iequal(format_str, "PARQUET")) { - *format_type = TFileFormatType::FORMAT_PARQUET; - } else if (iequal(format_str, "ORC")) { - *format_type = TFileFormatType::FORMAT_ORC; - } - return; -} - -static bool is_format_support_streaming(TFileFormatType::type format) { - switch (format) { - case TFileFormatType::FORMAT_CSV_PLAIN: - case TFileFormatType::FORMAT_CSV_BZ2: - case TFileFormatType::FORMAT_CSV_DEFLATE: - case TFileFormatType::FORMAT_CSV_GZ: - case TFileFormatType::FORMAT_CSV_LZ4FRAME: - case TFileFormatType::FORMAT_CSV_LZO: - case TFileFormatType::FORMAT_CSV_LZOP: - case TFileFormatType::FORMAT_JSON: - return true; - default: - return false; - } -} - StreamLoadAction::StreamLoadAction(ExecEnv* exec_env) : _exec_env(exec_env) { _stream_load_entity = DorisMetrics::instance()->metric_registry()->register_entity("stream_load"); @@ -290,8 +232,8 @@ Status StreamLoadAction::_on_header(HttpRequest* http_req, std::shared_ptrheader(HTTP_COMPRESS_TYPE), &ctx->format, - &ctx->compress_type); + LoadUtil::parse_format(format_str, http_req->header(HTTP_COMPRESS_TYPE), &ctx->format, + &ctx->compress_type); if (ctx->format == TFileFormatType::FORMAT_UNKNOWN) { return Status::InternalError("unknown data format, format={}", http_req->header(HTTP_FORMAT_KEY)); @@ -392,7 +334,7 @@ void StreamLoadAction::free_handler_ctx(std::shared_ptr param) { Status StreamLoadAction::_process_put(HttpRequest* http_req, std::shared_ptr ctx) { // Now we use stream - ctx->use_streaming = is_format_support_streaming(ctx->format); + ctx->use_streaming = LoadUtil::is_format_support_streaming(ctx->format); // put request TStreamLoadPutRequest request; diff --git a/be/src/util/load_util.cpp b/be/src/util/load_util.cpp new file mode 100644 index 00000000000000..8736561db4f3e1 --- /dev/null +++ b/be/src/util/load_util.cpp @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "util/load_util.h" + +#include + +#include "util/string_util.h" + +namespace doris { +void LoadUtil::parse_format(const std::string& format_str, const std::string& compress_type_str, + TFileFormatType::type* format_type, + TFileCompressType::type* compress_type) { + if (format_str.empty()) { + parse_format("CSV", compress_type_str, format_type, compress_type); + return; + } + *compress_type = TFileCompressType::PLAIN; + *format_type = TFileFormatType::FORMAT_UNKNOWN; + if (iequal(format_str, "CSV")) { + if (compress_type_str.empty()) { + *format_type = TFileFormatType::FORMAT_CSV_PLAIN; + } else if (iequal(compress_type_str, "GZ")) { + *format_type = TFileFormatType::FORMAT_CSV_GZ; + *compress_type = TFileCompressType::GZ; + } else if (iequal(compress_type_str, "LZO")) { + *format_type = TFileFormatType::FORMAT_CSV_LZO; + *compress_type = TFileCompressType::LZO; + } else if (iequal(compress_type_str, "BZ2")) { + *format_type = TFileFormatType::FORMAT_CSV_BZ2; + *compress_type = TFileCompressType::BZ2; + } else if (iequal(compress_type_str, "LZ4")) { + *format_type = TFileFormatType::FORMAT_CSV_LZ4FRAME; + *compress_type = TFileCompressType::LZ4FRAME; + } else if (iequal(compress_type_str, "LZOP")) { + *format_type = TFileFormatType::FORMAT_CSV_LZOP; + *compress_type = TFileCompressType::LZO; + } else if (iequal(compress_type_str, "DEFLATE")) { + *format_type = TFileFormatType::FORMAT_CSV_DEFLATE; + *compress_type = TFileCompressType::DEFLATE; + } + } else if (iequal(format_str, "JSON")) { + if (compress_type_str.empty()) { + *format_type = TFileFormatType::FORMAT_JSON; + } + } else if (iequal(format_str, "PARQUET")) { + *format_type = TFileFormatType::FORMAT_PARQUET; + } else if (iequal(format_str, "ORC")) { + *format_type = TFileFormatType::FORMAT_ORC; + } + return; +} + +bool LoadUtil::is_format_support_streaming(TFileFormatType::type format) { + switch (format) { + case TFileFormatType::FORMAT_CSV_PLAIN: + case TFileFormatType::FORMAT_CSV_BZ2: + case TFileFormatType::FORMAT_CSV_DEFLATE: + case TFileFormatType::FORMAT_CSV_GZ: + case TFileFormatType::FORMAT_CSV_LZ4FRAME: + case TFileFormatType::FORMAT_CSV_LZO: + case TFileFormatType::FORMAT_CSV_LZOP: + case TFileFormatType::FORMAT_JSON: + return true; + default: + return false; + } + return false; +} +} // namespace doris \ No newline at end of file diff --git a/be/src/util/load_util.h b/be/src/util/load_util.h new file mode 100644 index 00000000000000..60bd79ab1bea9b --- /dev/null +++ b/be/src/util/load_util.h @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include + +namespace doris { +class LoadUtil { +public: + static void parse_format(const std::string& format_str, const std::string& compress_type_str, + TFileFormatType::type* format_type, + TFileCompressType::type* compress_type); + + static bool is_format_support_streaming(TFileFormatType::type format); +}; +} // namespace doris \ No newline at end of file diff --git a/be/test/util/load_util_test.cpp b/be/test/util/load_util_test.cpp new file mode 100644 index 00000000000000..0b4967befb22b5 --- /dev/null +++ b/be/test/util/load_util_test.cpp @@ -0,0 +1,143 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "util/load_util.h" + +#include +#include + +#include "gtest/gtest_pred_impl.h" + +namespace doris { + +class LoadUtilTest : public testing::Test { +public: + LoadUtilTest() {} + ~LoadUtilTest() override = default; +}; + +TEST_F(LoadUtilTest, StreamingTest) { + { + EXPECT_TRUE(LoadUtil::is_format_support_streaming(TFileFormatType::FORMAT_CSV_PLAIN)); + EXPECT_TRUE(LoadUtil::is_format_support_streaming(TFileFormatType::FORMAT_CSV_BZ2)); + EXPECT_TRUE(LoadUtil::is_format_support_streaming(TFileFormatType::FORMAT_CSV_DEFLATE)); + EXPECT_TRUE(LoadUtil::is_format_support_streaming(TFileFormatType::FORMAT_CSV_GZ)); + EXPECT_TRUE(LoadUtil::is_format_support_streaming(TFileFormatType::FORMAT_CSV_LZ4FRAME)); + EXPECT_TRUE(LoadUtil::is_format_support_streaming(TFileFormatType::FORMAT_CSV_LZO)); + EXPECT_TRUE(LoadUtil::is_format_support_streaming(TFileFormatType::FORMAT_CSV_LZOP)); + EXPECT_TRUE(LoadUtil::is_format_support_streaming(TFileFormatType::FORMAT_JSON)); + EXPECT_FALSE(LoadUtil::is_format_support_streaming(TFileFormatType::FORMAT_PARQUET)); + EXPECT_FALSE(LoadUtil::is_format_support_streaming(TFileFormatType::FORMAT_ORC)); + EXPECT_FALSE(LoadUtil::is_format_support_streaming(TFileFormatType::FORMAT_PROTO)); + EXPECT_FALSE(LoadUtil::is_format_support_streaming(TFileFormatType::FORMAT_JNI)); + EXPECT_FALSE(LoadUtil::is_format_support_streaming(TFileFormatType::FORMAT_AVRO)); + EXPECT_FALSE(LoadUtil::is_format_support_streaming(TFileFormatType::FORMAT_UNKNOWN)); + } +} + +TEST_F(LoadUtilTest, ParseTest) { + { + // "", "" + TFileFormatType::type format_type; + TFileCompressType::type compress_type; + LoadUtil::parse_format("", "", &format_type, &compress_type); + EXPECT_EQ(TFileFormatType::FORMAT_CSV_PLAIN, format_type); + EXPECT_EQ(TFileCompressType::PLAIN, compress_type); + } + { + // CSV, GZ + TFileFormatType::type format_type; + TFileCompressType::type compress_type; + LoadUtil::parse_format("CSV", "GZ", &format_type, &compress_type); + EXPECT_EQ(TFileFormatType::FORMAT_CSV_GZ, format_type); + EXPECT_EQ(TFileCompressType::GZ, compress_type); + } + { + // CSV, LZO + TFileFormatType::type format_type; + TFileCompressType::type compress_type; + LoadUtil::parse_format("CSV", "LZO", &format_type, &compress_type); + EXPECT_EQ(TFileFormatType::FORMAT_CSV_LZO, format_type); + EXPECT_EQ(TFileCompressType::LZO, compress_type); + } + { + // CSV, BZ2 + TFileFormatType::type format_type; + TFileCompressType::type compress_type; + LoadUtil::parse_format("CSV", "BZ2", &format_type, &compress_type); + EXPECT_EQ(TFileFormatType::FORMAT_CSV_BZ2, format_type); + EXPECT_EQ(TFileCompressType::BZ2, compress_type); + } + { + // CSV, LZ4 + TFileFormatType::type format_type; + TFileCompressType::type compress_type; + LoadUtil::parse_format("CSV", "LZ4", &format_type, &compress_type); + EXPECT_EQ(TFileFormatType::FORMAT_CSV_LZ4FRAME, format_type); + EXPECT_EQ(TFileCompressType::LZ4FRAME, compress_type); + } + { + // CSV, LZOP + TFileFormatType::type format_type; + TFileCompressType::type compress_type; + LoadUtil::parse_format("CSV", "LZOP", &format_type, &compress_type); + EXPECT_EQ(TFileFormatType::FORMAT_CSV_LZOP, format_type); + EXPECT_EQ(TFileCompressType::LZO, compress_type); + } + { + // CSV, DEFLATE + TFileFormatType::type format_type; + TFileCompressType::type compress_type; + LoadUtil::parse_format("CSV", "DEFLATE", &format_type, &compress_type); + EXPECT_EQ(TFileFormatType::FORMAT_CSV_DEFLATE, format_type); + EXPECT_EQ(TFileCompressType::DEFLATE, compress_type); + } + { + // JSON, "" + TFileFormatType::type format_type; + TFileCompressType::type compress_type; + LoadUtil::parse_format("JSON", "", &format_type, &compress_type); + EXPECT_EQ(TFileFormatType::FORMAT_JSON, format_type); + EXPECT_EQ(TFileCompressType::PLAIN, compress_type); + } + { + // JSON, GZ + TFileFormatType::type format_type; + TFileCompressType::type compress_type; + LoadUtil::parse_format("JSON", "GZ", &format_type, &compress_type); + EXPECT_EQ(TFileFormatType::FORMAT_UNKNOWN, format_type); + EXPECT_EQ(TFileCompressType::PLAIN, compress_type); + } + { + // PARQUET, "" + TFileFormatType::type format_type; + TFileCompressType::type compress_type; + LoadUtil::parse_format("PARQUET", "", &format_type, &compress_type); + EXPECT_EQ(TFileFormatType::FORMAT_PARQUET, format_type); + EXPECT_EQ(TFileCompressType::PLAIN, compress_type); + } + { + // ORC, "" + TFileFormatType::type format_type; + TFileCompressType::type compress_type; + LoadUtil::parse_format("ORC", "", &format_type, &compress_type); + EXPECT_EQ(TFileFormatType::FORMAT_ORC, format_type); + EXPECT_EQ(TFileCompressType::PLAIN, compress_type); + } +} + +} // namespace doris From 07113e0b6fa113c1df65135bd96c0d4bd50b616c Mon Sep 17 00:00:00 2001 From: Mingyu Chen Date: Sat, 26 Aug 2023 12:59:05 +0800 Subject: [PATCH 2/2] [fix](hive) do not split compress data file and support lz4/snappy block codec (#23245) 1. do not split compress data file Some data file in hive is compressed with gzip, deflate, etc. These kinds of file can not be splitted. 2. Support lz4 block codec for hive scan node, use lz4 block codec instead of lz4 frame codec 4. Support snappy block codec For hadoop snappy 5. Optimize the `count(*)` query of csv file For query like `select count(*) from tbl`, only need to split the line, no need to split the column. Need to pick to branch-2.0 after this PR: #22304 --- be/src/exec/decompressor.cpp | 185 +++++++++++++++++- be/src/exec/decompressor.h | 39 +++- be/src/service/internal_service.cpp | 2 + be/src/util/load_util.cpp | 9 +- be/src/vec/exec/format/csv/csv_reader.cpp | 76 +++++-- be/src/vec/exec/format/csv/csv_reader.h | 6 + .../new_plain_text_line_reader.cpp | 10 +- .../file_reader/new_plain_text_line_reader.h | 1 - be/src/vec/exec/scan/vfile_scanner.cpp | 36 ++-- .../org/apache/doris/common/util/Util.java | 6 + .../datasource/hive/HiveMetaStoreCache.java | 2 + .../doris/external/hive/util/HiveUtil.java | 3 +- .../doris/planner/external/FileScanNode.java | 17 +- .../doris/planner/external/HiveScanNode.java | 11 ++ .../doris/planner/external/HiveSplit.java | 5 - gensrc/thrift/PlanNodes.thrift | 4 + .../hive/test_compress_type.out | 47 +++++ .../hive/test_compress_type.groovy | 61 ++++++ 18 files changed, 464 insertions(+), 56 deletions(-) create mode 100644 regression-test/data/external_table_p2/hive/test_compress_type.out create mode 100644 regression-test/suites/external_table_p2/hive/test_compress_type.groovy diff --git a/be/src/exec/decompressor.cpp b/be/src/exec/decompressor.cpp index af69a896a2b8e4..964654132a3a29 100644 --- a/be/src/exec/decompressor.cpp +++ b/be/src/exec/decompressor.cpp @@ -42,6 +42,12 @@ Status Decompressor::create_decompressor(CompressType type, Decompressor** decom case CompressType::LZ4FRAME: *decompressor = new Lz4FrameDecompressor(); break; + case CompressType::LZ4BLOCK: + *decompressor = new Lz4BlockDecompressor(); + break; + case CompressType::SNAPPYBLOCK: + *decompressor = new SnappyBlockDecompressor(); + break; #ifdef DORIS_WITH_LZO case CompressType::LZOP: *decompressor = new LzopDecompressor(); @@ -59,6 +65,10 @@ Status Decompressor::create_decompressor(CompressType type, Decompressor** decom return st; } +uint32_t Decompressor::_read_int32(uint8_t* buf) { + return (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3]; +} + std::string Decompressor::debug_info() { return "Decompressor"; } @@ -239,7 +249,7 @@ Status Lz4FrameDecompressor::decompress(uint8_t* input, size_t input_len, size_t size_t* decompressed_len, bool* stream_end, size_t* more_input_bytes, size_t* more_output_bytes) { uint8_t* src = input; - size_t src_size = input_len; + size_t remaining_input_size = input_len; size_t ret = 1; *input_bytes_read = 0; @@ -257,7 +267,7 @@ Status Lz4FrameDecompressor::decompress(uint8_t* input, size_t input_len, size_t } LZ4F_frameInfo_t info; - ret = LZ4F_getFrameInfo(_dctx, &info, (void*)src, &src_size); + ret = LZ4F_getFrameInfo(_dctx, &info, (void*)src, &remaining_input_size); if (LZ4F_isError(ret)) { return Status::InternalError("LZ4F_getFrameInfo error: {}", std::string(LZ4F_getErrorName(ret))); @@ -270,17 +280,17 @@ Status Lz4FrameDecompressor::decompress(uint8_t* input, size_t input_len, size_t std::string(LZ4F_getErrorName(ret))); } - *input_bytes_read = src_size; + *input_bytes_read = remaining_input_size; - src += src_size; - src_size = input_len - src_size; + src += remaining_input_size; + remaining_input_size = input_len - remaining_input_size; LOG(INFO) << "lz4 block size: " << _expect_dec_buf_size; } // decompress size_t output_len = output_max_len; - ret = LZ4F_decompress(_dctx, (void*)output, &output_len, (void*)src, &src_size, + ret = LZ4F_decompress(_dctx, (void*)output, &output_len, (void*)src, &remaining_input_size, /* LZ4F_decompressOptions_t */ nullptr); if (LZ4F_isError(ret)) { return Status::InternalError("Decompression error: {}", @@ -288,7 +298,7 @@ Status Lz4FrameDecompressor::decompress(uint8_t* input, size_t input_len, size_t } // update - *input_bytes_read += src_size; + *input_bytes_read += remaining_input_size; *decompressed_len = output_len; if (ret == 0) { *stream_end = true; @@ -324,4 +334,165 @@ size_t Lz4FrameDecompressor::get_block_size(const LZ4F_frameInfo_t* info) { } } +/// Lz4BlockDecompressor +Status Lz4BlockDecompressor::init() { + return Status::OK(); +} + +Status Lz4BlockDecompressor::decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, + uint8_t* output, size_t output_max_len, + size_t* decompressed_len, bool* stream_end, + size_t* more_input_bytes, size_t* more_output_bytes) { + uint8_t* src = input; + size_t remaining_input_size = input_len; + int64_t uncompressed_total_len = 0; + *input_bytes_read = 0; + + // The hadoop lz4 codec is as: + // <4 byte big endian uncompressed size> + // <4 byte big endian compressed size> + // + // .... + // <4 byte big endian uncompressed size> + // <4 byte big endian compressed size> + // + // + // See: + // https://github.com/apache/hadoop/blob/trunk/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/src/codec/Lz4Codec.cc + while (remaining_input_size > 0) { + // Read uncompressed size + uint32_t uncompressed_block_len = Decompressor::_read_int32(src); + int64_t remaining_output_size = output_max_len - uncompressed_total_len; + if (remaining_output_size < uncompressed_block_len) { + // Need more output buffer + *more_output_bytes = uncompressed_block_len - remaining_output_size; + break; + } + + // Read compressed size + size_t tmp_src_size = remaining_input_size - sizeof(uint32_t); + size_t compressed_len = Decompressor::_read_int32(src + sizeof(uint32_t)); + if (compressed_len == 0 || compressed_len > tmp_src_size) { + // Need more input data + *more_input_bytes = compressed_len - tmp_src_size; + break; + } + + src += 2 * sizeof(uint32_t); + remaining_input_size -= 2 * sizeof(uint32_t); + + // Decompress + int uncompressed_len = LZ4_decompress_safe(reinterpret_cast(src), + reinterpret_cast(output), compressed_len, + remaining_output_size); + if (uncompressed_len < 0 || uncompressed_len != uncompressed_block_len) { + return Status::InternalError( + "lz4 block decompress failed. uncompressed_len: {}, expected: {}", + uncompressed_len, uncompressed_block_len); + } + + output += uncompressed_len; + src += compressed_len; + remaining_input_size -= compressed_len; + uncompressed_total_len += uncompressed_len; + } + + *input_bytes_read += (input_len - remaining_input_size); + *decompressed_len = uncompressed_total_len; + // If no more input and output need, means this is the end of a compressed block + *stream_end = (*more_input_bytes == 0 && *more_output_bytes == 0); + + return Status::OK(); +} + +std::string Lz4BlockDecompressor::debug_info() { + std::stringstream ss; + ss << "Lz4BlockDecompressor."; + return ss.str(); +} + +/// SnappyBlockDecompressor +Status SnappyBlockDecompressor::init() { + return Status::OK(); +} + +Status SnappyBlockDecompressor::decompress(uint8_t* input, size_t input_len, + size_t* input_bytes_read, uint8_t* output, + size_t output_max_len, size_t* decompressed_len, + bool* stream_end, size_t* more_input_bytes, + size_t* more_output_bytes) { + uint8_t* src = input; + size_t remaining_input_size = input_len; + int64_t uncompressed_total_len = 0; + *input_bytes_read = 0; + + // The hadoop snappy codec is as: + // <4 byte big endian uncompressed size> + // <4 byte big endian compressed size> + // + // .... + // <4 byte big endian uncompressed size> + // <4 byte big endian compressed size> + // + // + // See: + // https://github.com/apache/hadoop/blob/trunk/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/src/codec/SnappyCodec.cc + while (remaining_input_size > 0) { + // Read uncompressed size + uint32_t uncompressed_block_len = Decompressor::_read_int32(src); + int64_t remaining_output_size = output_max_len - uncompressed_total_len; + if (remaining_output_size < uncompressed_block_len) { + // Need more output buffer + *more_output_bytes = uncompressed_block_len - remaining_output_size; + break; + } + + // Read compressed size + size_t tmp_src_size = remaining_input_size - sizeof(uint32_t); + size_t compressed_len = _read_int32(src + sizeof(uint32_t)); + if (compressed_len == 0 || compressed_len > tmp_src_size) { + // Need more input data + *more_input_bytes = compressed_len - tmp_src_size; + break; + } + + src += 2 * sizeof(uint32_t); + remaining_input_size -= 2 * sizeof(uint32_t); + + // ATTN: the uncompressed len from GetUncompressedLength() is same as + // uncompressed_block_len, so I think it is unnecessary to get it again. + // Get uncompressed len from snappy + // size_t uncompressed_len; + // if (!snappy::GetUncompressedLength(reinterpret_cast(src), + // compressed_len, &uncompressed_len)) { + // return Status::InternalError("snappy block decompress failed to get uncompressed len"); + // } + + // Decompress + if (!snappy::RawUncompress(reinterpret_cast(src), compressed_len, + reinterpret_cast(output))) { + return Status::InternalError("snappy block decompress failed. uncompressed_len: {}", + uncompressed_block_len); + } + + output += uncompressed_block_len; + src += compressed_len; + remaining_input_size -= compressed_len; + uncompressed_total_len += uncompressed_block_len; + } + + *input_bytes_read += (input_len - remaining_input_size); + *decompressed_len = uncompressed_total_len; + // If no more input and output need, means this is the end of a compressed block + *stream_end = (*more_input_bytes == 0 && *more_output_bytes == 0); + + return Status::OK(); +} + +std::string SnappyBlockDecompressor::debug_info() { + std::stringstream ss; + ss << "SnappyBlockDecompressor."; + return ss.str(); +} + } // namespace doris diff --git a/be/src/exec/decompressor.h b/be/src/exec/decompressor.h index af37335f1f7145..2b07e71139fb83 100644 --- a/be/src/exec/decompressor.h +++ b/be/src/exec/decompressor.h @@ -18,7 +18,10 @@ #pragma once #include +#include #include +#include +#include #include #include #include @@ -34,7 +37,7 @@ namespace doris { -enum CompressType { UNCOMPRESSED, GZIP, DEFLATE, BZIP2, LZ4FRAME, LZOP }; +enum CompressType { UNCOMPRESSED, GZIP, DEFLATE, BZIP2, LZ4FRAME, LZOP, LZ4BLOCK, SNAPPYBLOCK }; class Decompressor { public: @@ -68,6 +71,8 @@ class Decompressor { protected: virtual Status init() = 0; + static uint32_t _read_int32(uint8_t* buf); + Decompressor(CompressType ctype) : _ctype(ctype) {} CompressType _ctype; @@ -140,6 +145,38 @@ class Lz4FrameDecompressor : public Decompressor { const static unsigned DORIS_LZ4F_VERSION; }; +class Lz4BlockDecompressor : public Decompressor { +public: + ~Lz4BlockDecompressor() override {} + + Status decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, uint8_t* output, + size_t output_max_len, size_t* decompressed_len, bool* stream_end, + size_t* more_input_bytes, size_t* more_output_bytes) override; + + std::string debug_info() override; + +private: + friend class Decompressor; + Lz4BlockDecompressor() : Decompressor(CompressType::LZ4FRAME) {} + Status init() override; +}; + +class SnappyBlockDecompressor : public Decompressor { +public: + ~SnappyBlockDecompressor() override {} + + Status decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, uint8_t* output, + size_t output_max_len, size_t* decompressed_len, bool* stream_end, + size_t* more_input_bytes, size_t* more_output_bytes) override; + + std::string debug_info() override; + +private: + friend class Decompressor; + SnappyBlockDecompressor() : Decompressor(CompressType::SNAPPYBLOCK) {} + Status init() override; +}; + #ifdef DORIS_WITH_LZO class LzopDecompressor : public Decompressor { public: diff --git a/be/src/service/internal_service.cpp b/be/src/service/internal_service.cpp index 2e35ca99efe74a..69ce85bb50bd01 100644 --- a/be/src/service/internal_service.cpp +++ b/be/src/service/internal_service.cpp @@ -569,6 +569,8 @@ void PInternalServiceImpl::fetch_table_schema(google::protobuf::RpcController* c case TFileFormatType::FORMAT_CSV_GZ: case TFileFormatType::FORMAT_CSV_BZ2: case TFileFormatType::FORMAT_CSV_LZ4FRAME: + case TFileFormatType::FORMAT_CSV_LZ4BLOCK: + case TFileFormatType::FORMAT_CSV_SNAPPYBLOCK: case TFileFormatType::FORMAT_CSV_LZOP: case TFileFormatType::FORMAT_CSV_DEFLATE: { // file_slots is no use diff --git a/be/src/util/load_util.cpp b/be/src/util/load_util.cpp index 8736561db4f3e1..1277132378b85a 100644 --- a/be/src/util/load_util.cpp +++ b/be/src/util/load_util.cpp @@ -46,9 +46,15 @@ void LoadUtil::parse_format(const std::string& format_str, const std::string& co } else if (iequal(compress_type_str, "LZ4")) { *format_type = TFileFormatType::FORMAT_CSV_LZ4FRAME; *compress_type = TFileCompressType::LZ4FRAME; + } else if (iequal(compress_type_str, "LZ4_BLOCK")) { + *format_type = TFileFormatType::FORMAT_CSV_LZ4BLOCK; + *compress_type = TFileCompressType::LZ4BLOCK; } else if (iequal(compress_type_str, "LZOP")) { *format_type = TFileFormatType::FORMAT_CSV_LZOP; *compress_type = TFileCompressType::LZO; + } else if (iequal(compress_type_str, "SNAPPY_BLOCK")) { + *format_type = TFileFormatType::FORMAT_CSV_SNAPPYBLOCK; + *compress_type = TFileCompressType::SNAPPYBLOCK; } else if (iequal(compress_type_str, "DEFLATE")) { *format_type = TFileFormatType::FORMAT_CSV_DEFLATE; *compress_type = TFileCompressType::DEFLATE; @@ -72,6 +78,7 @@ bool LoadUtil::is_format_support_streaming(TFileFormatType::type format) { case TFileFormatType::FORMAT_CSV_DEFLATE: case TFileFormatType::FORMAT_CSV_GZ: case TFileFormatType::FORMAT_CSV_LZ4FRAME: + case TFileFormatType::FORMAT_CSV_LZ4BLOCK: case TFileFormatType::FORMAT_CSV_LZO: case TFileFormatType::FORMAT_CSV_LZOP: case TFileFormatType::FORMAT_JSON: @@ -81,4 +88,4 @@ bool LoadUtil::is_format_support_streaming(TFileFormatType::type format) { } return false; } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp b/be/src/vec/exec/format/csv/csv_reader.cpp index 65b2319b755e30..10b5ced96826f5 100644 --- a/be/src/vec/exec/format/csv/csv_reader.cpp +++ b/be/src/vec/exec/format/csv/csv_reader.cpp @@ -340,8 +340,12 @@ Status CsvReader::init_reader(bool is_load) { [[fallthrough]]; case TFileFormatType::FORMAT_CSV_LZ4FRAME: [[fallthrough]]; + case TFileFormatType::FORMAT_CSV_LZ4BLOCK: + [[fallthrough]]; case TFileFormatType::FORMAT_CSV_LZOP: [[fallthrough]]; + case TFileFormatType::FORMAT_CSV_SNAPPYBLOCK: + [[fallthrough]]; case TFileFormatType::FORMAT_CSV_DEFLATE: _line_reader = NewPlainTextLineReader::create_unique(_profile, _file_reader, _decompressor.get(), @@ -397,21 +401,51 @@ Status CsvReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { const int batch_size = std::max(_state->batch_size(), (int)_MIN_BATCH_SIZE); size_t rows = 0; - auto columns = block->mutate_columns(); - while (rows < batch_size && !_line_reader_eof) { - const uint8_t* ptr = nullptr; - size_t size = 0; - RETURN_IF_ERROR(_line_reader->read_line(&ptr, &size, &_line_reader_eof, _io_ctx)); - if (_skip_lines > 0) { - _skip_lines--; - continue; + + bool success = false; + if (_push_down_agg_type == TPushAggOp::type::COUNT) { + while (rows < batch_size && !_line_reader_eof) { + const uint8_t* ptr = nullptr; + size_t size = 0; + RETURN_IF_ERROR(_line_reader->read_line(&ptr, &size, &_line_reader_eof, _io_ctx)); + if (_skip_lines > 0) { + _skip_lines--; + continue; + } + if (size == 0) { + // Read empty row, just continue + continue; + } + + RETURN_IF_ERROR(_validate_line(Slice(ptr, size), &success)); + ++rows; } - if (size == 0) { - // Read empty row, just continue - continue; + + for (auto& col : block->mutate_columns()) { + col->resize(rows); } - RETURN_IF_ERROR(_fill_dest_columns(Slice(ptr, size), block, columns, &rows)); + } else { + auto columns = block->mutate_columns(); + while (rows < batch_size && !_line_reader_eof) { + const uint8_t* ptr = nullptr; + size_t size = 0; + RETURN_IF_ERROR(_line_reader->read_line(&ptr, &size, &_line_reader_eof, _io_ctx)); + if (_skip_lines > 0) { + _skip_lines--; + continue; + } + if (size == 0) { + // Read empty row, just continue + continue; + } + + RETURN_IF_ERROR(_validate_line(Slice(ptr, size), &success)); + if (!success) { + continue; + } + RETURN_IF_ERROR(_fill_dest_columns(Slice(ptr, size), block, columns, &rows)); + } } *eof = (rows == 0); @@ -473,9 +507,15 @@ Status CsvReader::_create_decompressor() { case TFileCompressType::LZ4FRAME: compress_type = CompressType::LZ4FRAME; break; + case TFileCompressType::LZ4BLOCK: + compress_type = CompressType::LZ4BLOCK; + break; case TFileCompressType::DEFLATE: compress_type = CompressType::DEFLATE; break; + case TFileCompressType::SNAPPYBLOCK: + compress_type = CompressType::SNAPPYBLOCK; + break; default: return Status::InternalError("unknown compress type: {}", _file_compress_type); } @@ -495,12 +535,18 @@ Status CsvReader::_create_decompressor() { case TFileFormatType::FORMAT_CSV_LZ4FRAME: compress_type = CompressType::LZ4FRAME; break; + case TFileFormatType::FORMAT_CSV_LZ4BLOCK: + compress_type = CompressType::LZ4BLOCK; + break; case TFileFormatType::FORMAT_CSV_LZOP: compress_type = CompressType::LZOP; break; case TFileFormatType::FORMAT_CSV_DEFLATE: compress_type = CompressType::DEFLATE; break; + case TFileFormatType::FORMAT_CSV_SNAPPYBLOCK: + compress_type = CompressType::SNAPPYBLOCK; + break; default: return Status::InternalError("unknown format type: {}", _file_format_type); } @@ -554,7 +600,7 @@ Status CsvReader::_fill_dest_columns(const Slice& line, Block* block, return Status::OK(); } -Status CsvReader::_line_split_to_values(const Slice& line, bool* success) { +Status CsvReader::_validate_line(const Slice& line, bool* success) { if (!_is_proto_format && !validate_utf8(line.data, line.size)) { if (!_is_load) { return Status::InternalError("Only support csv data in utf8 codec"); @@ -572,7 +618,11 @@ Status CsvReader::_line_split_to_values(const Slice& line, bool* success) { return Status::OK(); } } + *success = true; + return Status::OK(); +} +Status CsvReader::_line_split_to_values(const Slice& line, bool* success) { _split_line(line); if (_is_load) { diff --git a/be/src/vec/exec/format/csv/csv_reader.h b/be/src/vec/exec/format/csv/csv_reader.h index 5721bbd9291a38..2659703f8dce6e 100644 --- a/be/src/vec/exec/format/csv/csv_reader.h +++ b/be/src/vec/exec/format/csv/csv_reader.h @@ -221,6 +221,12 @@ class CsvReader : public GenericReader { // TODO(ftw): parse type Status _parse_col_types(size_t col_nums, std::vector* col_types); + // check the utf8 encoding of a line. + // return error status to stop processing. + // If return Status::OK but "success" is false, which means this is load request + // and the line is skipped as unqualified row, and the process should continue. + Status _validate_line(const Slice& line, bool* success); + RuntimeState* _state; RuntimeProfile* _profile; ScannerCounter* _counter; diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp index b59bbef1f1c085..c27aba354f6e1f 100644 --- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp +++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp @@ -201,7 +201,6 @@ NewPlainTextLineReader::NewPlainTextLineReader(RuntimeProfile* profile, _output_buf_limit(0), _file_eof(false), _eof(false), - _stream_end(true), _more_input_bytes(0), _more_output_bytes(0), _current_offset(current_offset), @@ -324,6 +323,7 @@ Status NewPlainTextLineReader::read_line(const uint8_t** ptr, size_t* size, bool _line_reader_ctx->refresh(); int found_line_delimiter = 0; size_t offset = 0; + bool stream_end = true; while (!done()) { // find line delimiter in current decompressed data uint8_t* cur_ptr = _output_buf + _output_buf_pos; @@ -379,7 +379,7 @@ Status NewPlainTextLineReader::read_line(const uint8_t** ptr, size_t* size, bool COUNTER_UPDATE(_bytes_read_counter, read_len); } if (_file_eof || read_len == 0) { - if (!_stream_end) { + if (!stream_end) { return Status::InternalError( "Compressed file has been truncated, which is not allowed"); } else { @@ -392,7 +392,7 @@ Status NewPlainTextLineReader::read_line(const uint8_t** ptr, size_t* size, bool if (_decompressor == nullptr) { _output_buf_limit += read_len; - _stream_end = true; + stream_end = true; } else { // only update input limit. // input pos is set at MARK step @@ -418,10 +418,10 @@ Status NewPlainTextLineReader::read_line(const uint8_t** ptr, size_t* size, bool _input_buf_limit - _input_buf_pos, /* input_len */ &input_read_bytes, _output_buf + _output_buf_limit, /* output */ _output_buf_size - _output_buf_limit, /* output_max_len */ - &decompressed_len, &_stream_end, &_more_input_bytes, &_more_output_bytes)); + &decompressed_len, &stream_end, &_more_input_bytes, &_more_output_bytes)); // LOG(INFO) << "after decompress:" - // << " stream_end: " << _stream_end + // << " stream_end: " << stream_end // << " input_read_bytes: " << input_read_bytes // << " decompressed_len: " << decompressed_len // << " more_input_bytes: " << _more_input_bytes diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h index 7326812b92bcb3..9947259300dae4 100644 --- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h +++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h @@ -235,7 +235,6 @@ class NewPlainTextLineReader : public LineReader { bool _file_eof; bool _eof; - bool _stream_end; size_t _more_input_bytes; size_t _more_output_bytes; diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index bc439b73d256e2..9206333d651302 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -261,22 +261,26 @@ Status VFileScanner::_get_block_impl(RuntimeState* state, Block* block, bool* eo // use read_rows instead of _src_block_ptr->rows(), because the first column of _src_block_ptr // may not be filled after calling `get_next_block()`, so _src_block_ptr->rows() may return wrong result. if (read_rows > 0) { - // Convert the src block columns type to string in-place. - RETURN_IF_ERROR(_cast_to_input_block(block)); - // FileReader can fill partition and missing columns itself - if (!_cur_reader->fill_all_columns()) { - // Fill rows in src block with partition columns from path. (e.g. Hive partition columns) - RETURN_IF_ERROR(_fill_columns_from_path(read_rows)); - // Fill columns not exist in file with null or default value - RETURN_IF_ERROR(_fill_missing_columns(read_rows)); + // If the push_down_agg_type is COUNT, no need to do the rest, + // because we only save a number in block. + if (_parent->get_push_down_agg_type() != TPushAggOp::type::COUNT) { + // Convert the src block columns type to string in-place. + RETURN_IF_ERROR(_cast_to_input_block(block)); + // FileReader can fill partition and missing columns itself + if (!_cur_reader->fill_all_columns()) { + // Fill rows in src block with partition columns from path. (e.g. Hive partition columns) + RETURN_IF_ERROR(_fill_columns_from_path(read_rows)); + // Fill columns not exist in file with null or default value + RETURN_IF_ERROR(_fill_missing_columns(read_rows)); + } + // Apply _pre_conjunct_ctxs to filter src block. + RETURN_IF_ERROR(_pre_filter_src_block()); + // Convert src block to output block (dest block), string to dest data type and apply filters. + RETURN_IF_ERROR(_convert_to_output_block(block)); + // Truncate char columns or varchar columns if size is smaller than file columns + // or not found in the file column schema. + RETURN_IF_ERROR(_truncate_char_or_varchar_columns(block)); } - // Apply _pre_conjunct_ctxs to filter src block. - RETURN_IF_ERROR(_pre_filter_src_block()); - // Convert src block to output block (dest block), string to dest data type and apply filters. - RETURN_IF_ERROR(_convert_to_output_block(block)); - // Truncate char columns or varchar columns if size is smaller than file columns - // or not found in the file column schema. - RETURN_IF_ERROR(_truncate_char_or_varchar_columns(block)); break; } } while (true); @@ -758,8 +762,10 @@ Status VFileScanner::_get_next_reader() { case TFileFormatType::FORMAT_CSV_GZ: case TFileFormatType::FORMAT_CSV_BZ2: case TFileFormatType::FORMAT_CSV_LZ4FRAME: + case TFileFormatType::FORMAT_CSV_LZ4BLOCK: case TFileFormatType::FORMAT_CSV_LZOP: case TFileFormatType::FORMAT_CSV_DEFLATE: + case TFileFormatType::FORMAT_CSV_SNAPPYBLOCK: case TFileFormatType::FORMAT_PROTO: { _cur_reader = CsvReader::create_unique(_state, _profile, &_counter, *_params, range, _file_slot_descs, _io_ctx.get()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/Util.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/Util.java index 8d9c2c6b0c07aa..0dff4b6caaba08 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/Util.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/Util.java @@ -550,6 +550,8 @@ public static TFileFormatType getFileFormatType(String path) { return TFileFormatType.FORMAT_CSV_LZO; } else if (lowerCasePath.endsWith(".deflate")) { return TFileFormatType.FORMAT_CSV_DEFLATE; + } else if (lowerCasePath.endsWith(".snappy")) { + return TFileFormatType.FORMAT_CSV_SNAPPYBLOCK; } else { return TFileFormatType.FORMAT_CSV_PLAIN; } @@ -575,6 +577,8 @@ public static TFileCompressType inferFileCompressTypeByPath(String path) { return TFileCompressType.LZO; } else if (lowerCasePath.endsWith(".deflate")) { return TFileCompressType.DEFLATE; + } else if (lowerCasePath.endsWith(".snappy")) { + return TFileCompressType.SNAPPYBLOCK; } else { return TFileCompressType.PLAIN; } @@ -599,6 +603,8 @@ public static boolean isCsvFormat(TFileFormatType fileFormatType) { || fileFormatType == TFileFormatType.FORMAT_CSV_DEFLATE || fileFormatType == TFileFormatType.FORMAT_CSV_GZ || fileFormatType == TFileFormatType.FORMAT_CSV_LZ4FRAME + || fileFormatType == TFileFormatType.FORMAT_CSV_LZ4BLOCK + || fileFormatType == TFileFormatType.FORMAT_CSV_SNAPPYBLOCK || fileFormatType == TFileFormatType.FORMAT_CSV_LZO || fileFormatType == TFileFormatType.FORMAT_CSV_LZOP || fileFormatType == TFileFormatType.FORMAT_CSV_PLAIN; diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java index e7e621948e24ab..45b84dfb174d41 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java @@ -1000,6 +1000,7 @@ public static class FileCacheValue { // File Cache for self splitter. private final List files = Lists.newArrayList(); // File split cache for old splitter. This is a temp variable. + @Deprecated private final List splits = Lists.newArrayList(); private boolean isSplittable; // The values of partitions. @@ -1021,6 +1022,7 @@ public void addFile(RemoteFile file) { } } + @Deprecated public void addSplit(FileSplit split) { if (isFileVisible(split.getPath())) { splits.add(split); diff --git a/fe/fe-core/src/main/java/org/apache/doris/external/hive/util/HiveUtil.java b/fe/fe-core/src/main/java/org/apache/doris/external/hive/util/HiveUtil.java index 704d0fadf84fee..e1baea3652cd89 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/external/hive/util/HiveUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/external/hive/util/HiveUtil.java @@ -190,7 +190,8 @@ public static boolean isSplittable(InputFormat inputFormat, Path path, Job return true; } - // use reflection to get isSplittable method on FileInputFormat + // use reflection to get isSplitable method on FileInputFormat + // ATTN: the method name is actually "isSplitable", but the right spell is "isSplittable" Method method = null; for (Class clazz = inputFormat.getClass(); clazz != null; clazz = clazz.getSuperclass()) { try { diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileScanNode.java index 662aa939ee4b16..8e2c8ed3a4521e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileScanNode.java @@ -25,6 +25,7 @@ import org.apache.doris.catalog.Column; import org.apache.doris.catalog.TableIf; import org.apache.doris.common.UserException; +import org.apache.doris.common.util.Util; import org.apache.doris.planner.PlanNodeId; import org.apache.doris.planner.external.FileSplit.FileSplitCreator; import org.apache.doris.qe.ConnectContext; @@ -32,6 +33,7 @@ import org.apache.doris.statistics.StatisticalType; import org.apache.doris.thrift.TExplainLevel; import org.apache.doris.thrift.TExpr; +import org.apache.doris.thrift.TFileCompressType; import org.apache.doris.thrift.TFileRangeDesc; import org.apache.doris.thrift.TFileScanNode; import org.apache.doris.thrift.TFileScanRangeParams; @@ -221,19 +223,20 @@ protected List splitFile(Path path, long blockSize, BlockLocation[] block if (blockLocations == null) { blockLocations = new BlockLocation[0]; } - long splitSize = ConnectContext.get().getSessionVariable().getFileSplitSize(); - if (splitSize <= 0) { - splitSize = blockSize; - } - // Min split size is DEFAULT_SPLIT_SIZE(128MB). - splitSize = Math.max(splitSize, DEFAULT_SPLIT_SIZE); List result = Lists.newArrayList(); - if (!splittable) { + TFileCompressType compressType = Util.inferFileCompressTypeByPath(path.toString()); + if (!splittable || compressType != TFileCompressType.PLAIN) { LOG.debug("Path {} is not splittable.", path); String[] hosts = blockLocations.length == 0 ? null : blockLocations[0].getHosts(); result.add(splitCreator.create(path, 0, length, length, modificationTime, hosts, partitionValues)); return result; } + long splitSize = ConnectContext.get().getSessionVariable().getFileSplitSize(); + if (splitSize <= 0) { + splitSize = blockSize; + } + // Min split size is DEFAULT_SPLIT_SIZE(128MB). + splitSize = Math.max(splitSize, DEFAULT_SPLIT_SIZE); long bytesRemaining; for (bytesRemaining = length; (double) bytesRemaining / (double) splitSize > 1.1D; bytesRemaining -= splitSize) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java index 7178a585ff1739..5d0033c90ea806 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java @@ -51,6 +51,7 @@ import org.apache.doris.spi.Split; import org.apache.doris.statistics.StatisticalType; import org.apache.doris.thrift.TFileAttributes; +import org.apache.doris.thrift.TFileCompressType; import org.apache.doris.thrift.TFileFormatType; import org.apache.doris.thrift.TFileTextScanRangeParams; import org.apache.doris.thrift.TFileType; @@ -386,4 +387,14 @@ public boolean pushDownAggNoGrouping(FunctionCallExpr aggExpr) { public boolean pushDownAggNoGroupingCheckCol(FunctionCallExpr aggExpr, Column col) { return !col.isAllowNull(); } + + @Override + protected TFileCompressType getFileCompressType(FileSplit fileSplit) throws UserException { + TFileCompressType compressType = super.getFileCompressType(fileSplit); + // hadoop use lz4 blocked codec + if (compressType == TFileCompressType.LZ4FRAME) { + compressType = TFileCompressType.LZ4BLOCK; + } + return compressType; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveSplit.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveSplit.java index 0f230c85f43943..0bc8442760710e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveSplit.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveSplit.java @@ -32,11 +32,6 @@ public HiveSplit(Path path, long start, long length, long fileLength, this.acidInfo = acidInfo; } - public HiveSplit(Path path, long start, long length, long fileLength, String[] hosts, AcidInfo acidInfo) { - super(path, start, length, fileLength, hosts, null); - this.acidInfo = acidInfo; - } - @Override public Object getInfo() { return acidInfo; diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift index 72cd772cfc9413..6be2a508f6027b 100644 --- a/gensrc/thrift/PlanNodes.thrift +++ b/gensrc/thrift/PlanNodes.thrift @@ -116,6 +116,8 @@ enum TFileFormatType { FORMAT_PROTO, FORMAT_JNI, FORMAT_AVRO, + FORMAT_CSV_LZ4BLOCK, + FORMAT_CSV_SNAPPYBLOCK, } // In previous versions, the data compression format and file format were stored together, as TFileFormatType, @@ -132,6 +134,8 @@ enum TFileCompressType { LZ4FRAME, DEFLATE, LZOP, + LZ4BLOCK, + SNAPPYBLOCK } struct THdfsConf { diff --git a/regression-test/data/external_table_p2/hive/test_compress_type.out b/regression-test/data/external_table_p2/hive/test_compress_type.out new file mode 100644 index 00000000000000..a95bf1f0dd3c02 --- /dev/null +++ b/regression-test/data/external_table_p2/hive/test_compress_type.out @@ -0,0 +1,47 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !q21 -- +600005 + +-- !q22 -- +1510010 + +-- !q23 -- +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 2023-08-21 +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 bzip2 +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 bzip2 +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 deflate +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 deflate +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 gzip +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 gzip +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 lz4 +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 plain +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 plain +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 snappy + +-- !q31 -- +600005 + +-- !q32 -- +1510010 + +-- !q33 -- +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 2023-08-21 +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 bzip2 +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 bzip2 +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 deflate +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 deflate +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 gzip +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 gzip +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 lz4 +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 plain +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 plain +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 snappy + diff --git a/regression-test/suites/external_table_p2/hive/test_compress_type.groovy b/regression-test/suites/external_table_p2/hive/test_compress_type.groovy new file mode 100644 index 00000000000000..d02ff3fbd0c47d --- /dev/null +++ b/regression-test/suites/external_table_p2/hive/test_compress_type.groovy @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_compress_type", "p2,external,hive,external_remote,external_remote_hive") { + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") + String extHiveHmsPort = context.config.otherConfigs.get("extHiveHmsPort") + String catalog_name = "test_compress_type" + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='hms', + 'hadoop.username' = 'hadoop', + 'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}' + ); + """ + logger.info("catalog " + catalog_name + " created") + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + + sql """ use multi_catalog """ + + // table test_compress_partitioned has 6 partitions with different compressed file: plain, gzip, bzip2, deflate + sql """set file_split_size=0""" + explain { + sql("select count(*) from test_compress_partitioned") + contains "inputSplitNum=16, totalFileSize=734675596, scanRanges=16" + contains "partition=8/8" + } + qt_q21 """select count(*) from test_compress_partitioned where dt="gzip" or dt="mix"""" + qt_q22 """select count(*) from test_compress_partitioned""" + order_qt_q23 """select * from test_compress_partitioned where watchid=4611870011201662970""" + + sql """set file_split_size=8388608""" + explain { + sql("select count(*) from test_compress_partitioned") + contains "inputSplitNum=82, totalFileSize=734675596, scanRanges=82" + contains "partition=8/8" + } + + qt_q31 """select count(*) from test_compress_partitioned where dt="gzip" or dt="mix"""" + qt_q32 """select count(*) from test_compress_partitioned""" + order_qt_q33 """select * from test_compress_partitioned where watchid=4611870011201662970""" + sql """set file_split_size=0""" + } +}