From 78504c7afa05ac8157f9fb4be01f9a3d8d7d0761 Mon Sep 17 00:00:00 2001 From: Socrates Date: Mon, 16 Dec 2024 17:45:32 +0800 Subject: [PATCH 1/6] don't push down schema changed column --- be/src/vec/exec/format/orc/vorc_reader.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index a1ecb1ae0dcf8b..178565be58ac36 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -593,7 +593,15 @@ std::tuple OrcReader::_make_orc_lite auto literal_data = literal->get_column_ptr()->get_data_at(0); auto* slot = _tuple_descriptor->slots()[slot_ref->column_id()]; auto slot_type = slot->type(); - switch (slot_type.type) { + auto primitive_type = slot_type.type; + auto src_type = OrcReader::convert_to_doris_type(orc_type).type; + if (src_type != slot_type.type && + !(is_string_type(src_type) && is_string_type(primitive_type))) { + LOG(WARNING) << "Unsupported Push Down Schema Changed Column " << slot_type.type << " to " + << src_type; + return std::make_tuple(false, orc::Literal(false), orc::PredicateDataType::LONG); + } + switch (primitive_type) { #define M(NAME) \ case TYPE_##NAME: { \ auto [valid, orc_literal] = convert_to_orc_literal( \ From 6a3cc74038f2e1fd2d1de1588d9f575b2fe86d4b Mon Sep 17 00:00:00 2001 From: Socrates Date: Mon, 16 Dec 2024 19:04:53 +0800 Subject: [PATCH 2/6] dont't push fixed char in orc reader --- be/src/vec/exec/format/orc/vorc_reader.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 178565be58ac36..4b073225e7a6eb 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -460,7 +460,7 @@ static std::unordered_map TYPEKIND_TO_PRE {orc::TypeKind::DOUBLE, orc::PredicateDataType::FLOAT}, {orc::TypeKind::STRING, orc::PredicateDataType::STRING}, {orc::TypeKind::BINARY, orc::PredicateDataType::STRING}, - {orc::TypeKind::CHAR, orc::PredicateDataType::STRING}, + // {orc::TypeKind::CHAR, orc::PredicateDataType::STRING}, {orc::TypeKind::VARCHAR, orc::PredicateDataType::STRING}, {orc::TypeKind::DATE, orc::PredicateDataType::DATE}, {orc::TypeKind::DECIMAL, orc::PredicateDataType::DECIMAL}, @@ -492,8 +492,8 @@ std::tuple convert_to_orc_literal(const orc::Type* type, [[fallthrough]]; case orc::TypeKind::BINARY: [[fallthrough]]; - case orc::TypeKind::CHAR: - [[fallthrough]]; + // case orc::TypeKind::CHAR: + // [[fallthrough]]; case orc::TypeKind::VARCHAR: { return std::make_tuple(true, orc::Literal(literal_data.data, literal_data.size)); } @@ -614,7 +614,6 @@ std::tuple OrcReader::_make_orc_lite M(INT) \ M(BIGINT) \ M(LARGEINT) \ - M(CHAR) \ M(DATE) \ M(DATETIME) \ M(DATEV2) \ From dfe812bf12f844cda7cee341309dcb62e42de535 Mon Sep 17 00:00:00 2001 From: Socrates Date: Mon, 16 Dec 2024 20:01:31 +0800 Subject: [PATCH 3/6] add cases --- .../orc_predicate/orc_predicate_table.hql | 16 ++++++ .../data/multi_catalog/orc_predicate/run.sh | 9 ++++ .../hive/test_hive_orc_predicate.out | 29 +++++++++++ .../hive/test_hive_orc_predicate.groovy | 50 +++++++++++++++++++ 4 files changed, 104 insertions(+) create mode 100644 docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql create mode 100755 docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/run.sh create mode 100644 regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out create mode 100644 regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql new file mode 100644 index 00000000000000..a946b25ff1af04 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql @@ -0,0 +1,16 @@ +CREATE DATABASE IF NOT EXISTS multi_catalog; +USE multi_catalog; + +create table fixed_char_table ( + i int, + c char(2) +) stored as orc; + +insert into fixed_char_table values(1,'a'),(2,'b '), (3,'cd'); + +create table type_changed_table ( + id int, + name string +) stored as orc; +insert into type_changed_table values (1, 'Alice'), (2, 'Bob'), (3, 'Charlie'); +ALTER TABLE type_changed_table CHANGE COLUMN id id STRING; diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/run.sh b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/run.sh new file mode 100755 index 00000000000000..f934ff3009c6f2 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -x + +CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" + +# create table +hive -f "${CUR_DIR}"/orc_predicate_table.hql + + diff --git a/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out b/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out new file mode 100644 index 00000000000000..f42bb629550c88 --- /dev/null +++ b/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out @@ -0,0 +1,29 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !predicate_fixed_char1 -- +1 a + +-- !predicate_fixed_char2 -- + +-- !predicate_changed_type1 -- +1 Alice + +-- !predicate_changed_type2 -- +2 Bob + +-- !predicate_changed_type3 -- +3 Charlie + +-- !predicate_fixed_char1 -- +1 a + +-- !predicate_fixed_char2 -- + +-- !predicate_changed_type1 -- +1 Alice + +-- !predicate_changed_type2 -- +2 Bob + +-- !predicate_changed_type3 -- +3 Charlie + diff --git a/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy b/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy new file mode 100644 index 00000000000000..2dd647aa2c1d8e --- /dev/null +++ b/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_hive_orc_predicate", "p0,external,hive,external_docker,external_docker_hive") { + + String enabled = context.config.otherConfigs.get("enableHiveTest") + if (enabled == null || !enabled.equalsIgnoreCase("true")) { + logger.info("diable Hive test.") + return; + } + + for (String hivePrefix : ["hive2", "hive3"]) { + try { + String hms_port = context.config.otherConfigs.get(hivePrefix + "HmsPort") + String catalog_name = "${hivePrefix}_test_predicate" + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + + sql """drop catalog if exists ${catalog_name}""" + sql """create catalog if not exists ${catalog_name} properties ( + "type"="hms", + 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}' + );""" + sql """use `${catalog_name}`.`multi_catalog`""" + + qt_predicate_fixed_char1 """ select * from fixed_char_table where c = 'a';""" + qt_predicate_fixed_char2 """ select * from fixed_char_table where c = 'a ';""" + + qt_predicate_changed_type1 """ select * from type_changed_table where id = '1';""" + qt_predicate_changed_type2 """ select * from type_changed_table where id = '2';""" + qt_predicate_changed_type3 """ select * from type_changed_table where id = '3';""" + + sql """drop catalog if exists ${catalog_name}""" + } finally { + } + } +} From d4e433a39c40f5e0889e730441c51014f895e12f Mon Sep 17 00:00:00 2001 From: Socrates Date: Tue, 17 Dec 2024 14:19:07 +0800 Subject: [PATCH 4/6] fix --- be/src/vec/exec/format/orc/vorc_reader.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 4b073225e7a6eb..d8b8f2a3b4cd13 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -595,9 +595,8 @@ std::tuple OrcReader::_make_orc_lite auto slot_type = slot->type(); auto primitive_type = slot_type.type; auto src_type = OrcReader::convert_to_doris_type(orc_type).type; - if (src_type != slot_type.type && - !(is_string_type(src_type) && is_string_type(primitive_type))) { - LOG(WARNING) << "Unsupported Push Down Schema Changed Column " << slot_type.type << " to " + if (src_type != primitive_type && !is_string_type(src_type) && is_string_type(primitive_type)) { + LOG(WARNING) << "Unsupported Push Down Schema Changed Column " << primitive_type << " to " << src_type; return std::make_tuple(false, orc::Literal(false), orc::PredicateDataType::LONG); } From 562a00ff846d8711c5e5428152f5dd5c8a484e80 Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 18 Dec 2024 10:49:36 +0800 Subject: [PATCH 5/6] add comment --- be/src/vec/exec/format/orc/vorc_reader.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index d8b8f2a3b4cd13..d8018044bc1e20 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -460,6 +460,7 @@ static std::unordered_map TYPEKIND_TO_PRE {orc::TypeKind::DOUBLE, orc::PredicateDataType::FLOAT}, {orc::TypeKind::STRING, orc::PredicateDataType::STRING}, {orc::TypeKind::BINARY, orc::PredicateDataType::STRING}, + // should not pust down CHAR type, because CHAR type is fixed length and will be padded // {orc::TypeKind::CHAR, orc::PredicateDataType::STRING}, {orc::TypeKind::VARCHAR, orc::PredicateDataType::STRING}, {orc::TypeKind::DATE, orc::PredicateDataType::DATE}, @@ -492,6 +493,7 @@ std::tuple convert_to_orc_literal(const orc::Type* type, [[fallthrough]]; case orc::TypeKind::BINARY: [[fallthrough]]; + // should not pust down CHAR type, because CHAR type is fixed length and will be padded // case orc::TypeKind::CHAR: // [[fallthrough]]; case orc::TypeKind::VARCHAR: { @@ -595,6 +597,7 @@ std::tuple OrcReader::_make_orc_lite auto slot_type = slot->type(); auto primitive_type = slot_type.type; auto src_type = OrcReader::convert_to_doris_type(orc_type).type; + // should not down predicate for string type change from other type if (src_type != primitive_type && !is_string_type(src_type) && is_string_type(primitive_type)) { LOG(WARNING) << "Unsupported Push Down Schema Changed Column " << primitive_type << " to " << src_type; From 7955b9af7ff08c160c7202ad8365b6585279e7fb Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 18 Dec 2024 17:22:46 +0800 Subject: [PATCH 6/6] remove unsupported_pushdown_types for paimon --- be/src/vec/exec/format/orc/vorc_reader.cpp | 5 ++--- be/src/vec/exec/format/orc/vorc_reader.h | 4 +--- be/src/vec/exec/scan/vfile_scanner.cpp | 10 +--------- 3 files changed, 4 insertions(+), 15 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index d8018044bc1e20..4d41830668960c 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -143,7 +143,7 @@ void ORCFileInputStream::read(void* buf, uint64_t length, uint64_t offset) { OrcReader::OrcReader(RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, const TFileRangeDesc& range, size_t batch_size, const std::string& ctz, io::IOContext* io_ctx, - bool enable_lazy_mat, std::vector* unsupported_pushdown_types) + bool enable_lazy_mat) : _profile(profile), _state(state), _scan_params(params), @@ -156,8 +156,7 @@ OrcReader::OrcReader(RuntimeProfile* profile, RuntimeState* state, _enable_lazy_mat(enable_lazy_mat), _enable_filter_by_min_max( state == nullptr ? true : state->query_options().enable_orc_filter_by_min_max), - _dict_cols_has_converted(false), - _unsupported_pushdown_types(unsupported_pushdown_types) { + _dict_cols_has_converted(false) { TimezoneUtils::find_cctz_time_zone(ctz, _time_zone); VecDateTimeValue t; t.from_unixtime(0, ctz); diff --git a/be/src/vec/exec/format/orc/vorc_reader.h b/be/src/vec/exec/format/orc/vorc_reader.h index 0dd19077bcf0af..6bbf3bead1efce 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.h +++ b/be/src/vec/exec/format/orc/vorc_reader.h @@ -129,8 +129,7 @@ class OrcReader : public GenericReader { OrcReader(RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, const TFileRangeDesc& range, size_t batch_size, const std::string& ctz, - io::IOContext* io_ctx, bool enable_lazy_mat = true, - std::vector* unsupported_pushdown_types = nullptr); + io::IOContext* io_ctx, bool enable_lazy_mat = true); OrcReader(const TFileScanRangeParams& params, const TFileRangeDesc& range, const std::string& ctz, io::IOContext* io_ctx, bool enable_lazy_mat = true); @@ -639,7 +638,6 @@ class OrcReader : public GenericReader { std::unique_ptr _string_dict_filter; bool _dict_cols_has_converted = false; bool _has_complex_type = false; - std::vector* _unsupported_pushdown_types; // resolve schema change std::unordered_map> _converters; diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index 76639e4bed4a28..93a22d1a94bf52 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -879,17 +879,9 @@ Status VFileScanner::_get_next_reader() { break; } case TFileFormatType::FORMAT_ORC: { - std::vector* unsupported_pushdown_types = nullptr; - if (range.__isset.table_format_params && - range.table_format_params.table_format_type == "paimon") { - static std::vector paimon_unsupport_type = - std::vector {orc::TypeKind::CHAR}; - unsupported_pushdown_types = &paimon_unsupport_type; - } std::unique_ptr orc_reader = OrcReader::create_unique( _profile, _state, *_params, range, _state->query_options().batch_size, - _state->timezone(), _io_ctx.get(), _state->query_options().enable_orc_lazy_mat, - unsupported_pushdown_types); + _state->timezone(), _io_ctx.get(), _state->query_options().enable_orc_lazy_mat); orc_reader->set_push_down_agg_type(_get_push_down_agg_type()); if (push_down_predicates) { RETURN_IF_ERROR(_process_late_arrival_conjuncts());