Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions be/src/runtime/exec_env.h
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ class ExecEnv {
static void set_tracking_memory(bool tracking_memory) {
_s_tracking_memory.store(tracking_memory, std::memory_order_release);
}
void set_orc_memory_pool(orc::MemoryPool* pool) { _orc_memory_pool = pool; }
void set_non_block_close_thread_pool(std::unique_ptr<ThreadPool>&& pool) {
_non_block_close_thread_pool = std::move(pool);
}
Expand Down
539 changes: 375 additions & 164 deletions be/src/vec/exec/format/orc/vorc_reader.cpp

Large diffs are not rendered by default.

45 changes: 33 additions & 12 deletions be/src/vec/exec/format/orc/vorc_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
#pragma once

#include <cctz/time_zone.h>
#include <stddef.h>
#include <stdint.h>

#include <cstddef>
#include <cstdint>
#include <list>
#include <memory>
#include <orc/OrcFile.hh>
Expand All @@ -41,6 +41,7 @@
#include "orc/Reader.hh"
#include "orc/Type.hh"
#include "orc/Vector.hh"
#include "orc/sargs/Literal.hh"
#include "runtime/types.h"
#include "util/runtime_profile.h"
#include "vec/aggregate_functions/aggregate_function.h"
Expand All @@ -51,6 +52,8 @@
#include "vec/exec/format/format_common.h"
#include "vec/exec/format/generic_reader.h"
#include "vec/exec/format/table/transactional_hive_reader.h"
#include "vec/exprs/vliteral.h"
#include "vec/exprs/vslot_ref.h"

namespace doris {
class RuntimeState;
Expand Down Expand Up @@ -80,13 +83,6 @@ namespace doris::vectorized {

class ORCFileInputStream;

struct OrcPredicate {
std::string col_name;
orc::PredicateDataType data_type;
std::vector<orc::Literal> literals;
SQLFilterOp op;
};

struct LazyReadContext {
VExprContextSPtrs conjuncts;
bool can_lazy_read = false;
Expand Down Expand Up @@ -227,6 +223,8 @@ class OrcReader : public GenericReader {
RuntimeProfile::Counter* decode_value_time = nullptr;
RuntimeProfile::Counter* decode_null_map_time = nullptr;
RuntimeProfile::Counter* filter_block_time = nullptr;
RuntimeProfile::Counter* selected_row_group_count = nullptr;
RuntimeProfile::Counter* evaluated_row_group_count = nullptr;
};

class ORCFilterImpl : public orc::ORCFilter {
Expand Down Expand Up @@ -287,11 +285,30 @@ class OrcReader : public GenericReader {
void _init_orc_cols(const orc::Type& type, std::vector<std::string>& orc_cols,
std::vector<std::string>& orc_cols_lower_case,
std::unordered_map<std::string, const orc::Type*>& type_map,
bool* is_hive1_orc);
bool* is_hive1_orc, bool should_add_acid_prefix) const;
static bool _check_acid_schema(const orc::Type& type);
static const orc::Type& _remove_acid(const orc::Type& type);
bool _init_search_argument(
const std::unordered_map<std::string, ColumnValueRangeType>* colname_to_value_range);

// functions for building search argument until _init_search_argument
std::tuple<bool, orc::Literal, orc::PredicateDataType> _make_orc_literal(
const VSlotRef* slot_ref, const VLiteral* literal);
bool _check_slot_can_push_down(const VExprSPtr& expr);
bool _check_literal_can_push_down(const VExprSPtr& expr, uint16_t child_id);
bool _check_rest_children_can_push_down(const VExprSPtr& expr);
bool _check_expr_can_push_down(const VExprSPtr& expr);
void _build_less_than(const VExprSPtr& expr,
std::unique_ptr<orc::SearchArgumentBuilder>& builder);
void _build_less_than_equals(const VExprSPtr& expr,
std::unique_ptr<orc::SearchArgumentBuilder>& builder);
void _build_equals(const VExprSPtr& expr, std::unique_ptr<orc::SearchArgumentBuilder>& builder);
void _build_filter_in(const VExprSPtr& expr,
std::unique_ptr<orc::SearchArgumentBuilder>& builder);
void _build_is_null(const VExprSPtr& expr,
std::unique_ptr<orc::SearchArgumentBuilder>& builder);
bool _build_search_argument(const VExprSPtr& expr,
std::unique_ptr<orc::SearchArgumentBuilder>& builder);
bool _init_search_argument(const VExprContextSPtrs& conjuncts);

void _init_bloom_filter(
std::unordered_map<std::string, ColumnValueRangeType>* colname_to_value_range);
void _init_system_properties();
Expand Down Expand Up @@ -584,6 +601,7 @@ class OrcReader : public GenericReader {
std::unique_ptr<ORCFileInputStream> _file_input_stream;
Statistics _statistics;
OrcProfile _orc_profile;
orc::ReaderMetrics _reader_metrics;

std::unique_ptr<orc::ColumnVectorBatch> _batch;
std::unique_ptr<orc::Reader> _reader;
Expand Down Expand Up @@ -629,6 +647,9 @@ class OrcReader : public GenericReader {
std::unordered_map<std::string, std::string> _table_col_to_file_col;
//support iceberg position delete .
std::vector<int64_t>* _position_delete_ordered_rowids = nullptr;
std::unordered_map<const VSlotRef*, orc::PredicateDataType>
_vslot_ref_to_orc_predicate_data_type;
std::unordered_map<const VLiteral*, orc::Literal> _vliteral_to_orc_literal;

// If you set "orc_tiny_stripe_threshold_bytes" = 0, the use tiny stripes merge io optimization will not be used.
int64_t _orc_tiny_stripe_threshold_bytes = 8L * 1024L * 1024L;
Expand Down
Binary file added be/test/exec/test_data/orc_scanner/orders.orc
Binary file not shown.
29 changes: 11 additions & 18 deletions be/test/testutil/desc_tbl_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,9 @@

#include "testutil/desc_tbl_builder.h"

#include <glog/logging.h>
#include <gtest/gtest-message.h>
#include <gtest/gtest-test-part.h>
#include <gtest/gtest.h>

#include <vector>

#include "common/object_pool.h"
#include "common/status.h"
#include "gtest/gtest_pred_impl.h"
#include "runtime/define_primitive_type.h"
#include "runtime/descriptors.h"
#include "util/bit_util.h"

using std::vector;

namespace doris {

Expand All @@ -44,7 +33,7 @@ TupleDescBuilder& DescriptorTblBuilder::declare_tuple() {

// item_id of -1 indicates no itemTupleId
static TSlotDescriptor make_slot_descriptor(int id, int parent_id, const TypeDescriptor& type,
int slot_idx, int item_id) {
const std::string& name, int slot_idx, int item_id) {
int null_byte = slot_idx / 8;
int null_bit = slot_idx % 8;
TSlotDescriptor slot_desc;
Expand All @@ -58,6 +47,7 @@ static TSlotDescriptor make_slot_descriptor(int id, int parent_id, const TypeDes
slot_desc.__set_nullIndicatorBit(null_bit);
slot_desc.__set_slotIdx(slot_idx);
slot_desc.__set_isMaterialized(true);
slot_desc.__set_colName(name);
// if (item_id != -1) {
// slot_desc.__set_itemTupleId(item_id);
// }
Expand All @@ -78,24 +68,27 @@ DescriptorTbl* DescriptorTblBuilder::build() {
int tuple_id = 0;
int slot_id = 0;

for (int i = 0; i < _tuples_descs.size(); ++i) {
build_tuple(_tuples_descs[i]->slot_types(), &thrift_desc_tbl, &tuple_id, &slot_id);
for (auto& _tuples_desc : _tuples_descs) {
build_tuple(_tuples_desc->slot_types(), _tuples_desc->slot_names(), &thrift_desc_tbl,
&tuple_id, &slot_id);
}

Status status = DescriptorTbl::create(_obj_pool, thrift_desc_tbl, &desc_tbl);
EXPECT_TRUE(status.ok());
return desc_tbl;
}

TTupleDescriptor DescriptorTblBuilder::build_tuple(const vector<TypeDescriptor>& slot_types,
TTupleDescriptor DescriptorTblBuilder::build_tuple(const std::vector<TypeDescriptor>& slot_types,
const std::vector<std::string>& slot_names,
TDescriptorTable* thrift_desc_tbl,
int* next_tuple_id, int* slot_id) {
// We never materialize struct slots (there's no in-memory representation of structs,
// instead the materialized fields appear directly in the tuple), but array types can
// still have a struct item type. In this case, the array item tuple contains the
// "inlined" struct fields.
if (slot_types.size() == 1 && slot_types[0].type == TYPE_STRUCT) {
return build_tuple(slot_types[0].children, thrift_desc_tbl, next_tuple_id, slot_id);
return build_tuple(slot_types[0].children, slot_types[0].field_names, thrift_desc_tbl,
next_tuple_id, slot_id);
}

int tuple_id = *next_tuple_id;
Expand All @@ -111,7 +104,7 @@ TTupleDescriptor DescriptorTblBuilder::build_tuple(const vector<TypeDescriptor>&
// }

thrift_desc_tbl->slotDescriptors.push_back(
make_slot_descriptor(*slot_id, tuple_id, slot_types[i], i, item_id));
make_slot_descriptor(*slot_id, tuple_id, slot_types[i], slot_names[i], i, item_id));
thrift_desc_tbl->__isset.slotDescriptors = true;
++(*slot_id);
}
Expand Down
17 changes: 15 additions & 2 deletions be/test/testutil/desc_tbl_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,16 @@

#include <gen_cpp/Descriptors_types.h>

#include <tuple>
#include <vector>

#include "common/object_pool.h"
#include "runtime/descriptors.h"
#include "runtime/types.h"

namespace doris {

class ObjectPool;
class TupleDescBuilder;
class DescriptorTbl;

// Aids in the construction of a DescriptorTbl by declaring tuples and slots
// associated with those tuples.
Expand All @@ -40,6 +41,7 @@ class DescriptorTbl;
// DescriptorTblBuilder builder;
// builder.declare_tuple() << TYPE_TINYINT << TYPE_TIMESTAMP; // gets TupleId 0
// builder.declare_tuple() << TYPE_FLOAT; // gets TupleId 1
// builder.declare_tuple() << std::make_tuple(TYPE_INT, "col1") << std::make_tuple(TYPE_STRING, "col2"); // gets Tuple with type and name
// DescriptorTbl desc_tbl = builder.build();
class DescriptorTblBuilder {
public:
Expand All @@ -57,20 +59,31 @@ class DescriptorTblBuilder {
std::vector<TupleDescBuilder*> _tuples_descs;

TTupleDescriptor build_tuple(const std::vector<TypeDescriptor>& slot_types,
const std::vector<std::string>& slot_names,
TDescriptorTable* thrift_desc_tbl, int* tuple_id, int* slot_id);
};

class TupleDescBuilder {
public:
using SlotType = std::tuple<TypeDescriptor, std::string>;
TupleDescBuilder& operator<<(const SlotType& slot) {
_slot_types.push_back(std::get<0>(slot));
_slot_names.push_back(std::get<1>(slot));
return *this;
}

TupleDescBuilder& operator<<(const TypeDescriptor& slot_type) {
_slot_types.push_back(slot_type);
_slot_names.emplace_back("");
return *this;
}

std::vector<TypeDescriptor> slot_types() const { return _slot_types; }
std::vector<std::string> slot_names() const { return _slot_names; }

private:
std::vector<TypeDescriptor> _slot_types;
std::vector<std::string> _slot_names;
};

} // end namespace doris
Expand Down
Loading
Loading