Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion be/src/exec/rowid_fetcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "common/status.h"
#include "exec/tablet_info.h" // DorisNodesInfo
#include "olap/storage_engine.h"
#include "vec/columns/column_string.h"
#include "vec/core/block.h"
#include "vec/data_types/data_type.h"

Expand All @@ -37,7 +38,6 @@ class RuntimeState;
class TupleDescriptor;

namespace vectorized {
class ColumnString;
class MutableBlock;
} // namespace vectorized

Expand Down
37 changes: 25 additions & 12 deletions be/src/exprs/bloom_filter_func.h
Original file line number Diff line number Diff line change
Expand Up @@ -379,25 +379,38 @@ struct CommonFindOp {
struct StringFindOp : CommonFindOp<StringRef> {
static void insert_batch(BloomFilterAdaptor& bloom_filter, const vectorized::ColumnPtr& column,
size_t start) {
auto _insert_batch_col_str = [&](const auto& col, const uint8_t* __restrict nullmap,
size_t start, size_t size) {
for (size_t i = start; i < size; i++) {
if (nullmap == nullptr || !nullmap[i]) {
bloom_filter.add_element(col.get_data_at(i));
} else {
bloom_filter.set_contain_null();
}
}
};

if (column->is_nullable()) {
const auto* nullable = assert_cast<const vectorized::ColumnNullable*>(column.get());
const auto& col =
assert_cast<const vectorized::ColumnString&>(nullable->get_nested_column());
const auto& nullmap =
assert_cast<const vectorized::ColumnUInt8&>(nullable->get_null_map_column())
.get_data();

for (size_t i = start; i < col.size(); i++) {
if (!nullmap[i]) {
bloom_filter.add_element(col.get_data_at(i));
} else {
bloom_filter.set_contain_null();
}
if (nullable->get_nested_column().is_column_string64()) {
_insert_batch_col_str(assert_cast<const vectorized::ColumnString64&>(
nullable->get_nested_column()),
nullmap.data(), start, nullmap.size());
} else {
_insert_batch_col_str(
assert_cast<const vectorized::ColumnString&>(nullable->get_nested_column()),
nullmap.data(), start, nullmap.size());
}
} else {
const auto& col = assert_cast<const vectorized::ColumnString*>(column.get());
for (size_t i = start; i < col->size(); i++) {
bloom_filter.add_element(col->get_data_at(i));
if (column->is_column_string64()) {
_insert_batch_col_str(assert_cast<const vectorized::ColumnString64&>(*column),
nullptr, start, column->size());
} else {
_insert_batch_col_str(assert_cast<const vectorized::ColumnString&>(*column),
nullptr, start, column->size());
}
}
}
Expand Down
74 changes: 50 additions & 24 deletions be/src/exprs/hybrid_set.h
Original file line number Diff line number Diff line change
Expand Up @@ -418,26 +418,39 @@ class StringSet : public HybridSetBase {
}
}

void _insert_fixed_len_string(const auto& col, const uint8_t* __restrict nullmap, size_t start,
size_t end) {
for (size_t i = start; i < end; i++) {
if (nullmap != nullptr || !nullmap[i]) {
_set.insert(col.get_data_at(i).to_string());
} else {
_contains_null = true;
}
}
}

void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) override {
if (column->is_nullable()) {
const auto* nullable = assert_cast<const vectorized::ColumnNullable*>(column.get());
const auto& col =
assert_cast<const vectorized::ColumnString&>(nullable->get_nested_column());
const auto& nullmap =
assert_cast<const vectorized::ColumnUInt8&>(nullable->get_null_map_column())
.get_data();

for (size_t i = start; i < nullable->size(); i++) {
if (!nullmap[i]) {
_set.insert(col.get_data_at(i).to_string());
} else {
_contains_null = true;
}
if (nullable->get_nested_column().is_column_string64()) {
_insert_fixed_len_string(assert_cast<const vectorized::ColumnString64&>(
nullable->get_nested_column()),
nullmap.data(), start, nullmap.size());
} else {
_insert_fixed_len_string(
assert_cast<const vectorized::ColumnString&>(nullable->get_nested_column()),
nullmap.data(), start, nullmap.size());
}
} else {
const auto& col = assert_cast<const vectorized::ColumnString*>(column.get());
for (size_t i = start; i < col->size(); i++) {
_set.insert(col->get_data_at(i).to_string());
if (column->is_column_string64()) {
_insert_fixed_len_string(assert_cast<const vectorized::ColumnString64&>(*column),
nullptr, start, column->size());
} else {
_insert_fixed_len_string(assert_cast<const vectorized::ColumnString&>(*column),
nullptr, start, column->size());
}
}
}
Expand Down Expand Up @@ -567,26 +580,39 @@ class StringValueSet : public HybridSetBase {
}
}

void _insert_fixed_len_string(const auto& col, const uint8_t* __restrict nullmap, size_t start,
size_t end) {
for (size_t i = start; i < end; i++) {
if (nullmap != nullptr || !nullmap[i]) {
_set.insert(col.get_data_at(i));
} else {
_contains_null = true;
}
}
}

void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) override {
if (column->is_nullable()) {
const auto* nullable = assert_cast<const vectorized::ColumnNullable*>(column.get());
const auto& col =
assert_cast<const vectorized::ColumnString&>(nullable->get_nested_column());
const auto& nullmap =
assert_cast<const vectorized::ColumnUInt8&>(nullable->get_null_map_column())
.get_data();

for (size_t i = start; i < nullable->size(); i++) {
if (!nullmap[i]) {
_set.insert(col.get_data_at(i));
} else {
_contains_null = true;
}
if (nullable->get_nested_column().is_column_string64()) {
_insert_fixed_len_string(assert_cast<const vectorized::ColumnString64&>(
nullable->get_nested_column()),
nullmap.data(), start, nullmap.size());
} else {
_insert_fixed_len_string(
assert_cast<const vectorized::ColumnString&>(nullable->get_nested_column()),
nullmap.data(), start, nullmap.size());
}
} else {
const auto& col = assert_cast<const vectorized::ColumnString*>(column.get());
for (size_t i = start; i < col->size(); i++) {
_set.insert(col->get_data_at(i));
if (column->is_column_string64()) {
_insert_fixed_len_string(assert_cast<const vectorized::ColumnString64&>(*column),
nullptr, start, column->size());
} else {
_insert_fixed_len_string(assert_cast<const vectorized::ColumnString&>(*column),
nullptr, start, column->size());
}
}
}
Expand Down
41 changes: 24 additions & 17 deletions be/src/exprs/minmax_predicate.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,19 +70,31 @@ class MinMaxNumFunc : public MinMaxFuncBase {
}
}

void update_batch(const vectorized::ColumnPtr& column, size_t start) {
const auto size = column->size();
if constexpr (std::is_same_v<T, StringRef>) {
const auto& column_string = assert_cast<const vectorized::ColumnString&>(*column);
for (size_t i = start; i < size; i++) {
void _update_batch_string(const auto& column_string, const uint8_t* __restrict nullmap,
size_t start, size_t size) {
for (size_t i = start; i < size; i++) {
if (nullmap == nullptr || !nullmap[i]) {
if constexpr (NeedMin) {
_min = std::min(_min, column_string.get_data_at(i));
}
if constexpr (NeedMax) {
_max = std::max(_max, column_string.get_data_at(i));
}
}
store_string_ref();
}
store_string_ref();
}

void update_batch(const vectorized::ColumnPtr& column, size_t start) {
const auto size = column->size();
if constexpr (std::is_same_v<T, StringRef>) {
if (column->is_column_string64()) {
_update_batch_string(assert_cast<const vectorized::ColumnString64&>(*column),
nullptr, start, size);
} else {
_update_batch_string(assert_cast<const vectorized::ColumnString&>(*column), nullptr,
start, size);
}
} else {
const T* data = (T*)column->get_raw_data().data;
for (size_t i = start; i < size; i++) {
Expand All @@ -100,18 +112,13 @@ class MinMaxNumFunc : public MinMaxFuncBase {
size_t start) {
const auto size = column->size();
if constexpr (std::is_same_v<T, StringRef>) {
const auto& column_string = assert_cast<const vectorized::ColumnString&>(*column);
for (size_t i = start; i < size; i++) {
if (!nullmap[i]) {
if constexpr (NeedMin) {
_min = std::min(_min, column_string.get_data_at(i));
}
if constexpr (NeedMax) {
_max = std::max(_max, column_string.get_data_at(i));
}
}
if (column->is_column_string64()) {
_update_batch_string(assert_cast<const vectorized::ColumnString64&>(*column),
nullmap.data(), start, size);
} else {
_update_batch_string(assert_cast<const vectorized::ColumnString&>(*column),
nullmap.data(), start, size);
}
store_string_ref();
} else {
const T* data = (T*)column->get_raw_data().data;
for (size_t i = start; i < size; i++) {
Expand Down
3 changes: 2 additions & 1 deletion be/src/pipeline/exec/hashjoin_build_sink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ Status HashJoinBuildSinkLocalState::process_build_block(RuntimeState* state,
return Status::OK();
}
COUNTER_UPDATE(_build_rows_counter, rows);
block.replace_if_overflow();

vectorized::ColumnRawPtrs raw_ptrs(_build_expr_ctxs.size());

Expand Down Expand Up @@ -519,7 +520,7 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block*
res_col_ids));

SCOPED_TIMER(local_state._build_side_merge_block_timer);
RETURN_IF_ERROR(local_state._build_side_mutable_block.merge(*in_block));
RETURN_IF_ERROR(local_state._build_side_mutable_block.merge_ignore_overflow(*in_block));
COUNTER_UPDATE(local_state._build_blocks_memory_usage, in_block->bytes());
local_state._mem_tracker->consume(in_block->bytes());
if (local_state._build_side_mutable_block.rows() >
Expand Down
5 changes: 1 addition & 4 deletions be/src/runtime/primitive_type.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "olap/decimal12.h"
#include "runtime/define_primitive_type.h"
#include "vec/columns/column_decimal.h"
#include "vec/columns/column_string.h"
#include "vec/columns/column_vector.h"
#include "vec/columns/columns_number.h"
#include "vec/core/types.h"
Expand All @@ -36,10 +37,6 @@

namespace doris {

namespace vectorized {
class ColumnString;
} // namespace vectorized

class DecimalV2Value;
struct StringRef;
struct JsonBinaryValue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "util/bitmap_value.h"
#include "vec/aggregate_functions/aggregate_function.h"
#include "vec/columns/column_complex.h"
#include "vec/columns/column_string.h"
#include "vec/columns/column_vector.h"
#include "vec/core/types.h"
#include "vec/data_types/data_type_bitmap.h"
Expand All @@ -44,7 +45,6 @@ namespace vectorized {
class Arena;
class BufferReadable;
class BufferWritable;
class ColumnString;
class IColumn;
} // namespace vectorized
} // namespace doris
Expand Down
20 changes: 16 additions & 4 deletions be/src/vec/columns/column.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,13 @@ class IColumn : public COW<IColumn> {
/// If you want to copy column for modification, look at 'mutate' method.
virtual MutablePtr clone() const = 0;

protected:
public:
// 64bit offsets now only Array type used, so we make it protected
// to avoid use IColumn::Offset64 directly.
// please use ColumnArray::Offset64 instead if we need.
using Offset64 = UInt64;
using Offsets64 = PaddedPODArray<Offset64>;

public:
// 32bit offsets for string
using Offset = UInt32;
using Offsets = PaddedPODArray<Offset>;
Expand All @@ -100,6 +99,11 @@ class IColumn : public COW<IColumn> {
*/
virtual Ptr convert_to_full_column_if_const() const { return get_ptr(); }

/** If in join. the StringColumn size may overflow uint32_t, we need convert to uint64_t to ColumnString64
* The Column: ColumnString, ColumnNullable, ColumnArray, ColumnStruct need impl the code
*/
virtual Ptr convert_column_if_overflow() { return get_ptr(); }

/// If column isn't ColumnLowCardinality, return itself.
/// If column is ColumnLowCardinality, transforms is to full column.
virtual Ptr convert_to_full_column_if_low_cardinality() const { return get_ptr(); }
Expand Down Expand Up @@ -222,6 +226,14 @@ class IColumn : public COW<IColumn> {
/// TODO: we need `insert_range_from_const` for every column type.
virtual void insert_range_from(const IColumn& src, size_t start, size_t length) = 0;

/// Appends range of elements from other column with the same type.
/// Do not need throw execption in ColumnString overflow uint32, only
/// use in join
virtual void insert_range_from_ignore_overflow(const IColumn& src, size_t start,
size_t length) {
insert_range_from(src, start, length);
}

/// Appends one element from other column with the same type multiple times.
virtual void insert_many_from(const IColumn& src, size_t position, size_t length) {
for (size_t i = 0; i < length; ++i) {
Expand Down Expand Up @@ -640,6 +652,8 @@ class IColumn : public COW<IColumn> {

virtual bool is_column_string() const { return false; }

virtual bool is_column_string64() const { return false; }

virtual bool is_column_decimal() const { return false; }

virtual bool is_column_dictionary() const { return false; }
Expand All @@ -653,8 +667,6 @@ class IColumn : public COW<IColumn> {
/// If the only value column can contain is NULL.
virtual bool only_null() const { return false; }

virtual bool low_cardinality() const { return false; }

virtual void sort_column(const ColumnSorter* sorter, EqualFlags& flags,
IColumn::Permutation& perms, EqualRange& range,
bool last_column) const;
Expand Down
34 changes: 34 additions & 0 deletions be/src/vec/columns/column_array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,40 @@ void ColumnArray::insert_range_from(const IColumn& src, size_t start, size_t len
}
}

void ColumnArray::insert_range_from_ignore_overflow(const IColumn& src, size_t start,
size_t length) {
const ColumnArray& src_concrete = assert_cast<const ColumnArray&>(src);

if (start + length > src_concrete.get_offsets().size()) {
throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
"Parameter out of bound in ColumnArray::insert_range_from method. "
"[start({}) + length({}) > offsets.size({})]",
std::to_string(start), std::to_string(length),
std::to_string(src_concrete.get_offsets().size()));
}

size_t nested_offset = src_concrete.offset_at(start);
size_t nested_length = src_concrete.get_offsets()[start + length - 1] - nested_offset;

get_data().insert_range_from_ignore_overflow(src_concrete.get_data(), nested_offset,
nested_length);

auto& cur_offsets = get_offsets();
const auto& src_offsets = src_concrete.get_offsets();

if (start == 0 && cur_offsets.empty()) {
cur_offsets.assign(src_offsets.begin(), src_offsets.begin() + length);
} else {
size_t old_size = cur_offsets.size();
// -1 is ok, because PaddedPODArray pads zeros on the left.
size_t prev_max_offset = cur_offsets.back();
cur_offsets.resize(old_size + length);

for (size_t i = 0; i < length; ++i)
cur_offsets[old_size + i] = src_offsets[start + i] - nested_offset + prev_max_offset;
Comment on lines +546 to +547
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: statement should be inside braces [readability-braces-around-statements]

Suggested change
for (size_t i = 0; i < length; ++i)
cur_offsets[old_size + i] = src_offsets[start + i] - nested_offset + prev_max_offset;
for (size_t i = 0; i < length; ++i) {
cur_offsets[old_size + i] = src_offsets[start + i] - nested_offset + prev_max_offset;
}

}
}

double ColumnArray::get_ratio_of_default_rows(double sample_ratio) const {
return get_ratio_of_default_rows_impl<ColumnArray>(sample_ratio);
}
Expand Down
Loading