From 7896f60dae334ff7afacc335c2c4bef666af6c3c Mon Sep 17 00:00:00 2001 From: cambyzju Date: Thu, 13 Oct 2022 21:57:41 +0800 Subject: [PATCH 1/2] bugfix for array column with delete condition --- .../rowset/segment_v2/segment_iterator.cpp | 6 +++- be/src/vec/columns/column_array.cpp | 32 +++++++++++++++++++ be/src/vec/columns/column_array.h | 2 ++ .../delete_p0/test_array_column_delete.out | 5 +++ .../delete_p0/test_array_column_delete.groovy | 29 +++++++++++++++++ 5 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 regression-test/data/delete_p0/test_array_column_delete.out create mode 100644 regression-test/suites/delete_p0/test_array_column_delete.groovy diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 61d369fb692048..b1c033b1103e40 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -1080,7 +1080,11 @@ Status SegmentIterator::next_batch(vectorized::Block* block) { for (size_t i = 0; i < _schema.num_column_ids(); i++) { auto cid = _schema.column_id(i); auto column_desc = _schema.column(cid); - if (_is_pred_column[cid]) { + if (column_desc->type() == OLAP_FIELD_TYPE_ARRAY) { + _current_return_columns[cid] = + Schema::get_data_type_ptr(*column_desc)->create_column(); + _current_return_columns[cid]->reserve(_opts.block_row_max); + } else if (_is_pred_column[cid]) { _current_return_columns[cid] = Schema::get_predicate_column_nullable_ptr( column_desc->type(), column_desc->is_nullable()); _current_return_columns[cid]->reserve(_opts.block_row_max); diff --git a/be/src/vec/columns/column_array.cpp b/be/src/vec/columns/column_array.cpp index 413c490b964f81..4afa62e36d3a1f 100644 --- a/be/src/vec/columns/column_array.cpp +++ b/be/src/vec/columns/column_array.cpp @@ -503,6 +503,38 @@ void ColumnArray::insert_indices_from(const IColumn& src, const int* indices_beg } } +Status ColumnArray::filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn* col_ptr) { + auto to = reinterpret_cast(col_ptr); + auto& to_offsets = to->get_offsets(); + + size_t element_size = 0; + size_t max_offset = 0; + for (size_t i = 0; i < sel_size; ++i) { + element_size += size_at(sel[i]); + max_offset = std::max(max_offset, offset_at(sel[i])); + } + if (max_offset > std::numeric_limits::max()) { + return Status::IOError("array elements too large than uint16_t::max"); + } + + to_offsets.reserve(to_offsets.size() + sel_size); + auto nested_sel = std::make_unique(element_size); + size_t nested_sel_size = 0; + for (size_t i = 0; i < sel_size; ++i) { + auto row_off = offset_at(sel[i]); + auto row_size = size_at(sel[i]); + to_offsets.push_back(to_offsets.back() + row_size); + for (auto j = 0; j < row_size; ++j) { + nested_sel[nested_sel_size++] = row_off + j; + } + } + + if (nested_sel_size > 0) { + return data->filter_by_selector(nested_sel.get(), nested_sel_size, &to->get_data()); + } + return Status::OK(); +} + ColumnPtr ColumnArray::replicate(const IColumn::Offsets& replicate_offsets) const { if (replicate_offsets.empty()) return clone_empty(); diff --git a/be/src/vec/columns/column_array.h b/be/src/vec/columns/column_array.h index 043cf5f629213a..824cca8b2389db 100644 --- a/be/src/vec/columns/column_array.h +++ b/be/src/vec/columns/column_array.h @@ -176,6 +176,8 @@ class ColumnArray final : public COWHelper { offsets->clear(); } + Status filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn* col_ptr) override; + private: WrappedPtr data; WrappedPtr offsets; diff --git a/regression-test/data/delete_p0/test_array_column_delete.out b/regression-test/data/delete_p0/test_array_column_delete.out new file mode 100644 index 00000000000000..8324b608c9502f --- /dev/null +++ b/regression-test/data/delete_p0/test_array_column_delete.out @@ -0,0 +1,5 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +2 [12, 3] +3 [] + diff --git a/regression-test/suites/delete_p0/test_array_column_delete.groovy b/regression-test/suites/delete_p0/test_array_column_delete.groovy new file mode 100644 index 00000000000000..21455285a49f6d --- /dev/null +++ b/regression-test/suites/delete_p0/test_array_column_delete.groovy @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_array_column_delete") { + def tableName = "test_array_column_delete" + + sql """ SET enable_vectorized_engine = TRUE; """ + sql "ADMIN SET FRONTEND CONFIG ('enable_array_type' = 'true')" + + sql """ DROP TABLE IF EXISTS ${tableName}; """ + sql """ CREATE TABLE ${tableName} (id INT NULL, c_array ARRAY NULL) ENGINE=OLAP DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 4 PROPERTIES ( "replication_allocation" = "tag.location.default: 1","in_memory" = "false","storage_format" = "V2") """ + sql """ insert into ${tableName} values(1, NULL),(2,[12,3]),(3,[]),(4,NULL),(5,NULL) """ + sql """ DELETE FROM ${tableName} WHERE c_array is NULL """ + qt_sql """ SELECT * FROM ${tableName} order by id """ +} From 4de58b0729206b25a358de8fb793be4e7b25ecdd Mon Sep 17 00:00:00 2001 From: cambyzju Date: Tue, 18 Oct 2022 09:38:29 +0800 Subject: [PATCH 2/2] refractor for improve performance --- be/src/olap/rowset/segment_v2/segment_iterator.cpp | 8 ++------ be/src/olap/schema.cpp | 11 +++++++---- be/src/olap/schema.h | 3 +-- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index b1c033b1103e40..1f8e24a2b28be7 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -1080,13 +1080,9 @@ Status SegmentIterator::next_batch(vectorized::Block* block) { for (size_t i = 0; i < _schema.num_column_ids(); i++) { auto cid = _schema.column_id(i); auto column_desc = _schema.column(cid); - if (column_desc->type() == OLAP_FIELD_TYPE_ARRAY) { + if (_is_pred_column[cid]) { _current_return_columns[cid] = - Schema::get_data_type_ptr(*column_desc)->create_column(); - _current_return_columns[cid]->reserve(_opts.block_row_max); - } else if (_is_pred_column[cid]) { - _current_return_columns[cid] = Schema::get_predicate_column_nullable_ptr( - column_desc->type(), column_desc->is_nullable()); + Schema::get_predicate_column_nullable_ptr(*column_desc); _current_return_columns[cid]->reserve(_opts.block_row_max); } else if (i >= block->columns()) { // if i >= block->columns means the column and not the pred_column means `column i` is diff --git a/be/src/olap/schema.cpp b/be/src/olap/schema.cpp index d6352b0cd1e71a..01f03433179fc8 100644 --- a/be/src/olap/schema.cpp +++ b/be/src/olap/schema.cpp @@ -114,10 +114,13 @@ vectorized::DataTypePtr Schema::get_data_type_ptr(const Field& field) { return vectorized::DataTypeFactory::instance().create_data_type(field); } -vectorized::IColumn::MutablePtr Schema::get_predicate_column_nullable_ptr(FieldType type, - bool is_null) { - vectorized::IColumn::MutablePtr ptr = Schema::get_predicate_column_ptr(type); - if (is_null) { +vectorized::IColumn::MutablePtr Schema::get_predicate_column_nullable_ptr(const Field& field) { + if (UNLIKELY(field.type() == OLAP_FIELD_TYPE_ARRAY)) { + return get_data_type_ptr(field)->create_column(); + } + + vectorized::IColumn::MutablePtr ptr = Schema::get_predicate_column_ptr(field.type()); + if (field.is_nullable()) { return doris::vectorized::ColumnNullable::create(std::move(ptr), doris::vectorized::ColumnUInt8::create()); } diff --git a/be/src/olap/schema.h b/be/src/olap/schema.h index f3f09ffe3cadef..7c578f4f278e87 100644 --- a/be/src/olap/schema.h +++ b/be/src/olap/schema.h @@ -112,8 +112,7 @@ class Schema { static vectorized::IColumn::MutablePtr get_predicate_column_ptr(FieldType type); - static vectorized::IColumn::MutablePtr get_predicate_column_nullable_ptr(FieldType type, - bool is_null = false); + static vectorized::IColumn::MutablePtr get_predicate_column_nullable_ptr(const Field& field); const std::vector& columns() const { return _cols; }