From 652f6a390337aa8f19e7c02f9fa452689fd52eaf Mon Sep 17 00:00:00 2001 From: Pxl Date: Wed, 8 Jan 2025 11:51:50 +0800 Subject: [PATCH 1/3] [Bug](join) fix columnstr64's offset overflow on serialize_value_into_arena (#46461) ```sql mysql [test]>select /*+ LEADING(a,b) */ count(*) from d_table as a, d_table2 as b where a.k4=b.k4 and a.k1=b.k1; +----------+ | count(*) | +----------+ | 50000000 | +----------+ 1 row in set (4.87 sec) mysql [test]>select /*+ LEADING(b,a) */ count(*) from d_table as a, d_table2 as b where a.k4=b.k4 and a.k1=b.k1; +----------+ | count(*) | +----------+ | 42949673 | +----------+ 1 row in set (21.32 sec) ``` None - Test - [x] Regression test - [ ] Unit Test - [ ] Manual test (add detailed scripts or steps below) - [ ] No need to test or manual test. Explain why: - [ ] This is a refactor/code format and no logic has been changed. - [ ] Previous test can cover this change. - [ ] No code files have been changed. - [ ] Other reason - Behavior changed: - [x] No. - [ ] Yes. - Does this need documentation? - [x] No. - [ ] Yes. - [ ] Confirm the release note - [ ] Confirm test cases - [ ] Confirm document - [ ] Add branch pick label --- be/src/vec/columns/column_string.cpp | 20 +++---- be/src/vec/columns/column_string.h | 10 +++- .../str64_serialize/str64_serialize.out | 7 +++ .../str64_serialize/str64_serialize.groovy | 57 +++++++++++++++++++ 4 files changed, 81 insertions(+), 13 deletions(-) create mode 100644 regression-test/data/query_p1/str64_serialize/str64_serialize.out create mode 100644 regression-test/suites/query_p1/str64_serialize/str64_serialize.groovy diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index 6eb3e45b2e015a..52484fdaef49a2 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -380,8 +380,8 @@ ColumnPtr ColumnStr::permute(const IColumn::Permutation& perm, size_t limit) template StringRef ColumnStr::serialize_value_into_arena(size_t n, Arena& arena, char const*& begin) const { - uint32_t string_size(size_at(n)); - uint32_t offset(offset_at(n)); + auto string_size(size_at(n)); + auto offset(offset_at(n)); StringRef res; res.size = sizeof(string_size) + string_size; @@ -395,7 +395,7 @@ StringRef ColumnStr::serialize_value_into_arena(size_t n, Arena& arena, template const char* ColumnStr::deserialize_and_insert_from_arena(const char* pos) { - const uint32_t string_size = unaligned_load(pos); + const auto string_size = unaligned_load(pos); pos += sizeof(string_size); const size_t old_size = chars.size(); @@ -413,7 +413,7 @@ size_t ColumnStr::get_max_row_byte_size() const { size_t max_size = 0; size_t num_rows = offsets.size(); for (size_t i = 0; i < num_rows; ++i) { - max_size = std::max(max_size, size_at(i)); + max_size = std::max(max_size, T(size_at(i))); } return max_size + sizeof(uint32_t); @@ -423,8 +423,8 @@ template void ColumnStr::serialize_vec(std::vector& keys, size_t num_rows, size_t max_row_byte_size) const { for (size_t i = 0; i < num_rows; ++i) { - uint32_t offset(offset_at(i)); - uint32_t string_size(size_at(i)); + auto offset(offset_at(i)); + auto string_size(size_at(i)); auto* ptr = const_cast(keys[i].data + keys[i].size); memcpy_fixed(ptr, (char*)&string_size); @@ -450,7 +450,7 @@ void ColumnStr::serialize_vec_with_null_map(std::vector& keys, siz UInt32 offset(offset_at(i)); UInt32 string_size(size_at(i)); - memcpy_fixed(dest + 1, (char*)&string_size); + memcpy_fixed(dest + 1, (char*)&string_size); memcpy(dest + 1 + sizeof(string_size), &chars[offset], string_size); keys[i].size += sizeof(string_size) + string_size + sizeof(UInt8); } else { @@ -467,7 +467,7 @@ void ColumnStr::serialize_vec_with_null_map(std::vector& keys, siz UInt32 offset(offset_at(i)); UInt32 string_size(size_at(i)); - memcpy_fixed(dest + 1, (char*)&string_size); + memcpy_fixed(dest + 1, (char*)&string_size); memcpy(dest + 1 + sizeof(string_size), &chars[offset], string_size); keys[i].size += sizeof(string_size) + string_size + sizeof(UInt8); } @@ -477,7 +477,7 @@ void ColumnStr::serialize_vec_with_null_map(std::vector& keys, siz template void ColumnStr::deserialize_vec(std::vector& keys, const size_t num_rows) { for (size_t i = 0; i != num_rows; ++i) { - auto original_ptr = keys[i].data; + const auto* original_ptr = keys[i].data; keys[i].data = deserialize_and_insert_from_arena(original_ptr); keys[i].size -= keys[i].data - original_ptr; } @@ -488,7 +488,7 @@ void ColumnStr::deserialize_vec_with_null_map(std::vector& keys, const size_t num_rows, const uint8_t* null_map) { for (size_t i = 0; i != num_rows; ++i) { if (null_map[i] == 0) { - auto original_ptr = keys[i].data; + const auto* original_ptr = keys[i].data; keys[i].data = deserialize_and_insert_from_arena(original_ptr); keys[i].size -= keys[i].data - original_ptr; } else { diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index 906f62b52aaca2..edbc63d2872bb3 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -29,6 +29,7 @@ #include #include +#include "common/cast_set.h" #include "common/compiler_util.h" // IWYU pragma: keep #include "common/exception.h" #include "common/status.h" @@ -87,8 +88,11 @@ class ColumnStr final : public COWHelper> { size_t ALWAYS_INLINE offset_at(ssize_t i) const { return offsets[i - 1]; } - /// Size of i-th element, including terminating zero. - size_t ALWAYS_INLINE size_at(ssize_t i) const { return offsets[i] - offsets[i - 1]; } + // Size of i-th element, including terminating zero. + // assume that the length of a single element is less than 32-bit + uint32_t ALWAYS_INLINE size_at(ssize_t i) const { + return uint32_t(offsets[i] - offsets[i - 1]); + } template struct less; @@ -373,7 +377,7 @@ class ColumnStr final : public COWHelper> { for (size_t i = start_index; i < start_index + num; i++) { int32_t codeword = data_array[i]; - auto& src = dict[codeword]; + const auto& src = dict[codeword]; memcpy(chars.data() + old_size, src.data, src.size); old_size += src.size; } diff --git a/regression-test/data/query_p1/str64_serialize/str64_serialize.out b/regression-test/data/query_p1/str64_serialize/str64_serialize.out new file mode 100644 index 00000000000000..99c168b99d4c2e --- /dev/null +++ b/regression-test/data/query_p1/str64_serialize/str64_serialize.out @@ -0,0 +1,7 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !test -- +50000000 + +-- !test -- +50000000 + diff --git a/regression-test/suites/query_p1/str64_serialize/str64_serialize.groovy b/regression-test/suites/query_p1/str64_serialize/str64_serialize.groovy new file mode 100644 index 00000000000000..b0e3ffa99e937f --- /dev/null +++ b/regression-test/suites/query_p1/str64_serialize/str64_serialize.groovy @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("str64_serialize") { + + sql """ DROP TABLE IF EXISTS d_table; """ + sql """ DROP TABLE IF EXISTS d_table2; """ + + + sql """ + create table d_table ( + k1 int null, + k2 int not null, + k3 bigint null, + k4 varchar(100) null + ) + duplicate key (k1,k2,k3) + distributed BY hash(k1) buckets 3 + properties("replication_num" = "1"); + """ + sql """ + create table d_table2 ( + k1 int null, + k2 int not null, + k3 bigint null, + k4 varchar(100) null + ) + duplicate key (k1,k2,k3) + distributed BY hash(k1) buckets 3 + properties("replication_num" = "1"); + """ + + sql """insert into d_table select 1,1,1,'1234567890abcdefghigalsdhaluihdicandejionxaoxwdeuhwenudzmwoedxneiowdxiowedjxneiowdjixoneiiexdnuiexef' from (select 1 k1) as t lateral view explode_numbers(50000000) tmp1 as e1; +""" + + sql """insert into d_table2 select 1,1,1,'1234567890abcdefghigalsdhaluihdicandejionxaoxwdeuhwenudzmwoedxneiowdxiowedjxneiowdjixoneiiexdnuiexef'; +""" + sql "set parallel_pipeline_task_num=1;" + + qt_test "select /*+ LEADING(a,b) */ count(*) from d_table as a, d_table2 as b where a.k4=b.k4 and a.k1=b.k1;" + qt_test "select /*+ LEADING(b,a) */ count(*) from d_table as a, d_table2 as b where a.k4=b.k4 and a.k1=b.k1;" +} + From b91d0b82871f63a32ccf3bb6f5f5ca06511f9bfe Mon Sep 17 00:00:00 2001 From: BiteTheDDDDt Date: Tue, 14 Jan 2025 10:53:59 +0800 Subject: [PATCH 2/3] fix --- be/src/vec/columns/column_string.h | 1 - 1 file changed, 1 deletion(-) diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index edbc63d2872bb3..e696b6f0764e28 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -29,7 +29,6 @@ #include #include -#include "common/cast_set.h" #include "common/compiler_util.h" // IWYU pragma: keep #include "common/exception.h" #include "common/status.h" From c04ab7e5b3e036a5a5ef9cfb6162dfb7e0ffc530 Mon Sep 17 00:00:00 2001 From: BiteTheDDDDt Date: Tue, 14 Jan 2025 12:41:36 +0800 Subject: [PATCH 3/3] fix --- be/src/vec/columns/column_string.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index 52484fdaef49a2..8bcc6565467a63 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -413,7 +413,7 @@ size_t ColumnStr::get_max_row_byte_size() const { size_t max_size = 0; size_t num_rows = offsets.size(); for (size_t i = 0; i < num_rows; ++i) { - max_size = std::max(max_size, T(size_at(i))); + max_size = std::max(max_size, size_t(size_at(i))); } return max_size + sizeof(uint32_t);