From adf71e19982471db714eaf0aa221f815962c102d Mon Sep 17 00:00:00 2001 From: zzzxl1993 Date: Mon, 11 Nov 2024 23:31:50 +0800 Subject: [PATCH] [feature](function) add approx_top_sum aggregation function --- .../aggregate_functions/aggregate_function.h | 2 +- .../aggregate_function_approx_top.h | 80 ++++++ .../aggregate_function_approx_top_k.cpp | 48 +--- .../aggregate_function_approx_top_k.h | 71 +---- .../aggregate_function_approx_top_sum.cpp | 71 +++++ .../aggregate_function_approx_top_sum.h | 245 ++++++++++++++++++ .../aggregate_function_simple_factory.cpp | 2 + be/src/vec/exprs/vectorized_agg_fn.cpp | 10 +- be/test/common/space_saving_test.cpp | 22 +- .../catalog/BuiltinAggregateFunctions.java | 2 + .../expressions/functions/agg/ApproxTopK.java | 14 +- .../functions/agg/ApproxTopSum.java | 94 +++++++ .../visitor/AggregateFunctionVisitor.java | 5 + .../test_index_approx_top_k.out | 24 -- .../test_index_approx_top_sum.out | 109 ++++++++ .../test_index_approx_top_k.groovy | 8 - .../test_index_approx_top_sum.groovy | 153 +++++++++++ 17 files changed, 798 insertions(+), 162 deletions(-) create mode 100644 be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.cpp create mode 100644 be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.h create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/agg/ApproxTopSum.java create mode 100644 regression-test/data/inverted_index_p0/test_index_approx_top_sum.out create mode 100644 regression-test/suites/inverted_index_p0/test_index_approx_top_sum.groovy diff --git a/be/src/vec/aggregate_functions/aggregate_function.h b/be/src/vec/aggregate_functions/aggregate_function.h index e9148716f99f35..32fc9d5efce771 100644 --- a/be/src/vec/aggregate_functions/aggregate_function.h +++ b/be/src/vec/aggregate_functions/aggregate_function.h @@ -43,7 +43,7 @@ class IDataType; struct AggregateFunctionAttr { bool enable_decimal256 {false}; - std::vector> column_infos; + std::vector column_names; }; template diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top.h b/be/src/vec/aggregate_functions/aggregate_function_approx_top.h index 7885321bba3e11..399af84f43cf20 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_approx_top.h +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top.h @@ -18,12 +18,92 @@ #pragma once #include "vec/core/types.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_nullable.h" namespace doris::vectorized { class AggregateFunctionApproxTop { public: + AggregateFunctionApproxTop(const std::vector& column_names) + : _column_names(column_names) {} + + static int32_t is_valid_const_columns(const std::vector& is_const_columns) { + int32_t true_count = 0; + bool found_false_after_true = false; + for (int32_t i = is_const_columns.size() - 1; i >= 0; --i) { + if (is_const_columns[i]) { + true_count++; + if (found_false_after_true) { + return false; + } + } else { + if (true_count > 2) { + return false; + } + found_false_after_true = true; + } + } + if (true_count > 2) { + throw Exception(ErrorCode::INVALID_ARGUMENT, "Invalid is_const_columns configuration"); + } + return true_count; + } + +protected: + void lazy_init(const IColumn** columns, ssize_t row_num, + const DataTypes& argument_types) const { + auto get_param = [](size_t idx, const DataTypes& data_types, + const IColumn** columns) -> uint64_t { + const auto& data_type = data_types.at(idx); + const IColumn* column = columns[idx]; + + const auto* type = data_type.get(); + if (type->is_nullable()) { + type = assert_cast(type) + ->get_nested_type() + .get(); + } + int64_t value = 0; + WhichDataType which(type); + if (which.idx == TypeIndex::Int8) { + value = assert_cast(column) + ->get_element(0); + } else if (which.idx == TypeIndex::Int16) { + value = assert_cast(column) + ->get_element(0); + } else if (which.idx == TypeIndex::Int32) { + value = assert_cast(column) + ->get_element(0); + } + if (value <= 0) { + throw Exception(ErrorCode::INVALID_ARGUMENT, + "The parameter cannot be less than or equal to 0."); + } + return value; + }; + + _threshold = + std::min(get_param(_column_names.size(), argument_types, columns), (uint64_t)4096); + _reserved = std::min( + std::max(get_param(_column_names.size() + 1, argument_types, columns), _threshold), + (uint64_t)4096); + + if (_threshold == 0 || _reserved == 0 || _threshold > 4096 || _reserved > 4096) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "approx_top_sum param error, _threshold: {}, _reserved: {}", _threshold, + _reserved); + } + + _init_flag = true; + } + static inline constexpr UInt64 TOP_K_MAX_SIZE = 0xFFFFFF; + + mutable std::vector _column_names; + mutable bool _init_flag = false; + mutable uint64_t _threshold = 10; + mutable uint64_t _reserved = 30; }; } // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.cpp b/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.cpp index d6298881a90630..0aa7adc253da0f 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.cpp @@ -24,58 +24,16 @@ namespace doris::vectorized { -int32_t is_valid_const_columns(const std::vector& is_const_columns) { - int32_t true_count = 0; - bool found_false_after_true = false; - for (int32_t i = is_const_columns.size() - 1; i >= 0; --i) { - if (is_const_columns[i]) { - true_count++; - if (found_false_after_true) { - return false; - } - } else { - if (true_count > 2) { - return false; - } - found_false_after_true = true; - } - } - if (true_count > 2) { - throw Exception(ErrorCode::INVALID_ARGUMENT, "Invalid is_const_columns configuration"); - } - return true_count; -} - AggregateFunctionPtr create_aggregate_function_approx_top_k(const std::string& name, const DataTypes& argument_types, const bool result_is_nullable, const AggregateFunctionAttr& attr) { - if (argument_types.empty()) { + if (argument_types.size() < 3) { return nullptr; } - std::vector is_const_columns; - std::vector column_names; - for (const auto& [name, is_const] : attr.column_infos) { - is_const_columns.push_back(is_const); - if (!is_const) { - column_names.push_back(name); - } - } - - int32_t true_count = is_valid_const_columns(is_const_columns); - if (true_count == 0) { - return creator_without_type::create>( - argument_types, result_is_nullable, column_names); - } else if (true_count == 1) { - return creator_without_type::create>( - argument_types, result_is_nullable, column_names); - } else if (true_count == 2) { - return creator_without_type::create>( - argument_types, result_is_nullable, column_names); - } else { - return nullptr; - } + return creator_without_type::create( + argument_types, result_is_nullable, attr.column_names); } void register_aggregate_function_approx_top_k(AggregateFunctionSimpleFactory& factory) { diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.h b/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.h index 7253ae8a96e200..6a9b1636fa0d94 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.h +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.h @@ -45,28 +45,25 @@ namespace doris::vectorized { -inline constexpr UInt64 TOP_K_MAX_SIZE = 0xFFFFFF; - struct AggregateFunctionTopKGenericData { using Set = SpaceSaving; Set value; }; -template class AggregateFunctionApproxTopK final : public IAggregateFunctionDataHelper>, + AggregateFunctionApproxTopK>, AggregateFunctionApproxTop { private: using State = AggregateFunctionTopKGenericData; public: - AggregateFunctionApproxTopK(std::vector column_names, + AggregateFunctionApproxTopK(const std::vector& column_names, const DataTypes& argument_types_) : IAggregateFunctionDataHelper>(argument_types_), - _column_names(std::move(column_names)) {} + AggregateFunctionApproxTopK>(argument_types_), + AggregateFunctionApproxTop(column_names) {} String get_name() const override { return "approx_top_k"; } @@ -141,7 +138,7 @@ class AggregateFunctionApproxTopK final void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, Arena* arena) const override { if (!_init_flag) { - lazy_init(columns, row_num); + lazy_init(columns, row_num, this->get_argument_types()); } auto& set = this->data(place).value; @@ -227,64 +224,6 @@ class AggregateFunctionApproxTopK final std::string res = buffer.GetString(); data_to.insert_data(res.data(), res.size()); } - -private: - void lazy_init(const IColumn** columns, ssize_t row_num) const { - auto get_param = [](size_t idx, const DataTypes& data_types, - const IColumn** columns) -> uint64_t { - const auto& data_type = data_types.at(idx); - const IColumn* column = columns[idx]; - - const auto* type = data_type.get(); - if (type->is_nullable()) { - type = assert_cast(type) - ->get_nested_type() - .get(); - } - int64_t value = 0; - WhichDataType which(type); - if (which.idx == TypeIndex::Int8) { - value = assert_cast(column) - ->get_element(0); - } else if (which.idx == TypeIndex::Int16) { - value = assert_cast(column) - ->get_element(0); - } else if (which.idx == TypeIndex::Int32) { - value = assert_cast(column) - ->get_element(0); - } - if (value <= 0) { - throw Exception(ErrorCode::INVALID_ARGUMENT, - "The parameter cannot be less than or equal to 0."); - } - return value; - }; - - const auto& data_types = this->get_argument_types(); - if (ArgsSize == 1) { - _threshold = - std::min(get_param(_column_names.size(), data_types, columns), (uint64_t)1000); - } else if (ArgsSize == 2) { - _threshold = - std::min(get_param(_column_names.size(), data_types, columns), (uint64_t)1000); - _reserved = std::min( - std::max(get_param(_column_names.size() + 1, data_types, columns), _threshold), - (uint64_t)1000); - } - - if (_threshold == 0 || _reserved == 0 || _threshold > 1000 || _reserved > 1000) { - throw Exception(ErrorCode::INTERNAL_ERROR, - "approx_top_k param error, _threshold: {}, _reserved: {}", _threshold, - _reserved); - } - - _init_flag = true; - } - - mutable std::vector _column_names; - mutable bool _init_flag = false; - mutable uint64_t _threshold = 10; - mutable uint64_t _reserved = 300; }; } // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.cpp b/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.cpp new file mode 100644 index 00000000000000..7325651d141c13 --- /dev/null +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.cpp @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/aggregate_functions/aggregate_function_approx_top_sum.h" + +#include "common/exception.h" +#include "vec/aggregate_functions/aggregate_function_simple_factory.h" +#include "vec/aggregate_functions/helpers.h" +#include "vec/data_types/data_type.h" + +namespace doris::vectorized { + +template +AggregateFunctionPtr create_aggregate_function_multi_top_sum_impl( + const DataTypes& argument_types, const bool result_is_nullable, + const std::vector& column_names) { + if (N == argument_types.size() - 3) { + return creator_with_type_base::template create< + AggregateFunctionApproxTopSumSimple>(argument_types, result_is_nullable, + column_names); + } else { + return create_aggregate_function_multi_top_sum_impl( + argument_types, result_is_nullable, column_names); + } +} + +template <> +AggregateFunctionPtr create_aggregate_function_multi_top_sum_impl<0>( + const DataTypes& argument_types, const bool result_is_nullable, + const std::vector& column_names) { + return creator_with_type_base::template create< + AggregateFunctionApproxTopSumSimple>(argument_types, result_is_nullable, column_names); +} + +AggregateFunctionPtr create_aggregate_function_approx_top_sum(const std::string& name, + const DataTypes& argument_types, + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { + if (argument_types.size() < 3) { + return nullptr; + } + + constexpr size_t max_param_value = 10; + if (argument_types.size() > max_param_value) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "Argument types size exceeds the supported limit."); + } + + return create_aggregate_function_multi_top_sum_impl( + argument_types, result_is_nullable, attr.column_names); +} + +void register_aggregate_function_approx_top_sum(AggregateFunctionSimpleFactory& factory) { + factory.register_function_both("approx_top_sum", create_aggregate_function_approx_top_sum); +} + +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.h b/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.h new file mode 100644 index 00000000000000..9b3ba6a965091a --- /dev/null +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.h @@ -0,0 +1,245 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include "vec/aggregate_functions/aggregate_function.h" +#include "vec/aggregate_functions/aggregate_function_approx_top.h" +#include "vec/columns/column.h" +#include "vec/columns/column_array.h" +#include "vec/columns/column_string.h" +#include "vec/columns/column_struct.h" +#include "vec/columns/column_vector.h" +#include "vec/columns/columns_number.h" +#include "vec/common/assert_cast.h" +#include "vec/common/space_saving.h" +#include "vec/common/string_ref.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type_array.h" +#include "vec/data_types/data_type_ipv4.h" +#include "vec/data_types/data_type_struct.h" +#include "vec/io/io_helper.h" + +namespace doris::vectorized { + +struct AggregateFunctionTopKGenericData { + using Set = SpaceSaving; + + Set value; +}; + +template +class AggregateFunctionApproxTopSum final + : public IAggregateFunctionDataHelper>, + AggregateFunctionApproxTop { +private: + using State = AggregateFunctionTopKGenericData; + + using ResultDataType = DataTypeNumber; + using ColVecType = ColumnVector; + using ColVecResult = ColumnVector; + +public: + AggregateFunctionApproxTopSum(const std::vector& column_names, + const DataTypes& argument_types_) + : IAggregateFunctionDataHelper>( + argument_types_), + AggregateFunctionApproxTop(column_names) {} + + String get_name() const override { return "approx_top_sum"; } + + DataTypePtr get_return_type() const override { return std::make_shared(); } + + // Serializes the aggregate function's state (including the SpaceSaving structure and threshold) into a buffer. + void serialize(ConstAggregateDataPtr __restrict place, BufferWritable& buf) const override { + this->data(place).value.write(buf); + + write_var_uint(_column_names.size(), buf); + for (const auto& column_name : _column_names) { + write_string_binary(column_name, buf); + } + write_var_uint(_threshold, buf); + write_var_uint(_reserved, buf); + } + + // Deserializes the aggregate function's state from a buffer (including the SpaceSaving structure and threshold). + void deserialize(AggregateDataPtr __restrict place, BufferReadable& buf, + Arena* arena) const override { + auto readStringBinaryInto = [](Arena& arena, BufferReadable& buf) { + size_t size = 0; + read_var_uint(size, buf); + + if (UNLIKELY(size > DEFAULT_MAX_STRING_SIZE)) { + throw Exception(ErrorCode::INTERNAL_ERROR, "Too large string size."); + } + + char* data = arena.alloc(size); + buf.read(data, size); + + return StringRef(data, size); + }; + + auto& set = this->data(place).value; + set.clear(); + + size_t size = 0; + read_var_uint(size, buf); + if (UNLIKELY(size > TOP_K_MAX_SIZE)) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "Too large size ({}) for aggregate function '{}' state (maximum is {})", + size, get_name(), TOP_K_MAX_SIZE); + } + + set.resize(size); + for (size_t i = 0; i < size; ++i) { + auto ref = readStringBinaryInto(*arena, buf); + uint64_t count = 0; + uint64_t error = 0; + read_var_uint(count, buf); + read_var_uint(error, buf); + set.insert(ref, count, error); + arena->rollback(ref.size); + } + + set.read_alpha_map(buf); + + uint64_t column_size = 0; + read_var_uint(column_size, buf); + _column_names.clear(); + for (uint64_t i = 0; i < column_size; i++) { + std::string column_name; + read_string_binary(column_name, buf); + _column_names.emplace_back(std::move(column_name)); + } + read_var_uint(_threshold, buf); + read_var_uint(_reserved, buf); + } + + // Adds a new row of data to the aggregate function (inserts a new value into the SpaceSaving structure). + void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, + Arena* arena) const override { + if (!_init_flag) { + lazy_init(columns, row_num, this->get_argument_types()); + } + + auto& set = this->data(place).value; + if (set.capacity() != _reserved) { + set.resize(_reserved); + } + + auto all_serialize_value_into_arena = + [](size_t i, size_t keys_size, const IColumn** columns, Arena* arena) -> StringRef { + const char* begin = nullptr; + + size_t sum_size = 0; + for (size_t j = 0; j < keys_size; ++j) { + sum_size += columns[j]->serialize_value_into_arena(i, *arena, begin).size; + } + + return {begin, sum_size}; + }; + + StringRef str_serialized = + all_serialize_value_into_arena(row_num, _column_names.size(), columns, arena); + const auto& column = assert_cast( + *columns[_column_names.size() - 1]); + set.insert(str_serialized, TResult(column.get_data()[row_num])); + arena->rollback(str_serialized.size); + } + + void add_many(AggregateDataPtr __restrict place, const IColumn** columns, + std::vector& rows, Arena* arena) const override { + for (auto row : rows) { + add(place, columns, row, arena); + } + } + + void reset(AggregateDataPtr __restrict place) const override { + this->data(place).value.clear(); + } + + // Merges the state of another aggregate function into the current one (merges two SpaceSaving sets). + void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, + Arena*) const override { + auto& rhs_set = this->data(rhs).value; + if (!rhs_set.size()) { + return; + } + + auto& set = this->data(place).value; + if (set.capacity() != _reserved) { + set.resize(_reserved); + } + set.merge(rhs_set); + } + + void insert_result_into(ConstAggregateDataPtr __restrict place, IColumn& to) const override { + auto& data_to = assert_cast(to); + + const typename State::Set& set = this->data(place).value; + auto result_vec = set.top_k(_threshold); + + rapidjson::StringBuffer buffer; + rapidjson::PrettyWriter writer(buffer); + writer.StartArray(); + for (auto& result : result_vec) { + auto argument_types = this->get_argument_types(); + MutableColumns argument_columns(_column_names.size()); + for (size_t i = 0; i < _column_names.size(); ++i) { + argument_columns[i] = argument_types[i]->create_column(); + } + rapidjson::StringBuffer sub_buffer; + rapidjson::Writer sub_writer(sub_buffer); + sub_writer.StartObject(); + const char* begin = result.key.data; + for (size_t i = 0; i < _column_names.size(); i++) { + begin = argument_columns[i]->deserialize_and_insert_from_arena(begin); + std::string row_str = argument_types[i]->to_string(*argument_columns[i], 0); + sub_writer.Key(_column_names[i].data(), _column_names[i].size()); + sub_writer.String(row_str.data(), row_str.size()); + } + sub_writer.Key("sum"); + sub_writer.String(std::to_string(result.count).c_str()); + sub_writer.EndObject(); + writer.RawValue(sub_buffer.GetString(), sub_buffer.GetSize(), rapidjson::kObjectType); + } + writer.EndArray(); + std::string res = buffer.GetString(); + data_to.insert_data(res.data(), res.size()); + } +}; + +template +struct TopSumSimple { + using ResultType = T; + using AggregateDataType = AggregateFunctionTopKGenericData; + using Function = AggregateFunctionApproxTopSum; +}; + +template +using AggregateFunctionApproxTopSumSimple = typename TopSumSimple::Function; + +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/aggregate_functions/aggregate_function_simple_factory.cpp b/be/src/vec/aggregate_functions/aggregate_function_simple_factory.cpp index dd379a41b19983..90bdab70522cf1 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_simple_factory.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_simple_factory.cpp @@ -73,6 +73,7 @@ void register_aggregate_function_covar_samp(AggregateFunctionSimpleFactory& fact void register_aggregate_function_skewness(AggregateFunctionSimpleFactory& factory); void register_aggregate_function_kurtosis(AggregateFunctionSimpleFactory& factory); void register_aggregate_function_approx_top_k(AggregateFunctionSimpleFactory& factory); +void register_aggregate_function_approx_top_sum(AggregateFunctionSimpleFactory& factory); AggregateFunctionSimpleFactory& AggregateFunctionSimpleFactory::instance() { static std::once_flag oc; @@ -126,6 +127,7 @@ AggregateFunctionSimpleFactory& AggregateFunctionSimpleFactory::instance() { register_aggregate_function_skewness(instance); register_aggregate_function_kurtosis(instance); register_aggregate_function_approx_top_k(instance); + register_aggregate_function_approx_top_sum(instance); }); return instance; } diff --git a/be/src/vec/exprs/vectorized_agg_fn.cpp b/be/src/vec/exprs/vectorized_agg_fn.cpp index cac93a0c992727..5f181311e20e60 100644 --- a/be/src/vec/exprs/vectorized_agg_fn.cpp +++ b/be/src/vec/exprs/vectorized_agg_fn.cpp @@ -140,11 +140,11 @@ Status AggFnEvaluator::prepare(RuntimeState* state, const RowDescriptor& desc, child_expr_name.emplace_back(_input_exprs_ctx->root()->expr_name()); } - std::vector> column_infos; + std::vector column_names; for (const auto& expr_ctx : _input_exprs_ctxs) { const auto& root = expr_ctx->root(); - if (!root->expr_name().empty()) { - column_infos.emplace_back(root->expr_name(), root->is_constant()); + if (!root->expr_name().empty() && !root->is_constant()) { + column_names.emplace_back(root->expr_name()); } } @@ -212,13 +212,13 @@ Status AggFnEvaluator::prepare(RuntimeState* state, const RowDescriptor& desc, AggregateFunctionSimpleFactory::result_nullable_by_foreach(_data_type), state->be_exec_version(), {.enable_decimal256 = state->enable_decimal256(), - .column_infos = std::move(column_infos)}); + .column_names = std::move(column_names)}); } else { _function = AggregateFunctionSimpleFactory::instance().get( _fn.name.function_name, argument_types, _data_type->is_nullable(), state->be_exec_version(), {.enable_decimal256 = state->enable_decimal256(), - .column_infos = std::move(column_infos)}); + .column_names = std::move(column_names)}); } } if (_function == nullptr) { diff --git a/be/test/common/space_saving_test.cpp b/be/test/common/space_saving_test.cpp index 6b390017997982..c4bf943171895b 100644 --- a/be/test/common/space_saving_test.cpp +++ b/be/test/common/space_saving_test.cpp @@ -31,13 +31,13 @@ namespace doris::vectorized { -class SapceSavingTest : public testing::Test { +class SpaceSavingTest : public testing::Test { public: void SetUp() override {} void TearDown() override {} - SapceSavingTest() = default; - ~SapceSavingTest() override = default; + SpaceSavingTest() = default; + ~SpaceSavingTest() override = default; }; int getDaySeed() { @@ -63,7 +63,7 @@ int32_t generateRandomNumber() { return rand() % 256; } -TEST_F(SapceSavingTest, test_space_saving_ip) { +TEST_F(SpaceSavingTest, test_space_saving_ip) { int seed = getDaySeed(); std::srand(seed); @@ -93,7 +93,7 @@ TEST_F(SapceSavingTest, test_space_saving_ip) { EXPECT_EQ(i, j); } -TEST_F(SapceSavingTest, test_space_saving_number) { +TEST_F(SpaceSavingTest, test_space_saving_number) { int seed = getDaySeed(); std::srand(seed); @@ -121,7 +121,7 @@ TEST_F(SapceSavingTest, test_space_saving_number) { EXPECT_EQ(i, j); } -TEST_F(SapceSavingTest, test_space_saving_merge) { +TEST_F(SpaceSavingTest, test_space_saving_merge) { int seed = getDaySeed(); std::srand(seed); @@ -193,7 +193,7 @@ TEST_F(SapceSavingTest, test_space_saving_merge) { } // Test inserting beyond capacity -TEST_F(SapceSavingTest, test_space_saving_exceed_capacity) { +TEST_F(SpaceSavingTest, test_space_saving_exceed_capacity) { int seed = getDaySeed(); std::srand(seed); @@ -208,7 +208,7 @@ TEST_F(SapceSavingTest, test_space_saving_exceed_capacity) { } // Test merging two SpaceSaving instances -TEST_F(SapceSavingTest, test_space_saving_merge_behavior) { +TEST_F(SpaceSavingTest, test_space_saving_merge_behavior) { int seed = getDaySeed(); std::srand(seed); @@ -231,7 +231,7 @@ TEST_F(SapceSavingTest, test_space_saving_merge_behavior) { } // Test that top_k returns elements in correct order -TEST_F(SapceSavingTest, test_space_saving_top_k_order) { +TEST_F(SpaceSavingTest, test_space_saving_top_k_order) { int seed = getDaySeed(); std::srand(seed); @@ -250,7 +250,7 @@ TEST_F(SapceSavingTest, test_space_saving_top_k_order) { } // Test that the merging does not lose counts -TEST_F(SapceSavingTest, test_space_saving_merge_counts) { +TEST_F(SpaceSavingTest, test_space_saving_merge_counts) { int seed = getDaySeed(); std::srand(seed); @@ -276,7 +276,7 @@ TEST_F(SapceSavingTest, test_space_saving_merge_counts) { } // Test with string keys and check behavior -TEST_F(SapceSavingTest, test_space_saving_string_keys) { +TEST_F(SpaceSavingTest, test_space_saving_string_keys) { int seed = getDaySeed(); std::srand(seed); diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinAggregateFunctions.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinAggregateFunctions.java index 22534930cbad35..1db9cf6477e8fb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinAggregateFunctions.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinAggregateFunctions.java @@ -19,6 +19,7 @@ import org.apache.doris.nereids.trees.expressions.functions.agg.AnyValue; import org.apache.doris.nereids.trees.expressions.functions.agg.ApproxTopK; +import org.apache.doris.nereids.trees.expressions.functions.agg.ApproxTopSum; import org.apache.doris.nereids.trees.expressions.functions.agg.ArrayAgg; import org.apache.doris.nereids.trees.expressions.functions.agg.Avg; import org.apache.doris.nereids.trees.expressions.functions.agg.AvgWeighted; @@ -97,6 +98,7 @@ public class BuiltinAggregateFunctions implements FunctionHelper { public final List aggregateFunctions = ImmutableList.of( agg(AnyValue.class, "any", "any_value"), agg(ApproxTopK.class, "approx_top_k"), + agg(ApproxTopSum.class, "approx_top_sum"), agg(ArrayAgg.class, "array_agg"), agg(Avg.class, "avg"), agg(AvgWeighted.class, "avg_weighted"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/agg/ApproxTopK.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/agg/ApproxTopK.java index 56f580efdec61f..5fa37479a6185b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/agg/ApproxTopK.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/agg/ApproxTopK.java @@ -55,9 +55,19 @@ public ApproxTopK(boolean distinct, boolean alwaysNullable, Expression... varArg @Override public void checkLegalityBeforeTypeCoercion() { - if (arity() < 1) { + if (arity() < 3) { throw new AnalysisException( - "Function requires at least 1 parameter: " + this.toSql()); + "Function requires at least 3 parameters: " + this.toSql()); + } + + if (!getArgument(arity() - 2).isConstant() || !getArgumentType(arity() - 2).isIntegerLikeType()) { + throw new AnalysisException( + "The second to last parameter must be a constant Integer Type: " + this.toSql()); + } + + if (!getArgument(arity() - 1).isConstant() || !getArgumentType(arity() - 1).isIntegerLikeType()) { + throw new AnalysisException( + "The last parameter must be a constant Integer Type: " + this.toSql()); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/agg/ApproxTopSum.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/agg/ApproxTopSum.java new file mode 100644 index 00000000000000..517cb055aace62 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/agg/ApproxTopSum.java @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.agg; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.exceptions.AnalysisException; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.StringType; +import org.apache.doris.nereids.types.coercion.AnyDataType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * AggregateFunction 'approx_top_sum'. This class is generated by GenerateFunction. + */ +public class ApproxTopSum extends NullableAggregateFunction + implements ExplicitlyCastableSignature { + + public static final List SIGNATURES = ImmutableList.of( + FunctionSignature.ret(StringType.INSTANCE) + .varArgs(AnyDataType.INSTANCE_WITHOUT_INDEX) + ); + + public ApproxTopSum(Expression... varArgs) { + this(false, varArgs); + } + + public ApproxTopSum(boolean distinct, Expression... varArgs) { + this(distinct, false, varArgs); + } + + public ApproxTopSum(boolean distinct, boolean alwaysNullable, Expression... varArgs) { + super("approx_top_sum", distinct, alwaysNullable, varArgs); + } + + @Override + public void checkLegalityBeforeTypeCoercion() { + if (arity() < 3) { + throw new AnalysisException( + "Function requires at least 3 parameters: " + this.toSql()); + } + + if (!getArgument(arity() - 2).isConstant() || !getArgumentType(arity() - 2).isIntegerLikeType()) { + throw new AnalysisException( + "The second to last parameter must be a constant Integer Type: " + this.toSql()); + } + + if (!getArgument(arity() - 1).isConstant() || !getArgumentType(arity() - 1).isIntegerLikeType()) { + throw new AnalysisException( + "The last parameter must be a constant Integer Type: " + this.toSql()); + } + } + + @Override + public ApproxTopSum withDistinctAndChildren(boolean distinct, List children) { + Preconditions.checkArgument(children.size() >= 1); + return new ApproxTopSum(distinct, alwaysNullable, children.toArray(new Expression[0])); + } + + @Override + public NullableAggregateFunction withAlwaysNullable(boolean alwaysNullable) { + return new ApproxTopSum(distinct, alwaysNullable, children.toArray(new Expression[0])); + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitApproxTopSum(this, context); + } + + @Override + public List getSignatures() { + return SIGNATURES; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/AggregateFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/AggregateFunctionVisitor.java index 3a93a2164bff4f..50ca233cdbc703 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/AggregateFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/AggregateFunctionVisitor.java @@ -20,6 +20,7 @@ import org.apache.doris.nereids.trees.expressions.functions.agg.AggregateFunction; import org.apache.doris.nereids.trees.expressions.functions.agg.AnyValue; import org.apache.doris.nereids.trees.expressions.functions.agg.ApproxTopK; +import org.apache.doris.nereids.trees.expressions.functions.agg.ApproxTopSum; import org.apache.doris.nereids.trees.expressions.functions.agg.ArrayAgg; import org.apache.doris.nereids.trees.expressions.functions.agg.Avg; import org.apache.doris.nereids.trees.expressions.functions.agg.AvgWeighted; @@ -364,4 +365,8 @@ default R visitApproxTopK(ApproxTopK approxTopK, C context) { return visitNullableAggregateFunction(approxTopK, context); } + default R visitApproxTopSum(ApproxTopSum approxTopSum, C context) { + return visitNullableAggregateFunction(approxTopSum, context); + } + } diff --git a/regression-test/data/inverted_index_p0/test_index_approx_top_k.out b/regression-test/data/inverted_index_p0/test_index_approx_top_k.out index 0ed4fc0879eaf0..4e7ac3cd07ce21 100644 --- a/regression-test/data/inverted_index_p0/test_index_approx_top_k.out +++ b/regression-test/data/inverted_index_p0/test_index_approx_top_k.out @@ -20,12 +20,6 @@ -- !sql -- [\n {"clientip":"141.78.0.0","count":"67"},\n {"clientip":"47.135.0.0","count":"62"},\n {"clientip":"12.0.0.0","count":"54"},\n {"clientip":"47.0.0.0","count":"44"},\n {"clientip":"23.0.0.0","count":"40"},\n {"clientip":"44.135.0.0","count":"37"},\n {"clientip":"8.0.0.0","count":"31"},\n {"clientip":"45.135.0.0","count":"29"},\n {"clientip":"3.0.0.0","count":"29"},\n {"clientip":"65.0.0.0","count":"27"}\n] --- !sql -- -[\n {"clientip":"141.78.0.0","count":"67"},\n {"clientip":"47.135.0.0","count":"62"},\n {"clientip":"12.0.0.0","count":"54"},\n {"clientip":"47.0.0.0","count":"44"},\n {"clientip":"23.0.0.0","count":"40"},\n {"clientip":"44.135.0.0","count":"37"},\n {"clientip":"8.0.0.0","count":"31"},\n {"clientip":"45.135.0.0","count":"29"},\n {"clientip":"3.0.0.0","count":"29"},\n {"clientip":"65.0.0.0","count":"27"}\n] - --- !sql -- -[\n {"clientip":"141.78.0.0","count":"67"},\n {"clientip":"47.135.0.0","count":"62"},\n {"clientip":"12.0.0.0","count":"54"},\n {"clientip":"47.0.0.0","count":"44"},\n {"clientip":"23.0.0.0","count":"40"},\n {"clientip":"44.135.0.0","count":"37"},\n {"clientip":"8.0.0.0","count":"31"},\n {"clientip":"45.135.0.0","count":"29"},\n {"clientip":"3.0.0.0","count":"29"},\n {"clientip":"65.0.0.0","count":"27"}\n] - -- !sql -- 47.0.0.0 304 0 41 44.135.0.0 304 0 25 @@ -47,12 +41,6 @@ -- !sql -- [\n {"clientip":"47.0.0.0","status":"304","size":"0","count":"41"},\n {"clientip":"44.135.0.0","status":"304","size":"0","count":"25"},\n {"clientip":"141.78.0.0","status":"304","size":"0","count":"24"},\n {"clientip":"3.0.0.0","status":"304","size":"0","count":"20"},\n {"clientip":"55.0.0.0","status":"200","size":"985","count":"12"},\n {"clientip":"247.37.0.0","status":"304","size":"0","count":"8"},\n {"clientip":"141.78.0.0","status":"200","size":"169","count":"8"},\n {"clientip":"8.0.0.0","status":"304","size":"0","count":"6"},\n {"clientip":"46.0.0.0","status":"304","size":"0","count":"6"},\n {"clientip":"41.0.0.0","status":"304","size":"0","count":"5"}\n] --- !sql -- -[\n {"clientip":"47.0.0.0","status":"304","size":"0","count":"41"},\n {"clientip":"44.135.0.0","status":"304","size":"0","count":"25"},\n {"clientip":"141.78.0.0","status":"304","size":"0","count":"24"},\n {"clientip":"3.0.0.0","status":"304","size":"0","count":"20"},\n {"clientip":"55.0.0.0","status":"200","size":"985","count":"12"},\n {"clientip":"247.37.0.0","status":"304","size":"0","count":"8"},\n {"clientip":"141.78.0.0","status":"200","size":"169","count":"8"},\n {"clientip":"8.0.0.0","status":"304","size":"0","count":"6"},\n {"clientip":"46.0.0.0","status":"304","size":"0","count":"6"},\n {"clientip":"41.0.0.0","status":"304","size":"0","count":"5"}\n] - --- !sql -- -[\n {"clientip":"47.0.0.0","status":"304","size":"0","count":"41"},\n {"clientip":"44.135.0.0","status":"304","size":"0","count":"25"},\n {"clientip":"141.78.0.0","status":"304","size":"0","count":"24"},\n {"clientip":"3.0.0.0","status":"304","size":"0","count":"20"},\n {"clientip":"55.0.0.0","status":"200","size":"985","count":"12"},\n {"clientip":"247.37.0.0","status":"304","size":"0","count":"8"},\n {"clientip":"141.78.0.0","status":"200","size":"169","count":"8"},\n {"clientip":"8.0.0.0","status":"304","size":"0","count":"6"},\n {"clientip":"46.0.0.0","status":"304","size":"0","count":"6"},\n {"clientip":"41.0.0.0","status":"304","size":"0","count":"5"}\n] - -- !sql -- {"clientip":"47.0.0.0","status":"304","size":"0","count":"41"} {"clientip":"44.135.0.0","status":"304","size":"0","count":"25"} @@ -86,12 +74,6 @@ -- !sql -- [\n {"clientip":"141.78.0.0","count":"67"},\n {"clientip":"47.135.0.0","count":"62"},\n {"clientip":"12.0.0.0","count":"54"},\n {"clientip":"47.0.0.0","count":"44"},\n {"clientip":"23.0.0.0","count":"40"},\n {"clientip":"44.135.0.0","count":"37"},\n {"clientip":"8.0.0.0","count":"31"},\n {"clientip":"45.135.0.0","count":"29"},\n {"clientip":"3.0.0.0","count":"29"},\n {"clientip":"65.0.0.0","count":"27"}\n] --- !sql -- -[\n {"clientip":"141.78.0.0","count":"67"},\n {"clientip":"47.135.0.0","count":"62"},\n {"clientip":"12.0.0.0","count":"54"},\n {"clientip":"47.0.0.0","count":"44"},\n {"clientip":"23.0.0.0","count":"40"},\n {"clientip":"44.135.0.0","count":"37"},\n {"clientip":"8.0.0.0","count":"31"},\n {"clientip":"45.135.0.0","count":"29"},\n {"clientip":"3.0.0.0","count":"29"},\n {"clientip":"65.0.0.0","count":"27"}\n] - --- !sql -- -[\n {"clientip":"141.78.0.0","count":"67"},\n {"clientip":"47.135.0.0","count":"62"},\n {"clientip":"12.0.0.0","count":"54"},\n {"clientip":"47.0.0.0","count":"44"},\n {"clientip":"23.0.0.0","count":"40"},\n {"clientip":"44.135.0.0","count":"37"},\n {"clientip":"8.0.0.0","count":"31"},\n {"clientip":"45.135.0.0","count":"29"},\n {"clientip":"3.0.0.0","count":"29"},\n {"clientip":"65.0.0.0","count":"27"}\n] - -- !sql -- 47.0.0.0 304 0 41 44.135.0.0 304 0 25 @@ -113,12 +95,6 @@ -- !sql -- [\n {"clientip":"47.0.0.0","status":"304","size":"0","count":"41"},\n {"clientip":"44.135.0.0","status":"304","size":"0","count":"25"},\n {"clientip":"141.78.0.0","status":"304","size":"0","count":"24"},\n {"clientip":"3.0.0.0","status":"304","size":"0","count":"20"},\n {"clientip":"55.0.0.0","status":"200","size":"985","count":"12"},\n {"clientip":"247.37.0.0","status":"304","size":"0","count":"8"},\n {"clientip":"141.78.0.0","status":"200","size":"169","count":"8"},\n {"clientip":"8.0.0.0","status":"304","size":"0","count":"6"},\n {"clientip":"46.0.0.0","status":"304","size":"0","count":"6"},\n {"clientip":"41.0.0.0","status":"304","size":"0","count":"5"}\n] --- !sql -- -[\n {"clientip":"47.0.0.0","status":"304","size":"0","count":"41"},\n {"clientip":"44.135.0.0","status":"304","size":"0","count":"25"},\n {"clientip":"141.78.0.0","status":"304","size":"0","count":"24"},\n {"clientip":"3.0.0.0","status":"304","size":"0","count":"20"},\n {"clientip":"55.0.0.0","status":"200","size":"985","count":"12"},\n {"clientip":"247.37.0.0","status":"304","size":"0","count":"8"},\n {"clientip":"141.78.0.0","status":"200","size":"169","count":"8"},\n {"clientip":"8.0.0.0","status":"304","size":"0","count":"6"},\n {"clientip":"46.0.0.0","status":"304","size":"0","count":"6"},\n {"clientip":"41.0.0.0","status":"304","size":"0","count":"5"}\n] - --- !sql -- -[\n {"clientip":"47.0.0.0","status":"304","size":"0","count":"41"},\n {"clientip":"44.135.0.0","status":"304","size":"0","count":"25"},\n {"clientip":"141.78.0.0","status":"304","size":"0","count":"24"},\n {"clientip":"3.0.0.0","status":"304","size":"0","count":"20"},\n {"clientip":"55.0.0.0","status":"200","size":"985","count":"12"},\n {"clientip":"247.37.0.0","status":"304","size":"0","count":"8"},\n {"clientip":"141.78.0.0","status":"200","size":"169","count":"8"},\n {"clientip":"8.0.0.0","status":"304","size":"0","count":"6"},\n {"clientip":"46.0.0.0","status":"304","size":"0","count":"6"},\n {"clientip":"41.0.0.0","status":"304","size":"0","count":"5"}\n] - -- !sql -- {"clientip":"47.0.0.0","status":"304","size":"0","count":"41"} {"clientip":"44.135.0.0","status":"304","size":"0","count":"25"} diff --git a/regression-test/data/inverted_index_p0/test_index_approx_top_sum.out b/regression-test/data/inverted_index_p0/test_index_approx_top_sum.out new file mode 100644 index 00000000000000..2711e78af2c500 --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_index_approx_top_sum.out @@ -0,0 +1,109 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +24736 2448864 +60349 482792 +33665 201990 +8712 78408 +8389 67112 +25676 51352 +12800 51200 +16606 49818 +49256 49256 +15609 46827 + +-- !sql -- +[\n {"size":"24736","sum":"2448864"},\n {"size":"60349","sum":"482792"},\n {"size":"33665","sum":"201990"},\n {"size":"8712","sum":"78408"},\n {"size":"8389","sum":"67112"},\n {"size":"25676","sum":"51352"},\n {"size":"12800","sum":"51200"},\n {"size":"16606","sum":"49818"},\n {"size":"49256","sum":"49256"},\n {"size":"15609","sum":"46827"}\n] + +-- !sql -- +[\n {"size":"24736","sum":"2448864"},\n {"size":"60349","sum":"482792"},\n {"size":"33665","sum":"201990"},\n {"size":"8712","sum":"78408"},\n {"size":"8389","sum":"67112"},\n {"size":"25676","sum":"51352"},\n {"size":"12800","sum":"51200"},\n {"size":"16606","sum":"49818"},\n {"size":"49256","sum":"49256"},\n {"size":"15609","sum":"46827"}\n] + +-- !sql -- +[\n {"size":"24736","sum":"2448864"},\n {"size":"60349","sum":"482792"},\n {"size":"33665","sum":"201990"},\n {"size":"8712","sum":"78408"},\n {"size":"8389","sum":"67112"},\n {"size":"25676","sum":"51352"},\n {"size":"12800","sum":"51200"},\n {"size":"16606","sum":"49818"},\n {"size":"49256","sum":"49256"},\n {"size":"15609","sum":"46827"}\n] + +-- !sql -- +232.0.0.0 200 24736 74208 +40.135.0.0 200 24736 74208 +0.0.0.0 200 60349 60349 +1.0.0.0 200 60349 60349 +23.0.0.0 200 60349 60349 +32.0.0.0 200 60349 60349 +56.0.0.0 200 60349 60349 +64.0.0.0 200 60349 60349 +66.0.0.0 200 60349 60349 +73.0.0.0 200 60349 60349 + +-- !sql -- +[\n {"clientip":"40.135.0.0","status":"200","size":"24736","sum":"74208"},\n {"clientip":"232.0.0.0","status":"200","size":"24736","sum":"74208"},\n {"clientip":"0.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"1.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"23.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"32.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"56.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"64.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"73.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"66.0.0.0","status":"200","size":"60349","sum":"60349"}\n] + +-- !sql -- +[\n {"clientip":"40.135.0.0","status":"200","size":"24736","sum":"74208"},\n {"clientip":"232.0.0.0","status":"200","size":"24736","sum":"74208"},\n {"clientip":"0.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"1.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"23.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"32.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"56.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"64.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"73.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"66.0.0.0","status":"200","size":"60349","sum":"60349"}\n] + +-- !sql -- +[\n {"clientip":"40.135.0.0","status":"200","size":"24736","sum":"74208"},\n {"clientip":"232.0.0.0","status":"200","size":"24736","sum":"74208"},\n {"clientip":"0.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"1.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"23.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"32.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"56.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"64.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"73.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"66.0.0.0","status":"200","size":"60349","sum":"60349"}\n] + +-- !sql -- +{"clientip":"40.135.0.0","status":"200","size":"24736","sum":"74208"} +{"clientip":"232.0.0.0","status":"200","size":"24736","sum":"74208"} +{"clientip":"0.0.0.0","status":"200","size":"60349","sum":"60349"} +{"clientip":"1.0.0.0","status":"200","size":"60349","sum":"60349"} +{"clientip":"23.0.0.0","status":"200","size":"60349","sum":"60349"} +{"clientip":"32.0.0.0","status":"200","size":"60349","sum":"60349"} +{"clientip":"56.0.0.0","status":"200","size":"60349","sum":"60349"} +{"clientip":"64.0.0.0","status":"200","size":"60349","sum":"60349"} +{"clientip":"73.0.0.0","status":"200","size":"60349","sum":"60349"} +{"clientip":"66.0.0.0","status":"200","size":"60349","sum":"60349"} + +-- !sql -- +24736 2448864 +60349 482792 +33665 201990 +8712 78408 +8389 67112 +25676 51352 +12800 51200 +16606 49818 +49256 49256 +15609 46827 + +-- !sql -- +[\n {"size":"24736","sum":"2448864"},\n {"size":"60349","sum":"482792"},\n {"size":"33665","sum":"201990"},\n {"size":"8712","sum":"78408"},\n {"size":"8389","sum":"67112"},\n {"size":"25676","sum":"51352"},\n {"size":"12800","sum":"51200"},\n {"size":"16606","sum":"49818"},\n {"size":"49256","sum":"49256"},\n {"size":"15609","sum":"46827"}\n] + +-- !sql -- +[\n {"size":"24736","sum":"2448864"},\n {"size":"60349","sum":"482792"},\n {"size":"33665","sum":"201990"},\n {"size":"8712","sum":"78408"},\n {"size":"8389","sum":"67112"},\n {"size":"25676","sum":"51352"},\n {"size":"12800","sum":"51200"},\n {"size":"16606","sum":"49818"},\n {"size":"49256","sum":"49256"},\n {"size":"15609","sum":"46827"}\n] + +-- !sql -- +[\n {"size":"24736","sum":"2448864"},\n {"size":"60349","sum":"482792"},\n {"size":"33665","sum":"201990"},\n {"size":"8712","sum":"78408"},\n {"size":"8389","sum":"67112"},\n {"size":"25676","sum":"51352"},\n {"size":"12800","sum":"51200"},\n {"size":"16606","sum":"49818"},\n {"size":"49256","sum":"49256"},\n {"size":"15609","sum":"46827"}\n] + +-- !sql -- +232.0.0.0 200 24736 74208 +40.135.0.0 200 24736 74208 +0.0.0.0 200 60349 60349 +1.0.0.0 200 60349 60349 +23.0.0.0 200 60349 60349 +32.0.0.0 200 60349 60349 +56.0.0.0 200 60349 60349 +64.0.0.0 200 60349 60349 +66.0.0.0 200 60349 60349 +73.0.0.0 200 60349 60349 + +-- !sql -- +[\n {"clientip":"40.135.0.0","status":"200","size":"24736","sum":"74208"},\n {"clientip":"232.0.0.0","status":"200","size":"24736","sum":"74208"},\n {"clientip":"0.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"1.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"23.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"32.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"56.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"64.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"73.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"66.0.0.0","status":"200","size":"60349","sum":"60349"}\n] + +-- !sql -- +[\n {"clientip":"40.135.0.0","status":"200","size":"24736","sum":"74208"},\n {"clientip":"232.0.0.0","status":"200","size":"24736","sum":"74208"},\n {"clientip":"0.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"1.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"23.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"32.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"56.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"64.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"73.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"66.0.0.0","status":"200","size":"60349","sum":"60349"}\n] + +-- !sql -- +[\n {"clientip":"40.135.0.0","status":"200","size":"24736","sum":"74208"},\n {"clientip":"232.0.0.0","status":"200","size":"24736","sum":"74208"},\n {"clientip":"0.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"1.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"23.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"32.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"56.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"64.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"73.0.0.0","status":"200","size":"60349","sum":"60349"},\n {"clientip":"66.0.0.0","status":"200","size":"60349","sum":"60349"}\n] + +-- !sql -- +{"clientip":"40.135.0.0","status":"200","size":"24736","sum":"74208"} +{"clientip":"232.0.0.0","status":"200","size":"24736","sum":"74208"} +{"clientip":"0.0.0.0","status":"200","size":"60349","sum":"60349"} +{"clientip":"1.0.0.0","status":"200","size":"60349","sum":"60349"} +{"clientip":"23.0.0.0","status":"200","size":"60349","sum":"60349"} +{"clientip":"32.0.0.0","status":"200","size":"60349","sum":"60349"} +{"clientip":"56.0.0.0","status":"200","size":"60349","sum":"60349"} +{"clientip":"64.0.0.0","status":"200","size":"60349","sum":"60349"} +{"clientip":"73.0.0.0","status":"200","size":"60349","sum":"60349"} +{"clientip":"66.0.0.0","status":"200","size":"60349","sum":"60349"} + diff --git a/regression-test/suites/inverted_index_p0/test_index_approx_top_k.groovy b/regression-test/suites/inverted_index_p0/test_index_approx_top_k.groovy index 638d2316c463a0..4dffd80be86e16 100644 --- a/regression-test/suites/inverted_index_p0/test_index_approx_top_k.groovy +++ b/regression-test/suites/inverted_index_p0/test_index_approx_top_k.groovy @@ -88,15 +88,11 @@ suite("test_index_approx_top_k", "p0"){ sql """ set debug_skip_fold_constant = true; """ qt_sql """ select clientip, count(*) as count from ${tableName} group by clientip order by count desc, clientip asc limit 10; """ - qt_sql """ select approx_top_k(clientip) from ${tableName}; """ - qt_sql """ select approx_top_k(clientip, 10) from ${tableName}; """ qt_sql """ select approx_top_k(clientip, 10, 300) from ${tableName}; """ qt_sql """ select approx_top_k(clientip, 5 + 5, 300) from ${tableName}; """ qt_sql """ select approx_top_k(clientip, abs(-10), 300) from ${tableName}; """ qt_sql """ select clientip, status, size, count(*) as count from ${tableName} group by clientip, status, size order by count desc, clientip asc limit 10; """ - qt_sql """ select approx_top_k(clientip, status, size) from ${tableName}; """ - qt_sql """ select approx_top_k(clientip, status, size, 10) from ${tableName}; """ qt_sql """ select approx_top_k(clientip, status, size, 10, 300) from ${tableName}; """ qt_sql """ select approx_top_k(clientip, status, size, 5 + 5, 300) from ${tableName}; """ qt_sql """ select approx_top_k(clientip, status, size, abs(-10), 300) from ${tableName}; """ @@ -123,15 +119,11 @@ suite("test_index_approx_top_k", "p0"){ sql """ set debug_skip_fold_constant = true; """ qt_sql """ select clientip, count(*) as count from ${tableName} group by clientip order by count desc, clientip asc limit 10; """ - qt_sql """ select approx_top_k(clientip) from ${tableName}; """ - qt_sql """ select approx_top_k(clientip, 10) from ${tableName}; """ qt_sql """ select approx_top_k(clientip, 10, 300) from ${tableName}; """ qt_sql """ select approx_top_k(clientip, 5 + 5, 300) from ${tableName}; """ qt_sql """ select approx_top_k(clientip, abs(-10), 300) from ${tableName}; """ qt_sql """ select clientip, status, size, count(*) as count from ${tableName} group by clientip, status, size order by count desc, clientip asc limit 10; """ - qt_sql """ select approx_top_k(clientip, status, size) from ${tableName}; """ - qt_sql """ select approx_top_k(clientip, status, size, 10) from ${tableName}; """ qt_sql """ select approx_top_k(clientip, status, size, 10, 300) from ${tableName}; """ qt_sql """ select approx_top_k(clientip, status, size, 5 + 5, 300) from ${tableName}; """ qt_sql """ select approx_top_k(clientip, status, size, abs(-10), 300) from ${tableName}; """ diff --git a/regression-test/suites/inverted_index_p0/test_index_approx_top_sum.groovy b/regression-test/suites/inverted_index_p0/test_index_approx_top_sum.groovy new file mode 100644 index 00000000000000..5d764f95b1a0ac --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_index_approx_top_sum.groovy @@ -0,0 +1,153 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_index_approx_top_sum", "p0"){ + def tableName = "test_index_approx_top_sum" + + sql "DROP TABLE IF EXISTS ${tableName}" + + def create_table = {table_name -> + sql """ + CREATE TABLE ${table_name} ( + `@timestamp` int(11) NULL COMMENT "", + `clientip` text NULL COMMENT "", + `request` text NULL COMMENT "", + `status` int NULL COMMENT "", + `size` int NULL COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(`@timestamp`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "disable_auto_compaction" = "true" + ); + """ + } + + def load_httplogs_data = {table_name, label, read_flag, format_flag, file_name, ignore_failure=false, + expected_succ_rows = -1, load_to_single_tablet = 'true' -> + + // load the json data + streamLoad { + table "${table_name}" + + // set http request header params + set 'label', label + "_" + UUID.randomUUID().toString() + set 'read_json_by_line', read_flag + set 'format', format_flag + file file_name // import json file + time 10000 // limit inflight 10s + if (expected_succ_rows >= 0) { + set 'max_filter_ratio', '1' + } + + // if declared a check callback, the default check condition will ignore. + // So you must check all condition + check { result, exception, startTime, endTime -> + if (ignore_failure && expected_succ_rows < 0) { return } + if (exception != null) { + throw exception + } + log.info("Stream load result: ${result}".toString()) + def json = parseJson(result) + assertEquals("success", json.Status.toLowerCase()) + if (expected_succ_rows >= 0) { + assertEquals(json.NumberLoadedRows, expected_succ_rows) + } else { + assertEquals(json.NumberTotalRows, json.NumberLoadedRows + json.NumberUnselectedRows) + assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0) + } + } + } + } + + try { + create_table(tableName) + + load_httplogs_data.call(tableName, 'test_index_approx_top_sum', 'true', 'json', 'documents-1000.json') + + sql "sync" + + sql """ set enable_common_expr_pushdown = true """ + + sql """ set debug_skip_fold_constant = true; """ + qt_sql """ select size, sum(size) as sum from ${tableName} group by size order by sum desc, size asc limit 10; """ + qt_sql """ select approx_top_sum(size, 10, 300) from ${tableName}; """ + qt_sql """ select approx_top_sum(size, 5 + 5, 300) from ${tableName}; """ + qt_sql """ select approx_top_sum(size, abs(-10), 300) from ${tableName}; """ + + qt_sql """ select clientip, status, size, sum(size) as sum from ${tableName} group by clientip, status, size order by sum desc, clientip asc limit 10; """ + qt_sql """ select approx_top_sum(clientip, status, size, 10, 300) from ${tableName}; """ + qt_sql """ select approx_top_sum(clientip, status, size, 5 + 5, 300) from ${tableName}; """ + qt_sql """ select approx_top_sum(clientip, status, size, abs(-10), 300) from ${tableName}; """ + + def result1 = "fail" + try { + drop_result = sql " select approx_top_sum(size, -10, 300) from ${tableName}; " + result1 = 'success' + } catch(Exception ex) { + logger.info("error msg: " + ex) + } + assertEquals(result1, 'fail') + + qt_sql """ + WITH tmp AS ( + SELECT approx_top_sum(clientip, status, size, 10, 300) AS json_output FROM ${tableName} + ) + SELECT + e1 + FROM + tmp + LATERAL VIEW explode_json_array_json(json_output) tmp1 AS e1; + """ + + sql """ set debug_skip_fold_constant = true; """ + qt_sql """ select size, sum(size) as sum from ${tableName} group by size order by sum desc, size asc limit 10; """ + qt_sql """ select approx_top_sum(size, 10, 300) from ${tableName}; """ + qt_sql """ select approx_top_sum(size, 5 + 5, 300) from ${tableName}; """ + qt_sql """ select approx_top_sum(size, abs(-10), 300) from ${tableName}; """ + + qt_sql """ select clientip, status, size, sum(size) as sum from ${tableName} group by clientip, status, size order by sum desc, clientip asc limit 10; """ + qt_sql """ select approx_top_sum(clientip, status, size, 10, 300) from ${tableName}; """ + qt_sql """ select approx_top_sum(clientip, status, size, 5 + 5, 300) from ${tableName}; """ + qt_sql """ select approx_top_sum(clientip, status, size, abs(-10), 300) from ${tableName}; """ + + def result2 = "fail" + try { + drop_result = sql " select approx_top_sum(size, -10, 300) from ${tableName}; " + result2 = 'success' + } catch(Exception ex) { + logger.info("error msg: " + ex) + } + assertEquals(result2, 'fail') + + qt_sql """ + WITH tmp AS ( + SELECT approx_top_sum(clientip, status, size, 10, 300) AS json_output FROM ${tableName} + ) + SELECT + e1 + FROM + tmp + LATERAL VIEW explode_json_array_json(json_output) tmp1 AS e1; + """ + } finally { + //try_sql("DROP TABLE IF EXISTS ${testTable}") + } +}