diff --git a/be/src/vec/aggregate_functions/aggregate_function_histogram.h b/be/src/vec/aggregate_functions/aggregate_function_histogram.h index a1a8a346e9c405..19e718787530be 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_histogram.h +++ b/be/src/vec/aggregate_functions/aggregate_function_histogram.h @@ -28,6 +28,8 @@ #include #include +#include "common/exception.h" +#include "common/status.h" #include "vec/aggregate_functions/aggregate_function.h" #include "vec/aggregate_functions/aggregate_function_simple_factory.h" #include "vec/columns/column.h" @@ -57,12 +59,10 @@ template struct AggregateFunctionHistogramData { using ColVecType = std::conditional_t, ColumnDecimal, ColumnVector>; + const static size_t DEFAULT_BUCKET_NUM = 128; + const static size_t BUCKET_NUM_INIT_VALUE = 0; - void set_parameters(int input_max_num_buckets) { - if (input_max_num_buckets > 0) { - max_num_buckets = (size_t)input_max_num_buckets; - } - } + void set_parameters(size_t input_max_num_buckets) { max_num_buckets = input_max_num_buckets; } void reset() { ordered_map.clear(); } @@ -86,6 +86,8 @@ struct AggregateFunctionHistogramData { } void merge(const AggregateFunctionHistogramData& rhs) { + // if rhs.max_num_buckets == 0, it means the input block for serialization is all null + // we should discard this data, because histogram only fouce on the not-null data if (!rhs.max_num_buckets) { return; } @@ -104,7 +106,6 @@ struct AggregateFunctionHistogramData { void write(BufferWritable& buf) const { write_binary(max_num_buckets, buf); - size_t element_number = (size_t)ordered_map.size(); write_binary(element_number, buf); @@ -148,7 +149,13 @@ struct AggregateFunctionHistogramData { std::string get(const DataTypePtr& data_type) const { std::vector> buckets; rapidjson::StringBuffer buffer; - build_histogram(buckets, ordered_map, max_num_buckets); + // NOTE: We need an extral branch for to handle max_num_buckets == 0, + // when target column is nullable, and input block is all null, + // set_parameters will not be called because of the short-circuit in + // AggregateFunctionNullVariadicInline, so max_num_buckets will be 0 in this situation. + build_histogram( + buckets, ordered_map, + max_num_buckets == BUCKET_NUM_INIT_VALUE ? DEFAULT_BUCKET_NUM : max_num_buckets); histogram_to_json(buffer, buckets, data_type); return std::string(buffer.GetString()); } @@ -162,7 +169,7 @@ struct AggregateFunctionHistogramData { } private: - size_t max_num_buckets = 128; + size_t max_num_buckets = BUCKET_NUM_INIT_VALUE; std::map ordered_map; }; @@ -186,13 +193,17 @@ class AggregateFunctionHistogram final void add(AggregateDataPtr __restrict place, const IColumn** columns, size_t row_num, Arena* arena) const override { - if (columns[0]->is_null_at(row_num)) { - return; - } - - if (has_input_param) { - this->data(place).set_parameters( - assert_cast(columns[1])->get_element(row_num)); + if constexpr (has_input_param) { + Int32 input_max_num_buckets = + assert_cast(columns[1])->get_element(row_num); + if (input_max_num_buckets <= 0) { + throw doris::Exception(ErrorCode::INVALID_ARGUMENT, + "Invalid max_num_buckets {}, row_num {}", + input_max_num_buckets, row_num); + } + this->data(place).set_parameters(input_max_num_buckets); + } else { + this->data(place).set_parameters(Data::DEFAULT_BUCKET_NUM); } if constexpr (std::is_same_v) { diff --git a/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.out b/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.out index 26e3963e4418ca..c9c7cd167d2d70 100644 --- a/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.out +++ b/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.out @@ -118,3 +118,75 @@ -- !select_minmax4 -- 243 +-- !select_histogram_k0 -- +{"num_buckets":2,"buckets":[{"lower":"0","upper":"0","ndv":1,"count":7,"pre_sum":0},{"lower":"1","upper":"1","ndv":1,"count":8,"pre_sum":7}]} + +-- !select_histogram_k1 -- +{"num_buckets":1,"buckets":[{"lower":"1","upper":"15","ndv":15,"count":15,"pre_sum":0}]} + +-- !select_histogram_k2 -- +{"num_buckets":2,"buckets":[{"lower":"-32767","upper":"1985","ndv":9,"count":11,"pre_sum":0},{"lower":"1986","upper":"32767","ndv":5,"count":10,"pre_sum":11}]} + +-- !select_histogram_k3 -- +{"num_buckets":3,"buckets":[{"lower":"-2147483647","upper":"1001","ndv":3,"count":5,"pre_sum":0},{"lower":"1002","upper":"3021","ndv":2,"count":5,"pre_sum":5},{"lower":"5014","upper":"2147483647","ndv":3,"count":5,"pre_sum":10}]} + +-- !select_histogram_k4 -- +{"num_buckets":4,"buckets":[{"lower":"-9223372036854775807","upper":"123456","ndv":4,"count":5,"pre_sum":0},{"lower":"7210457","upper":"11011903","ndv":3,"count":5,"pre_sum":5},{"lower":"11011905","upper":"11011920","ndv":2,"count":3,"pre_sum":10},{"lower":"9223372036854775807","upper":"9223372036854775807","ndv":1,"count":2,"pre_sum":13}]} + +-- !select_histogram_k5 -- +{"num_buckets":5,"buckets":[{"lower":"-654.654","upper":"-0.123","ndv":3,"count":3,"pre_sum":0},{"lower":"0.000","upper":"0.666","ndv":2,"count":3,"pre_sum":3},{"lower":"3.141","upper":"123.123","ndv":3,"count":3,"pre_sum":6},{"lower":"243.325","upper":"1243.500","ndv":2,"count":3,"pre_sum":9},{"lower":"24453.325","upper":"604587.000","ndv":3,"count":3,"pre_sum":12}]} + +-- !select_histogram_k6 -- +{"num_buckets":2,"buckets":[{"lower":"false","upper":"false","ndv":1,"count":8,"pre_sum":0},{"lower":"true","upper":"true","ndv":1,"count":7,"pre_sum":8}]} + +-- !select_histogram_k7 -- +{"num_buckets":7,"buckets":[{"lower":"","upper":"du3lnvl","ndv":3,"count":3,"pre_sum":0},{"lower":"jiw3n4","upper":"lifsno","ndv":2,"count":2,"pre_sum":3},{"lower":"wangjuoo4","upper":"wangjuoo5","ndv":2,"count":3,"pre_sum":5},{"lower":"wangynnsf","upper":"wenlsfnl","ndv":2,"count":3,"pre_sum":8},{"lower":"yanavnd","upper":"yanavnd","ndv":1,"count":1,"pre_sum":11},{"lower":"yanvjldjlll","upper":"yanvjldjlll","ndv":1,"count":1,"pre_sum":12},{"lower":"yunlj8@nk","upper":"yunlj8@nk","ndv":1,"count":2,"pre_sum":13}]} + +-- !select_histogram_k10 -- +{"num_buckets":10,"buckets":[{"lower":"1901-12-31","upper":"1901-12-31","ndv":1,"count":1,"pre_sum":0},{"lower":"1988-03-21","upper":"1988-03-21","ndv":1,"count":1,"pre_sum":1},{"lower":"1989-03-21","upper":"1989-03-21","ndv":1,"count":2,"pre_sum":2},{"lower":"1991-08-11","upper":"1991-08-11","ndv":1,"count":2,"pre_sum":4},{"lower":"2012-03-14","upper":"2012-03-14","ndv":1,"count":1,"pre_sum":6},{"lower":"2014-11-11","upper":"2014-11-11","ndv":1,"count":1,"pre_sum":7},{"lower":"2015-01-01","upper":"2015-01-01","ndv":1,"count":1,"pre_sum":8},{"lower":"2015-04-02","upper":"2015-04-02","ndv":1,"count":4,"pre_sum":9},{"lower":"3124-10-10","upper":"3124-10-10","ndv":1,"count":1,"pre_sum":13},{"lower":"9999-12-12","upper":"9999-12-12","ndv":1,"count":1,"pre_sum":14}]} + +-- !select_histogram_k11 -- +{"num_buckets":9,"buckets":[{"lower":"1901-01-01 00:00:00","upper":"1901-01-01 00:00:00","ndv":1,"count":1,"pre_sum":0},{"lower":"1989-03-21 13:00:00","upper":"1989-03-21 13:00:00","ndv":1,"count":2,"pre_sum":1},{"lower":"1989-03-21 13:11:00","upper":"1989-03-21 13:11:00","ndv":1,"count":2,"pre_sum":3},{"lower":"2000-01-01 00:00:00","upper":"2000-01-01 00:00:00","ndv":1,"count":1,"pre_sum":5},{"lower":"2013-04-02 15:16:52","upper":"2013-04-02 15:16:52","ndv":1,"count":2,"pre_sum":6},{"lower":"2015-03-13 10:30:00","upper":"2015-03-13 10:30:00","ndv":1,"count":1,"pre_sum":8},{"lower":"2015-03-13 12:36:38","upper":"2015-03-13 12:36:38","ndv":1,"count":2,"pre_sum":9},{"lower":"2015-04-02 00:00:00","upper":"2015-04-02 00:00:00","ndv":1,"count":3,"pre_sum":11},{"lower":"9999-11-11 12:12:00","upper":"9999-11-11 12:12:00","ndv":1,"count":1,"pre_sum":14}]} + +-- !select_histogram_k12 -- +{"num_buckets":1,"buckets":[{"lower":"string12345","upper":"string12345","ndv":1,"count":15,"pre_sum":0}]} + +-- !select_histogram_k13 -- +{"num_buckets":13,"buckets":[{"lower":"-170141183460469231731687303715884105727","upper":"-20220101","ndv":2,"count":2,"pre_sum":0},{"lower":"-11011903","upper":"-2022","ndv":2,"count":2,"pre_sum":2},{"lower":"0","upper":"0","ndv":1,"count":1,"pre_sum":4},{"lower":"11011903","upper":"11011903","ndv":1,"count":1,"pre_sum":5},{"lower":"20220101","upper":"20220101","ndv":1,"count":1,"pre_sum":6},{"lower":"20220102","upper":"20220102","ndv":1,"count":1,"pre_sum":7},{"lower":"20220104","upper":"20220104","ndv":1,"count":1,"pre_sum":8},{"lower":"701411834604692317","upper":"701411834604692317","ndv":1,"count":1,"pre_sum":9},{"lower":"701411834604692317316873","upper":"701411834604692317316873","ndv":1,"count":1,"pre_sum":10},{"lower":"1701604692317316873037158","upper":"1701604692317316873037158","ndv":1,"count":1,"pre_sum":11},{"lower":"701411834604692317316873037158","upper":"701411834604692317316873037158","ndv":1,"count":1,"pre_sum":12},{"lower":"1701411834604692317316873037158","upper":"1701411834604692317316873037158","ndv":1,"count":1,"pre_sum":13},{"lower":"170141183460469231731687303715884105727","upper":"170141183460469231731687303715884105727","ndv":1,"count":1,"pre_sum":14}]} + +-- !select_histogram_k0_all_null -- +{"num_buckets":0,"buckets":[]} + +-- !select_histogram_k1_all_null -- +{"num_buckets":0,"buckets":[]} + +-- !select_histogram_k2 -- +{"num_buckets":2,"buckets":[{"lower":"10","upper":"12","ndv":3,"count":3,"pre_sum":0},{"lower":"13","upper":"15","ndv":3,"count":3,"pre_sum":3}]} + +-- !select_histogram_k3_all_null -- +{"num_buckets":0,"buckets":[]} + +-- !select_histogram_k4_all_null -- +{"num_buckets":0,"buckets":[]} + +-- !select_histogram_k5_all_null -- +{"num_buckets":0,"buckets":[]} + +-- !select_histogram_k6_all_null -- +{"num_buckets":0,"buckets":[]} + +-- !select_histogram_k7_all_null -- +{"num_buckets":0,"buckets":[]} + +-- !select_histogram_k10_all_null -- +{"num_buckets":0,"buckets":[]} + +-- !select_histogram_k11_all_null -- +{"num_buckets":0,"buckets":[]} + +-- !select_histogram_k12_all_null -- +{"num_buckets":0,"buckets":[]} + +-- !select_histogram_k13_all_null -- +{"num_buckets":0,"buckets":[]} + diff --git a/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.groovy b/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.groovy index 823a73e700d89d..03986516048c49 100644 --- a/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.groovy +++ b/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.groovy @@ -113,4 +113,99 @@ suite("test_aggregate_all_functions2") { qt_select_minmax2 """ select max_by(datekey,hour) from metric_table; """ qt_select_minmax3 """ select bitmap_to_string(max_by(device_id,hour)) from metric_table; """ qt_select_minmax4 """ select bitmap_to_string(min_by(device_id,hour)) from metric_table; """ + + sql """ + INSERT INTO baseall values + (NULL, NULL, 10, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL) + """ + + sql """ + INSERT INTO baseall values + (NULL, NULL, 11, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL) + """ + + sql """ + INSERT INTO baseall values + (NULL, NULL, 12, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL) + """ + + sql """ + INSERT INTO baseall values + (NULL, NULL, 13, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL) + """ + + sql """ + INSERT INTO baseall values + (NULL, NULL, 14, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL) + """ + + sql """ + INSERT INTO baseall values + (NULL, NULL, 15, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL) + """ + + qt_select_histogram_k0 """SELECT histogram(k0) FROM baseall""" + qt_select_histogram_k1 """SELECT histogram(k1, 1) FROM baseall""" + qt_select_histogram_k2 """SELECT histogram(k2, 2) FROM baseall""" + qt_select_histogram_k3 """SELECT histogram(k3, 3) FROM baseall""" + qt_select_histogram_k4 """SELECT histogram(k4, 4) FROM baseall""" + qt_select_histogram_k5 """SELECT histogram(k5, 5) FROM baseall""" + qt_select_histogram_k6 """SELECT histogram(k6, 6) FROM baseall""" + qt_select_histogram_k7 """SELECT histogram(k7, 7) FROM baseall""" + // the test case for double and float is removed, becase the result is not stable since we have + // 0 and -0 in column k8, both of them are valid but we can not make both of them stand in out file. +// qt_select_histogram_k8 """SELECT histogram(k8, 8) FROM baseall""" +// qt_select_histogram_k9 """SELECT histogram(k9, 9) FROM baseall""" + qt_select_histogram_k10 """SELECT histogram(k10, 10) FROM baseall""" + qt_select_histogram_k11 """SELECT histogram(k11, 11) FROM baseall""" + qt_select_histogram_k12 """SELECT histogram(k12, 12) FROM baseall""" + qt_select_histogram_k13 """SELECT histogram(k13, 13) FROM baseall""" + + sql """ + TRUNCATE TABLE baseall; + """ + + sql """ + INSERT INTO baseall values + (NULL, NULL, 10, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL) + """ + + sql """ + INSERT INTO baseall values + (NULL, NULL, 11, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL) + """ + + sql """ + INSERT INTO baseall values + (NULL, NULL, 12, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL) + """ + + sql """ + INSERT INTO baseall values + (NULL, NULL, 13, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL) + """ + + sql """ + INSERT INTO baseall values + (NULL, NULL, 14, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL) + """ + + sql """ + INSERT INTO baseall values + (NULL, NULL, 15, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL) + """ + qt_select_histogram_k0_all_null """SELECT histogram(k0) FROM baseall""" + qt_select_histogram_k1_all_null """SELECT histogram(k1, 1) FROM baseall""" + qt_select_histogram_k2 """SELECT histogram(k2, 2) FROM baseall""" + qt_select_histogram_k3_all_null """SELECT histogram(k3, 3) FROM baseall""" + qt_select_histogram_k4_all_null """SELECT histogram(k4, 4) FROM baseall""" + qt_select_histogram_k5_all_null """SELECT histogram(k5, 5) FROM baseall""" + qt_select_histogram_k6_all_null """SELECT histogram(k6, 6) FROM baseall""" + qt_select_histogram_k7_all_null """SELECT histogram(k7, 7) FROM baseall""" +// qt_select_histogram_k8_all_null """SELECT histogram(k8, 8) FROM baseall""" +// qt_select_histogram_k9_all_null """SELECT histogram(k9, 9) FROM baseall""" + qt_select_histogram_k10_all_null """SELECT histogram(k10, 10) FROM baseall""" + qt_select_histogram_k11_all_null """SELECT histogram(k11, 11) FROM baseall""" + qt_select_histogram_k12_all_null """SELECT histogram(k12, 12) FROM baseall""" + qt_select_histogram_k13_all_null """SELECT histogram(k13, 13) FROM baseall""" }