-
Notifications
You must be signed in to change notification settings - Fork 3.7k
Encapsulate HLL logic #1756
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Encapsulate HLL logic #1756
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,110 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #include "exprs/hll_function.h" | ||
|
|
||
| #include "exprs/anyval_util.h" | ||
| #include "util/hash_util.hpp" | ||
|
|
||
| namespace doris { | ||
|
|
||
| using doris_udf::BigIntVal; | ||
| using doris_udf::StringVal; | ||
|
|
||
| void HllFunctions::init() { | ||
| } | ||
|
|
||
| StringVal HllFunctions::hll_hash(FunctionContext* ctx, const StringVal& input) { | ||
| const int HLL_SINGLE_VALUE_SIZE = 10; | ||
| const int HLL_EMPTY_SIZE = 1; | ||
| std::string buf; | ||
| std::unique_ptr<HyperLogLog> hll; | ||
| if (!input.is_null) { | ||
| uint64_t hash_value = HashUtil::murmur_hash64A(input.ptr, input.len, HashUtil::MURMUR_SEED); | ||
| hll.reset(new HyperLogLog(hash_value)); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For this case we create HyperLogLog two times
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will Fix |
||
| buf.resize(HLL_SINGLE_VALUE_SIZE); | ||
| } else { | ||
| hll.reset(new HyperLogLog()); | ||
| buf.resize(HLL_EMPTY_SIZE); | ||
| } | ||
| hll->serialize((char*)buf.c_str()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not serialize(std::string*) ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because slice.data type is char* and many HLL methods use char*. |
||
| return AnyValUtil::from_string_temp(ctx, buf); | ||
| } | ||
|
|
||
| void HllFunctions::hll_init(FunctionContext *, StringVal* dst) { | ||
| dst->is_null = false; | ||
| dst->len = sizeof(HyperLogLog); | ||
| dst->ptr = (uint8_t*)new HyperLogLog(); | ||
| } | ||
|
|
||
| template <typename T> | ||
| void HllFunctions::hll_update(FunctionContext *, const T &src, StringVal* dst) { | ||
| if (src.is_null) { | ||
| return; | ||
| } | ||
|
|
||
| uint64_t hash_value = AnyValUtil::hash64_murmur(src, HashUtil::MURMUR_SEED); | ||
| if (hash_value != 0) { | ||
| auto* dst_hll = reinterpret_cast<HyperLogLog*>(dst->ptr); | ||
| dst_hll->update(hash_value); | ||
| } | ||
| } | ||
| void HllFunctions::hll_merge(FunctionContext*, const StringVal &src, StringVal* dst) { | ||
| HyperLogLog src_hll = HyperLogLog((char*)src.ptr); | ||
| auto* dst_hll = reinterpret_cast<HyperLogLog*>(dst->ptr); | ||
| dst_hll->merge(src_hll); | ||
| } | ||
|
|
||
| BigIntVal HllFunctions::hll_finalize(FunctionContext*, const StringVal &src) { | ||
| auto* src_hll = reinterpret_cast<HyperLogLog*>(src.ptr); | ||
| BigIntVal result(src_hll->estimate_cardinality()); | ||
| delete src_hll; | ||
| return result; | ||
| } | ||
|
|
||
| BigIntVal HllFunctions::hll_cardinality(FunctionContext* ctx, const StringVal& input) { | ||
| if (input.is_null) { | ||
| return BigIntVal::null(); | ||
| } | ||
| StringVal dst; | ||
| hll_init(ctx, &dst); | ||
| hll_merge(ctx, input, &dst); | ||
| return hll_finalize(ctx, dst); | ||
| } | ||
|
|
||
| StringVal HllFunctions::hll_serialize(FunctionContext *ctx, const StringVal &src) { | ||
| auto* src_hll = reinterpret_cast<HyperLogLog*>(src.ptr); | ||
| StringVal result(ctx, HLL_COLUMN_DEFAULT_LEN); | ||
| int size = src_hll->serialize((char*)result.ptr); | ||
| result.resize(ctx, size); | ||
| delete src_hll; | ||
| return result; | ||
| } | ||
|
|
||
| template void HllFunctions::hll_update(FunctionContext*, const BooleanVal&, StringVal*); | ||
| template void HllFunctions::hll_update(FunctionContext*, const TinyIntVal&, StringVal*); | ||
| template void HllFunctions::hll_update(FunctionContext*, const SmallIntVal&, StringVal*); | ||
| template void HllFunctions::hll_update(FunctionContext*, const IntVal&, StringVal*); | ||
| template void HllFunctions::hll_update(FunctionContext*, const BigIntVal&, StringVal*); | ||
| template void HllFunctions::hll_update(FunctionContext*, const FloatVal&, StringVal*); | ||
| template void HllFunctions::hll_update(FunctionContext*, const DoubleVal&, StringVal*); | ||
| template void HllFunctions::hll_update(FunctionContext*, const StringVal&, StringVal*); | ||
| template void HllFunctions::hll_update(FunctionContext*, const DateTimeVal&, StringVal*); | ||
| template void HllFunctions::hll_update(FunctionContext*, const LargeIntVal&, StringVal*); | ||
| template void HllFunctions::hll_update(FunctionContext*, const DecimalVal&, StringVal*); | ||
| template void HllFunctions::hll_update(FunctionContext*, const DecimalV2Val&, StringVal*); | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,44 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #ifndef DORIS_BE_SRC_QUERY_EXPRS_HLL_FUNCTION_H | ||
| #define DORIS_BE_SRC_QUERY_EXPRS_HLL_FUNCTION_H | ||
|
|
||
| #include "udf/udf.h" | ||
|
|
||
| namespace doris { | ||
|
|
||
| class HllFunctions { | ||
| public: | ||
| static void init(); | ||
| static StringVal hll_hash(FunctionContext* ctx, const StringVal& dest_base); | ||
| static void hll_init(FunctionContext*, StringVal* dst); | ||
|
|
||
| template <typename T> | ||
| static void hll_update(FunctionContext*, const T& src, StringVal* dst); | ||
|
|
||
| static void hll_merge(FunctionContext*,const StringVal& src, StringVal* dst); | ||
|
|
||
| static BigIntVal hll_finalize(FunctionContext*, const StringVal& src); | ||
|
|
||
| static StringVal hll_serialize(FunctionContext* ctx, const StringVal& src); | ||
|
|
||
| static BigIntVal hll_cardinality(FunctionContext* ctx, const StringVal& src); | ||
| }; | ||
| } | ||
|
|
||
| #endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should use a const or a macro, better not to use a magic number
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see. Use magic number only for simple.
Because I think we could delete this class code in doris 0.12 version.