diff --git a/be/src/exprs/hll_function.cpp b/be/src/exprs/hll_function.cpp index 5894d0c057795d..e91f947b345d28 100644 --- a/be/src/exprs/hll_function.cpp +++ b/be/src/exprs/hll_function.cpp @@ -48,6 +48,9 @@ void HllFunctions::hll_init(FunctionContext *, StringVal* dst) { dst->len = sizeof(HyperLogLog); dst->ptr = (uint8_t*)new HyperLogLog(); } +StringVal HllFunctions::empty_hll(FunctionContext* ctx) { + return AnyValUtil::from_string_temp(ctx, HyperLogLog::empty()); +} template void HllFunctions::hll_update(FunctionContext *, const T &src, StringVal* dst) { diff --git a/be/src/exprs/hll_function.h b/be/src/exprs/hll_function.h index e08cbff7bea716..da0e5e9d2bd9fd 100644 --- a/be/src/exprs/hll_function.h +++ b/be/src/exprs/hll_function.h @@ -26,6 +26,7 @@ class HllFunctions { public: static void init(); static StringVal hll_hash(FunctionContext* ctx, const StringVal& dest_base); + static StringVal empty_hll(FunctionContext* ctx); static void hll_init(FunctionContext*, StringVal* dst); template diff --git a/be/src/olap/hll.h b/be/src/olap/hll.h index 21807f4f075b69..e6c88d5e9ce1b9 100644 --- a/be/src/olap/hll.h +++ b/be/src/olap/hll.h @@ -103,6 +103,14 @@ class HyperLogLog { int64_t estimate_cardinality(); + static std::string empty() { + static HyperLogLog hll; + std::string buf; + buf.resize(HLL_EMPTY_SIZE); + hll.serialize((uint8_t*)buf.c_str()); + return buf; + } + // only for debug std::string to_string() { switch (_type) { diff --git a/docs/documentation/cn/sql-reference/sql-statements/Data Definition/HLL.md b/docs/documentation/cn/sql-reference/sql-statements/Data Definition/HLL.md index 16b40d7fbb444e..de26c468ac9840 100644 --- a/docs/documentation/cn/sql-reference/sql-statements/Data Definition/HLL.md +++ b/docs/documentation/cn/sql-reference/sql-statements/Data Definition/HLL.md @@ -18,6 +18,9 @@ HLL_HASH(column_name) 生成HLL列类型,用于insert或导入的时候,导入的使用见相关说明 + + EMPTY_HLL() + 生成空HLL列,用于insert或导入的时候补充默认值,导入的使用见相关说明 ## example 1. 首先创建一张含有hll列的表 diff --git a/docs/documentation/cn/sql-reference/sql-statements/Data Manipulation/BROKER LOAD.md b/docs/documentation/cn/sql-reference/sql-statements/Data Manipulation/BROKER LOAD.md index fd1e4991ebe51c..5dc7ec024bc1c5 100644 --- a/docs/documentation/cn/sql-reference/sql-statements/Data Manipulation/BROKER LOAD.md +++ b/docs/documentation/cn/sql-reference/sql-statements/Data Manipulation/BROKER LOAD.md @@ -286,8 +286,8 @@ 7. 导入数据到含有HLL列的表,可以是表中的列或者数据里面的列 - 如果表中有三列分别是(id,v1,v2)。其中v1和v2列是hll列。导入的源文件有3列。则(column_list)中声明第一列为id,第二三列为一个临时命名的k1,k2。 - 在SET中必须给表中的hll列特殊声明 hll_hash。表中的v1列等于原始数据中的hll_hash(k1)列。 + 如果表中有三列分别是(id,v1,v2,v3)。其中v1和v2列是hll列。导入的源文件有3列。则(column_list)中声明第一列为id,第二三列为一个临时命名的k1,k2。 + 在SET中必须给表中的hll列特殊声明 hll_hash。表中的v1列等于原始数据中的hll_hash(k1)列, 表中的v3列在原始数据中并没有对应的值,使用empty_hll补充默认值。 LOAD LABEL example_db.label7 ( DATA INFILE("hdfs://hdfs_host:hdfs_port/user/palo/data/input/file") @@ -297,7 +297,8 @@ (id, k1, k2) SET ( v1 = hll_hash(k1), - v2 = hll_hash(k2) + v2 = hll_hash(k2), + v3 = empty_hll() ) ) WITH BROKER hdfs ("username"="hdfs_user", "password"="hdfs_password"); diff --git a/docs/documentation/cn/sql-reference/sql-statements/Data Manipulation/STREAM LOAD.md b/docs/documentation/cn/sql-reference/sql-statements/Data Manipulation/STREAM LOAD.md index f1d02d6400b0b4..507373bf637100 100644 --- a/docs/documentation/cn/sql-reference/sql-statements/Data Manipulation/STREAM LOAD.md +++ b/docs/documentation/cn/sql-reference/sql-statements/Data Manipulation/STREAM LOAD.md @@ -90,8 +90,8 @@ 6. 使用streaming方式导入(用户是defalut_cluster中的) seq 1 10 | awk '{OFS="\t"}{print $1, $1 * 10}' | curl --location-trusted -u root -T - http://host:port/api/testDb/testTbl/_stream_load - 7. 导入含有HLL列的表,可以是表中的列或者数据中的列用于生成HLL列 - curl --location-trusted -u root -H "columns: k1, k2, v1=hll_hash(k1)" -T testData http://host:port/api/testDb/testTbl/_stream_load + 7. 导入含有HLL列的表,可以是表中的列或者数据中的列用于生成HLL列,也可使用empty_hll补充数据中没有的列 + curl --location-trusted -u root -H "columns: k1, k2, v1=hll_hash(k1), v2=empty_hll()" -T testData http://host:port/api/testDb/testTbl/_stream_load 8. 导入数据进行严格模式过滤,并设置时区为 Africa/Abidjan curl --location-trusted -u root -H "strict_mode: true" -H "timezone: Africa/Abidjan" -T testData http://host:port/api/testDb/testTbl/_stream_load diff --git a/docs/documentation/en/sql-reference/sql-statements/Data Definition/HLL_EN.md b/docs/documentation/en/sql-reference/sql-statements/Data Definition/HLL_EN.md index 4061060964542b..b6c7044d037a1e 100644 --- a/docs/documentation/en/sql-reference/sql-statements/Data Definition/HLL_EN.md +++ b/docs/documentation/en/sql-reference/sql-statements/Data Definition/HLL_EN.md @@ -19,6 +19,9 @@ This function is used to estimate the cardinality of a single HLL sequence HLL_HASH(column_name) Generate HLL column types for insert or import, see the instructions for the use of imports +EMPTY_HLL() +Generate empty HLL column types for insert or import, see the instructions for the use of imports + ## example 1. First create a table with HLL columns create table test( diff --git a/docs/documentation/en/sql-reference/sql-statements/Data Manipulation/BROKER LOAD_EN.md b/docs/documentation/en/sql-reference/sql-statements/Data Manipulation/BROKER LOAD_EN.md index 7eefcfabde8322..2b820848de0785 100644 --- a/docs/documentation/en/sql-reference/sql-statements/Data Manipulation/BROKER LOAD_EN.md +++ b/docs/documentation/en/sql-reference/sql-statements/Data Manipulation/BROKER LOAD_EN.md @@ -302,9 +302,9 @@ 7. Load data into tables containing HLL columns, which can be columns in tables or columns in data - If there are three columns in the table (id, v1, v2). The V1 and V2 columns are HLL columns. The imported source file has three columns. Then (column_list) declares that the first column is id, and the second and third columns are temporarily named k1, k2. + If there are three columns in the table (id, v1, v2, v3). The V1 and V2 columns are HLL columns. The imported source file has three columns. Then (column_list) declares that the first column is id, and the second and third columns are temporarily named k1, k2. - In SET, the HLL column in the table must be specifically declared hll_hash. The V1 column in the table is equal to the hll_hash (k1) column in the original data. + In SET, the HLL column in the table must be specifically declared hll_hash. The V1 column in the table is equal to the hll_hash (k1) column in the original data.The v3 column in the table does not have a corresponding value in the original data, and empty_hll is used to supplement the default value. LOAD LABEL example_db.label7 ( @@ -315,7 +315,8 @@ (id, k1, k2) SET ( v1 = hll_hash(k1), - v2 = hll_hash(k2) + v2 = hll_hash(k2), + v3 = empty_hll() ) ) WITH BROKER hdfs ("username"="hdfs_user", "password"="hdfs_password"); diff --git a/docs/documentation/en/sql-reference/sql-statements/Data Manipulation/STREAM LOAD_EN.md b/docs/documentation/en/sql-reference/sql-statements/Data Manipulation/STREAM LOAD_EN.md index a294ae25a3a646..531c3bd3c83bc6 100644 --- a/docs/documentation/en/sql-reference/sql-statements/Data Manipulation/STREAM LOAD_EN.md +++ b/docs/documentation/en/sql-reference/sql-statements/Data Manipulation/STREAM LOAD_EN.md @@ -145,9 +145,9 @@ Where url is the url given by ErrorURL. ```Seq 1 10 | awk '{OFS="\t"}{print $1, $1 * 10}' | curl --location-trusted -u root -T - http://host:port/api/testDb/testTbl/_stream_load``` -7. load a table with HLL columns, which can be columns in the table or columns in the data used to generate HLL columns +7. load a table with HLL columns, which can be columns in the table or columns in the data used to generate HLL columns,you can also use empty_hll to supplement columns that are not in the data - ```Curl --location-trusted -u root -H "columns: k1, k2, v1=hll_hash(k1)" -T testData http://host:port/api/testDb/testTbl/_stream_load``` + ```Curl --location-trusted -u root -H "columns: k1, k2, v1=hll_hash(k1), v2=empty_hll()" -T testData http://host:port/api/testDb/testTbl/_stream_load``` 8. load data for strict mode filtering and set the time zone to Africa/Abidjan diff --git a/fe/src/main/java/org/apache/doris/planner/BrokerScanNode.java b/fe/src/main/java/org/apache/doris/planner/BrokerScanNode.java index 5aab4a8e4db71f..3dff76965766fc 100644 --- a/fe/src/main/java/org/apache/doris/planner/BrokerScanNode.java +++ b/fe/src/main/java/org/apache/doris/planner/BrokerScanNode.java @@ -277,9 +277,9 @@ private void finalizeParams(ParamCreateContext context) throws UserException, An + destSlotDesc.getColumn().getName() + "=hll_hash(xxx)"); } FunctionCallExpr fn = (FunctionCallExpr) expr; - if (!fn.getFnName().getFunction().equalsIgnoreCase("hll_hash")) { + if (!fn.getFnName().getFunction().equalsIgnoreCase("hll_hash") && !fn.getFnName().getFunction().equalsIgnoreCase("empty_hll")) { throw new AnalysisException("HLL column must use hll_hash function, like " - + destSlotDesc.getColumn().getName() + "=hll_hash(xxx)"); + + destSlotDesc.getColumn().getName() + "=hll_hash(xxx) or " + destSlotDesc.getColumn().getName() + "=empty_hll()"); } expr.setType(Type.HLL); } diff --git a/fe/src/main/java/org/apache/doris/planner/StreamLoadScanNode.java b/fe/src/main/java/org/apache/doris/planner/StreamLoadScanNode.java index 440b13a4df8b3e..d15e49667710b5 100644 --- a/fe/src/main/java/org/apache/doris/planner/StreamLoadScanNode.java +++ b/fe/src/main/java/org/apache/doris/planner/StreamLoadScanNode.java @@ -179,6 +179,7 @@ private void finalizeParams() throws UserException { } } } + // check hll_hash if (dstSlotDesc.getType().getPrimitiveType() == PrimitiveType.HLL) { if (!(expr instanceof FunctionCallExpr)) { @@ -186,9 +187,9 @@ private void finalizeParams() throws UserException { + dstSlotDesc.getColumn().getName() + "=hll_hash(xxx)"); } FunctionCallExpr fn = (FunctionCallExpr) expr; - if (!fn.getFnName().getFunction().equalsIgnoreCase("hll_hash")) { + if (!fn.getFnName().getFunction().equalsIgnoreCase("hll_hash") && !fn.getFnName().getFunction().equalsIgnoreCase("empty_hll")) { throw new AnalysisException("HLL column must use hll_hash function, like " - + dstSlotDesc.getColumn().getName() + "=hll_hash(xxx)"); + + dstSlotDesc.getColumn().getName() + "=hll_hash(xxx) or " + dstSlotDesc.getColumn().getName() + "=empty_hll()"); } expr.setType(Type.HLL); } diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index 9fc365107682d6..f8a87e6d54edac 100755 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -587,6 +587,8 @@ '_ZN5doris12HllFunctions15hll_cardinalityEPN9doris_udf15FunctionContextERKNS1_9StringValE'], [['hll_hash'], 'VARCHAR', ['VARCHAR'], '_ZN5doris12HllFunctions8hll_hashEPN9doris_udf15FunctionContextERKNS1_9StringValE'], + [['empty_hll'], 'VARCHAR', [], + '_ZN5doris12HllFunctions9empty_hllEPN9doris_udf15FunctionContextE'], #bitmap function