From 545ee80c887c7164647719083208669173eb33bb Mon Sep 17 00:00:00 2001 From: lixueyan07 Date: Tue, 22 Sep 2020 16:43:19 +0800 Subject: [PATCH 1/2] update data types doc and fix some typo --- docs/.vuepress/sidebar/en.js | 3 +- docs/.vuepress/sidebar/zh-CN.js | 1 + .../sql-statements/Data Types/BITMAP.md | 48 ++++++++++++++++++ .../Data Types/HLL(HyperLogLog).md | 35 ------------- .../sql-statements/Data Types/HLL.md | 49 +++++++++++++++++++ .../sql-statements/Data Types/VARCHAR.md | 4 +- .../sql-statements/Data Types/BITMAP.md | 48 ++++++++++++++++++ .../sql-statements/Data Types/HLL.md | 20 ++++++-- .../sql-statements/Data Types/VARCHAR.md | 4 +- 9 files changed, 170 insertions(+), 42 deletions(-) create mode 100644 docs/en/sql-reference/sql-statements/Data Types/BITMAP.md delete mode 100644 docs/en/sql-reference/sql-statements/Data Types/HLL(HyperLogLog).md create mode 100644 docs/en/sql-reference/sql-statements/Data Types/HLL.md create mode 100644 docs/zh-CN/sql-reference/sql-statements/Data Types/BITMAP.md diff --git a/docs/.vuepress/sidebar/en.js b/docs/.vuepress/sidebar/en.js index 0d07657e08b09c..49572b780ebb9b 100644 --- a/docs/.vuepress/sidebar/en.js +++ b/docs/.vuepress/sidebar/en.js @@ -483,6 +483,7 @@ module.exports = [ directoryPath: "Data Types/", children: [ "BIGINT", + "BITMAP", "BOOLEAN", "CHAR", "DATE", @@ -490,7 +491,7 @@ module.exports = [ "DECIMAL", "DOUBLE", "FLOAT", - "HLL(HyperLogLog)", + "HLL", "INT", "SMALLINT", "TINYINT", diff --git a/docs/.vuepress/sidebar/zh-CN.js b/docs/.vuepress/sidebar/zh-CN.js index 2d694e79873f36..cf5301ccd22435 100644 --- a/docs/.vuepress/sidebar/zh-CN.js +++ b/docs/.vuepress/sidebar/zh-CN.js @@ -489,6 +489,7 @@ module.exports = [ directoryPath: "Data Types/", children: [ "BIGINT", + "BITMAP", "BOOLEAN", "CHAR", "DATE", diff --git a/docs/en/sql-reference/sql-statements/Data Types/BITMAP.md b/docs/en/sql-reference/sql-statements/Data Types/BITMAP.md new file mode 100644 index 00000000000000..f1c1774071b88c --- /dev/null +++ b/docs/en/sql-reference/sql-statements/Data Types/BITMAP.md @@ -0,0 +1,48 @@ +--- +{ + "title": "BITMAP", + "language": "en" +} +--- + + + +#BITMAP +## Description +BITMAP + +BITMAP cannot be used as a key column, and the aggregation type is BITMAP_UNION when building the table. +The user does not need to specify the length and default value. The length is controlled within the system according to the degree of data aggregation. +And the BITMAP column can only be queried or used by supporting functions such as bitmap_union_count, bitmap_union, and bitmap_hash. + +The use of BITMAP in offline scenarios will affect the import speed. In the case of a large amount of data, the query speed will be slower than HLL and better than Count Distinct. +In the real-time scene, BITMAP cannot achieve complete and accurate deduplication, and the error is usually less than one thousandth. + +## example + + select hour, BITMAP_UNION_COUNT(pv) over(order by hour) uv from( + select hour, BITMAP_UNION(device_id) as pv + from metric_table -- Query the accumulated UV per hour + where datekey=20200922 + group by hour order by 1 + ) final; + +## keyword +BITMAP diff --git a/docs/en/sql-reference/sql-statements/Data Types/HLL(HyperLogLog).md b/docs/en/sql-reference/sql-statements/Data Types/HLL(HyperLogLog).md deleted file mode 100644 index 7a511e288ebc95..00000000000000 --- a/docs/en/sql-reference/sql-statements/Data Types/HLL(HyperLogLog).md +++ /dev/null @@ -1,35 +0,0 @@ ---- -{ - "title": "HLL (Hyloglog)", - "language": "en" -} ---- - - - -#HLL (Hyloglog) -## Description -MARKETING (M) -A variable length string, M represents the length of a variable length string. The range of M is 1-16385. -Users do not need to specify length and default values. Length is controlled within the system according to the aggregation degree of data -And HLL columns can only be queried or used by matching hll_union_agg, hll_raw_agg, hll_cardinality, hll_hash. - -## keyword -High loglog, hll, hyloglog diff --git a/docs/en/sql-reference/sql-statements/Data Types/HLL.md b/docs/en/sql-reference/sql-statements/Data Types/HLL.md new file mode 100644 index 00000000000000..999a897a88f88a --- /dev/null +++ b/docs/en/sql-reference/sql-statements/Data Types/HLL.md @@ -0,0 +1,49 @@ +--- +{ + "title": "HLL (HyperLogLog)", + "language": "en" +} +--- + + + +#HLL (HyperLogLog) +## Description +HLL + +HLL cannot be used as a key column, and the aggregation type is HLL_UNION when create table. +The user does not need to specify the length and default value. +The length is controlled within the system according to the degree of data aggregation. +And HLL columns can only be queried or used through the matching hll_union_agg, hll_raw_agg, hll_cardinality, and hll_hash. + +HLL is approximate count of distinct elements, and its performance is better than Count Distinct when the amount of data is large. +The error of HLL is usually around 1%, sometimes up to 2%. + +## example + + select hour, HLL_UNION_AGG(pv) over(order by hour) uv from( + select hour, HLL_RAW_AGG(device_id) as pv + from metric_table -- Query the accumulated UV per hour + where datekey=20200922 + group by hour order by 1 + ) final; + +## keyword +HLL,HYPERLOGLOG diff --git a/docs/en/sql-reference/sql-statements/Data Types/VARCHAR.md b/docs/en/sql-reference/sql-statements/Data Types/VARCHAR.md index 268a56222df954..ae7fa8599d3e93 100644 --- a/docs/en/sql-reference/sql-statements/Data Types/VARCHAR.md +++ b/docs/en/sql-reference/sql-statements/Data Types/VARCHAR.md @@ -27,7 +27,9 @@ under the License. # VARCHAR ## Description MARKETING (M) -A variable length string, M represents the length of a variable length string. The range of M is 1-65535. +A variable length string, M represents the length of a variable length string. The range of M is 1-65533. + +Note: Variable length strings are stored in UTF-8 encoding, so usually English characters occupies 1 byte, and Chinese characters occupies 3 bytes. ## keyword VARCHAR diff --git a/docs/zh-CN/sql-reference/sql-statements/Data Types/BITMAP.md b/docs/zh-CN/sql-reference/sql-statements/Data Types/BITMAP.md new file mode 100644 index 00000000000000..2482229bad3270 --- /dev/null +++ b/docs/zh-CN/sql-reference/sql-statements/Data Types/BITMAP.md @@ -0,0 +1,48 @@ +--- +{ + "title": "BITMAP", + "language": "zh-CN" +} +--- + + + +# BITMAP +## description + BITMAP + BITMAP不能作为key列使用,建表时配合聚合类型为BITMAP_UNION。 + 用户不需要指定长度和默认值。长度根据数据的聚合程度系统内控制。 + 并且BITMAP列只能通过配套的bitmap_union_count、bitmap_union、bitmap_hash等函数进行查询或使用。 + + 离线场景下使用BITMAP会影响导入速度,在数据量大的情况下查询速度会慢于HLL,并优于Count Distinct。 + 实时场景下BITMAP也不能做到完全的精确去重,通常误差小于千分之一。 + +## example + + select hour, BITMAP_UNION_COUNT(pv) over(order by hour) uv from( + select hour, BITMAP_UNION(device_id) as pv + from metric_table -- 查询每小时的累计UV + where datekey=20200622 + group by hour order by 1 + ) final; + +## keyword + + BITMAP diff --git a/docs/zh-CN/sql-reference/sql-statements/Data Types/HLL.md b/docs/zh-CN/sql-reference/sql-statements/Data Types/HLL.md index 7357f4e1e3130a..b261495d7ec384 100644 --- a/docs/zh-CN/sql-reference/sql-statements/Data Types/HLL.md +++ b/docs/zh-CN/sql-reference/sql-statements/Data Types/HLL.md @@ -26,10 +26,22 @@ under the License. # HLL(HyperLogLog) ## description - VARCHAR(M) - 变长字符串,M代表的是变长字符串的长度。M的范围是1-16385 - 用户不需要指定长度和默认值。长度根据数据的聚合程度系统内控制 - 并且HLL列只能通过配套的hll_union_agg、hll_raw_agg、hll_cardinality、hll_hash进行查询或使用 + HLL + HLL不能作为key列使用,建表时配合聚合类型为HLL_UNION。 + 用户不需要指定长度和默认值。长度根据数据的聚合程度系统内控制。 + 并且HLL列只能通过配套的hll_union_agg、hll_raw_agg、hll_cardinality、hll_hash进行查询或使用。 + + HLL是模糊去重,在数据量大的情况性能优于Count Distinct。 + HLL的误差通常在1%左右,有时会达到2%。 + +## example + + select hour, HLL_UNION_AGG(pv) over(order by hour) uv from( + select hour, HLL_RAW_AGG(device_id) as pv + from metric_table -- 查询每小时的累计UV + where datekey=20200622 + group by hour order by 1 + ) final; ## keyword diff --git a/docs/zh-CN/sql-reference/sql-statements/Data Types/VARCHAR.md b/docs/zh-CN/sql-reference/sql-statements/Data Types/VARCHAR.md index 59416788a3ec14..178b56fb56f308 100644 --- a/docs/zh-CN/sql-reference/sql-statements/Data Types/VARCHAR.md +++ b/docs/zh-CN/sql-reference/sql-statements/Data Types/VARCHAR.md @@ -27,7 +27,9 @@ under the License. # VARCHAR ## description VARCHAR(M) - 变长字符串,M代表的是变长字符串的长度。M的范围是1-65535 + 变长字符串,M代表的是变长字符串的长度。M的范围是1-65533。 + + 注意:变长字符串是以UTF-8编码存储的,因此通常英文字符占1个字节,中文字符占3个字节。 ## keyword From 8d09aaab7c762e013b023a3c2141f47d00a230d7 Mon Sep 17 00:00:00 2001 From: lixueyan07 Date: Mon, 12 Oct 2020 14:13:25 +0800 Subject: [PATCH 2/2] update data types doc and fix some typo --- docs/en/sql-reference/sql-statements/Data Types/BITMAP.md | 2 +- docs/zh-CN/sql-reference/sql-statements/Data Types/BITMAP.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/sql-statements/Data Types/BITMAP.md b/docs/en/sql-reference/sql-statements/Data Types/BITMAP.md index f1c1774071b88c..29d8a75331e74b 100644 --- a/docs/en/sql-reference/sql-statements/Data Types/BITMAP.md +++ b/docs/en/sql-reference/sql-statements/Data Types/BITMAP.md @@ -33,7 +33,7 @@ The user does not need to specify the length and default value. The length is co And the BITMAP column can only be queried or used by supporting functions such as bitmap_union_count, bitmap_union, and bitmap_hash. The use of BITMAP in offline scenarios will affect the import speed. In the case of a large amount of data, the query speed will be slower than HLL and better than Count Distinct. -In the real-time scene, BITMAP cannot achieve complete and accurate deduplication, and the error is usually less than one thousandth. +Note: If BITMAP does not use a global dictionary in real-time scenarios, using bitmap_hash() may cause an error of about one-thousandth. ## example diff --git a/docs/zh-CN/sql-reference/sql-statements/Data Types/BITMAP.md b/docs/zh-CN/sql-reference/sql-statements/Data Types/BITMAP.md index 2482229bad3270..c92e20b99407b5 100644 --- a/docs/zh-CN/sql-reference/sql-statements/Data Types/BITMAP.md +++ b/docs/zh-CN/sql-reference/sql-statements/Data Types/BITMAP.md @@ -32,7 +32,7 @@ under the License. 并且BITMAP列只能通过配套的bitmap_union_count、bitmap_union、bitmap_hash等函数进行查询或使用。 离线场景下使用BITMAP会影响导入速度,在数据量大的情况下查询速度会慢于HLL,并优于Count Distinct。 - 实时场景下BITMAP也不能做到完全的精确去重,通常误差小于千分之一。 + 注意:实时场景下BITMAP如果不使用全局字典,使用了bitmap_hash()可能会导致有千分之一左右的误差。 ## example