From a233e0798de9069eda5b087a112741102779f0de Mon Sep 17 00:00:00 2001 From: AKIRA <33112463+Kikyou1997@users.noreply.github.com> Date: Tue, 5 Dec 2023 20:40:38 +0800 Subject: [PATCH] [fix](stats) truncate min/max if too long (#27955) For some string value the max/min might be a very long string which might take too many memory of FE, so we truncate to 1024 chars if it's too long --- .../doris/statistics/AnalysisManager.java | 4 +- .../doris/statistics/BaseAnalysisTask.java | 16 ++++---- .../doris/statistics/JdbcAnalysisTask.java | 4 +- .../doris/statistics/OlapAnalysisTask.java | 4 +- .../statistics/OlapAnalysisTaskTest.java | 39 +++++++++++++++++-- 5 files changed, 51 insertions(+), 16 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java index f5c14f59fa5450..0bf24e0c288071 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java @@ -542,7 +542,9 @@ public void createTaskForEachColumns(AnalysisInfo jobInfo, Map> colToParts = new HashMap<>(); + colToParts.put(colName, entry.getValue()); + colTaskInfoBuilder.setColToPartitions(colToParts); } AnalysisInfo analysisInfo = colTaskInfoBuilder.setColName(colName).setIndexId(indexId) .setTaskId(taskId).setLastExecTimeInMs(System.currentTimeMillis()).build(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index cdd5c3fc69f139..27101a1d66fa1f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -58,8 +58,8 @@ public abstract class BaseAnalysisTask { + " COUNT(1) AS `row_count`, " + " NDV(`${colName}`) AS `ndv`, " + " COUNT(1) - COUNT(`${colName}`) AS `null_count`, " - + " CAST(MIN(`${colName}`) AS STRING) AS `min`, " - + " CAST(MAX(`${colName}`) AS STRING) AS `max`, " + + " SUBSTRING(CAST(MIN(`${colName}`) AS STRING), 1, 1024) AS `min`, " + + " SUBSTRING(CAST(MAX(`${colName}`) AS STRING), 1, 1024) AS `max`, " + " ${dataSizeFunction} AS `data_size`, " + " NOW() AS `update_time` " + " FROM `${catalogName}`.`${dbName}`.`${tblName}`"; @@ -75,8 +75,8 @@ public abstract class BaseAnalysisTask { + "${rowCount} AS `row_count`, " + "${ndvFunction} as `ndv`, " + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS `null_count`, " - + "${min} AS `min`, " - + "${max} AS `max`, " + + "SUBSTRING(CAST(${min} AS STRING), 1, 1024) AS `min`, " + + "SUBSTRING(CAST(${max} AS STRING), 1, 1024) AS `max`, " + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, " + "NOW() " + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleHints} ${limit}"; @@ -92,8 +92,8 @@ public abstract class BaseAnalysisTask { + "${rowCount} AS `row_count`, " + "${ndvFunction} as `ndv`, " + "IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * ${scaleFactor} as `null_count`, " - + "${min} AS `min`, " - + "${max} AS `max`, " + + "SUBSTRING(CAST(${min} AS STRING), 1, 1024) AS `min`, " + + "SUBSTRING(CAST(${max} AS STRING), 1, 1024) AS `max`, " + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, " + "NOW() " + "FROM ( " @@ -115,8 +115,8 @@ public abstract class BaseAnalysisTask { + "${row_count} AS `row_count`, " + "${ndv} AS `ndv`, " + "${null_count} AS `null_count`, " - + "${min} AS `min`, " - + "${max} AS `max`, " + + "SUBSTRING(CAST(${min} AS STRING), 1, 1024) AS `min`, " + + "SUBSTRING(CAST(${max} AS STRING), 1, 1024) AS `max`, " + "${data_size} AS `data_size`, " + "NOW() "; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java index 50c437fa8f9945..2bf72843a71f5e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java @@ -41,8 +41,8 @@ public class JdbcAnalysisTask extends BaseAnalysisTask { + "COUNT(1) AS row_count, " + "NDV(`${colName}`) AS ndv, " + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) AS null_count, " - + "MIN(`${colName}`) AS min, " - + "MAX(`${colName}`) AS max, " + + "SUBSTRING(CAST(MIN(`${colName}`) AS STRING), 1, 1024) AS min, " + + "SUBSTRING(CAST(MAX(`${colName}`) AS STRING), 1, 1024) AS max, " + "${dataSizeFunction} AS data_size, " + "NOW() " + "FROM `${catalogName}`.`${dbName}`.`${tblName}`"; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java index 396e25b5a1664f..50042e4610f492 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java @@ -47,8 +47,8 @@ public class OlapAnalysisTask extends BaseAnalysisTask { private static final String BASIC_STATS_TEMPLATE = "SELECT " - + "MIN(`${colName}`) as min, " - + "MAX(`${colName}`) as max " + + "SUBSTRING(CAST(MIN(`${colName}`) AS STRING), 1, 1024) as min, " + + "SUBSTRING(CAST(MAX(`${colName}`) AS STRING), 1, 1024) as max " + "FROM `${dbName}`.`${tblName}`"; @VisibleForTesting diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java index ed9122f70b6560..5b27c79c8633ed 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java @@ -150,7 +150,20 @@ public ResultRow collectBasicStat(AutoCloseConnectContext context) { @Mock public void runQuery(String sql) { - Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, '1' AS `min`, '2' AS `max`, SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql); + Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', '-', 'null') " + + "AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, " + + "-1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS" + + " `row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`)" + + " - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0))" + + " * SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key`" + + " IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, " + + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`," + + " SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, " + + "SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() " + + "FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) " + + "as `count` FROM (SELECT `${colName}` FROM " + + "`catalogName`.`${dbName}`.`${tblName}` " + + " limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql); return; } }; @@ -216,7 +229,16 @@ public ResultRow collectBasicStat(AutoCloseConnectContext context) { @Mock public void runQuery(String sql) { - Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, ROUND(NDV(`${colName}`) * 5.0) as `ndv`, ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, '1' AS `min`, '2' AS `max`, SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() FROM `catalogName`.`${dbName}`.`${tblName}` limit 100", sql); + Assertions.assertEquals(" " + + "SELECT CONCAT(30001, '-', -1, '-', 'null') AS `id`, " + + "10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, " + + "-1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, " + + "500 AS `row_count`, ROUND(NDV(`${colName}`) * 5.0) as `ndv`, " + + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 5.0) " + + "AS `null_count`, SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, " + + "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, " + + "SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() " + + "FROM `catalogName`.`${dbName}`.`${tblName}` limit 100", sql); return; } }; @@ -289,7 +311,18 @@ public ResultRow collectBasicStat(AutoCloseConnectContext context) { @Mock public void runQuery(String sql) { - Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, '1' AS `min`, '2' AS `max`, SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql); + System.out.println(sql); + Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', '-', 'null') " + + "AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, " + + "-1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`," + + " 500 AS `row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) " + + "- SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * " + + "SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` " + + "IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, " + + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, " + + "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, " + + "SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() " + + "FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql); return; } };