From f67239687a67f154270cd1c684913956bfcdf51c Mon Sep 17 00:00:00 2001 From: Jibing Li Date: Mon, 23 Sep 2024 14:46:52 +0800 Subject: [PATCH] Reduce partition column sample BE memory consumption. --- .../doris/statistics/BaseAnalysisTask.java | 42 ++++++------------- .../doris/statistics/HMSAnalysisTask.java | 7 +--- .../doris/statistics/OlapAnalysisTask.java | 7 +--- .../statistics/BaseAnalysisTaskTest.java | 2 +- .../statistics/OlapAnalysisTaskTest.java | 17 ++++---- .../test_hive_statistics_all_type_p0.groovy | 29 ++++++++++++- 6 files changed, 55 insertions(+), 49 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index 329231f360487c..3d944715b8c352 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -96,30 +96,6 @@ public abstract class BaseAnalysisTask { + "NOW() " + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} ${sampleHints} ${limit}"; - protected static final String DUJ1_ANALYZE_STRING_TEMPLATE = "SELECT " - + "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, " - + "${catalogId} AS `catalog_id`, " - + "${dbId} AS `db_id`, " - + "${tblId} AS `tbl_id`, " - + "${idxId} AS `idx_id`, " - + "'${colId}' AS `col_id`, " - + "NULL AS `part_id`, " - + "${rowCount} AS `row_count`, " - + "${ndvFunction} as `ndv`, " - + "IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * ${scaleFactor} as `null_count`, " - + "SUBSTRING(CAST(${min} AS STRING), 1, 1024) AS `min`, " - + "SUBSTRING(CAST(${max} AS STRING), 1, 1024) AS `max`, " - + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, " - + "NOW() " - + "FROM ( " - + " SELECT t0.`colValue` as `column_key`, COUNT(1) as `count` " - + " FROM " - + " (SELECT SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024) AS `colValue` " - + " FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} " - + " ${sampleHints} ${limit}) as `t0` " - + " GROUP BY `t0`.`colValue` " - + ") as `t1` "; - protected static final String DUJ1_ANALYZE_TEMPLATE = "SELECT " + "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, " + "${catalogId} AS `catalog_id`, " @@ -136,11 +112,11 @@ public abstract class BaseAnalysisTask { + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, " + "NOW() " + "FROM ( " - + " SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` " + + " SELECT t0.`colValue` as `column_key`, COUNT(1) as `count`, SUM(`len`) as `column_length` " + " FROM " - + " (SELECT `${colName}` FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} " - + " ${sampleHints} ${limit}) as `t0` " - + " GROUP BY `t0`.`${colName}` " + + " (SELECT ${subStringColName} AS `colValue`, LENGTH(`${colName}`) as `len` " + + " FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} ${sampleHints} ${limit}) as `t0` " + + " GROUP BY `t0`.`colValue` " + ") as `t1` "; protected static final String ANALYZE_PARTITION_COLUMN_TEMPLATE = " SELECT " @@ -281,7 +257,7 @@ public long getJobId() { protected String getDataSizeFunction(Column column, boolean useDuj1) { if (useDuj1) { if (column.getType().isStringType()) { - return "SUM(LENGTH(`column_key`) * count)"; + return "SUM(`column_length`)"; } else { return "SUM(t1.count) * " + column.getType().getSlotSize(); } @@ -294,6 +270,14 @@ protected String getDataSizeFunction(Column column, boolean useDuj1) { } } + protected String getStringTypeColName(Column column) { + if (column.getType().isStringType()) { + return "xxhash_64(SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024))"; + } else { + return "`${colName}`"; + } + } + protected String getMinFunction() { if (tableSample == null) { return "CAST(MIN(`${colName}`) as ${type}) "; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java index 91b3d98cbcd764..7b807b4661cdc2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java @@ -249,11 +249,8 @@ protected void doSample() { params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})"); params.put("rowCount", "ROUND(count(1) * ${scaleFactor})"); } else { - if (col.getType().isStringType()) { - sb.append(DUJ1_ANALYZE_STRING_TEMPLATE); - } else { - sb.append(DUJ1_ANALYZE_TEMPLATE); - } + sb.append(DUJ1_ANALYZE_TEMPLATE); + params.put("subStringColName", getStringTypeColName(col)); params.put("dataSizeFunction", getDataSizeFunction(col, true)); params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count) * ${scaleFactor})")); params.put("rowCount", "ROUND(SUM(t1.count) * ${scaleFactor})"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java index ab81851cf6cecf..5abf6120f01746 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java @@ -156,11 +156,8 @@ protected void doSample() { sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE); } else { params.put("dataSizeFunction", getDataSizeFunction(col, true)); - if (col.getType().isStringType()) { - sql = stringSubstitutor.replace(DUJ1_ANALYZE_STRING_TEMPLATE); - } else { - sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE); - } + params.put("subStringColName", getStringTypeColName(col)); + sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE); } LOG.info("Sample for column [{}]. Total rows [{}], rows to sample [{}], scale factor [{}], " + "limited [{}], distribute column [{}], partition column [{}], key column [{}], " diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java index 187c4d207dfcad..86ccfff26e0375 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java @@ -31,7 +31,7 @@ public void testGetFunctions() { OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask(); Column column = new Column("string_column", PrimitiveType.STRING); String dataSizeFunction = olapAnalysisTask.getDataSizeFunction(column, true); - Assertions.assertEquals("SUM(LENGTH(`column_key`) * count)", dataSizeFunction); + Assertions.assertEquals("SUM(`column_length`)", dataSizeFunction); dataSizeFunction = olapAnalysisTask.getDataSizeFunction(column, false); Assertions.assertEquals("SUM(LENGTH(`${colName}`))", dataSizeFunction); diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java index 7b4ce3dfca530e..b4e835c1bc59dc 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java @@ -137,9 +137,11 @@ public void runQuery(String sql) { + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`," + " SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, " + "SUM(t1.count) * 4 * 5.0 AS `data_size`, NOW() " - + "FROM ( SELECT t0.`null` as `column_key`, COUNT(1) " - + "as `count` FROM (SELECT `null` FROM `catalogName`.`${dbName}`.`null`" - + " limit 100) as `t0` GROUP BY `t0`.`null` ) as `t1` ", sql); + + "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) " + + "as `count`, SUM(`len`) as `column_length` FROM " + + "(SELECT `null` AS `colValue`, LENGTH(`null`) as `len` " + + "FROM `catalogName`.`${dbName}`.`null`" + + " limit 100) as `t0` GROUP BY `t0`.`colValue` ) as `t1` ", sql); return; } }; @@ -296,10 +298,11 @@ public void runQuery(String sql) { + "IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, " + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, " + "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, " - + "SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() " - + "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) as `count` FROM " - + "(SELECT SUBSTRING(CAST(`null` AS STRING), 1, 1024) AS `colValue` " - + "FROM `catalogName`.`${dbName}`.`null` limit 100) as `t0` " + + "SUM(`column_length`) * 5.0 AS `data_size`, NOW() " + + "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) as `count`, SUM(`len`) as " + + "`column_length` FROM (SELECT xxhash_64(SUBSTRING(CAST(`null` AS STRING), 1, 1024)) " + + "AS `colValue`, LENGTH(`null`) as `len`" + + " FROM `catalogName`.`${dbName}`.`null` limit 100) as `t0` " + "GROUP BY `t0`.`colValue` ) as `t1` ", sql); return; } diff --git a/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy b/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy index 496cab8825a713..7d85f288bd7b94 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy @@ -18,7 +18,7 @@ suite("test_hive_statistics_all_type_p0", "all_types,p0,external,hive,external_docker,external_docker_hive") { String enabled = context.config.otherConfigs.get("enableHiveTest") if (enabled == null || !enabled.equalsIgnoreCase("true")) { - logger.info("diable Hive test.") + logger.info("disable Hive test.") return; } @@ -34,10 +34,35 @@ suite("test_hive_statistics_all_type_p0", "all_types,p0,external,hive,external_d 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}' );""" sql """use `${catalog_name}`.`default`""" - sql """analyze table orc_all_types with sync""" + sql """analyze table orc_all_types with sync with sample rows 4000000""" def result = sql """show column stats orc_all_types;""" assertEquals(16, result.size()) + result = sql """show column stats orc_all_types (int_col);""" + assertEquals("int_col", result[0][0]) + assertEquals("3600.0", result[0][2]) + assertEquals("3240.0", result[0][3]) + assertEquals("361.0", result[0][4]) + assertEquals("14400.0", result[0][5]) + + result = sql """show column stats orc_all_types (string_col);""" + assertEquals("string_col", result[0][0]) + assertEquals("3600.0", result[0][2]) + assertEquals("3254.0", result[0][3]) + assertEquals("347.0", result[0][4]) + assertEquals("453634.0", result[0][5]) + + result = sql """show column stats orc_all_types (varchar_col);""" + assertEquals("varchar_col", result[0][0]) + assertEquals("3600.0", result[0][2]) + assertEquals("6.0", result[0][3]) + assertEquals("0.0", result[0][4]) + assertEquals("35950.0", result[0][5]) + + sql """drop stats orc_all_types""" + sql """analyze table orc_all_types with sync""" + result = sql """show column stats orc_all_types;""" + assertEquals(16, result.size()) result = sql """show column stats orc_all_types (int_col);""" assertEquals("int_col", result[0][0]) assertEquals("3600.0", result[0][2])