From 27905118f6f9e5db65ad94638a30f5bd3a60e08c Mon Sep 17 00:00:00 2001 From: Jibing-Li <64681310+Jibing-Li@users.noreply.github.com> Date: Thu, 26 Sep 2024 16:47:08 +0800 Subject: [PATCH] [improvement](statistics)Reduce partition column sample BE memory consumption. (#41203) For string type columns, use xxhash_64 to transfer column value to an integer, and then calculate the NDV based on the integer hash value. In this case, we can reduce the memory cost of sample analyze and improve the performance. For example, l_comment column of TPCH 100G lineitem table. The memory cost to calculate its NDV is reduced to 8GB from 22GB --- .../doris/statistics/BaseAnalysisTask.java | 42 ++++++------------- .../doris/statistics/HMSAnalysisTask.java | 7 +--- .../doris/statistics/OlapAnalysisTask.java | 13 +++--- .../statistics/BaseAnalysisTaskTest.java | 2 +- .../statistics/OlapAnalysisTaskTest.java | 25 ++++++----- .../test_hive_statistics_all_type_p0.groovy | 27 +++++++++++- 6 files changed, 62 insertions(+), 54 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index 52c8ce932a54b4..6ab65677e7ace1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -82,30 +82,6 @@ public abstract class BaseAnalysisTask { + "NOW() " + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} ${sampleHints} ${limit}"; - protected static final String DUJ1_ANALYZE_STRING_TEMPLATE = "SELECT " - + "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, " - + "${catalogId} AS `catalog_id`, " - + "${dbId} AS `db_id`, " - + "${tblId} AS `tbl_id`, " - + "${idxId} AS `idx_id`, " - + "'${colId}' AS `col_id`, " - + "NULL AS `part_id`, " - + "${rowCount} AS `row_count`, " - + "${ndvFunction} as `ndv`, " - + "IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * ${scaleFactor} as `null_count`, " - + "SUBSTRING(CAST(${min} AS STRING), 1, 1024) AS `min`, " - + "SUBSTRING(CAST(${max} AS STRING), 1, 1024) AS `max`, " - + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, " - + "NOW() " - + "FROM ( " - + " SELECT t0.`colValue` as `column_key`, COUNT(1) as `count` " - + " FROM " - + " (SELECT SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024) AS `colValue` " - + " FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} " - + " ${sampleHints} ${limit}) as `t0` " - + " GROUP BY `t0`.`colValue` " - + ") as `t1` "; - protected static final String DUJ1_ANALYZE_TEMPLATE = "SELECT " + "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, " + "${catalogId} AS `catalog_id`, " @@ -122,11 +98,11 @@ public abstract class BaseAnalysisTask { + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, " + "NOW() " + "FROM ( " - + " SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` " + + " SELECT t0.`colValue` as `column_key`, COUNT(1) as `count`, SUM(`len`) as `column_length` " + " FROM " - + " (SELECT `${colName}` FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} " - + " ${sampleHints} ${limit}) as `t0` " - + " GROUP BY `t0`.`${colName}` " + + " (SELECT ${subStringColName} AS `colValue`, LENGTH(`${colName}`) as `len` " + + " FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} ${sampleHints} ${limit}) as `t0` " + + " GROUP BY `t0`.`colValue` " + ") as `t1` "; protected static final String ANALYZE_PARTITION_COLUMN_TEMPLATE = " SELECT " @@ -230,7 +206,7 @@ public long getJobId() { protected String getDataSizeFunction(Column column, boolean useDuj1) { if (useDuj1) { if (column.getType().isStringType()) { - return "SUM(LENGTH(`column_key`) * count)"; + return "SUM(`column_length`)"; } else { return "SUM(t1.count) * " + column.getType().getSlotSize(); } @@ -243,6 +219,14 @@ protected String getDataSizeFunction(Column column, boolean useDuj1) { } } + protected String getStringTypeColName(Column column) { + if (column.getType().isStringType()) { + return "murmur_hash3_64(SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024))"; + } else { + return "`${colName}`"; + } + } + protected String getMinFunction() { if (tableSample == null) { return "CAST(MIN(`${colName}`) as ${type}) "; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java index 42fb10fc449d10..015796ff8f3aef 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java @@ -151,11 +151,8 @@ private void getOrdinaryColumnStats() throws Exception { params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})"); params.put("rowCount", "ROUND(count(1) * ${scaleFactor})"); } else { - if (col.getType().isStringType()) { - sb.append(DUJ1_ANALYZE_STRING_TEMPLATE); - } else { - sb.append(DUJ1_ANALYZE_TEMPLATE); - } + sb.append(DUJ1_ANALYZE_TEMPLATE); + params.put("subStringColName", getStringTypeColName(col)); params.put("dataSizeFunction", getDataSizeFunction(col, true)); params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count) * ${scaleFactor})")); params.put("rowCount", "ROUND(SUM(t1.count) * ${scaleFactor})"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java index 352a21aca83b00..34fb339564abed 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java @@ -133,8 +133,8 @@ protected void doSample() throws Exception { params.put("colId", StatisticsUtil.escapeSQL(String.valueOf(info.colName))); params.put("dataSizeFunction", getDataSizeFunction(col, false)); params.put("dbName", db.getFullName()); - params.put("colName", StatisticsUtil.escapeColumnName(info.colName)); - params.put("tblName", tbl.getName()); + params.put("colName", StatisticsUtil.escapeColumnName(String.valueOf(info.colName))); + params.put("tblName", String.valueOf(tbl.getName())); params.put("scaleFactor", String.valueOf(scaleFactor)); params.put("sampleHints", tabletStr.isEmpty() ? "" : String.format("TABLET(%s)", tabletStr)); params.put("ndvFunction", getNdvFunction(String.valueOf(totalRowCount))); @@ -167,11 +167,8 @@ protected void doSample() throws Exception { sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE); } else { params.put("dataSizeFunction", getDataSizeFunction(col, true)); - if (col.getType().isStringType()) { - sql = stringSubstitutor.replace(DUJ1_ANALYZE_STRING_TEMPLATE); - } else { - sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE); - } + params.put("subStringColName", getStringTypeColName(col)); + sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE); } LOG.info("Sample for column [{}]. Total rows [{}], rows to sample [{}], scale factor [{}], " + "limited [{}], distribute column [{}], partition column [{}], key column [{}], " @@ -195,7 +192,7 @@ protected ResultRow collectBasicStat(AutoCloseConnectContext context) { Map params = new HashMap<>(); params.put("dbName", db.getFullName()); params.put("colName", StatisticsUtil.escapeColumnName(info.colName)); - params.put("tblName", tbl.getName()); + params.put("tblName", String.valueOf(tbl.getName())); params.put("index", getIndex()); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String sql = stringSubstitutor.replace(BASIC_STATS_TEMPLATE); diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java index 187c4d207dfcad..86ccfff26e0375 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java @@ -31,7 +31,7 @@ public void testGetFunctions() { OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask(); Column column = new Column("string_column", PrimitiveType.STRING); String dataSizeFunction = olapAnalysisTask.getDataSizeFunction(column, true); - Assertions.assertEquals("SUM(LENGTH(`column_key`) * count)", dataSizeFunction); + Assertions.assertEquals("SUM(`column_length`)", dataSizeFunction); dataSizeFunction = olapAnalysisTask.getDataSizeFunction(column, false); Assertions.assertEquals("SUM(LENGTH(`${colName}`))", dataSizeFunction); diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java index 75506b1c85a014..a78bc81cf6b5a1 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java @@ -160,9 +160,11 @@ public void runQuery(String sql) { + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`," + " SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, " + "SUM(t1.count) * 4 * 5.0 AS `data_size`, NOW() " - + "FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) " - + "as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}`" - + " limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql); + + "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) " + + "as `count`, SUM(`len`) as `column_length` FROM " + + "(SELECT `null` AS `colValue`, LENGTH(`null`) as `len` " + + "FROM `catalogName`.`${dbName}`.`null`" + + " limit 100) as `t0` GROUP BY `t0`.`colValue` ) as `t1` ", sql); return; } }; @@ -232,12 +234,12 @@ public void runQuery(String sql) { + "SELECT CONCAT(30001, '-', -1, '-', 'null') AS `id`, " + "10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, " + "-1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, " - + "500 AS `row_count`, ROUND(NDV(`${colName}`) * 5.0) as `ndv`, " - + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 5.0) " + + "500 AS `row_count`, ROUND(NDV(`null`) * 5.0) as `ndv`, " + + "ROUND(SUM(CASE WHEN `null` IS NULL THEN 1 ELSE 0 END) * 5.0) " + "AS `null_count`, SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, " + "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, " - + "SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() " - + "FROM `catalogName`.`${dbName}`.`${tblName}` limit 100", sql); + + "SUM(LENGTH(`null`)) * 5.0 AS `data_size`, NOW() " + + "FROM `catalogName`.`${dbName}`.`null` limit 100", sql); return; } }; @@ -320,9 +322,12 @@ public void runQuery(String sql) { + "IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, " + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, " + "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, " - + "SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() " - + "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) as `count` FROM " - + "(SELECT SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024) AS `colValue` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`colValue` ) as `t1` ", sql); + + "SUM(`column_length`) * 5.0 AS `data_size`, NOW() " + + "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) as `count`, SUM(`len`) as " + + "`column_length` FROM (SELECT murmur_hash3_64(SUBSTRING(CAST(`null` AS STRING), 1, 1024)) " + + "AS `colValue`, LENGTH(`null`) as `len`" + + " FROM `catalogName`.`${dbName}`.`null` limit 100) as `t0` " + + "GROUP BY `t0`.`colValue` ) as `t1` ", sql); return; } }; diff --git a/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy b/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy index a37a55351163e2..6ce76af588f3d1 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy @@ -29,10 +29,35 @@ suite("test_hive_statistics_all_type_p0", "all_types,p0,external,hive,external_d 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}' );""" sql """use `${catalog_name}`.`default`""" - sql """analyze table orc_all_types with sync""" + sql """analyze table orc_all_types with sync with sample rows 4000000""" def result = sql """show column stats orc_all_types;""" assertEquals(16, result.size()) + result = sql """show column stats orc_all_types (int_col);""" + assertEquals("int_col", result[0][0]) + assertEquals("3600.0", result[0][2]) + assertEquals("3240.0", result[0][3]) + assertEquals("361.0", result[0][4]) + assertEquals("14400.0", result[0][5]) + + result = sql """show column stats orc_all_types (string_col);""" + assertEquals("string_col", result[0][0]) + assertEquals("3600.0", result[0][2]) + assertEquals("3254.0", result[0][3]) + assertEquals("347.0", result[0][4]) + assertEquals("453634.0", result[0][5]) + + result = sql """show column stats orc_all_types (varchar_col);""" + assertEquals("varchar_col", result[0][0]) + assertEquals("3600.0", result[0][2]) + assertEquals("6.0", result[0][3]) + assertEquals("0.0", result[0][4]) + assertEquals("35950.0", result[0][5]) + + sql """drop stats orc_all_types""" + sql """analyze table orc_all_types with sync""" + result = sql """show column stats orc_all_types;""" + assertEquals(16, result.size()) result = sql """show column stats orc_all_types (int_col);""" assertEquals("int_col", result[0][0]) assertEquals("3600.0", result[0][2])