From be1d3b39a0ce983c5b5f2b8dc38aa31378834f6b Mon Sep 17 00:00:00 2001 From: Jibing-Li <64681310+Jibing-Li@users.noreply.github.com> Date: Thu, 26 Sep 2024 16:47:08 +0800 Subject: [PATCH] [improvement](statistics)Reduce partition column sample BE memory consumption. (#41203) For string type columns, use xxhash_64 to transfer column value to an integer, and then calculate the NDV based on the integer hash value. In this case, we can reduce the memory cost of sample analyze and improve the performance. For example, l_comment column of TPCH 100G lineitem table. The memory cost to calculate its NDV is reduced to 8GB from 22GB --- .../doris/statistics/BaseAnalysisTask.java | 42 ++++++------------- .../statistics/ExternalAnalysisTask.java | 7 +--- .../doris/statistics/OlapAnalysisTask.java | 15 +++---- .../statistics/BaseAnalysisTaskTest.java | 2 +- .../statistics/OlapAnalysisTaskTest.java | 25 ++++++----- .../test_hive_statistics_all_type_p0.groovy | 29 ++++++++++++- 6 files changed, 64 insertions(+), 56 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index 52c8ce932a54b4..a7aaaf2c037c07 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -82,30 +82,6 @@ public abstract class BaseAnalysisTask { + "NOW() " + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} ${sampleHints} ${limit}"; - protected static final String DUJ1_ANALYZE_STRING_TEMPLATE = "SELECT " - + "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, " - + "${catalogId} AS `catalog_id`, " - + "${dbId} AS `db_id`, " - + "${tblId} AS `tbl_id`, " - + "${idxId} AS `idx_id`, " - + "'${colId}' AS `col_id`, " - + "NULL AS `part_id`, " - + "${rowCount} AS `row_count`, " - + "${ndvFunction} as `ndv`, " - + "IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * ${scaleFactor} as `null_count`, " - + "SUBSTRING(CAST(${min} AS STRING), 1, 1024) AS `min`, " - + "SUBSTRING(CAST(${max} AS STRING), 1, 1024) AS `max`, " - + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, " - + "NOW() " - + "FROM ( " - + " SELECT t0.`colValue` as `column_key`, COUNT(1) as `count` " - + " FROM " - + " (SELECT SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024) AS `colValue` " - + " FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} " - + " ${sampleHints} ${limit}) as `t0` " - + " GROUP BY `t0`.`colValue` " - + ") as `t1` "; - protected static final String DUJ1_ANALYZE_TEMPLATE = "SELECT " + "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, " + "${catalogId} AS `catalog_id`, " @@ -122,11 +98,11 @@ public abstract class BaseAnalysisTask { + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, " + "NOW() " + "FROM ( " - + " SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` " + + " SELECT t0.`colValue` as `column_key`, COUNT(1) as `count`, SUM(`len`) as `column_length` " + " FROM " - + " (SELECT `${colName}` FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} " - + " ${sampleHints} ${limit}) as `t0` " - + " GROUP BY `t0`.`${colName}` " + + " (SELECT ${subStringColName} AS `colValue`, LENGTH(`${colName}`) as `len` " + + " FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} ${sampleHints} ${limit}) as `t0` " + + " GROUP BY `t0`.`colValue` " + ") as `t1` "; protected static final String ANALYZE_PARTITION_COLUMN_TEMPLATE = " SELECT " @@ -230,7 +206,7 @@ public long getJobId() { protected String getDataSizeFunction(Column column, boolean useDuj1) { if (useDuj1) { if (column.getType().isStringType()) { - return "SUM(LENGTH(`column_key`) * count)"; + return "SUM(`column_length`)"; } else { return "SUM(t1.count) * " + column.getType().getSlotSize(); } @@ -243,6 +219,14 @@ protected String getDataSizeFunction(Column column, boolean useDuj1) { } } + protected String getStringTypeColName(Column column) { + if (column.getType().isStringType()) { + return "xxhash_64(SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024))"; + } else { + return "`${colName}`"; + } + } + protected String getMinFunction() { if (tableSample == null) { return "CAST(MIN(`${colName}`) as ${type}) "; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java index c054c7bc5b075e..979ff04f79e68d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java @@ -132,11 +132,8 @@ protected void getColumnStats() throws Exception { params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})"); params.put("rowCount", "ROUND(count(1) * ${scaleFactor})"); } else { - if (col.getType().isStringType()) { - sb.append(DUJ1_ANALYZE_STRING_TEMPLATE); - } else { - sb.append(DUJ1_ANALYZE_TEMPLATE); - } + sb.append(DUJ1_ANALYZE_TEMPLATE); + params.put("subStringColName", getStringTypeColName(col)); params.put("dataSizeFunction", getDataSizeFunction(col, true)); params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count) * ${scaleFactor})")); params.put("rowCount", "ROUND(SUM(t1.count) * ${scaleFactor})"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java index 7f96c52e81b248..5e5d6ac474042f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java @@ -134,8 +134,8 @@ protected void doSample() throws Exception { params.put("colId", StatisticsUtil.escapeSQL(String.valueOf(info.colName))); params.put("dataSizeFunction", getDataSizeFunction(col, false)); params.put("dbName", db.getFullName()); - params.put("colName", StatisticsUtil.escapeColumnName(info.colName)); - params.put("tblName", tbl.getName()); + params.put("colName", StatisticsUtil.escapeColumnName(String.valueOf(info.colName))); + params.put("tblName", String.valueOf(tbl.getName())); params.put("scaleFactor", String.valueOf(scaleFactor)); params.put("sampleHints", tabletStr.isEmpty() ? "" : String.format("TABLET(%s)", tabletStr)); params.put("ndvFunction", getNdvFunction(String.valueOf(totalRowCount))); @@ -168,11 +168,8 @@ protected void doSample() throws Exception { sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE); } else { params.put("dataSizeFunction", getDataSizeFunction(col, true)); - if (col.getType().isStringType()) { - sql = stringSubstitutor.replace(DUJ1_ANALYZE_STRING_TEMPLATE); - } else { - sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE); - } + params.put("subStringColName", getStringTypeColName(col)); + sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE); } LOG.info("Sample for column [{}]. Total rows [{}], rows to sample [{}], scale factor [{}], " + "limited [{}], distribute column [{}], partition column [{}], key column [{}], " @@ -195,8 +192,8 @@ protected ResultRow collectBasicStat(AutoCloseConnectContext context) { long startTime = System.currentTimeMillis(); Map params = new HashMap<>(); params.put("dbName", db.getFullName()); - params.put("colName", StatisticsUtil.escapeColumnName(info.colName)); - params.put("tblName", tbl.getName()); + params.put("colName", StatisticsUtil.escapeColumnName(String.valueOf(info.colName))); + params.put("tblName", String.valueOf(tbl.getName())); params.put("index", getIndex()); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String sql = stringSubstitutor.replace(BASIC_STATS_TEMPLATE); diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java index 187c4d207dfcad..86ccfff26e0375 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java @@ -31,7 +31,7 @@ public void testGetFunctions() { OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask(); Column column = new Column("string_column", PrimitiveType.STRING); String dataSizeFunction = olapAnalysisTask.getDataSizeFunction(column, true); - Assertions.assertEquals("SUM(LENGTH(`column_key`) * count)", dataSizeFunction); + Assertions.assertEquals("SUM(`column_length`)", dataSizeFunction); dataSizeFunction = olapAnalysisTask.getDataSizeFunction(column, false); Assertions.assertEquals("SUM(LENGTH(`${colName}`))", dataSizeFunction); diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java index 75506b1c85a014..5bb0920e4338b8 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java @@ -160,9 +160,11 @@ public void runQuery(String sql) { + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`," + " SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, " + "SUM(t1.count) * 4 * 5.0 AS `data_size`, NOW() " - + "FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) " - + "as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}`" - + " limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql); + + "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) " + + "as `count`, SUM(`len`) as `column_length` FROM " + + "(SELECT `null` AS `colValue`, LENGTH(`null`) as `len` " + + "FROM `catalogName`.`${dbName}`.`null`" + + " limit 100) as `t0` GROUP BY `t0`.`colValue` ) as `t1` ", sql); return; } }; @@ -232,12 +234,12 @@ public void runQuery(String sql) { + "SELECT CONCAT(30001, '-', -1, '-', 'null') AS `id`, " + "10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, " + "-1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, " - + "500 AS `row_count`, ROUND(NDV(`${colName}`) * 5.0) as `ndv`, " - + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 5.0) " + + "500 AS `row_count`, ROUND(NDV(`null`) * 5.0) as `ndv`, " + + "ROUND(SUM(CASE WHEN `null` IS NULL THEN 1 ELSE 0 END) * 5.0) " + "AS `null_count`, SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, " + "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, " - + "SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() " - + "FROM `catalogName`.`${dbName}`.`${tblName}` limit 100", sql); + + "SUM(LENGTH(`null`)) * 5.0 AS `data_size`, NOW() " + + "FROM `catalogName`.`${dbName}`.`null` limit 100", sql); return; } }; @@ -320,9 +322,12 @@ public void runQuery(String sql) { + "IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, " + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, " + "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, " - + "SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() " - + "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) as `count` FROM " - + "(SELECT SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024) AS `colValue` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`colValue` ) as `t1` ", sql); + + "SUM(`column_length`) * 5.0 AS `data_size`, NOW() " + + "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) as `count`, SUM(`len`) as " + + "`column_length` FROM (SELECT xxhash_64(SUBSTRING(CAST(`null` AS STRING), 1, 1024)) " + + "AS `colValue`, LENGTH(`null`) as `len`" + + " FROM `catalogName`.`${dbName}`.`null` limit 100) as `t0` " + + "GROUP BY `t0`.`colValue` ) as `t1` ", sql); return; } }; diff --git a/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy b/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy index 496cab8825a713..7d85f288bd7b94 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy @@ -18,7 +18,7 @@ suite("test_hive_statistics_all_type_p0", "all_types,p0,external,hive,external_docker,external_docker_hive") { String enabled = context.config.otherConfigs.get("enableHiveTest") if (enabled == null || !enabled.equalsIgnoreCase("true")) { - logger.info("diable Hive test.") + logger.info("disable Hive test.") return; } @@ -34,10 +34,35 @@ suite("test_hive_statistics_all_type_p0", "all_types,p0,external,hive,external_d 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}' );""" sql """use `${catalog_name}`.`default`""" - sql """analyze table orc_all_types with sync""" + sql """analyze table orc_all_types with sync with sample rows 4000000""" def result = sql """show column stats orc_all_types;""" assertEquals(16, result.size()) + result = sql """show column stats orc_all_types (int_col);""" + assertEquals("int_col", result[0][0]) + assertEquals("3600.0", result[0][2]) + assertEquals("3240.0", result[0][3]) + assertEquals("361.0", result[0][4]) + assertEquals("14400.0", result[0][5]) + + result = sql """show column stats orc_all_types (string_col);""" + assertEquals("string_col", result[0][0]) + assertEquals("3600.0", result[0][2]) + assertEquals("3254.0", result[0][3]) + assertEquals("347.0", result[0][4]) + assertEquals("453634.0", result[0][5]) + + result = sql """show column stats orc_all_types (varchar_col);""" + assertEquals("varchar_col", result[0][0]) + assertEquals("3600.0", result[0][2]) + assertEquals("6.0", result[0][3]) + assertEquals("0.0", result[0][4]) + assertEquals("35950.0", result[0][5]) + + sql """drop stats orc_all_types""" + sql """analyze table orc_all_types with sync""" + result = sql """show column stats orc_all_types;""" + assertEquals(16, result.size()) result = sql """show column stats orc_all_types (int_col);""" assertEquals("int_col", result[0][0]) assertEquals("3600.0", result[0][2])