From 18bfa0bc3074a1426cd6b914a2425052ca9b0e44 Mon Sep 17 00:00:00 2001 From: kikyo Date: Wed, 20 Sep 2023 10:57:35 +0800 Subject: [PATCH] 1. disable page cache 2. single replica for data size calc 3. move auto analyze related options to global session variable --- docs/en/docs/query-acceleration/statistics.md | 4 +-- .../docs/query-acceleration/statistics.md | 4 +-- .../java/org/apache/doris/common/Config.java | 15 ++-------- .../org/apache/doris/catalog/OlapTable.java | 8 ++++-- .../org/apache/doris/catalog/TableIf.java | 2 +- .../org/apache/doris/qe/SessionVariable.java | 11 ++++++-- .../doris/statistics/AnalysisTaskWrapper.java | 2 +- .../doris/statistics/BaseAnalysisTask.java | 3 +- .../doris/statistics/StatisticConstants.java | 3 ++ .../statistics/StatisticsAutoCollector.java | 10 +++---- .../doris/statistics/StatisticsCollector.java | 5 ++++ .../doris/statistics/util/StatisticsUtil.java | 28 +++++++++++++------ .../statistics/OlapAnalysisTaskTest.java | 6 ++-- 13 files changed, 61 insertions(+), 40 deletions(-) diff --git a/docs/en/docs/query-acceleration/statistics.md b/docs/en/docs/query-acceleration/statistics.md index f22e15b8f7b634..127240626e6ef7 100644 --- a/docs/en/docs/query-acceleration/statistics.md +++ b/docs/en/docs/query-acceleration/statistics.md @@ -237,16 +237,16 @@ Automatic collection tasks do not support viewing the completion status and fail |full_auto_analyze_end_time|End time for automatic statistics collection|02:00:00| |enable_auto_sample|Enable automatic sampling for large tables. When enabled, statistics will be automatically collected through sampling for tables larger than the `huge_table_lower_bound_size_in_bytes` threshold.| false| |auto_analyze_job_record_count|Controls the persistence of records for automatically triggered statistics collection jobs.|20000| -|huge_table_default_sample_rows|Defines the number of sample rows for large tables when automatic sampling is enabled.|200000| +|huge_table_default_sample_rows|Defines the number of sample rows for large tables when automatic sampling is enabled.|4194304| |huge_table_lower_bound_size_in_bytes|Defines the lower size threshold for large tables. When `enable_auto_sample` is enabled, statistics will be automatically collected through sampling for tables larger than this value.|5368 709120| |huge_table_auto_analyze_interval_in_millis|Controls the minimum time interval for automatic ANALYZE on large tables. Within this interval, tables larger than `huge_table_lower_bound_size_in_bytes` will only be analyzed once.|43200000| |table_stats_health_threshold|Takes a value between 0-100. When the data update volume reaches (100 - table_stats_health_threshold)% since the last statistics collection operation, the statistics for the table are considered outdated.|80| -|enable_full_auto_analyze|Enable automatic collection functionality|true| |Session Variable|Description|Default Value| |---|---|---| |full_auto_analyze_start_time|Start time for automatic statistics collection|00:00:00| |full_auto_analyze_end_time|End time for automatic statistics collection|02:00:00| +|enable_full_auto_analyze|Enable automatic collection functionality|true| Please note that when both FE configuration and global session variables are configured for the same parameter, the value of the global session variable takes precedence. diff --git a/docs/zh-CN/docs/query-acceleration/statistics.md b/docs/zh-CN/docs/query-acceleration/statistics.md index 0fbbf4c979fb6e..9bd69f944829a6 100644 --- a/docs/zh-CN/docs/query-acceleration/statistics.md +++ b/docs/zh-CN/docs/query-acceleration/statistics.md @@ -249,16 +249,16 @@ SHOW AUTO ANALYZE [ptable_name] |full_auto_analyze_end_time|自动统计信息收集结束时间|02:00:00| |enable_auto_sample|是否开启大表自动sample,开启后对于大小超过huge_table_lower_bound_size_in_bytes会自动通过采样收集| false| |auto_analyze_job_record_count|控制统计信息的自动触发作业执行记录的持久化行数|20000| -|huge_table_default_sample_rows|定义开启开启大表自动sample后,对大表的采样行数|200000| +|huge_table_default_sample_rows|定义开启开启大表自动sample后,对大表的采样行数|4194304| |huge_table_lower_bound_size_in_bytes|定义大表的大小下界,在开启enable_auto_sample的情况下,大小超过该值的表将会自动通过采样收集统计信息|5368 709120| |huge_table_auto_analyze_interval_in_millis|控制对大表的自动ANALYZE的最小时间间隔,在该时间间隔内大小超过huge_table_lower_bound_size_in_bytes的表仅ANALYZE一次|43200000| |table_stats_health_threshold|取值在0-100之间,当自上次统计信息收集操作之后,数据更新量达到 (100 - table_stats_health_threshold)% ,认为该表的统计信息已过时|80| -|enable_full_auto_analyze|开启自动收集功能|true| |会话变量|说明|默认值| |---|---|---| |full_auto_analyze_start_time|自动统计信息收集开始时间|00:00:00| |full_auto_analyze_end_time|自动统计信息收集结束时间|02:00:00| +|enable_full_auto_analyze|开启自动收集功能|true| 注意,对于fe配置和全局会话变量中均可配置的参数都设置的情况下,优先使用全局会话变量参数值。 diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index a2aa07ba24b7fe..1c4c58723a6d67 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -2079,15 +2079,6 @@ public class Config extends ConfigBase { "Sample size for hive row count estimation."}) public static int hive_stats_partition_sample_size = 3000; - @ConfField - public static boolean enable_full_auto_analyze = true; - - @ConfField - public static String full_auto_analyze_start_time = "00:00:00"; - - @ConfField - public static String full_auto_analyze_end_time = "02:00:00"; - @ConfField public static int statistics_sql_parallel_exec_instance_num = 1; @@ -2184,10 +2175,10 @@ public class Config extends ConfigBase { + "statistics through sampling"}) public static long huge_table_lower_bound_size_in_bytes = 5L * 1024 * 1024 * 1024; - @ConfField(description = {"定义开启开启大表自动sample后,对大表的采样行数", - "This defines the number of sample rows for large tables when automatic sampling for" + @ConfField(description = {"定义开启开启大表自动sample后,对大表的采样比例", + "This defines the number of sample percent for large tables when automatic sampling for" + "large tables is enabled"}) - public static int huge_table_default_sample_rows = 20_0000; + public static int huge_table_default_sample_rows = 4194304; @ConfField(description = {"是否开启大表自动sample,开启后对于大小超过huge_table_lower_bound_size_in_bytes会自动通过采样收集" + "统计信息", "Whether to enable automatic sampling for large tables, which, when enabled, automatically" diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java index 10d5d542737245..90daf38b64bbfc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java @@ -1568,14 +1568,18 @@ public Partition replacePartition(Partition newPartition) { return oldPartition; } - public long getDataSize() { + public long getDataSize(boolean singleReplica) { long dataSize = 0; for (Partition partition : getAllPartitions()) { - dataSize += partition.getDataSize(false); + dataSize += partition.getDataSize(singleReplica); } return dataSize; } + public long getDataSize() { + return getDataSize(false); + } + public long getRemoteDataSize() { long remoteDataSize = 0; for (Partition partition : getAllPartitions()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java index d3e5beabf2e86d..5c9ea40d3ff144 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java @@ -246,7 +246,7 @@ default long getLastUpdateTime() { return -1L; } - default long getDataSize() { + default long getDataSize(boolean singleReplica) { // TODO: Each tableIf should impl it by itself. return 0; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 4044c3e0e05722..0dace46e73e601 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -413,6 +413,8 @@ public class SessionVariable implements Serializable, Writable { public static final String TEST_QUERY_CACHE_HIT = "test_query_cache_hit"; + public static final String ENABLE_FULL_AUTO_ANALYZE = "enable_full_auto_analyze"; + public static final List DEBUG_VARIABLES = ImmutableList.of( SKIP_DELETE_PREDICATE, SKIP_DELETE_BITMAP, @@ -1198,13 +1200,13 @@ public void setMaxJoinNumberOfReorder(int maxJoinNumberOfReorder) { description = {"该参数定义自动ANALYZE例程的开始时间", "This parameter defines the start time for the automatic ANALYZE routine."}, flag = VariableMgr.GLOBAL) - public String fullAutoAnalyzeStartTime = ""; + public String fullAutoAnalyzeStartTime = "00:00:00"; @VariableMgr.VarAttr(name = FULL_AUTO_ANALYZE_END_TIME, needForward = true, checker = "checkAnalyzeTimeFormat", description = {"该参数定义自动ANALYZE例程的结束时间", "This parameter defines the end time for the automatic ANALYZE routine."}, flag = VariableMgr.GLOBAL) - public String fullAutoAnalyzeEndTime = ""; + public String fullAutoAnalyzeEndTime = "02:00:00"; @VariableMgr.VarAttr(name = ENABLE_UNIQUE_KEY_PARTIAL_UPDATE, needForward = false) public boolean enableUniqueKeyPartialUpdate = false; @@ -1216,6 +1218,11 @@ public void setMaxJoinNumberOfReorder(int maxJoinNumberOfReorder) { options = {"none", "sql_cache", "partition_cache"}) public String testQueryCacheHit = "none"; + @VariableMgr.VarAttr(name = ENABLE_FULL_AUTO_ANALYZE, + description = {"该参数控制是否开启自动收集", "Set false to disable auto analyze"}, + flag = VariableMgr.GLOBAL) + public boolean enableFullAutoAnalyze = true; + // If this fe is in fuzzy mode, then will use initFuzzyModeVariables to generate some variables, // not the default value set in the code. public void initFuzzyModeVariables() { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskWrapper.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskWrapper.java index 800c4465764902..9aa3d85992b32c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskWrapper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskWrapper.java @@ -56,7 +56,7 @@ public void run() { if (task.killed) { return; } - if (task.info.scheduleType.equals(ScheduleType.AUTOMATIC) && !StatisticsUtil.checkAnalyzeTime( + if (task.info.scheduleType.equals(ScheduleType.AUTOMATIC) && !StatisticsUtil.inAnalyzeTime( LocalTime.now(TimeUtils.getTimeZone().toZoneId()))) { // TODO: Do we need a separate AnalysisState here? Env.getCurrentEnv().getAnalysisManager() diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index edadf4c17bf4a5..196e2277cb3228 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -225,7 +225,8 @@ protected String getSampleExpression() { } int sampleRows = info.sampleRows; if (info.analysisMethod == AnalysisMethod.FULL) { - if (Config.enable_auto_sample && tbl.getDataSize() > Config.huge_table_lower_bound_size_in_bytes) { + if (Config.enable_auto_sample + && tbl.getDataSize(true) > Config.huge_table_lower_bound_size_in_bytes) { sampleRows = Config.huge_table_default_sample_rows; } else { return ""; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java index 4bb7030753f989..e6b8297d0c0b01 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java @@ -81,6 +81,9 @@ public class StatisticConstants { // union more relation than 512 may cause StackOverFlowException in the future. public static final int UNION_ALL_LIMIT = 512; + public static final String FULL_AUTO_ANALYZE_START_TIME = "00:00:00"; + public static final String FULL_AUTO_ANALYZE_END_TIME = "23:59:59"; + static { SYSTEM_DBS.add(SystemInfoService.DEFAULT_CLUSTER + ClusterNamespace.CLUSTER_DELIMITER + FeConstants.INTERNAL_DB_NAME); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java index 3b8b7840aa56ac..9bbed0f417064e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java @@ -58,11 +58,11 @@ public StatisticsAutoCollector() { @Override protected void collect() { - if (!StatisticsUtil.checkAnalyzeTime(LocalTime.now(TimeUtils.getTimeZone().toZoneId()))) { + if (!StatisticsUtil.inAnalyzeTime(LocalTime.now(TimeUtils.getTimeZone().toZoneId()))) { analysisTaskExecutor.clear(); return; } - if (Config.enable_full_auto_analyze) { + if (StatisticsUtil.enableAutoAnalyze()) { analyzeAll(); } } @@ -115,7 +115,7 @@ protected boolean skip(TableIf table) { if (!(table instanceof OlapTable || table instanceof ExternalTable)) { return true; } - if (table.getDataSize() < Config.huge_table_lower_bound_size_in_bytes) { + if (table.getDataSize(true) < Config.huge_table_lower_bound_size_in_bytes) { return false; } TableStats tableStats = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId()); @@ -124,7 +124,7 @@ protected boolean skip(TableIf table) { protected void createAnalyzeJobForTbl(DatabaseIf db, List analysisInfos, TableIf table) { - AnalysisMethod analysisMethod = table.getDataSize() > Config.huge_table_lower_bound_size_in_bytes + AnalysisMethod analysisMethod = table.getDataSize(true) > Config.huge_table_lower_bound_size_in_bytes ? AnalysisMethod.SAMPLE : AnalysisMethod.FULL; TableName tableName = new TableName(db.getCatalog().getName(), db.getFullName(), table.getName()); @@ -141,7 +141,7 @@ protected void createAnalyzeJobForTbl(DatabaseIf db, .setAnalysisType(AnalysisInfo.AnalysisType.FUNDAMENTALS) .setAnalysisMode(AnalysisInfo.AnalysisMode.INCREMENTAL) .setAnalysisMethod(analysisMethod) - .setSamplePercent(Config.huge_table_default_sample_rows) + .setSampleRows(Config.huge_table_default_sample_rows) .setScheduleType(ScheduleType.AUTOMATIC) .setState(AnalysisState.PENDING) .setTaskIds(new ArrayList<>()) diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCollector.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCollector.java index f65829e748b4b4..2d5c48168357fc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCollector.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCollector.java @@ -23,12 +23,15 @@ import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.hudi.common.util.VisibleForTesting; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import java.util.HashMap; import java.util.Map; public abstract class StatisticsCollector extends MasterDaemon { + private static final Logger LOG = LogManager.getLogger(StatisticsCollector.class); protected final AnalysisTaskExecutor analysisTaskExecutor; @@ -45,6 +48,7 @@ protected void runAfterCatalogReady() { return; } if (!StatisticsUtil.statsTblAvailable()) { + LOG.info("Stats table not available, skip"); return; } if (Env.isCheckpointThread()) { @@ -52,6 +56,7 @@ protected void runAfterCatalogReady() { } if (!analysisTaskExecutor.idle()) { + LOG.info("Analyze tasks those submitted in last time is not finished, skip"); return; } collect(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java index 1cb131099c5b4b..dd4e4f8515de06 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java @@ -174,6 +174,7 @@ public static AutoCloseConnectContext buildConnectContext(boolean limitScan) { sessionVariable.setMaxExecMemByte(Config.statistics_sql_mem_limit_in_bytes); sessionVariable.cpuResourceLimit = Config.cpu_resource_limit_per_analyze_task; sessionVariable.setEnableInsertStrict(true); + sessionVariable.enablePageCache = false; sessionVariable.parallelExecInstanceNum = Config.statistics_sql_parallel_exec_instance_num; sessionVariable.parallelPipelineTaskNum = Config.statistics_sql_parallel_exec_instance_num; sessionVariable.setEnableNereidsPlanner(false); @@ -729,15 +730,14 @@ public static boolean isExternalTable(String catalogName, String dbName, String return table instanceof ExternalTable; } - public static boolean checkAnalyzeTime(LocalTime now) { + public static boolean inAnalyzeTime(LocalTime now) { try { Pair range = findRangeFromGlobalSessionVar(); - DateTimeFormatter timeFormatter = DateTimeFormatter.ofPattern("HH:mm:ss"); - LocalTime start = range == null - ? LocalTime.parse(Config.full_auto_analyze_start_time, timeFormatter) : range.first; - LocalTime end = range == null - ? LocalTime.parse(Config.full_auto_analyze_end_time, timeFormatter) : range.second; - + if (range == null) { + return false; + } + LocalTime start = range.first; + LocalTime end = range.second; if (start.isAfter(end) && (now.isAfter(start) || now.isBefore(end))) { return true; } else { @@ -754,13 +754,14 @@ private static Pair findRangeFromGlobalSessionVar() { String startTime = findRangeFromGlobalSessionVar(SessionVariable.FULL_AUTO_ANALYZE_START_TIME) .fullAutoAnalyzeStartTime; + // For compatibility if (StringUtils.isEmpty(startTime)) { - return null; + startTime = StatisticConstants.FULL_AUTO_ANALYZE_START_TIME; } String endTime = findRangeFromGlobalSessionVar(SessionVariable.FULL_AUTO_ANALYZE_END_TIME) .fullAutoAnalyzeEndTime; if (StringUtils.isEmpty(startTime)) { - return null; + endTime = StatisticConstants.FULL_AUTO_ANALYZE_END_TIME; } DateTimeFormatter timeFormatter = DateTimeFormatter.ofPattern("HH:mm:ss"); return Pair.of(LocalTime.parse(startTime, timeFormatter), LocalTime.parse(endTime, timeFormatter)); @@ -775,4 +776,13 @@ private static SessionVariable findRangeFromGlobalSessionVar(String varName) thr VariableMgr.getValue(sessionVariable, variableExpr); return sessionVariable; } + + public static boolean enableAutoAnalyze() { + try { + return findRangeFromGlobalSessionVar(SessionVariable.ENABLE_FULL_AUTO_ANALYZE).enableFullAutoAnalyze; + } catch (Exception e) { + LOG.warn("Fail to get value of enable auto analyze, return false by default", e); + } + return false; + } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java index 185ae1f6f8a626..37fef3ebc8582f 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java @@ -34,7 +34,7 @@ public class OlapAnalysisTaskTest { public void testAutoSample(@Mocked CatalogIf catalogIf, @Mocked DatabaseIf databaseIf, @Mocked TableIf tableIf) { new Expectations() { { - tableIf.getDataSize(); + tableIf.getDataSize(true); result = 60_0000_0000L; } }; @@ -46,11 +46,11 @@ public void testAutoSample(@Mocked CatalogIf catalogIf, @Mocked DatabaseIf datab olapAnalysisTask.tbl = tableIf; Config.enable_auto_sample = true; String sampleExpr = olapAnalysisTask.getSampleExpression(); - Assertions.assertEquals("TABLESAMPLE(200000 ROWS)", sampleExpr); + Assertions.assertEquals("TABLESAMPLE(4194304 ROWS)", sampleExpr); new Expectations() { { - tableIf.getDataSize(); + tableIf.getDataSize(true); result = 1_0000_0000L; } };