diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java index 97a88aad40f910..5f58ab277ff850 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java @@ -56,6 +56,8 @@ import org.apache.doris.statistics.HistogramTask; import org.apache.doris.statistics.MVAnalysisTask; import org.apache.doris.statistics.OlapAnalysisTask; +import org.apache.doris.statistics.TableStats; +import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.doris.system.Backend; import org.apache.doris.system.SystemInfoService; import org.apache.doris.thrift.TColumn; @@ -1122,6 +1124,32 @@ public BaseAnalysisTask createAnalysisTask(AnalysisInfo info) { return new MVAnalysisTask(info); } + @Override + public boolean needReAnalyzeTable(TableStats tblStats) { + long rowCount = getRowCount(); + // TODO: Do we need to analyze an empty table? + if (rowCount == 0) { + return false; + } + long updateRows = tblStats.updatedRows.get(); + int tblHealth = StatisticsUtil.getTableHealth(rowCount, updateRows); + return tblHealth < Config.table_stats_health_threshold; + } + + @Override + public Set findReAnalyzeNeededPartitions(TableStats tableStats) { + if (tableStats == null) { + return getPartitionNames().stream().map(this::getPartition) + .filter(Partition::hasData).map(Partition::getName).collect(Collectors.toSet()); + } + return getPartitionNames().stream() + .map(this::getPartition) + .filter(Partition::hasData) + .filter(partition -> + partition.getVisibleVersionTime() >= tableStats.updatedTime).map(Partition::getName) + .collect(Collectors.toSet()); + } + @Override public long getRowCount() { long rowCount = 0; diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java index 24d9dcc8008c2b..12689894b4835a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java @@ -30,6 +30,7 @@ import org.apache.doris.statistics.AnalysisInfo; import org.apache.doris.statistics.BaseAnalysisTask; import org.apache.doris.statistics.ColumnStatistic; +import org.apache.doris.statistics.TableStats; import org.apache.doris.thrift.TTableDescriptor; import com.google.common.base.Preconditions; @@ -571,6 +572,15 @@ public Optional getColumnStatistic(String colName) { return Optional.empty(); } - public void analyze(String dbName) { + public void analyze(String dbName) {} + + @Override + public boolean needReAnalyzeTable(TableStats tblStats) { + return true; + } + + @Override + public Set findReAnalyzeNeededPartitions(TableStats tableStats) { + return Collections.emptySet(); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java index f23f839898a0be..21e2ddd154b9ee 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java @@ -23,6 +23,7 @@ import org.apache.doris.statistics.AnalysisInfo; import org.apache.doris.statistics.BaseAnalysisTask; import org.apache.doris.statistics.ColumnStatistic; +import org.apache.doris.statistics.TableStats; import org.apache.doris.thrift.TTableDescriptor; import com.google.common.collect.Lists; @@ -136,6 +137,10 @@ default int getBaseColumnIdxByName(String colName) { Optional getColumnStatistic(String colName); + boolean needReAnalyzeTable(TableStats tblStats); + + Set findReAnalyzeNeededPartitions(TableStats tableStats); + void write(DataOutput out) throws IOException; /** diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java index 0f9ec3f56433c7..6f31ac18d77ddb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java @@ -35,8 +35,10 @@ import org.apache.doris.statistics.AnalysisInfo; import org.apache.doris.statistics.BaseAnalysisTask; import org.apache.doris.statistics.ColumnStatistic; +import org.apache.doris.statistics.TableStats; import org.apache.doris.thrift.TTableDescriptor; +import com.google.common.collect.Sets; import com.google.gson.annotations.SerializedName; import lombok.Getter; import org.apache.commons.lang3.NotImplementedException; @@ -46,8 +48,10 @@ import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; +import java.util.HashSet; import java.util.List; import java.util.Optional; +import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.ReentrantReadWriteLock; @@ -375,4 +379,19 @@ public void gsonPostProcess() throws IOException { rwLock = new ReentrantReadWriteLock(true); objectCreated = false; } + + @Override + public boolean needReAnalyzeTable(TableStats tblStats) { + // TODO: Find a way to decide if this external table need to be reanalyzed. + // For now, simply return true for all external tables. + return true; + } + + @Override + public Set findReAnalyzeNeededPartitions(TableStats tableStats) { + HashSet partitions = Sets.newHashSet(); + // TODO: Find a way to collect external table partitions that need to be analyzed. + partitions.add("Dummy Partition"); + return partitions; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java index 0459c3ef2abd31..c1de1ea98d7555 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java @@ -153,7 +153,7 @@ protected synchronized void makeSureInitialized() { } } objectCreated = true; - estimatedRowCount = getRowCountFromExternalSource(); + estimatedRowCount = getRowCountFromExternalSource(true); } } @@ -277,7 +277,7 @@ public long getUpdateTime() { @Override public long getRowCount() { makeSureInitialized(); - long rowCount = getRowCountFromExternalSource(); + long rowCount = getRowCountFromExternalSource(false); if (rowCount == -1) { LOG.debug("Will estimate row count from file list."); rowCount = StatisticsUtil.getRowCountFromFileList(this); @@ -285,11 +285,11 @@ public long getRowCount() { return rowCount; } - private long getRowCountFromExternalSource() { + private long getRowCountFromExternalSource(boolean isInit) { long rowCount; switch (dlaType) { case HIVE: - rowCount = StatisticsUtil.getHiveRowCount(this); + rowCount = StatisticsUtil.getHiveRowCount(this, isInit); break; case ICEBERG: rowCount = StatisticsUtil.getIcebergRowCount(this); diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java index b84c769eacff5e..d135018e7541ba 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java @@ -170,4 +170,6 @@ default CatalogLog constructEditLog() { // Return a copy of all db collection. @SuppressWarnings({"rawtypes", "unchecked"}) public Collection getAllDbs(); + + public boolean enableAutoAnalyze(); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java index f213f9a302bd65..35d03dfabcfbf4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java @@ -72,6 +72,8 @@ public abstract class ExternalCatalog implements CatalogIf>, Writable, GsonPostProcessable { private static final Logger LOG = LogManager.getLogger(ExternalCatalog.class); + public static final String ENABLE_AUTO_ANALYZE = "enable.auto.analyze"; + // Unique id of this catalog, will be assigned after catalog is loaded. @SerializedName(value = "id") protected long id; @@ -587,6 +589,20 @@ public boolean useSelfSplitter() { @Override public Collection getAllDbs() { + makeSureInitialized(); return new HashSet<>(idToDb.values()); } + + @Override + public boolean enableAutoAnalyze() { + // By default, external catalog disables auto analyze, uses could set catalog property to enable it: + // "enable.auto.analyze" = true + Map properties = catalogProperty.getProperties(); + boolean ret = false; + if (properties.containsKey(ENABLE_AUTO_ANALYZE) + && properties.get(ENABLE_AUTO_ANALYZE).equalsIgnoreCase("true")) { + ret = true; + } + return ret; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java index 68a819a2dd0b32..a85d892734b6b0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java @@ -3134,4 +3134,9 @@ public void replayAutoIncrementIdUpdateLog(AutoIncrementIdUpdateLog log) throws OlapTable olapTable = (OlapTable) db.getTableOrMetaException(log.getTableId(), TableType.OLAP); olapTable.getAutoIncrementGenerator().applyChange(log.getColumnId(), log.getBatchEndId()); } + + @Override + public boolean enableAutoAnalyze() { + return true; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoAnalyzer.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoAnalyzer.java index ad70769427446a..5d704e4f3b9f17 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoAnalyzer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoAnalyzer.java @@ -21,7 +21,6 @@ import org.apache.doris.catalog.Column; import org.apache.doris.catalog.DatabaseIf; import org.apache.doris.catalog.Env; -import org.apache.doris.catalog.Partition; import org.apache.doris.catalog.TableIf; import org.apache.doris.catalog.View; import org.apache.doris.common.Config; @@ -88,7 +87,9 @@ protected void runAfterCatalogReady() { private void analyzeAll() { Set catalogs = Env.getCurrentEnv().getCatalogMgr().getCopyOfCatalog(); for (CatalogIf ctl : catalogs) { - + if (!ctl.enableAutoAnalyze()) { + continue; + } Collection dbs = ctl.getAllDbs(); for (DatabaseIf databaseIf : dbs) { if (StatisticConstants.STATISTICS_DB_BLACK_LIST.contains(databaseIf.getFullName())) { @@ -158,11 +159,11 @@ protected AnalysisInfo getReAnalyzeRequiredPart(AnalysisInfo jobInfo) { .findTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName); TableStats tblStats = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId()); - if (!(tblStats == null || needReanalyzeTable(table, tblStats))) { + if (!(tblStats == null || table.needReAnalyzeTable(tblStats))) { return null; } - Set needRunPartitions = findReAnalyzeNeededPartitions(table, tblStats); + Set needRunPartitions = table.findReAnalyzeNeededPartitions(tblStats); if (needRunPartitions.isEmpty()) { return null; @@ -171,31 +172,6 @@ protected AnalysisInfo getReAnalyzeRequiredPart(AnalysisInfo jobInfo) { return getAnalysisJobInfo(jobInfo, table, needRunPartitions); } - @VisibleForTesting - protected Set findReAnalyzeNeededPartitions(TableIf table, TableStats tableStats) { - if (tableStats == null) { - return table.getPartitionNames().stream().map(table::getPartition) - .filter(Partition::hasData).map(Partition::getName).collect(Collectors.toSet()); - } - return table.getPartitionNames().stream() - .map(table::getPartition) - .filter(Partition::hasData) - .filter(partition -> - partition.getVisibleVersionTime() >= tableStats.updatedTime).map(Partition::getName) - .collect(Collectors.toSet()); - } - - private boolean needReanalyzeTable(TableIf table, TableStats tblStats) { - long rowCount = table.getRowCount(); - // TODO: Do we need to analyze an empty table? - if (rowCount == 0) { - return false; - } - long updateRows = tblStats.updatedRows.get(); - int tblHealth = StatisticsUtil.getTableHealth(rowCount, updateRows); - return tblHealth < Config.table_stats_health_threshold; - } - @VisibleForTesting public AnalysisInfo getAnalysisJobInfo(AnalysisInfo jobInfo, TableIf table, Set needRunPartitions) { @@ -247,6 +223,9 @@ protected void createSystemAnalysisJob(AnalysisInfo jobInfo) Map analysisTaskInfos = new HashMap<>(); AnalysisManager analysisManager = Env.getCurrentEnv().getAnalysisManager(); analysisManager.createTaskForEachColumns(jobInfo, analysisTaskInfos, false); + if (StatisticsUtil.isExternalTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName)) { + analysisManager.createTableLevelTaskForExternalTable(jobInfo, analysisTaskInfos, false); + } Env.getCurrentEnv().getAnalysisManager().registerSysJob(jobInfo, analysisTaskInfos); analysisTaskInfos.values().forEach(analysisTaskExecutor::submitTask); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java index 876be46ab60541..87d8a0ba15c9c9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java @@ -489,9 +489,10 @@ public static int getTableHealth(long totalRows, long updatedRows) { * First get it from remote table parameters. If not found, estimate it : totalSize/estimatedRowSize * * @param table Hive HMSExternalTable to estimate row count. + * @param isInit Flag to indicate if this is called during init. To avoid recursively get schema. * @return estimated row count */ - public static long getHiveRowCount(HMSExternalTable table) { + public static long getHiveRowCount(HMSExternalTable table, boolean isInit) { Map parameters = table.getRemoteTable().getParameters(); if (parameters == null) { return -1; @@ -500,7 +501,7 @@ public static long getHiveRowCount(HMSExternalTable table) { if (parameters.containsKey(NUM_ROWS)) { return Long.parseLong(parameters.get(NUM_ROWS)); } - if (!parameters.containsKey(TOTAL_SIZE)) { + if (!parameters.containsKey(TOTAL_SIZE) || isInit) { return -1; } // Table parameters doesn't contain row count but contain total size. Estimate row count : totalSize/rowSize diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoAnalyzerTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoAnalyzerTest.java index 3368a5a6692b70..fff649a4473eb7 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoAnalyzerTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoAnalyzerTest.java @@ -39,7 +39,6 @@ import mockit.Injectable; import mockit.Mock; import mockit.MockUp; -import mockit.Mocked; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -140,14 +139,25 @@ public List getBaseSchema() { } @Test - public void testGetReAnalyzeRequiredPart0(@Mocked TableIf tableIf) { + public void testGetReAnalyzeRequiredPart0() { - new Expectations() { - { - tableIf.getRowCount(); - result = 100; + TableIf tableIf = new OlapTable(); + + new MockUp() { + @Mock + protected Set findReAnalyzeNeededPartitions(TableStats tableStats) { + Set partitionNames = new HashSet<>(); + partitionNames.add("p1"); + partitionNames.add("p2"); + return partitionNames; + } + + @Mock + public long getRowCount() { + return 100; } }; + new MockUp() { @Mock public TableIf findTable(String catalogName, String dbName, String tblName) { @@ -176,14 +186,6 @@ public TableStats findTableStatsStatus(long tblId) { }; new MockUp() { - @Mock - protected Set findReAnalyzeNeededPartitions(TableIf table, TableStats tableStats) { - Set partitionNames = new HashSet<>(); - partitionNames.add("p1"); - partitionNames.add("p2"); - return partitionNames; - } - @Mock public AnalysisInfo getAnalysisJobInfo(AnalysisInfo jobInfo, TableIf table, Set needRunPartitions) { diff --git a/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy b/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy index 2366267a27ea24..8465536a8c79fd 100644 --- a/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy +++ b/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy @@ -30,6 +30,10 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h ); """ logger.info("catalog " + catalog_name + " created") + + // Test analyze table without init. + sql """analyze table ${catalog_name}.tpch_1000_parquet.region with sync""" + sql """switch ${catalog_name};""" logger.info("switched to catalog " + catalog_name) sql """use statistics;"""