From 804b2c1d9661b9b59b04a99937f4ab2244d33c6c Mon Sep 17 00:00:00 2001 From: Jibing Li Date: Mon, 25 Sep 2023 19:58:32 +0800 Subject: [PATCH] Sample analyze improvement. --- .../apache/doris/analysis/AnalyzeTblStmt.java | 6 - .../org/apache/doris/analysis/TableRef.java | 5 - .../java/org/apache/doris/catalog/Table.java | 5 + .../org/apache/doris/catalog/TableIf.java | 4 + .../doris/catalog/external/ExternalTable.java | 5 + .../catalog/external/HMSExternalTable.java | 31 ++++ .../translator/PhysicalPlanTranslator.java | 7 +- .../nereids/rules/analysis/BindRelation.java | 6 +- .../LogicalFileScanToPhysicalFileScan.java | 3 +- .../trees/copier/LogicalPlanDeepCopier.java | 2 +- .../trees/plans/logical/LogicalFileScan.java | 19 ++- .../plans/physical/PhysicalFileScan.java | 16 +- .../planner/external/FileQueryScanNode.java | 2 + .../doris/planner/external/HiveScanNode.java | 14 +- .../doris/statistics/AnalysisInfoBuilder.java | 2 - .../doris/statistics/BaseAnalysisTask.java | 22 ++- .../doris/statistics/HMSAnalysisTask.java | 83 ++++++---- .../doris/statistics/util/StatisticsUtil.java | 91 +++++++---- .../hive/test_hive_statistic_sample.groovy | 150 ++++++++++++++++++ 19 files changed, 371 insertions(+), 102 deletions(-) create mode 100644 regression-test/suites/external_table_p2/hive/test_hive_statistic_sample.groovy diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java index cbc66f367f260f..185bee1d132551 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java @@ -21,7 +21,6 @@ import org.apache.doris.catalog.Database; import org.apache.doris.catalog.DatabaseIf; import org.apache.doris.catalog.Env; -import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.TableIf; import org.apache.doris.catalog.View; import org.apache.doris.catalog.external.ExternalTable; @@ -167,11 +166,6 @@ public void check() throws AnalysisException { } analyzeProperties.check(); - // TODO support external table - if (analyzeProperties.isSampleRows() && !(table instanceof OlapTable)) { - throw new AnalysisException("Sampling statistics " - + "collection of external tables is not supported with rows, use percent instead."); - } if (analyzeProperties.isSync() && (analyzeProperties.isAutomatic() || analyzeProperties.getPeriodTimeInMs() != 0)) { throw new AnalysisException("Automatic/Period statistics collection " diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/TableRef.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/TableRef.java index 42370a0c53b9c3..b6c47e1cbfc366 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/TableRef.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/TableRef.java @@ -476,11 +476,6 @@ protected void analyzeSample() throws AnalysisException { throw new AnalysisException("Sample table " + desc.getTable().getName() + " type " + desc.getTable().getType() + " is not supported"); } - if (tableSample != null && TableIf.TableType.HMS_EXTERNAL_TABLE.equals(desc.getTable().getType())) { - if (!tableSample.isPercent()) { - throw new AnalysisException("HMS table doesn't support sample rows, use percent instead."); - } - } } /** diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java index 53ba6fe7fe4a3c..cdd179a34723b4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java @@ -583,4 +583,9 @@ public boolean needReAnalyzeTable(TableStatsMeta tblStats) { public Map> findReAnalyzeNeededPartitions() { return Collections.emptyMap(); } + + @Override + public List getChunkSizes() { + throw new NotImplementedException("getChunkSized not implemented"); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java index 5e0e59fb265cf1..16aa4a6d370979 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java @@ -142,6 +142,10 @@ default int getBaseColumnIdxByName(String colName) { Map> findReAnalyzeNeededPartitions(); + // Get all the chunk sizes of this table. Now, only HMS external table implemented this interface. + // For HMS external table, the return result is a list of all the files' size. + List getChunkSizes(); + void write(DataOutput out) throws IOException; /** diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java index 36dbe88967e56a..a915136193c891 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java @@ -399,4 +399,9 @@ public Map> findReAnalyzeNeededPartitions() { partitions.add("Dummy Partition"); return getBaseSchema().stream().collect(Collectors.toMap(Column::getName, k -> partitions)); } + + @Override + public List getChunkSizes() { + throw new NotImplementedException("getChunkSized not implemented"); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java index 5aa55c97dd15a6..1f98ebad1312d0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java @@ -26,6 +26,7 @@ import org.apache.doris.catalog.Type; import org.apache.doris.common.AnalysisException; import org.apache.doris.datasource.HMSExternalCatalog; +import org.apache.doris.datasource.hive.HiveMetaStoreCache; import org.apache.doris.datasource.hive.PooledHiveMetaStoreClient; import org.apache.doris.statistics.AnalysisInfo; import org.apache.doris.statistics.BaseAnalysisTask; @@ -644,6 +645,36 @@ public void gsonPostProcess() throws IOException { super.gsonPostProcess(); estimatedRowCount = -1; } + + @Override + public List getChunkSizes() { + HiveMetaStoreCache.HivePartitionValues partitionValues = StatisticsUtil.getPartitionValuesForTable(this); + List filesByPartitions + = StatisticsUtil.getFilesForPartitions(this, partitionValues, 0); + List result = Lists.newArrayList(); + for (HiveMetaStoreCache.FileCacheValue files : filesByPartitions) { + for (HiveMetaStoreCache.HiveFileStatus file : files.getFiles()) { + result.add(file.getLength()); + } + } + return result; + } + + @Override + public long getDataSize(boolean singleReplica) { + long totalSize = StatisticsUtil.getTotalSizeFromHMS(this); + // Usually, we can get total size from HMS parameter. + if (totalSize > 0) { + return totalSize; + } + // If not found the size in HMS, calculate it by sum all files' size in table. + List chunkSizes = getChunkSizes(); + long total = 0; + for (long size : chunkSizes) { + total += size; + } + return total; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java index 8dd0ad742ebfa7..768f8ddf4b1aae 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java @@ -479,7 +479,12 @@ public PlanFragment visitPhysicalFileScan(PhysicalFileScan fileScan, PlanTransla break; case HIVE: scanNode = new HiveScanNode(fileScan.translatePlanNodeId(), tupleDescriptor, false); - ((HiveScanNode) scanNode).setSelectedPartitions(fileScan.getSelectedPartitions()); + HiveScanNode hiveScanNode = (HiveScanNode) scanNode; + hiveScanNode.setSelectedPartitions(fileScan.getSelectedPartitions()); + if (fileScan.getTableSample().isPresent()) { + hiveScanNode.setTableSample(new TableSample(fileScan.getTableSample().get().isPercent, + fileScan.getTableSample().get().sampleValue, fileScan.getTableSample().get().seek)); + } break; default: throw new RuntimeException("do not support DLA type " + ((HMSExternalTable) table).getDlaType()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/BindRelation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/BindRelation.java index 8a35ac4b4d9990..f97057e105ddcd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/BindRelation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/BindRelation.java @@ -230,11 +230,13 @@ private LogicalPlan getLogicalPlan(TableIf table, UnboundRelation unboundRelatio Plan hiveViewPlan = parseAndAnalyzeHiveView(hiveCatalog, ddlSql, cascadesContext); return new LogicalSubQueryAlias<>(tableQualifier, hiveViewPlan); } - return new LogicalFileScan(unboundRelation.getRelationId(), (HMSExternalTable) table, tableQualifier); + return new LogicalFileScan(unboundRelation.getRelationId(), (HMSExternalTable) table, tableQualifier, + unboundRelation.getTableSample()); case ICEBERG_EXTERNAL_TABLE: case PAIMON_EXTERNAL_TABLE: case MAX_COMPUTE_EXTERNAL_TABLE: - return new LogicalFileScan(unboundRelation.getRelationId(), (ExternalTable) table, tableQualifier); + return new LogicalFileScan(unboundRelation.getRelationId(), (ExternalTable) table, tableQualifier, + unboundRelation.getTableSample()); case SCHEMA: return new LogicalSchemaScan(unboundRelation.getRelationId(), table, tableQualifier); case JDBC_EXTERNAL_TABLE: diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/LogicalFileScanToPhysicalFileScan.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/LogicalFileScanToPhysicalFileScan.java index f53ce1553aed1a..d86e1d1667e18a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/LogicalFileScanToPhysicalFileScan.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/LogicalFileScanToPhysicalFileScan.java @@ -39,7 +39,8 @@ public Rule build() { Optional.empty(), fileScan.getLogicalProperties(), fileScan.getConjuncts(), - fileScan.getSelectedPartitions()) + fileScan.getSelectedPartitions(), + fileScan.getTableSample()) ).toRule(RuleType.LOGICAL_FILE_SCAN_TO_PHYSICAL_FILE_SCAN_RULE); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/copier/LogicalPlanDeepCopier.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/copier/LogicalPlanDeepCopier.java index 17b2f0d2d3022d..22530cabea687d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/copier/LogicalPlanDeepCopier.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/copier/LogicalPlanDeepCopier.java @@ -212,7 +212,7 @@ public Plan visitLogicalFileScan(LogicalFileScan fileScan, DeepCopierContext con return context.getRelationReplaceMap().get(fileScan.getRelationId()); } LogicalFileScan newFileScan = new LogicalFileScan(StatementScopeIdGenerator.newRelationId(), - fileScan.getTable(), fileScan.getQualifier()); + fileScan.getTable(), fileScan.getQualifier(), fileScan.getTableSample()); updateLeadingRelationIdMap(newFileScan.getRelationId(), fileScan.getTable().getName(), newFileScan); updateReplaceMapWithOutput(fileScan, newFileScan, context.exprIdReplaceMap); context.putRelation(fileScan.getRelationId(), newFileScan); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalFileScan.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalFileScan.java index c25a25efcecfcd..390fb1c97e0224 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalFileScan.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalFileScan.java @@ -21,6 +21,7 @@ import org.apache.doris.catalog.external.ExternalTable; import org.apache.doris.nereids.memo.GroupExpression; import org.apache.doris.nereids.properties.LogicalProperties; +import org.apache.doris.nereids.trees.TableSample; import org.apache.doris.nereids.trees.expressions.Expression; import org.apache.doris.nereids.trees.plans.Plan; import org.apache.doris.nereids.trees.plans.PlanType; @@ -49,22 +50,26 @@ public class LogicalFileScan extends LogicalCatalogRelation { private final Set conjuncts; @Getter private final SelectedPartitions selectedPartitions; + @Getter + private final Optional tableSample; /** * Constructor for LogicalFileScan. */ public LogicalFileScan(RelationId id, ExternalTable table, List qualifier, Optional groupExpression, Optional logicalProperties, - Set conjuncts, SelectedPartitions selectedPartitions) { + Set conjuncts, SelectedPartitions selectedPartitions, Optional tableSample) { super(id, PlanType.LOGICAL_FILE_SCAN, table, qualifier, groupExpression, logicalProperties); this.conjuncts = conjuncts; this.selectedPartitions = selectedPartitions; + this.tableSample = tableSample; } - public LogicalFileScan(RelationId id, ExternalTable table, List qualifier) { + public LogicalFileScan(RelationId id, ExternalTable table, List qualifier, + Optional tableSample) { this(id, table, qualifier, Optional.empty(), Optional.empty(), - Sets.newHashSet(), SelectedPartitions.NOT_PRUNED); + Sets.newHashSet(), SelectedPartitions.NOT_PRUNED, tableSample); } @Override @@ -85,24 +90,24 @@ public String toString() { @Override public LogicalFileScan withGroupExpression(Optional groupExpression) { return new LogicalFileScan(relationId, (ExternalTable) table, qualifier, groupExpression, - Optional.of(getLogicalProperties()), conjuncts, selectedPartitions); + Optional.of(getLogicalProperties()), conjuncts, selectedPartitions, tableSample); } @Override public Plan withGroupExprLogicalPropChildren(Optional groupExpression, Optional logicalProperties, List children) { return new LogicalFileScan(relationId, (ExternalTable) table, qualifier, - groupExpression, logicalProperties, conjuncts, selectedPartitions); + groupExpression, logicalProperties, conjuncts, selectedPartitions, tableSample); } public LogicalFileScan withConjuncts(Set conjuncts) { return new LogicalFileScan(relationId, (ExternalTable) table, qualifier, groupExpression, - Optional.of(getLogicalProperties()), conjuncts, selectedPartitions); + Optional.of(getLogicalProperties()), conjuncts, selectedPartitions, tableSample); } public LogicalFileScan withSelectedPartitions(SelectedPartitions selectedPartitions) { return new LogicalFileScan(relationId, (ExternalTable) table, qualifier, groupExpression, - Optional.of(getLogicalProperties()), conjuncts, selectedPartitions); + Optional.of(getLogicalProperties()), conjuncts, selectedPartitions, tableSample); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalFileScan.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalFileScan.java index be4f5b179839dd..3dc83c7f43d300 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalFileScan.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalFileScan.java @@ -22,6 +22,7 @@ import org.apache.doris.nereids.properties.DistributionSpec; import org.apache.doris.nereids.properties.LogicalProperties; import org.apache.doris.nereids.properties.PhysicalProperties; +import org.apache.doris.nereids.trees.TableSample; import org.apache.doris.nereids.trees.expressions.Expression; import org.apache.doris.nereids.trees.plans.Plan; import org.apache.doris.nereids.trees.plans.PlanType; @@ -47,6 +48,8 @@ public class PhysicalFileScan extends PhysicalCatalogRelation { private final Set conjuncts; @Getter private final SelectedPartitions selectedPartitions; + @Getter + private final Optional tableSample; /** * Constructor for PhysicalFileScan. @@ -54,11 +57,12 @@ public class PhysicalFileScan extends PhysicalCatalogRelation { public PhysicalFileScan(RelationId id, ExternalTable table, List qualifier, DistributionSpec distributionSpec, Optional groupExpression, LogicalProperties logicalProperties, Set conjuncts, - SelectedPartitions selectedPartitions) { + SelectedPartitions selectedPartitions, Optional tableSample) { super(id, PlanType.PHYSICAL_FILE_SCAN, table, qualifier, groupExpression, logicalProperties); this.distributionSpec = distributionSpec; this.conjuncts = conjuncts; this.selectedPartitions = selectedPartitions; + this.tableSample = tableSample; } /** @@ -67,12 +71,14 @@ public PhysicalFileScan(RelationId id, ExternalTable table, List qualifi public PhysicalFileScan(RelationId id, ExternalTable table, List qualifier, DistributionSpec distributionSpec, Optional groupExpression, LogicalProperties logicalProperties, PhysicalProperties physicalProperties, - Statistics statistics, Set conjuncts, SelectedPartitions selectedPartitions) { + Statistics statistics, Set conjuncts, SelectedPartitions selectedPartitions, + Optional tableSample) { super(id, PlanType.PHYSICAL_FILE_SCAN, table, qualifier, groupExpression, logicalProperties, physicalProperties, statistics); this.distributionSpec = distributionSpec; this.conjuncts = conjuncts; this.selectedPartitions = selectedPartitions; + this.tableSample = tableSample; } @Override @@ -95,14 +101,14 @@ public R accept(PlanVisitor visitor, C context) { @Override public PhysicalFileScan withGroupExpression(Optional groupExpression) { return new PhysicalFileScan(relationId, getTable(), qualifier, distributionSpec, - groupExpression, getLogicalProperties(), conjuncts, selectedPartitions); + groupExpression, getLogicalProperties(), conjuncts, selectedPartitions, tableSample); } @Override public Plan withGroupExprLogicalPropChildren(Optional groupExpression, Optional logicalProperties, List children) { return new PhysicalFileScan(relationId, getTable(), qualifier, distributionSpec, - groupExpression, logicalProperties.get(), conjuncts, selectedPartitions); + groupExpression, logicalProperties.get(), conjuncts, selectedPartitions, tableSample); } @Override @@ -115,6 +121,6 @@ public PhysicalFileScan withPhysicalPropertiesAndStats(PhysicalProperties physic Statistics statistics) { return new PhysicalFileScan(relationId, getTable(), qualifier, distributionSpec, groupExpression, getLogicalProperties(), physicalProperties, statistics, conjuncts, - selectedPartitions); + selectedPartitions, tableSample); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileQueryScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileQueryScanNode.java index 5fe573ab5b82d3..7f7be8f51b60ff 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileQueryScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileQueryScanNode.java @@ -74,6 +74,7 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.collect.Maps; +import lombok.Getter; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -95,6 +96,7 @@ public abstract class FileQueryScanNode extends FileScanNode { protected Map destSlotDescByName; protected TFileScanRangeParams params; + @Getter protected TableSample tableSample; /** diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java index ba9b3e0abc387e..1ba77fa5f9c09f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java @@ -66,6 +66,7 @@ import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.Random; import java.util.stream.Collectors; public class HiveScanNode extends FileQueryScanNode { @@ -263,9 +264,18 @@ private List selectFiles(List totalSize += file.getLength(); } } - long sampleSize = totalSize * tableSample.getSampleValue() / 100; + long sampleSize = 0; + if (tableSample.isPercent()) { + sampleSize = totalSize * tableSample.getSampleValue() / 100; + } else { + long estimatedRowSize = 0; + for (Column column : hmsTable.getFullSchema()) { + estimatedRowSize += column.getDataType().getSlotSize(); + } + sampleSize = estimatedRowSize * tableSample.getSampleValue(); + } long selectedSize = 0; - Collections.shuffle(fileList); + Collections.shuffle(fileList, new Random(tableSample.getSeek())); int index = 0; for (HiveMetaStoreCache.HiveFileStatus file : fileList) { selectedSize += file.getLength(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java index d84cebfd6c9297..b6d9aafe853fdc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java @@ -58,9 +58,7 @@ public class AnalysisInfoBuilder { private boolean samplingPartition; private boolean isAllPartition; private long partitionCount; - private CronExpression cronExpression; - private boolean forceFull; public AnalysisInfoBuilder() { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index f689ab9d48b5fa..af85b528247816 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -137,6 +137,7 @@ protected void init(AnalysisInfo info) { info, AnalysisState.FAILED, String.format("Table with name %s not exists", info.tblName), System.currentTimeMillis()); } + tableSample = getTableSample(); // External Table level task doesn't contain a column. Don't need to do the column related analyze. if (info.externalTableLevelTask) { return; @@ -150,8 +151,6 @@ protected void init(AnalysisInfo info) { Preconditions.checkArgument(!StatisticsUtil.isUnsupportedType(col.getType()), String.format("Column with type %s is not supported", col.getType().toString())); } - tableSample = getTableSample(); - } public void execute() { @@ -230,19 +229,18 @@ protected TableSample getTableSample() { if (info.forceFull) { return null; } - long sampleRows = info.sampleRows; - if (info.analysisMethod == AnalysisMethod.FULL) { - if (Config.enable_auto_sample - && tbl.getDataSize(true) > Config.huge_table_lower_bound_size_in_bytes) { - sampleRows = Config.huge_table_default_sample_rows; - } else { - return null; - } - } + // If user specified sample percent or sample rows, use it. if (info.samplePercent > 0) { return new TableSample(true, (long) info.samplePercent); + } else if (info.sampleRows > 0) { + return new TableSample(false, info.sampleRows); + } else if (info.analysisMethod == AnalysisMethod.FULL + && Config.enable_auto_sample + && tbl.getDataSize(true) > Config.huge_table_lower_bound_size_in_bytes) { + // If user doesn't specify sample percent/rows, use auto sample and update sample rows in analysis info. + return new TableSample(false, (long) Config.huge_table_default_sample_rows); } else { - return new TableSample(false, sampleRows); + return null; } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java index 405cead922e2ce..076479aae8f50b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java @@ -32,10 +32,12 @@ import org.apache.logging.log4j.Logger; import java.util.ArrayList; +import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Random; import java.util.Set; import java.util.StringJoiner; import java.util.stream.Collectors; @@ -43,10 +45,9 @@ public class HMSAnalysisTask extends BaseAnalysisTask { private static final Logger LOG = LogManager.getLogger(HMSAnalysisTask.class); - public static final String TOTAL_SIZE = "totalSize"; - public static final String NUM_ROWS = "numRows"; - public static final String NUM_FILES = "numFiles"; - public static final String TIMESTAMP = "transient_lastDdlTime"; + // While doing sample analysis, the sampled ndv result will multiply a factor (total size/sample size) + // if ndv(col)/count(col) is greater than this threshold. + private static final String NDV_MULTIPLY_THRESHOLD = "0.3"; private static final String ANALYZE_TABLE_TEMPLATE = "INSERT INTO " + "${internalDB}.${columnStatTbl}" @@ -58,14 +59,17 @@ public class HMSAnalysisTask extends BaseAnalysisTask { + "${idxId} AS idx_id, " + "'${colId}' AS col_id, " + "NULL AS part_id, " - + "${countExpr} AS row_count, " - + "NDV(`${colName}`) AS ndv, " - + "${nullCountExpr} AS null_count, " + + "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, " + + "case when NDV(`${colName}`)/count('${colName}') < " + + NDV_MULTIPLY_THRESHOLD + + " then NDV(`${colName}`) " + + "else NDV(`${colName}`) * ${scaleFactor} end AS ndv, " + + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS null_count, " + "MIN(`${colName}`) AS min, " + "MAX(`${colName}`) AS max, " - + "${dataSizeFunction} AS data_size, " + + "${dataSizeFunction} * ${scaleFactor} AS data_size, " + "NOW() " - + "FROM `${catalogName}`.`${dbName}`.`${tblName}`"; + + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleExpr}"; private static final String ANALYZE_PARTITION_TEMPLATE = " SELECT " + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}', '-', ${partId}) AS id, " @@ -83,7 +87,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask { + "${dataSizeFunction} AS data_size, " + "NOW() FROM `${catalogName}`.`${dbName}`.`${tblName}` where "; - private static final String ANALYZE_TABLE_COUNT_TEMPLATE = "SELECT ${countExpr} as rowCount " + private static final String ANALYZE_TABLE_COUNT_TEMPLATE = "SELECT ROUND(COUNT(1) * ${scaleFactor}) as rowCount " + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleExpr}"; // cache stats for each partition, it would be inserted into column_statistics in a batch. @@ -160,7 +164,6 @@ private void getTableColumnStats() throws Exception { params.put("colName", col.getName()); params.put("colId", info.colName); params.put("dataSizeFunction", getDataSizeFunction(col)); - params.put("nullCountExpr", getNullCountExpression()); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String sql = stringSubstitutor.replace(sb.toString()); executeInsertSql(sql); @@ -277,7 +280,8 @@ private Map buildTableStatsParams(String partId) { commonParams.put("catalogName", catalog.getName()); commonParams.put("dbName", db.getFullName()); commonParams.put("tblName", tbl.getName()); - commonParams.put("countExpr", getCountExpression()); + commonParams.put("sampleExpr", getSampleExpression()); + commonParams.put("scaleFactor", getSampleScaleFactor()); if (col != null) { commonParams.put("type", col.getType().toString()); } @@ -285,30 +289,51 @@ private Map buildTableStatsParams(String partId) { return commonParams; } - protected String getCountExpression() { - if (info.samplePercent > 0) { - return String.format("ROUND(COUNT(1) * 100 / %d)", info.samplePercent); - } else { - return "COUNT(1)"; + protected String getSampleExpression() { + if (tableSample == null) { + return ""; } - } - - protected String getNullCountExpression() { - if (info.samplePercent > 0) { - return String.format("ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 100 / %d)", - info.samplePercent); + if (tableSample.isPercent()) { + return String.format("TABLESAMPLE(%d PERCENT)", tableSample.getSampleValue()); } else { - return "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END)"; + return String.format("TABLESAMPLE(%d ROWS)", tableSample.getSampleValue()); } } - protected String getDataSizeFunction(Column column) { - String originFunction = super.getDataSizeFunction(column); - if (info.samplePercent > 0 && !isPartitionOnly) { - return String.format("ROUND((%s) * 100 / %d)", originFunction, info.samplePercent); + // Get the sample scale factor. While analyzing, the result of count, null count and data size need to + // multiply this factor to get more accurate result. + protected String getSampleScaleFactor() { + if (tableSample == null) { + return "1"; + } + long target = 0; + // Get list of all files' size in this HMS table. + List chunkSizes = table.getChunkSizes(); + Collections.shuffle(chunkSizes, new Random(tableSample.getSeek())); + long total = 0; + // Calculate the total size of this HMS table. + for (long size : chunkSizes) { + total += size; + } + // Calculate the sample target size for percent and rows sample. + if (tableSample.isPercent()) { + target = total * tableSample.getSampleValue() / 100; } else { - return originFunction; + int columnSize = 0; + for (Column column : table.getFullSchema()) { + columnSize += column.getDataType().getSlotSize(); + } + target = columnSize * tableSample.getSampleValue(); + } + // Calculate the actual sample size (cumulate). + long cumulate = 0; + for (long size : chunkSizes) { + cumulate += size; + if (cumulate >= target) { + break; + } } + return Double.toString(Math.max(((double) total) / cumulate, 1)); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java index 40ae13a0e0e293..f482c812879975 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java @@ -539,6 +539,19 @@ public static long getHiveRowCount(HMSExternalTable table, boolean isInit) { return totalSize / estimatedRowSize; } + /** + * Get total size parameter from HMS. + * @param table Hive HMSExternalTable to get HMS total size parameter. + * @return Long value of table total size, return 0 if not found. + */ + public static long getTotalSizeFromHMS(HMSExternalTable table) { + Map parameters = table.getRemoteTable().getParameters(); + if (parameters == null) { + return 0; + } + return parameters.containsKey(TOTAL_SIZE) ? Long.parseLong(parameters.get(TOTAL_SIZE)) : 0; + } + /** * Estimate iceberg table row count. * Get the row count by adding all task file recordCount. @@ -574,13 +587,42 @@ public static long getRowCountFromFileList(HMSExternalTable table) { if (table.isView()) { return 0; } + HiveMetaStoreCache.HivePartitionValues partitionValues = getPartitionValuesForTable(table); + int totalPartitionSize = partitionValues == null ? 1 : partitionValues.getIdToPartitionItem().size(); + + // Get files for all partitions. + int samplePartitionSize = Config.hive_stats_partition_sample_size; + List filesByPartitions + = getFilesForPartitions(table, partitionValues, samplePartitionSize); + long totalSize = 0; + // Calculate the total file size. + for (HiveMetaStoreCache.FileCacheValue files : filesByPartitions) { + for (HiveMetaStoreCache.HiveFileStatus file : files.getFiles()) { + totalSize += file.getLength(); + } + } + // Estimate row count: totalSize/estimatedRowSize + long estimatedRowSize = 0; + for (Column column : table.getFullSchema()) { + estimatedRowSize += column.getDataType().getSlotSize(); + } + if (estimatedRowSize == 0) { + return 1; + } + if (samplePartitionSize < totalPartitionSize) { + totalSize = totalSize * totalPartitionSize / samplePartitionSize; + } + return totalSize / estimatedRowSize; + } + + public static HiveMetaStoreCache.HivePartitionValues getPartitionValuesForTable(HMSExternalTable table) { + if (table.isView()) { + return null; + } HiveMetaStoreCache cache = Env.getCurrentEnv().getExtMetaCacheMgr() .getMetaStoreCache((HMSExternalCatalog) table.getCatalog()); List partitionColumnTypes = table.getPartitionColumnTypes(); HiveMetaStoreCache.HivePartitionValues partitionValues = null; - List hivePartitions = Lists.newArrayList(); - int samplePartitionSize = Config.hive_stats_partition_sample_size; - int totalPartitionSize = 1; // Get table partitions from cache. if (!partitionColumnTypes.isEmpty()) { // It is ok to get partition values from cache, @@ -588,17 +630,28 @@ public static long getRowCountFromFileList(HMSExternalTable table) { // because it has enough space to keep partition info of all tables in cache. partitionValues = cache.getPartitionValues(table.getDbName(), table.getName(), partitionColumnTypes); } + return partitionValues; + } + + public static List getFilesForPartitions( + HMSExternalTable table, HiveMetaStoreCache.HivePartitionValues partitionValues, int sampleSize) { + if (table.isView()) { + return Lists.newArrayList(); + } + HiveMetaStoreCache cache = Env.getCurrentEnv().getExtMetaCacheMgr() + .getMetaStoreCache((HMSExternalCatalog) table.getCatalog()); + List hivePartitions = Lists.newArrayList(); if (partitionValues != null) { Map idToPartitionItem = partitionValues.getIdToPartitionItem(); - totalPartitionSize = idToPartitionItem.size(); + int totalPartitionSize = idToPartitionItem.size(); Collection partitionItems; List> partitionValuesList; // If partition number is too large, randomly choose part of them to estimate the whole table. - if (samplePartitionSize < totalPartitionSize) { + if (sampleSize > 0 && sampleSize < totalPartitionSize) { List items = new ArrayList<>(idToPartitionItem.values()); Collections.shuffle(items); - partitionItems = items.subList(0, samplePartitionSize); - partitionValuesList = Lists.newArrayListWithCapacity(samplePartitionSize); + partitionItems = items.subList(0, sampleSize); + partitionValuesList = Lists.newArrayListWithCapacity(sampleSize); } else { partitionItems = idToPartitionItem.values(); partitionValuesList = Lists.newArrayListWithCapacity(totalPartitionSize); @@ -609,34 +662,14 @@ public static long getRowCountFromFileList(HMSExternalTable table) { // get partitions without cache, so that it will not invalid the cache when executing // non query request such as `show table status` hivePartitions = cache.getAllPartitionsWithoutCache(table.getDbName(), table.getName(), - partitionValuesList); + partitionValuesList); } else { hivePartitions.add(new HivePartition(table.getDbName(), table.getName(), true, table.getRemoteTable().getSd().getInputFormat(), table.getRemoteTable().getSd().getLocation(), null)); } // Get files for all partitions. - List filesByPartitions = cache.getFilesByPartitionsWithoutCache( - hivePartitions, true); - long totalSize = 0; - // Calculate the total file size. - for (HiveMetaStoreCache.FileCacheValue files : filesByPartitions) { - for (HiveMetaStoreCache.HiveFileStatus file : files.getFiles()) { - totalSize += file.getLength(); - } - } - // Estimate row count: totalSize/estimatedRowSize - long estimatedRowSize = 0; - for (Column column : table.getFullSchema()) { - estimatedRowSize += column.getDataType().getSlotSize(); - } - if (estimatedRowSize == 0) { - return 1; - } - if (samplePartitionSize < totalPartitionSize) { - totalSize = totalSize * totalPartitionSize / samplePartitionSize; - } - return totalSize / estimatedRowSize; + return cache.getFilesByPartitionsWithoutCache(hivePartitions, true); } /** diff --git a/regression-test/suites/external_table_p2/hive/test_hive_statistic_sample.groovy b/regression-test/suites/external_table_p2/hive/test_hive_statistic_sample.groovy new file mode 100644 index 00000000000000..960bec31df5c5e --- /dev/null +++ b/regression-test/suites/external_table_p2/hive/test_hive_statistic_sample.groovy @@ -0,0 +1,150 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_remote_hive") { + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") + String extHiveHmsPort = context.config.otherConfigs.get("extHiveHmsPort") + String catalog_name = "test_hive_statistic_sample" + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='hms', + 'hadoop.username' = 'hadoop', + 'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}' + ); + """ + logger.info("catalog " + catalog_name + " created") + + sql """analyze table ${catalog_name}.tpch_1000_parquet.region with sample percent 10 with sync""" + sql """analyze table ${catalog_name}.tpch_1000_parquet.supplier with sample percent 10 with sync""" + sql """use ${catalog_name}.tpch_1000_parquet""" + def result = sql """show column stats region (r_regionkey)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "r_regionkey") + assertTrue(result[0][1] == "5.0") + assertTrue(result[0][2] == "5.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "20.0") + assertTrue(result[0][5] == "4.0") + assertTrue(result[0][6] == "0") + assertTrue(result[0][7] == "4") + + result = sql """show column stats region (r_name)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "r_name") + assertTrue(result[0][1] == "5.0") + assertTrue(result[0][2] == "5.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "34.0") + assertTrue(result[0][5] == "6.8") + assertTrue(result[0][6] == "\'AFRICA\'") + assertTrue(result[0][7] == "\'MIDDLE EAST\'") + + result = sql """show column stats region (r_comment)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "r_comment") + assertTrue(result[0][1] == "5.0") + assertTrue(result[0][2] == "5.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "330.0") + assertTrue(result[0][5] == "66.0") + assertTrue(result[0][6] == "\'ges. thinly even pinto beans ca\'") + assertTrue(result[0][7] == "\'uickly special accounts cajole carefully blithely close requests. carefully final asymptotes haggle furiousl\'") + + result = sql """show column stats supplier (s_suppkey)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "s_suppkey") + assertTrue(result[0][1] == "9998799.0") + assertTrue(result[0][2] == "9970222.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "3.9995194E7") + assertTrue(result[0][5] == "3.9999997999759773") + assertTrue(result[0][6] == "1885331") + assertTrue(result[0][7] == "9395153") + + result = sql """show column stats supplier (s_name)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "s_name") + assertTrue(result[0][1] == "9998799.0") + assertTrue(result[0][2] == "1.004004E7") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "1.79978374E8") + assertTrue(result[0][5] == "17.999999199903908") + assertTrue(result[0][6] == "\'Supplier#001885331\'") + assertTrue(result[0][7] == "\'Supplier#009395153\'") + + result = sql """show column stats supplier (s_address)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "s_address") + assertTrue(result[0][1] == "9998799.0") + assertTrue(result[0][2] == "9998862.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "2.50070604E8") + assertTrue(result[0][5] == "25.010064108699456") + assertTrue(result[0][6] == "\' E,WAW2ZEx\'") + assertTrue(result[0][7] == "\'zzzw X3bpxu,OCpzgv6BdyMVMKzaB1DbH\'") + + result = sql """show column stats supplier (s_nationkey)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "s_nationkey") + assertTrue(result[0][1] == "9998799.0") + assertTrue(result[0][2] == "25.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "3.9995194E7") + assertTrue(result[0][5] == "3.9999997999759773") + assertTrue(result[0][6] == "0") + assertTrue(result[0][7] == "24") + + result = sql """show column stats supplier (s_phone)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "s_phone") + assertTrue(result[0][1] == "9998799.0") + assertTrue(result[0][2] == "9928006.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "1.49981978E8") + assertTrue(result[0][5] == "14.99999929991592") + assertTrue(result[0][6] == "\'10-100-128-4513\'") + assertTrue(result[0][7] == "\'34-999-967-7296\'") + + result = sql """show column stats supplier (s_acctbal)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "s_acctbal") + assertTrue(result[0][1] == "9998799.0") + assertTrue(result[0][2] == "4766937.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "7.9990388E7") + assertTrue(result[0][5] == "7.999999599951955") + assertTrue(result[0][6] == "-999.99") + assertTrue(result[0][7] == "9999.99") + + result = sql """show column stats supplier (s_comment)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "s_comment") + assertTrue(result[0][1] == "9998799.0") + assertTrue(result[0][2] == "9931298.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "6.24883849E8") + assertTrue(result[0][5] == "62.49589065646784") + assertTrue(result[0][6] == "\' Customer across the pinto beans. pinRecommends\'") + assertTrue(result[0][7] == "\'zzle? express, regular foxes haggle final ac\'") + + sql """drop catalog ${catalog_name}""" + } +} +