From 2ca2c38643d648f48638eb90b2a17b099047ce70 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Fri, 3 Jun 2016 15:34:02 +0800 Subject: [PATCH 01/45] init commit --- .../spark/sql/execution/ExistingRDD.scala | 6 +- .../datasources/FileSourceStrategy.scala | 194 ++++++++++-------- .../apache/spark/sql/internal/SQLConf.scala | 7 + .../datasources/FileSourceStrategySuite.scala | 15 ++ 4 files changed, 130 insertions(+), 92 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala index fef3255c739ab..1998497f8bd3d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala @@ -338,7 +338,8 @@ private[sql] object DataSourceScanExec { rdd: RDD[InternalRow], relation: BaseRelation, metadata: Map[String, String] = Map.empty, - metastoreTableIdentifier: Option[TableIdentifier] = None): DataSourceScanExec = { + metastoreTableIdentifier: Option[TableIdentifier] = None, + isSupportBatch: Boolean = true): DataSourceScanExec = { val outputPartitioning = { val bucketSpec = relation match { // TODO: this should be closer to bucket planning. @@ -362,7 +363,8 @@ private[sql] object DataSourceScanExec { relation match { case r: HadoopFsRelation - if r.fileFormat.supportBatch(r.sparkSession, StructType.fromAttributes(output)) => + if isSupportBatch && + r.fileFormat.supportBatch(r.sparkSession, StructType.fromAttributes(output)) => BatchedDataSourceScanExec( output, rdd, relation, outputPartitioning, metadata, metastoreTableIdentifier) case _ => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala index 350508c1d9f4c..545821c6c1680 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala @@ -109,108 +109,43 @@ private[sql] object FileSourceStrategy extends Strategy with Logging { val pushedDownFilters = dataFilters.flatMap(DataSourceStrategy.translateFilter) logInfo(s"Pushed Filters: ${pushedDownFilters.mkString(",")}") - val readFile = files.fileFormat.buildReaderWithPartitionValues( - sparkSession = files.sparkSession, - dataSchema = files.dataSchema, - partitionSchema = files.partitionSchema, - requiredSchema = prunedDataSchema, - filters = pushedDownFilters, - options = files.options, - hadoopConf = files.sparkSession.sessionState.newHadoopConfWithOptions(files.options)) - - val plannedPartitions = files.bucketSpec match { - case Some(bucketing) if files.sparkSession.sessionState.conf.bucketingEnabled => - logInfo(s"Planning with ${bucketing.numBuckets} buckets") - val bucketed = - selectedPartitions.flatMap { p => - p.files.map { f => - val hosts = getBlockHosts(getBlockLocations(f), 0, f.getLen) - PartitionedFile(p.values, f.getPath.toUri.toString, 0, f.getLen, hosts) - } - }.groupBy { f => - BucketingUtils - .getBucketId(new Path(f.filePath).getName) - .getOrElse(sys.error(s"Invalid bucket file ${f.filePath}")) - } - - (0 until bucketing.numBuckets).map { bucketId => - FilePartition(bucketId, bucketed.getOrElse(bucketId, Nil)) - } - - case _ => - val defaultMaxSplitBytes = files.sparkSession.sessionState.conf.filesMaxPartitionBytes - val openCostInBytes = files.sparkSession.sessionState.conf.filesOpenCostInBytes - val defaultParallelism = files.sparkSession.sparkContext.defaultParallelism - val totalBytes = selectedPartitions.flatMap(_.files.map(_.getLen + openCostInBytes)).sum - val bytesPerCore = totalBytes / defaultParallelism - val maxSplitBytes = Math.min(defaultMaxSplitBytes, - Math.max(openCostInBytes, bytesPerCore)) - logInfo(s"Planning scan with bin packing, max size: $maxSplitBytes bytes, " + - s"open cost is considered as scanning $openCostInBytes bytes.") - - val splitFiles = selectedPartitions.flatMap { partition => - partition.files.flatMap { file => - val blockLocations = getBlockLocations(file) - (0L until file.getLen by maxSplitBytes).map { offset => - val remaining = file.getLen - offset - val size = if (remaining > maxSplitBytes) maxSplitBytes else remaining - val hosts = getBlockHosts(blockLocations, offset, size) - PartitionedFile(partition.values, file.getPath.toUri.toString, offset, size, hosts) - } - } - }.toArray.sortBy(_.length)(implicitly[Ordering[Long]].reverse) - - val partitions = new ArrayBuffer[FilePartition] - val currentFiles = new ArrayBuffer[PartitionedFile] - var currentSize = 0L - - /** Add the given file to the current partition. */ - def addFile(file: PartitionedFile): Unit = { - currentSize += file.length + openCostInBytes - currentFiles.append(file) - } - - /** Close the current partition and move to the next. */ - def closePartition(): Unit = { - if (currentFiles.nonEmpty) { - val newPartition = - FilePartition( - partitions.size, - currentFiles.toArray.toSeq) // Copy to a new Array. - partitions.append(newPartition) - } - currentFiles.clear() - currentSize = 0 - } - - // Assign files to partitions using "First Fit Decreasing" (FFD) - // TODO: consider adding a slop factor here? - splitFiles.foreach { file => - if (currentSize + file.length > maxSplitBytes) { - closePartition() - } - addFile(file) - } - closePartition() - partitions - } + val optimizerMetadataOnly = + readDataColumns.isEmpty && files.sparkSession.sessionState.conf.optimizerMetadataOnly + val scanRdd = if (optimizerMetadataOnly) { + val partitionValues = selectedPartitions.map(_.values) + files.sqlContext.sparkContext.parallelize(partitionValues, 1) + } else { + val readFile = files.fileFormat.buildReaderWithPartitionValues( + sparkSession = files.sparkSession, + dataSchema = files.dataSchema, + partitionSchema = files.partitionSchema, + requiredSchema = prunedDataSchema, + filters = pushedDownFilters, + options = files.options, + hadoopConf = files.sparkSession.sessionState.newHadoopConfWithOptions(files.options)) + + val plannedPartitions = getFilePartitions(files, selectedPartitions) + new FileScanRDD( + files.sparkSession, + readFile, + plannedPartitions) + } val meta = Map( "Format" -> files.fileFormat.toString, "ReadSchema" -> prunedDataSchema.simpleString, + "metadataOnly" -> optimizerMetadataOnly.toString, PUSHED_FILTERS -> pushedDownFilters.mkString("[", ", ", "]"), INPUT_PATHS -> files.location.paths.mkString(", ")) val scan = DataSourceScanExec.create( readDataColumns ++ partitionColumns, - new FileScanRDD( - files.sparkSession, - readFile, - plannedPartitions), + scanRdd, files, meta, - table) + table, + !optimizerMetadataOnly) val afterScanFilter = afterScanFilters.toSeq.reduceOption(expressions.And) val withFilter = afterScanFilter.map(execution.FilterExec(_, scan)).getOrElse(scan) @@ -225,6 +160,85 @@ private[sql] object FileSourceStrategy extends Strategy with Logging { case _ => Nil } + private def getFilePartitions( + files: HadoopFsRelation, + selectedPartitions: Seq[Partition]): Seq[FilePartition] = files.bucketSpec match { + case Some(bucketing) if files.sparkSession.sessionState.conf.bucketingEnabled => + logInfo(s"Planning with ${bucketing.numBuckets} buckets") + val bucketed = + selectedPartitions.flatMap { p => + p.files.map { f => + val hosts = getBlockHosts(getBlockLocations(f), 0, f.getLen) + PartitionedFile(p.values, f.getPath.toUri.toString, 0, f.getLen, hosts) + } + }.groupBy { f => + BucketingUtils + .getBucketId(new Path(f.filePath).getName) + .getOrElse(sys.error(s"Invalid bucket file ${f.filePath}")) + } + + (0 until bucketing.numBuckets).map { bucketId => + FilePartition(bucketId, bucketed.getOrElse(bucketId, Nil)) + } + + case _ => + val defaultMaxSplitBytes = files.sparkSession.sessionState.conf.filesMaxPartitionBytes + val openCostInBytes = files.sparkSession.sessionState.conf.filesOpenCostInBytes + val defaultParallelism = files.sparkSession.sparkContext.defaultParallelism + val totalBytes = selectedPartitions.flatMap(_.files.map(_.getLen + openCostInBytes)).sum + val bytesPerCore = totalBytes / defaultParallelism + val maxSplitBytes = Math.min(defaultMaxSplitBytes, + Math.max(openCostInBytes, bytesPerCore)) + logInfo(s"Planning scan with bin packing, max size: $maxSplitBytes bytes, " + + s"open cost is considered as scanning $openCostInBytes bytes.") + + val splitFiles = selectedPartitions.flatMap { partition => + partition.files.flatMap { file => + val blockLocations = getBlockLocations(file) + (0L until file.getLen by maxSplitBytes).map { offset => + val remaining = file.getLen - offset + val size = if (remaining > maxSplitBytes) maxSplitBytes else remaining + val hosts = getBlockHosts(blockLocations, offset, size) + PartitionedFile(partition.values, file.getPath.toUri.toString, offset, size, hosts) + } + } + }.toArray.sortBy(_.length)(implicitly[Ordering[Long]].reverse) + + val partitions = new ArrayBuffer[FilePartition] + val currentFiles = new ArrayBuffer[PartitionedFile] + var currentSize = 0L + + /** Add the given file to the current partition. */ + def addFile(file: PartitionedFile): Unit = { + currentSize += file.length + openCostInBytes + currentFiles.append(file) + } + + /** Close the current partition and move to the next. */ + def closePartition(): Unit = { + if (currentFiles.nonEmpty) { + val newPartition = + FilePartition( + partitions.size, + currentFiles.toArray.toSeq) // Copy to a new Array. + partitions.append(newPartition) + } + currentFiles.clear() + currentSize = 0 + } + + // Assign files to partitions using "First Fit Decreasing" (FFD) + // TODO: consider adding a slop factor here? + splitFiles.foreach { file => + if (currentSize + file.length > maxSplitBytes) { + closePartition() + } + addFile(file) + } + closePartition() + partitions + } + private def getBlockLocations(file: FileStatus): Array[BlockLocation] = file match { case f: LocatedFileStatus => f.getBlockLocations case f => Array.empty[BlockLocation] diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 437e093825f6e..8789724a2f6e6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -258,6 +258,11 @@ object SQLConf { .booleanConf .createWithDefault(false) + val OPTIMIZER_METADATA_ONLY = SQLConfigBuilder("spark.sql.optimizer.metadataOnly") + .doc("When true, enable the metadata-only query optimization.") + .booleanConf + .createWithDefault(false) + val NATIVE_VIEW = SQLConfigBuilder("spark.sql.nativeView") .internal() .doc("When true, CREATE VIEW will be handled by Spark SQL instead of Hive native commands. " + @@ -599,6 +604,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging { def metastorePartitionPruning: Boolean = getConf(HIVE_METASTORE_PARTITION_PRUNING) + def optimizerMetadataOnly: Boolean = getConf(OPTIMIZER_METADATA_ONLY) + def nativeView: Boolean = getConf(NATIVE_VIEW) def wholeStageEnabled: Boolean = getConf(WHOLESTAGE_CODEGEN_ENABLED) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala index 52dda8c6ace84..15d14bd9ba383 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala @@ -338,6 +338,21 @@ class FileSourceStrategySuite extends QueryTest with SharedSQLContext with Predi assert(partitions.flatMap(fileScanRDD.preferredLocations).length == 3) } } + + test("optimize metadataOnly") { + withSQLConf("spark.sql.optimizer.metadataOnly" -> "true") { + val table = + createTable( + files = Seq( + "p1=1/file1" -> 10, + "p1=2/file2" -> 10)) + + checkDataset(table.select($"p1"), Row(1), Row(2)) + checkDataset(table.where("p1 = 1").select($"p1"), Row(1)) + val df = table.where("p1 = 1 AND (p1 + c1) = 2 AND c1 = 1") + assert(getPhysicalFilters(df) contains resolve(df, "c1 = 1")) + } + } } // Helpers for checking the arguments passed to the FileFormat. From edea710f826477de000913b58568cf9fd4b55814 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Fri, 3 Jun 2016 20:58:48 +0800 Subject: [PATCH 02/45] fix unit test --- .../datasources/FileSourceStrategySuite.scala | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala index 15d14bd9ba383..c2a617080bbe9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala @@ -338,20 +338,20 @@ class FileSourceStrategySuite extends QueryTest with SharedSQLContext with Predi assert(partitions.flatMap(fileScanRDD.preferredLocations).length == 3) } } + } - test("optimize metadataOnly") { - withSQLConf("spark.sql.optimizer.metadataOnly" -> "true") { - val table = - createTable( - files = Seq( - "p1=1/file1" -> 10, - "p1=2/file2" -> 10)) - - checkDataset(table.select($"p1"), Row(1), Row(2)) - checkDataset(table.where("p1 = 1").select($"p1"), Row(1)) - val df = table.where("p1 = 1 AND (p1 + c1) = 2 AND c1 = 1") - assert(getPhysicalFilters(df) contains resolve(df, "c1 = 1")) - } + test("optimize metadataOnly") { + withSQLConf("spark.sql.optimizer.metadataOnly" -> "true") { + val table = + createTable( + files = Seq( + "p1=1/file1" -> 10, + "p1=2/file2" -> 10)) + + checkDataset(table.select($"p1"), Row(1), Row(2)) + checkDataset(table.where("p1 = 1").select($"p1"), Row(1)) + val df = table.where("p1 = 1 AND (p1 + c1) = 2 AND c1 = 1") + assert(getPhysicalFilters(df) contains resolve(df, "c1 = 1")) } } From 153293ea976f78b0b55d6744a66ecab7e8bedb62 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Sat, 4 Jun 2016 00:57:52 +0800 Subject: [PATCH 03/45] fix unit test --- .../datasources/FileSourceStrategy.scala | 44 ++++++++++--------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala index 545821c6c1680..75a4c31cbc7a2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala @@ -22,9 +22,11 @@ import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.fs.{BlockLocation, FileStatus, LocatedFileStatus, Path} import org.apache.spark.internal.Logging +import org.apache.spark.rdd.RDD import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.expressions +import org.apache.spark.sql.catalyst.{expressions, InternalRow} import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.DataSourceScanExec @@ -111,25 +113,27 @@ private[sql] object FileSourceStrategy extends Strategy with Logging { val optimizerMetadataOnly = readDataColumns.isEmpty && files.sparkSession.sessionState.conf.optimizerMetadataOnly - val scanRdd = if (optimizerMetadataOnly) { - val partitionValues = selectedPartitions.map(_.values) - files.sqlContext.sparkContext.parallelize(partitionValues, 1) - } else { - val readFile = files.fileFormat.buildReaderWithPartitionValues( - sparkSession = files.sparkSession, - dataSchema = files.dataSchema, - partitionSchema = files.partitionSchema, - requiredSchema = prunedDataSchema, - filters = pushedDownFilters, - options = files.options, - hadoopConf = files.sparkSession.sessionState.newHadoopConfWithOptions(files.options)) - - val plannedPartitions = getFilePartitions(files, selectedPartitions) - new FileScanRDD( - files.sparkSession, - readFile, - plannedPartitions) - } + val scanRdd: RDD[InternalRow] = if (optimizerMetadataOnly) { + val partitionSchema = files.partitionSchema.toAttributes + lazy val converter = GenerateUnsafeProjection.generate(partitionSchema, partitionSchema) + val partitionValues = selectedPartitions.map(_.values) + files.sqlContext.sparkContext.parallelize(partitionValues, 1).map(converter(_)) + } else { + val readFile = files.fileFormat.buildReaderWithPartitionValues( + sparkSession = files.sparkSession, + dataSchema = files.dataSchema, + partitionSchema = files.partitionSchema, + requiredSchema = prunedDataSchema, + filters = pushedDownFilters, + options = files.options, + hadoopConf = files.sparkSession.sessionState.newHadoopConfWithOptions(files.options)) + + val plannedPartitions = getFilePartitions(files, selectedPartitions) + new FileScanRDD( + files.sparkSession, + readFile, + plannedPartitions) + } val meta = Map( "Format" -> files.fileFormat.toString, From 7dfb7437a0918c79a7856b35f06dd6edfe63f08d Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Fri, 24 Jun 2016 14:29:32 +0800 Subject: [PATCH 04/45] update --- .../spark/sql/hive/HiveMetastoreCatalog.scala | 115 ++++++++++++++++++ .../spark/sql/hive/HiveSessionCatalog.scala | 1 + .../spark/sql/hive/HiveSessionState.scala | 15 ++- .../sql/hive/execution/SQLQuerySuite.scala | 40 ++++++ 4 files changed, 170 insertions(+), 1 deletion(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index f10afa75f2bfc..6dc028f01a622 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -27,9 +27,11 @@ import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ +import org.apache.spark.sql.execution.LogicalRDD import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils._ import org.apache.spark.sql.execution.command.CreateHiveTableAsSelectLogicalPlan import org.apache.spark.sql.execution.datasources.{Partition => _, _} @@ -506,6 +508,119 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log } } + /** + * When scanning only partition columns, get results based on metadata without scanning files. + * It is used for distinct or distinct/Max/Min aggregations, example: max(partition). + */ + object MetadataOnlyOptimizer extends Rule[LogicalPlan] { + + private def canSupportMetadataOnly(a: Aggregate): Boolean = { + val aggregateExpressions = a.aggregateExpressions.flatMap { expr => + expr.collect { + case agg: AggregateExpression => agg + } + }.distinct + aggregateExpressions.forall { agg => + if (agg.isDistinct) { + true + } else { + agg.aggregateFunction match { + case max: Max => true + case min: Min => true + case _ => false + } + } + } + } + + private def findRelation(plan: LogicalPlan): (Option[LogicalPlan], Seq[Expression]) = { + plan match { + case relation @ LogicalRelation(files: HadoopFsRelation, _, table) + if files.partitionSchema.nonEmpty => + (Some(relation), Seq.empty[Expression]) + + case relation: MetastoreRelation if relation.partitionKeys.nonEmpty => + (Some(relation), Seq.empty[Expression]) + + case p @ Project(_, child) => + findRelation(child) + + case f @ Filter(filterCondition, child) => + val (plan, conditions) = findRelation(child) + (plan, conditions ++ Seq(filterCondition)) + + case SubqueryAlias(_, child) => + findRelation(child) + + case _ => (None, Seq.empty[Expression]) + } + } + + private def convertToMetadataOnlyPlan( + parent: LogicalPlan, + filters: Seq[Expression], + relation: LogicalPlan): LogicalPlan = relation match { + case l @ LogicalRelation(files: HadoopFsRelation, _, _) => + val attributeMap = l.output.map(attr => (attr.name, attr)).toMap + val partitionColumns = files.partitionSchema.map { field => + attributeMap.getOrElse(field.name, throw new AnalysisException( + s"Unable to resolve ${field.name} given [${l.output.map(_.name).mkString(", ")}]")) + } + val filterColumns = filters.flatMap(_.references) + val projectSet = parent.references ++ AttributeSet(filterColumns) + if (projectSet.subsetOf(AttributeSet(partitionColumns))) { + val selectedPartitions = files.location.listFiles(filters) + val partitionValues = selectedPartitions.map(_.values) + val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) + parent.withNewChildren(LogicalRDD(partitionColumns, valuesRdd)(sparkSession) :: Nil) + } else { + parent + } + + case relation: MetastoreRelation => + if (parent.references.subsetOf(AttributeSet(relation.partitionKeys))) { + val partitionColumnDataTypes = relation.partitionKeys.map(_.dataType) + val partitionValues = relation.getHiveQlPartitions(filters).map { p => + InternalRow.fromSeq(p.getValues.asScala.zip(partitionColumnDataTypes).map { + case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null) + }) + } + val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) + val valuesPlan = LogicalRDD(relation.partitionKeys, valuesRdd)(sparkSession) + val child = filters.reduceLeftOption(And).map(Filter(_, valuesPlan)).getOrElse(valuesPlan) + parent.withNewChildren(child :: Nil) + } else { + parent + } + + case _ => + parent + } + + def apply(plan: LogicalPlan): LogicalPlan = { + if (!sparkSession.sessionState.conf.optimizerMetadataOnly) { + return plan + } + plan.transform { + case a @ Aggregate(_, _, child) if canSupportMetadataOnly(a) => + val (plan, filters) = findRelation(child) + if (plan.isDefined) { + convertToMetadataOnlyPlan(a, filters, plan.get) + } else { + a + } + + case d @ Distinct(p @ Project(_, _)) => + val (plan, filters) = findRelation(p) + if (plan.isDefined) { + d.withNewChildren(convertToMetadataOnlyPlan(p, filters, plan.get) :: Nil) + } else { + d + } + } + } + } + } /** diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala index 4f8aac8c2fcdd..8a19645480c18 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala @@ -88,6 +88,7 @@ private[sql] class HiveSessionCatalog( val OrcConversions: Rule[LogicalPlan] = metastoreCatalog.OrcConversions val CreateTables: Rule[LogicalPlan] = metastoreCatalog.CreateTables val PreInsertionCasts: Rule[LogicalPlan] = metastoreCatalog.PreInsertionCasts + val metadataOnlyOptimizer: Rule[LogicalPlan] = metastoreCatalog.MetadataOnlyOptimizer override def refreshTable(name: TableIdentifier): Unit = { metastoreCatalog.refreshTable(name) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala index ca8e5f8223968..1150414ffd78c 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala @@ -19,7 +19,8 @@ package org.apache.spark.sql.hive import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.Analyzer -import org.apache.spark.sql.execution.SparkPlanner +import org.apache.spark.sql.catalyst.optimizer.Optimizer +import org.apache.spark.sql.execution.{SparkOptimizer, SparkPlanner} import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.hive.client.HiveClient import org.apache.spark.sql.internal.SessionState @@ -74,6 +75,18 @@ private[hive] class HiveSessionState(sparkSession: SparkSession) } } + /** + * Logical query plan optimizer for Hive. + */ + override lazy val optimizer: Optimizer = new SparkOptimizer(catalog, conf, experimentalMethods) { + val extendedHiveOptimizerRules: Seq[Batch] = Seq( + Batch("MetadataOnly Optimization", Once, + catalog.metadataOnlyOptimizer) + ) + + override def batches: Seq[Batch] = extendedHiveOptimizerRules ++ super.batches + } + /** * Planner that takes into account Hive-specific strategies. */ diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 499819f32b434..b1aa17de3a251 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1640,4 +1640,44 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { ) } } + + test("spark-15752 metadata only optimizer") { + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { + spark.range(0, 5, 1, 2).selectExpr("id as key", "id as value").registerTempTable("tempTable") + sql( + """ + |CREATE TABLE srcpart_15752 (key INT, value STRING) + |PARTITIONED BY (ds STRING, hr INT) STORED AS parquet + """.stripMargin) + for (ds <- Seq("2008-04-08", "2008-04-09"); hr <- Seq(11, 12)) { + sql( + s""" + |INSERT OVERWRITE TABLE srcpart_15752 PARTITION (ds='$ds',hr='$hr') + |select key, value from tempTable + """.stripMargin) + } + checkAnswer(sql("select max(hr) from srcpart_15752"), Row(12)) + checkAnswer(sql("select max(hr) from srcpart_15752 where hr = 11"), Row(11)) + checkAnswer(sql("select max(hr) from (select hr from srcpart_15752) t"), Row(12)) + checkAnswer(sql("select distinct hr from srcpart_15752 where hr = 11"), Row(11)) + + sql( + """ + |CREATE TABLE srctext_15752 (key INT, value STRING) + |PARTITIONED BY (ds STRING, hr INT) STORED AS textfile + """.stripMargin) + for (ds <- Seq("2008-04-08", "2008-04-09"); hr <- Seq(11, 12)) { + sql( + s""" + |INSERT OVERWRITE TABLE srctext_15752 PARTITION (ds='$ds',hr='$hr') + |select key, value from tempTable + """.stripMargin) + } + + checkAnswer(sql("select max(hr) from srctext_15752"), Row(12)) + checkAnswer(sql("select max(hr) from srctext_15752 where hr = 11"), Row(11)) + checkAnswer(sql("select max(hr) from (select hr from srctext_15752) t"), Row(12)) + checkAnswer(sql("select distinct hr from srctext_15752 where hr = 11"), Row(11)) + } + } } From 68e6d6d32a0b5f9f8503cc54ee491abe98db76dc Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Fri, 24 Jun 2016 14:30:50 +0800 Subject: [PATCH 05/45] Revert "fix unit test" This reverts commit 153293ea976f78b0b55d6744a66ecab7e8bedb62. --- .../datasources/FileSourceStrategy.scala | 44 +++++++++---------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala index 75a4c31cbc7a2..545821c6c1680 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala @@ -22,11 +22,9 @@ import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.fs.{BlockLocation, FileStatus, LocatedFileStatus, Path} import org.apache.spark.internal.Logging -import org.apache.spark.rdd.RDD import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.{expressions, InternalRow} +import org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.DataSourceScanExec @@ -113,27 +111,25 @@ private[sql] object FileSourceStrategy extends Strategy with Logging { val optimizerMetadataOnly = readDataColumns.isEmpty && files.sparkSession.sessionState.conf.optimizerMetadataOnly - val scanRdd: RDD[InternalRow] = if (optimizerMetadataOnly) { - val partitionSchema = files.partitionSchema.toAttributes - lazy val converter = GenerateUnsafeProjection.generate(partitionSchema, partitionSchema) - val partitionValues = selectedPartitions.map(_.values) - files.sqlContext.sparkContext.parallelize(partitionValues, 1).map(converter(_)) - } else { - val readFile = files.fileFormat.buildReaderWithPartitionValues( - sparkSession = files.sparkSession, - dataSchema = files.dataSchema, - partitionSchema = files.partitionSchema, - requiredSchema = prunedDataSchema, - filters = pushedDownFilters, - options = files.options, - hadoopConf = files.sparkSession.sessionState.newHadoopConfWithOptions(files.options)) - - val plannedPartitions = getFilePartitions(files, selectedPartitions) - new FileScanRDD( - files.sparkSession, - readFile, - plannedPartitions) - } + val scanRdd = if (optimizerMetadataOnly) { + val partitionValues = selectedPartitions.map(_.values) + files.sqlContext.sparkContext.parallelize(partitionValues, 1) + } else { + val readFile = files.fileFormat.buildReaderWithPartitionValues( + sparkSession = files.sparkSession, + dataSchema = files.dataSchema, + partitionSchema = files.partitionSchema, + requiredSchema = prunedDataSchema, + filters = pushedDownFilters, + options = files.options, + hadoopConf = files.sparkSession.sessionState.newHadoopConfWithOptions(files.options)) + + val plannedPartitions = getFilePartitions(files, selectedPartitions) + new FileScanRDD( + files.sparkSession, + readFile, + plannedPartitions) + } val meta = Map( "Format" -> files.fileFormat.toString, From 595ef36ede338936727e18b0a84a9cc8280a0e29 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Fri, 24 Jun 2016 14:31:14 +0800 Subject: [PATCH 06/45] Revert "fix unit test" This reverts commit edea710f826477de000913b58568cf9fd4b55814. --- .../datasources/FileSourceStrategySuite.scala | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala index bab5b90b63591..645ef42ca4a91 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala @@ -338,20 +338,20 @@ class FileSourceStrategySuite extends QueryTest with SharedSQLContext with Predi assert(partitions.flatMap(fileScanRDD.preferredLocations).length == 3) } } - } - - test("optimize metadataOnly") { - withSQLConf("spark.sql.optimizer.metadataOnly" -> "true") { - val table = - createTable( - files = Seq( - "p1=1/file1" -> 10, - "p1=2/file2" -> 10)) - checkDataset(table.select($"p1"), Row(1), Row(2)) - checkDataset(table.where("p1 = 1").select($"p1"), Row(1)) - val df = table.where("p1 = 1 AND (p1 + c1) = 2 AND c1 = 1") - assert(getPhysicalFilters(df) contains resolve(df, "c1 = 1")) + test("optimize metadataOnly") { + withSQLConf("spark.sql.optimizer.metadataOnly" -> "true") { + val table = + createTable( + files = Seq( + "p1=1/file1" -> 10, + "p1=2/file2" -> 10)) + + checkDataset(table.select($"p1"), Row(1), Row(2)) + checkDataset(table.where("p1 = 1").select($"p1"), Row(1)) + val df = table.where("p1 = 1 AND (p1 + c1) = 2 AND c1 = 1") + assert(getPhysicalFilters(df) contains resolve(df, "c1 = 1")) + } } } From 2e55a9df94ef85345f54f2bb3b8c23d06f600f58 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Fri, 24 Jun 2016 15:50:56 +0800 Subject: [PATCH 07/45] Merge branch 'apache-master' into metadata-only --- .../spark/sql/execution/ExistingRDD.scala | 6 +- .../datasources/FileSourceStrategySuite.scala | 15 -- .../spark/sql/hive/HiveMetastoreCatalog.scala | 153 +++++++++++++----- .../spark/sql/hive/HiveSessionCatalog.scala | 1 - .../sql/hive/execution/SQLQuerySuite.scala | 9 +- 5 files changed, 120 insertions(+), 64 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala index 75fad506d468b..e2c23a4ba8670 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala @@ -347,8 +347,7 @@ private[sql] object DataSourceScanExec { rdd: RDD[InternalRow], relation: BaseRelation, metadata: Map[String, String] = Map.empty, - metastoreTableIdentifier: Option[TableIdentifier] = None, - isSupportBatch: Boolean = true): DataSourceScanExec = { + metastoreTableIdentifier: Option[TableIdentifier] = None): DataSourceScanExec = { val outputPartitioning = { val bucketSpec = relation match { // TODO: this should be closer to bucket planning. @@ -372,8 +371,7 @@ private[sql] object DataSourceScanExec { relation match { case r: HadoopFsRelation - if isSupportBatch && - r.fileFormat.supportBatch(r.sparkSession, StructType.fromAttributes(output)) => + if r.fileFormat.supportBatch(r.sparkSession, StructType.fromAttributes(output)) => BatchedDataSourceScanExec( output, rdd, relation, outputPartitioning, metadata, metastoreTableIdentifier) case _ => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala index 9a4e2859a973d..8d8a18fa9332b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala @@ -338,21 +338,6 @@ class FileSourceStrategySuite extends QueryTest with SharedSQLContext with Predi assert(partitions.flatMap(fileScanRDD.preferredLocations).length == 3) } } - - test("optimize metadataOnly") { - withSQLConf("spark.sql.optimizer.metadataOnly" -> "true") { - val table = - createTable( - files = Seq( - "p1=1/file1" -> 10, - "p1=2/file2" -> 10)) - - checkDataset(table.select($"p1"), Row(1), Row(2)) - checkDataset(table.where("p1 = 1").select($"p1"), Row(1)) - val df = table.where("p1 = 1 AND (p1 + c1) = 2 AND c1 = 1") - assert(getPhysicalFilters(df) contains resolve(df, "c1 = 1")) - } - } } test("SPARK-15654 do not split non-splittable files") { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index f10afa75f2bfc..a1e59d1996fcc 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -23,21 +23,22 @@ import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache} import org.apache.hadoop.fs.Path import org.apache.spark.internal.Logging -import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession} +import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ +import org.apache.spark.sql.execution.LogicalRDD import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils._ import org.apache.spark.sql.execution.command.CreateHiveTableAsSelectLogicalPlan import org.apache.spark.sql.execution.datasources.{Partition => _, _} -import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat +import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, ParquetOptions} import org.apache.spark.sql.hive.orc.OrcFileFormat import org.apache.spark.sql.types._ - /** * Legacy catalog for interacting with the Hive metastore. * @@ -355,13 +356,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log val fileFormatClass = classOf[ParquetFileFormat] val mergeSchema = sessionState.convertMetastoreParquetWithSchemaMerging - val options = Map( - ParquetFileFormat.MERGE_SCHEMA -> mergeSchema.toString, - ParquetFileFormat.METASTORE_TABLE_NAME -> TableIdentifier( - relation.tableName, - Some(relation.databaseName) - ).unquotedString - ) + val options = Map(ParquetOptions.MERGE_SCHEMA -> mergeSchema.toString) convertToLogicalRelation(relation, options, defaultSource, fileFormatClass, "parquet") } @@ -465,43 +460,119 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log } /** - * Casts input data to correct data types according to table definition before inserting into - * that table. + * When scanning only partition columns, get results based on metadata without scanning files. + * It is used for distinct or distinct/Max/Min aggregations, example: max(partition). */ - object PreInsertionCasts extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan.transform { - // Wait until children are resolved. - case p: LogicalPlan if !p.childrenResolved => p + object MetadataOnlyOptimizer extends Rule[LogicalPlan] { - case p @ InsertIntoTable(table: MetastoreRelation, _, child, _, _) => - castChildOutput(p, table, child) + private def canSupportMetadataOnly(a: Aggregate): Boolean = { + val aggregateExpressions = a.aggregateExpressions.flatMap { expr => + expr.collect { + case agg: AggregateExpression => agg + } + }.distinct + aggregateExpressions.forall { agg => + if (agg.isDistinct) { + true + } else { + agg.aggregateFunction match { + case max: Max => true + case min: Min => true + case _ => false + } + } + } } - def castChildOutput(p: InsertIntoTable, table: MetastoreRelation, child: LogicalPlan) - : LogicalPlan = { - val childOutputDataTypes = child.output.map(_.dataType) - val numDynamicPartitions = p.partition.values.count(_.isEmpty) - val tableOutputDataTypes = - (table.attributes ++ table.partitionKeys.takeRight(numDynamicPartitions)) - .take(child.output.length).map(_.dataType) - - if (childOutputDataTypes == tableOutputDataTypes) { - InsertIntoHiveTable(table, p.partition, p.child, p.overwrite, p.ifNotExists) - } else if (childOutputDataTypes.size == tableOutputDataTypes.size && - childOutputDataTypes.zip(tableOutputDataTypes) - .forall { case (left, right) => left.sameType(right) }) { - // If both types ignoring nullability of ArrayType, MapType, StructType are the same, - // use InsertIntoHiveTable instead of InsertIntoTable. - InsertIntoHiveTable(table, p.partition, p.child, p.overwrite, p.ifNotExists) - } else { - // Only do the casting when child output data types differ from table output data types. - val castedChildOutput = child.output.zip(table.output).map { - case (input, output) if input.dataType != output.dataType => - Alias(Cast(input, output.dataType), input.name)() - case (input, _) => input + private def findRelation(plan: LogicalPlan): (Option[LogicalPlan], Seq[Expression]) = { + plan match { + case relation @ LogicalRelation(files: HadoopFsRelation, _, table) + if files.partitionSchema.nonEmpty => + (Some(relation), Seq.empty[Expression]) + + case relation: MetastoreRelation if relation.partitionKeys.nonEmpty => + (Some(relation), Seq.empty[Expression]) + + case p @ Project(_, child) => + findRelation(child) + + case f @ Filter(filterCondition, child) => + val (plan, conditions) = findRelation(child) + (plan, conditions ++ Seq(filterCondition)) + + case SubqueryAlias(_, child) => + findRelation(child) + + case _ => (None, Seq.empty[Expression]) + } + } + + private def convertToMetadataOnlyPlan( + parent: LogicalPlan, + project: Option[LogicalPlan], + filters: Seq[Expression], + relation: LogicalPlan): LogicalPlan = relation match { + case l @ LogicalRelation(files: HadoopFsRelation, _, _) => + val attributeMap = l.output.map(attr => (attr.name, attr)).toMap + val partitionColumns = files.partitionSchema.map { field => + attributeMap.getOrElse(field.name, throw new AnalysisException( + s"Unable to resolve ${field.name} given [${l.output.map(_.name).mkString(", ")}]")) + } + val filterColumns = filters.flatMap(_.references) + val projectSet = parent.references ++ AttributeSet(filterColumns) + if (projectSet.subsetOf(AttributeSet(partitionColumns))) { + val selectedPartitions = files.location.listFiles(filters) + val partitionValues = selectedPartitions.map(_.values) + val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) + val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) + val scanPlan = project.map(_.withNewChildren(valuesPlan :: Nil)).getOrElse(valuesPlan) + parent.withNewChildren(scanPlan :: Nil) + } else { + parent + } + + case relation: MetastoreRelation => + if (parent.references.subsetOf(AttributeSet(relation.partitionKeys))) { + val partitionColumnDataTypes = relation.partitionKeys.map(_.dataType) + val partitionValues = relation.getHiveQlPartitions(filters).map { p => + InternalRow.fromSeq(p.getValues.asScala.zip(partitionColumnDataTypes).map { + case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null) + }) + } + val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) + val valuesPlan = LogicalRDD(relation.partitionKeys, valuesRdd)(sparkSession) + val filterPlan = + filters.reduceLeftOption(And).map(Filter(_, valuesPlan)).getOrElse(valuesPlan) + val scanPlan = project.map(_.withNewChildren(filterPlan :: Nil)).getOrElse(filterPlan) + parent.withNewChildren(scanPlan :: Nil) + } else { + parent } - p.copy(child = logical.Project(castedChildOutput, child)) + case _ => + parent + } + + def apply(plan: LogicalPlan): LogicalPlan = { + if (!sparkSession.sessionState.conf.optimizerMetadataOnly) { + return plan + } + plan.transform { + case a @ Aggregate(_, _, child) if canSupportMetadataOnly(a) => + val (plan, filters) = findRelation(child) + if (plan.isDefined) { + convertToMetadataOnlyPlan(a, None, filters, plan.get) + } else { + a + } + + case d @ Distinct(p @ Project(_, _)) => + val (plan, filters) = findRelation(p) + if (plan.isDefined) { + convertToMetadataOnlyPlan(d, Some(p), filters, plan.get) + } else { + d + } } } } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala index 8a19645480c18..c3b2a8c7d3926 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala @@ -87,7 +87,6 @@ private[sql] class HiveSessionCatalog( val ParquetConversions: Rule[LogicalPlan] = metastoreCatalog.ParquetConversions val OrcConversions: Rule[LogicalPlan] = metastoreCatalog.OrcConversions val CreateTables: Rule[LogicalPlan] = metastoreCatalog.CreateTables - val PreInsertionCasts: Rule[LogicalPlan] = metastoreCatalog.PreInsertionCasts val metadataOnlyOptimizer: Rule[LogicalPlan] = metastoreCatalog.MetadataOnlyOptimizer override def refreshTable(name: TableIdentifier): Unit = { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 8c260bb6560b4..37b312da77630 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1687,7 +1687,8 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("spark-15752 metadata only optimizer") { withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { - spark.range(0, 5, 1, 2).selectExpr("id as key", "id as value").registerTempTable("tempTable") + val df = Seq((1, 2), (3, 4)).toDF("key", "value") + df.createOrReplaceTempView("data") sql( """ |CREATE TABLE srcpart_15752 (key INT, value STRING) @@ -1697,12 +1698,13 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { sql( s""" |INSERT OVERWRITE TABLE srcpart_15752 PARTITION (ds='$ds',hr='$hr') - |select key, value from tempTable + |select key, value from data """.stripMargin) } checkAnswer(sql("select max(hr) from srcpart_15752"), Row(12)) checkAnswer(sql("select max(hr) from srcpart_15752 where hr = 11"), Row(11)) checkAnswer(sql("select max(hr) from (select hr from srcpart_15752) t"), Row(12)) + checkAnswer(sql("select distinct hr from srcpart_15752"), Row(11) :: Row(12) :: Nil) checkAnswer(sql("select distinct hr from srcpart_15752 where hr = 11"), Row(11)) sql( @@ -1714,13 +1716,14 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { sql( s""" |INSERT OVERWRITE TABLE srctext_15752 PARTITION (ds='$ds',hr='$hr') - |select key, value from tempTable + |select key, value from data """.stripMargin) } checkAnswer(sql("select max(hr) from srctext_15752"), Row(12)) checkAnswer(sql("select max(hr) from srctext_15752 where hr = 11"), Row(11)) checkAnswer(sql("select max(hr) from (select hr from srctext_15752) t"), Row(12)) + checkAnswer(sql("select distinct hr from srctext_15752"), Row(11) :: Row(12) :: Nil) checkAnswer(sql("select distinct hr from srctext_15752 where hr = 11"), Row(11)) } } From b2b6eba2ad21675173227ee19c65f6661534e6b6 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Fri, 24 Jun 2016 15:52:21 +0800 Subject: [PATCH 08/45] update --- .../scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index a1e59d1996fcc..c4f478ba4c25b 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -621,7 +621,7 @@ private[hive] case class InsertIntoHiveTable( child: LogicalPlan, overwrite: Boolean, ifNotExists: Boolean) - extends LogicalPlan { + extends LogicalPlan with Command { override def children: Seq[LogicalPlan] = child :: Nil override def output: Seq[Attribute] = Seq.empty From 6404c1fa2e8e9796574b69b57054dd4791ff6c52 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Sat, 25 Jun 2016 00:12:32 +0800 Subject: [PATCH 09/45] update opt for core --- .../catalyst/catalog/ExternalCatalog.scala | 14 ++ .../catalyst/catalog/InMemoryCatalog.scala | 8 + .../sql/catalyst/catalog/SessionCatalog.scala | 13 ++ .../sql/catalyst/optimizer/Optimizer.scala | 4 + .../spark/sql/execution/SparkOptimizer.scala | 152 +++++++++++++++++- .../apache/spark/sql/internal/SQLConf.scala | 2 +- .../spark/sql/hive/HiveExternalCatalog.scala | 10 ++ .../spark/sql/hive/HiveMetastoreCatalog.scala | 121 -------------- .../spark/sql/hive/HiveSessionCatalog.scala | 1 - .../spark/sql/hive/HiveSessionState.scala | 12 -- .../spark/sql/hive/client/HiveClient.scala | 11 ++ .../sql/hive/execution/SQLQuerySuite.scala | 8 +- 12 files changed, 214 insertions(+), 142 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala index 6714846e8cbda..5d478a4ccce9f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.catalyst.catalog import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException +import org.apache.spark.sql.catalyst.expressions.Expression /** @@ -151,6 +152,19 @@ abstract class ExternalCatalog { def getPartition(db: String, table: String, spec: TablePartitionSpec): CatalogTablePartition + /** + * Returns partitions filtered by predicates for the given table, It just work for Hive. + * + * The filters Expressions may optionally be provided to filter the partitions returned. + * For instance, if there exist partitions (a='1', b='2'), (a='1', b='3') and (a='2', b='4'), + * then the filters (a='1') will return the first two only. + * @param catalogTable table + * @param filters The filters used to prune which partitions are returned. + */ + def getPartitionsByFilter( + catalogTable: CatalogTable, + filters: Seq[Expression] = Nil): Seq[CatalogTablePartition] + /** * List the metadata of all partitions that belong to the specified table, assuming it exists. * diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala index fb3e1b3637f21..1d9b30d3f8734 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala @@ -28,6 +28,7 @@ import org.apache.spark.SparkException import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.analysis._ +import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.util.StringUtils /** @@ -447,6 +448,13 @@ class InMemoryCatalog(hadoopConfig: Configuration = new Configuration) extends E catalog(db).tables(table).partitions(spec) } + override def getPartitionsByFilter( + catalogTable: CatalogTable, + filters: Seq[Expression] = Nil): Seq[CatalogTablePartition] = synchronized { + requireTableExists(catalogTable.database, catalogTable.identifier.table) + catalog(catalogTable.database).tables(catalogTable.identifier.table).partitions.values.toSeq + } + override def listPartitions( db: String, table: String, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 7ab10d1c38237..be984bade6009 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -585,6 +585,19 @@ class SessionCatalog( externalCatalog.getPartition(db, table, spec) } + /** + * Returns partitions filtered by predicates for the given table, It just work for Hive. + * + * The filters Expressions may optionally be provided to filter the partitions returned. + * For instance, if there exist partitions (a='1', b='2'), (a='1', b='3') and (a='2', b='4'), + * then the filters (a='1') will return the first two only. + */ + def getPartitionsByFilter( + catalogTable: CatalogTable, + filters: Seq[Expression] = Nil): Seq[CatalogTablePartition] = { + externalCatalog.getPartitionsByFilter(catalogTable, filters) + } + /** * List the metadata of all partitions that belong to the specified table, assuming it exists. * diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 2bca31d5f1f75..3b567571a07ca 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -42,6 +42,8 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, conf: CatalystConf) protected val fixedPoint = FixedPoint(conf.optimizerMaxIterations) + val aggregateOptimizationsRules: Seq[Rule[LogicalPlan]] = RewriteDistinctAggregates :: Nil + def batches: Seq[Batch] = { // Technically some of the rules in Finish Analysis are not optimizer rules and belong more // in the analyzer, because they are needed for correctness (e.g. ComputeCurrentTime). @@ -61,6 +63,8 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, conf: CatalystConf) // extra operators between two adjacent Union operators. // - Call CombineUnions again in Batch("Operator Optimizations"), // since the other rules might make two separate Unions operators adjacent. + Batch("Aggregate Optimizations", Once, + aggregateOptimizationsRules : _*) :: Batch("Union", Once, CombineUnions) :: Batch("Subquery", Once, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala index 12a10cba20fe9..e4ad530c4d3fd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala @@ -17,9 +17,16 @@ package org.apache.spark.sql.execution -import org.apache.spark.sql.ExperimentalMethods -import org.apache.spark.sql.catalyst.catalog.SessionCatalog -import org.apache.spark.sql.catalyst.optimizer.Optimizer +import org.apache.spark.sql.{AnalysisException, ExperimentalMethods, SparkSession} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.analysis._ +import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, SessionCatalog} +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.aggregate._ +import org.apache.spark.sql.catalyst.optimizer._ +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} import org.apache.spark.sql.execution.python.ExtractPythonUDFFromAggregate import org.apache.spark.sql.internal.SQLConf @@ -29,7 +36,146 @@ class SparkOptimizer( experimentalMethods: ExperimentalMethods) extends Optimizer(catalog, conf) { + override val aggregateOptimizationsRules: Seq[Rule[LogicalPlan]] = + MetadataOnlyOptimizer(catalog) :: + RewriteDistinctAggregates :: Nil + override def batches: Seq[Batch] = super.batches :+ Batch("Extract Python UDF from Aggregate", Once, ExtractPythonUDFFromAggregate) :+ Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*) } + +/** + * When scanning only partition columns, get results based on metadata without scanning files. + * It is used for distinct, distinct aggregations or distinct-like aggregations(example: Max/Min). + * Example: select Max(partition) from table. + */ +case class MetadataOnlyOptimizer(catalog: SessionCatalog) extends Rule[LogicalPlan] { + + private val sparkSession: SparkSession = SparkSession.getActiveSession.get + private def canSupportMetadataOnly(a: Aggregate): Boolean = { + val aggregateExpressions = a.aggregateExpressions.flatMap { expr => + expr.collect { + case agg: AggregateExpression => agg + } + }.distinct + if (aggregateExpressions.isEmpty) { + // Cannot support for aggregate that has no aggregateFunction. + // example: select col1 from table group by col1. + false + } else { + aggregateExpressions.forall { agg => + if (agg.isDistinct) { + true + } else { + // If function can be evaluated on just the distinct values of a column, it can be used + // by metadata-only optimizer. + agg.aggregateFunction match { + case max: Max => true + case min: Min => true + case hyperLog: HyperLogLogPlusPlus => true + case _ => false + } + } + } + } + } + + private def findRelation(plan: LogicalPlan): (Option[LogicalPlan], Seq[Expression]) = { + plan match { + case relation @ LogicalRelation(files: HadoopFsRelation, _, table) + if files.partitionSchema.nonEmpty => + (Some(relation), Seq.empty[Expression]) + + case relation: CatalogRelation if relation.catalogTable.partitionColumnNames.nonEmpty => + (Some(relation), Seq.empty[Expression]) + + case p @ Project(_, child) => + findRelation(child) + + case f @ Filter(filterCondition, child) => + val (plan, conditions) = findRelation(child) + (plan, conditions ++ Seq(filterCondition)) + + case _ => (None, Seq.empty[Expression]) + } + } + + private def convertToMetadataOnlyPlan( + parent: LogicalPlan, + project: Option[LogicalPlan], + filters: Seq[Expression], + relation: LogicalPlan): LogicalPlan = relation match { + case l @ LogicalRelation(files: HadoopFsRelation, _, _) => + val attributeMap = l.output.map(attr => (attr.name, attr)).toMap + val partitionColumns = files.partitionSchema.map { field => + attributeMap.getOrElse(field.name, throw new AnalysisException( + s"Unable to resolve ${field.name} given [${l.output.map(_.name).mkString(", ")}]")) + } + val filterColumns = filters.flatMap(_.references) + val projectSet = parent.references ++ AttributeSet(filterColumns) + if (projectSet.subsetOf(AttributeSet(partitionColumns))) { + val selectedPartitions = files.location.listFiles(filters) + val partitionValues = selectedPartitions.map(_.values) + val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) + val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) + val scanPlan = project.map(_.withNewChildren(valuesPlan :: Nil)).getOrElse(valuesPlan) + parent.withNewChildren(scanPlan :: Nil) + } else { + parent + } + + case relation: CatalogRelation => + val attributeMap = relation.output.map(attr => (attr.name, attr)).toMap + val partitionColumns = relation.catalogTable.partitionColumnNames.map { column => + attributeMap.getOrElse(column, throw new AnalysisException( + s"Unable to resolve ${column} given [${relation.output.map(_.name).mkString(", ")}]")) + } + val filterColumns = filters.flatMap(_.references) + val projectSet = parent.references ++ AttributeSet(filterColumns) + if (projectSet.subsetOf(AttributeSet(partitionColumns))) { + val partitionColumnDataTypes = partitionColumns.map(_.dataType) + val partitionValues = catalog.getPartitionsByFilter(relation.catalogTable, filters) + .map { p => + InternalRow.fromSeq( + partitionColumns.map(a => p.spec(a.name)).zip(partitionColumnDataTypes).map { + case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null) + }) + } + val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) + val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) + val filterPlan = + filters.reduceLeftOption(And).map(Filter(_, valuesPlan)).getOrElse(valuesPlan) + val scanPlan = project.map(_.withNewChildren(filterPlan :: Nil)).getOrElse(filterPlan) + parent.withNewChildren(scanPlan :: Nil) + } else { + parent + } + + case _ => + parent + } + + def apply(plan: LogicalPlan): LogicalPlan = { + if (!sparkSession.sessionState.conf.optimizerMetadataOnly) { + return plan + } + plan.transform { + case a @ Aggregate(_, _, child) if canSupportMetadataOnly(a) => + val (plan, filters) = findRelation(child) + if (plan.isDefined) { + convertToMetadataOnlyPlan(a, None, filters, plan.get) + } else { + a + } + + case d @ Distinct(p @ Project(_, _)) => + val (plan, filters) = findRelation(p) + if (plan.isDefined) { + convertToMetadataOnlyPlan(d, Some(p), filters, plan.get) + } else { + d + } + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index f18ab6463f692..9ed640d6e6a4e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -261,7 +261,7 @@ object SQLConf { val OPTIMIZER_METADATA_ONLY = SQLConfigBuilder("spark.sql.optimizer.metadataOnly") .doc("When true, enable the metadata-only query optimization.") .booleanConf - .createWithDefault(false) + .createWithDefault(true) val NATIVE_VIEW = SQLConfigBuilder("spark.sql.nativeView") .internal() diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index b8bc9ab900ad1..bfb53b915da83 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -30,6 +30,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog._ +import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.hive.client.HiveClient @@ -322,6 +323,15 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu client.getPartition(db, table, spec) } + /** + * Returns partitions filtered by predicates for the given table. + */ + override def getPartitionsByFilter( + catalogTable: CatalogTable, + filters: Seq[Expression] = Nil): Seq[CatalogTablePartition] = withClient { + client.getPartitionsByFilter(catalogTable, filters) + } + /** * Returns the partition names from hive metastore for a given table in a database. */ diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index c4f478ba4c25b..3c7b70a4bd53a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -27,11 +27,8 @@ import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.aggregate._ -import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ -import org.apache.spark.sql.execution.LogicalRDD import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils._ import org.apache.spark.sql.execution.command.CreateHiveTableAsSelectLogicalPlan import org.apache.spark.sql.execution.datasources.{Partition => _, _} @@ -459,124 +456,6 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log } } - /** - * When scanning only partition columns, get results based on metadata without scanning files. - * It is used for distinct or distinct/Max/Min aggregations, example: max(partition). - */ - object MetadataOnlyOptimizer extends Rule[LogicalPlan] { - - private def canSupportMetadataOnly(a: Aggregate): Boolean = { - val aggregateExpressions = a.aggregateExpressions.flatMap { expr => - expr.collect { - case agg: AggregateExpression => agg - } - }.distinct - aggregateExpressions.forall { agg => - if (agg.isDistinct) { - true - } else { - agg.aggregateFunction match { - case max: Max => true - case min: Min => true - case _ => false - } - } - } - } - - private def findRelation(plan: LogicalPlan): (Option[LogicalPlan], Seq[Expression]) = { - plan match { - case relation @ LogicalRelation(files: HadoopFsRelation, _, table) - if files.partitionSchema.nonEmpty => - (Some(relation), Seq.empty[Expression]) - - case relation: MetastoreRelation if relation.partitionKeys.nonEmpty => - (Some(relation), Seq.empty[Expression]) - - case p @ Project(_, child) => - findRelation(child) - - case f @ Filter(filterCondition, child) => - val (plan, conditions) = findRelation(child) - (plan, conditions ++ Seq(filterCondition)) - - case SubqueryAlias(_, child) => - findRelation(child) - - case _ => (None, Seq.empty[Expression]) - } - } - - private def convertToMetadataOnlyPlan( - parent: LogicalPlan, - project: Option[LogicalPlan], - filters: Seq[Expression], - relation: LogicalPlan): LogicalPlan = relation match { - case l @ LogicalRelation(files: HadoopFsRelation, _, _) => - val attributeMap = l.output.map(attr => (attr.name, attr)).toMap - val partitionColumns = files.partitionSchema.map { field => - attributeMap.getOrElse(field.name, throw new AnalysisException( - s"Unable to resolve ${field.name} given [${l.output.map(_.name).mkString(", ")}]")) - } - val filterColumns = filters.flatMap(_.references) - val projectSet = parent.references ++ AttributeSet(filterColumns) - if (projectSet.subsetOf(AttributeSet(partitionColumns))) { - val selectedPartitions = files.location.listFiles(filters) - val partitionValues = selectedPartitions.map(_.values) - val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) - val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) - val scanPlan = project.map(_.withNewChildren(valuesPlan :: Nil)).getOrElse(valuesPlan) - parent.withNewChildren(scanPlan :: Nil) - } else { - parent - } - - case relation: MetastoreRelation => - if (parent.references.subsetOf(AttributeSet(relation.partitionKeys))) { - val partitionColumnDataTypes = relation.partitionKeys.map(_.dataType) - val partitionValues = relation.getHiveQlPartitions(filters).map { p => - InternalRow.fromSeq(p.getValues.asScala.zip(partitionColumnDataTypes).map { - case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null) - }) - } - val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) - val valuesPlan = LogicalRDD(relation.partitionKeys, valuesRdd)(sparkSession) - val filterPlan = - filters.reduceLeftOption(And).map(Filter(_, valuesPlan)).getOrElse(valuesPlan) - val scanPlan = project.map(_.withNewChildren(filterPlan :: Nil)).getOrElse(filterPlan) - parent.withNewChildren(scanPlan :: Nil) - } else { - parent - } - - case _ => - parent - } - - def apply(plan: LogicalPlan): LogicalPlan = { - if (!sparkSession.sessionState.conf.optimizerMetadataOnly) { - return plan - } - plan.transform { - case a @ Aggregate(_, _, child) if canSupportMetadataOnly(a) => - val (plan, filters) = findRelation(child) - if (plan.isDefined) { - convertToMetadataOnlyPlan(a, None, filters, plan.get) - } else { - a - } - - case d @ Distinct(p @ Project(_, _)) => - val (plan, filters) = findRelation(p) - if (plan.isDefined) { - convertToMetadataOnlyPlan(d, Some(p), filters, plan.get) - } else { - d - } - } - } - } - } /** diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala index c3b2a8c7d3926..2f6a2207855ec 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala @@ -87,7 +87,6 @@ private[sql] class HiveSessionCatalog( val ParquetConversions: Rule[LogicalPlan] = metastoreCatalog.ParquetConversions val OrcConversions: Rule[LogicalPlan] = metastoreCatalog.OrcConversions val CreateTables: Rule[LogicalPlan] = metastoreCatalog.CreateTables - val metadataOnlyOptimizer: Rule[LogicalPlan] = metastoreCatalog.MetadataOnlyOptimizer override def refreshTable(name: TableIdentifier): Unit = { metastoreCatalog.refreshTable(name) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala index 924b5a0b2a466..46978d5e64a22 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala @@ -74,18 +74,6 @@ private[hive] class HiveSessionState(sparkSession: SparkSession) } } - /** - * Logical query plan optimizer for Hive. - */ - override lazy val optimizer: Optimizer = new SparkOptimizer(catalog, conf, experimentalMethods) { - val extendedHiveOptimizerRules: Seq[Batch] = Seq( - Batch("MetadataOnly Optimization", Once, - catalog.metadataOnlyOptimizer) - ) - - override def batches: Seq[Batch] = extendedHiveOptimizerRules ++ super.batches - } - /** * Planner that takes into account Hive-specific strategies. */ diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala index 5f896969188d7..4f8b1cc21e090 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala @@ -183,6 +183,17 @@ private[hive] trait HiveClient { table: CatalogTable, partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition] + /** + * Returns the partitions for the given table that match the supplied partition spec. + * If no partition spec is specified, all partitions are returned. + */ + final def getPartitionsByFilter( + db: String, + table: String, + predicates: Seq[Expression]): Seq[CatalogTablePartition] = { + getPartitionsByFilter(getTable(db, table), predicates) + } + /** Returns partitions filtered by predicates for the given table. */ def getPartitionsByFilter( table: CatalogTable, diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 37b312da77630..503a5f1a8ee91 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1687,8 +1687,8 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("spark-15752 metadata only optimizer") { withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { - val df = Seq((1, 2), (3, 4)).toDF("key", "value") - df.createOrReplaceTempView("data") + val df = Seq((1, "2"), (3, "4")).toDF("key", "value") + df.createOrReplaceTempView("data_15752") sql( """ |CREATE TABLE srcpart_15752 (key INT, value STRING) @@ -1698,7 +1698,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { sql( s""" |INSERT OVERWRITE TABLE srcpart_15752 PARTITION (ds='$ds',hr='$hr') - |select key, value from data + |select key, value from data_15752 """.stripMargin) } checkAnswer(sql("select max(hr) from srcpart_15752"), Row(12)) @@ -1716,7 +1716,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { sql( s""" |INSERT OVERWRITE TABLE srctext_15752 PARTITION (ds='$ds',hr='$hr') - |select key, value from data + |select key, value from data_15752 """.stripMargin) } From 1bb5812a3af9fffcbc879fba42ccfe66485617bb Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Sat, 25 Jun 2016 00:28:59 +0800 Subject: [PATCH 10/45] refactor --- .../catalyst/catalog/ExternalCatalog.scala | 2 +- .../sql/catalyst/catalog/SessionCatalog.scala | 2 +- .../sql/execution/MetadataOnlyOptimizer.scala | 162 ++++++++++++++++++ .../spark/sql/execution/SparkOptimizer.scala | 149 +--------------- .../spark/sql/hive/HiveMetastoreCatalog.scala | 1 - 5 files changed, 169 insertions(+), 147 deletions(-) create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala index 5d478a4ccce9f..005457b2bfbde 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala @@ -153,7 +153,7 @@ abstract class ExternalCatalog { def getPartition(db: String, table: String, spec: TablePartitionSpec): CatalogTablePartition /** - * Returns partitions filtered by predicates for the given table, It just work for Hive. + * Returns partitions filtered by predicates for the given table, Filters only work for Hive. * * The filters Expressions may optionally be provided to filter the partitions returned. * For instance, if there exist partitions (a='1', b='2'), (a='1', b='3') and (a='2', b='4'), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index be984bade6009..c55830dfd798f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -586,7 +586,7 @@ class SessionCatalog( } /** - * Returns partitions filtered by predicates for the given table, It just work for Hive. + * Returns partitions filtered by predicates for the given table, Filters only work for Hive. * * The filters Expressions may optionally be provided to filter the partitions returned. * For instance, if there exist partitions (a='1', b='2'), (a='1', b='3') and (a='2', b='4'), diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala new file mode 100644 index 0000000000000..6f4eb0e9c854b --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.sql.{AnalysisException, SparkSession} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, SessionCatalog} +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.aggregate._ +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} + +/** + * When scanning only partition columns, get results based on metadata without scanning files. + * It is used for distinct, distinct aggregations or distinct-like aggregations(example: Max/Min). + * Example: select Max(partition) from table. + */ +case class MetadataOnlyOptimizer(catalog: SessionCatalog) extends Rule[LogicalPlan] { + + private val sparkSession: SparkSession = SparkSession.getActiveSession.get + private def canSupportMetadataOnly(a: Aggregate): Boolean = { + val aggregateExpressions = a.aggregateExpressions.flatMap { expr => + expr.collect { + case agg: AggregateExpression => agg + } + }.distinct + if (aggregateExpressions.isEmpty) { + // Cannot support for aggregate that has no aggregateFunction. + // example: select col1 from table group by col1. + false + } else { + aggregateExpressions.forall { agg => + if (agg.isDistinct) { + true + } else { + // If function can be evaluated on just the distinct values of a column, it can be used + // by metadata-only optimizer. + agg.aggregateFunction match { + case max: Max => true + case min: Min => true + case hyperLog: HyperLogLogPlusPlus => true + case _ => false + } + } + } + } + } + + private def findRelation(plan: LogicalPlan): (Option[LogicalPlan], Seq[Expression]) = { + plan match { + case relation @ LogicalRelation(files: HadoopFsRelation, _, table) + if files.partitionSchema.nonEmpty => + (Some(relation), Seq.empty[Expression]) + + case relation: CatalogRelation if relation.catalogTable.partitionColumnNames.nonEmpty => + (Some(relation), Seq.empty[Expression]) + + case p @ Project(_, child) => + findRelation(child) + + case f @ Filter(filterCondition, child) => + val (plan, conditions) = findRelation(child) + (plan, conditions ++ Seq(filterCondition)) + + case _ => (None, Seq.empty[Expression]) + } + } + + private def convertToMetadataOnlyPlan( + parent: LogicalPlan, + project: Option[LogicalPlan], + filters: Seq[Expression], + relation: LogicalPlan): LogicalPlan = relation match { + case l @ LogicalRelation(files: HadoopFsRelation, _, _) => + val attributeMap = l.output.map(attr => (attr.name, attr)).toMap + val partitionColumns = files.partitionSchema.map { field => + attributeMap.getOrElse(field.name, throw new AnalysisException( + s"Unable to resolve ${field.name} given [${l.output.map(_.name).mkString(", ")}]")) + } + val filterColumns = filters.flatMap(_.references) + val projectSet = parent.references ++ AttributeSet(filterColumns) + if (projectSet.subsetOf(AttributeSet(partitionColumns))) { + val selectedPartitions = files.location.listFiles(filters) + val partitionValues = selectedPartitions.map(_.values) + val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) + val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) + val scanPlan = project.map(_.withNewChildren(valuesPlan :: Nil)).getOrElse(valuesPlan) + parent.withNewChildren(scanPlan :: Nil) + } else { + parent + } + + case relation: CatalogRelation => + val attributeMap = relation.output.map(attr => (attr.name, attr)).toMap + val partitionColumns = relation.catalogTable.partitionColumnNames.map { column => + attributeMap.getOrElse(column, throw new AnalysisException( + s"Unable to resolve ${column} given [${relation.output.map(_.name).mkString(", ")}]")) + } + val filterColumns = filters.flatMap(_.references) + val projectSet = parent.references ++ AttributeSet(filterColumns) + if (projectSet.subsetOf(AttributeSet(partitionColumns))) { + val partitionColumnDataTypes = partitionColumns.map(_.dataType) + val partitionValues = catalog.getPartitionsByFilter(relation.catalogTable, filters) + .map { p => + InternalRow.fromSeq( + partitionColumns.map(a => p.spec(a.name)).zip(partitionColumnDataTypes).map { + case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null) + }) + } + val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) + val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) + val filterPlan = + filters.reduceLeftOption(And).map(Filter(_, valuesPlan)).getOrElse(valuesPlan) + val scanPlan = project.map(_.withNewChildren(filterPlan :: Nil)).getOrElse(filterPlan) + parent.withNewChildren(scanPlan :: Nil) + } else { + parent + } + + case _ => + parent + } + + def apply(plan: LogicalPlan): LogicalPlan = { + if (!sparkSession.sessionState.conf.optimizerMetadataOnly) { + return plan + } + plan.transform { + case a @ Aggregate(_, _, child) if canSupportMetadataOnly(a) => + val (plan, filters) = findRelation(child) + if (plan.isDefined) { + convertToMetadataOnlyPlan(a, None, filters, plan.get) + } else { + a + } + + case d @ Distinct(p @ Project(_, _)) => + val (plan, filters) = findRelation(p) + if (plan.isDefined) { + convertToMetadataOnlyPlan(d, Some(p), filters, plan.get) + } else { + d + } + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala index e4ad530c4d3fd..0512c7f72958a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala @@ -17,16 +17,12 @@ package org.apache.spark.sql.execution -import org.apache.spark.sql.{AnalysisException, ExperimentalMethods, SparkSession} -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis._ -import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, SessionCatalog} -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.aggregate._ -import org.apache.spark.sql.catalyst.optimizer._ -import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.ExperimentalMethods +import org.apache.spark.sql.catalyst.analysis.RewriteDistinctAggregates +import org.apache.spark.sql.catalyst.catalog.SessionCatalog +import org.apache.spark.sql.catalyst.optimizer.Optimizer +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} import org.apache.spark.sql.execution.python.ExtractPythonUDFFromAggregate import org.apache.spark.sql.internal.SQLConf @@ -44,138 +40,3 @@ class SparkOptimizer( Batch("Extract Python UDF from Aggregate", Once, ExtractPythonUDFFromAggregate) :+ Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*) } - -/** - * When scanning only partition columns, get results based on metadata without scanning files. - * It is used for distinct, distinct aggregations or distinct-like aggregations(example: Max/Min). - * Example: select Max(partition) from table. - */ -case class MetadataOnlyOptimizer(catalog: SessionCatalog) extends Rule[LogicalPlan] { - - private val sparkSession: SparkSession = SparkSession.getActiveSession.get - private def canSupportMetadataOnly(a: Aggregate): Boolean = { - val aggregateExpressions = a.aggregateExpressions.flatMap { expr => - expr.collect { - case agg: AggregateExpression => agg - } - }.distinct - if (aggregateExpressions.isEmpty) { - // Cannot support for aggregate that has no aggregateFunction. - // example: select col1 from table group by col1. - false - } else { - aggregateExpressions.forall { agg => - if (agg.isDistinct) { - true - } else { - // If function can be evaluated on just the distinct values of a column, it can be used - // by metadata-only optimizer. - agg.aggregateFunction match { - case max: Max => true - case min: Min => true - case hyperLog: HyperLogLogPlusPlus => true - case _ => false - } - } - } - } - } - - private def findRelation(plan: LogicalPlan): (Option[LogicalPlan], Seq[Expression]) = { - plan match { - case relation @ LogicalRelation(files: HadoopFsRelation, _, table) - if files.partitionSchema.nonEmpty => - (Some(relation), Seq.empty[Expression]) - - case relation: CatalogRelation if relation.catalogTable.partitionColumnNames.nonEmpty => - (Some(relation), Seq.empty[Expression]) - - case p @ Project(_, child) => - findRelation(child) - - case f @ Filter(filterCondition, child) => - val (plan, conditions) = findRelation(child) - (plan, conditions ++ Seq(filterCondition)) - - case _ => (None, Seq.empty[Expression]) - } - } - - private def convertToMetadataOnlyPlan( - parent: LogicalPlan, - project: Option[LogicalPlan], - filters: Seq[Expression], - relation: LogicalPlan): LogicalPlan = relation match { - case l @ LogicalRelation(files: HadoopFsRelation, _, _) => - val attributeMap = l.output.map(attr => (attr.name, attr)).toMap - val partitionColumns = files.partitionSchema.map { field => - attributeMap.getOrElse(field.name, throw new AnalysisException( - s"Unable to resolve ${field.name} given [${l.output.map(_.name).mkString(", ")}]")) - } - val filterColumns = filters.flatMap(_.references) - val projectSet = parent.references ++ AttributeSet(filterColumns) - if (projectSet.subsetOf(AttributeSet(partitionColumns))) { - val selectedPartitions = files.location.listFiles(filters) - val partitionValues = selectedPartitions.map(_.values) - val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) - val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) - val scanPlan = project.map(_.withNewChildren(valuesPlan :: Nil)).getOrElse(valuesPlan) - parent.withNewChildren(scanPlan :: Nil) - } else { - parent - } - - case relation: CatalogRelation => - val attributeMap = relation.output.map(attr => (attr.name, attr)).toMap - val partitionColumns = relation.catalogTable.partitionColumnNames.map { column => - attributeMap.getOrElse(column, throw new AnalysisException( - s"Unable to resolve ${column} given [${relation.output.map(_.name).mkString(", ")}]")) - } - val filterColumns = filters.flatMap(_.references) - val projectSet = parent.references ++ AttributeSet(filterColumns) - if (projectSet.subsetOf(AttributeSet(partitionColumns))) { - val partitionColumnDataTypes = partitionColumns.map(_.dataType) - val partitionValues = catalog.getPartitionsByFilter(relation.catalogTable, filters) - .map { p => - InternalRow.fromSeq( - partitionColumns.map(a => p.spec(a.name)).zip(partitionColumnDataTypes).map { - case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null) - }) - } - val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) - val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) - val filterPlan = - filters.reduceLeftOption(And).map(Filter(_, valuesPlan)).getOrElse(valuesPlan) - val scanPlan = project.map(_.withNewChildren(filterPlan :: Nil)).getOrElse(filterPlan) - parent.withNewChildren(scanPlan :: Nil) - } else { - parent - } - - case _ => - parent - } - - def apply(plan: LogicalPlan): LogicalPlan = { - if (!sparkSession.sessionState.conf.optimizerMetadataOnly) { - return plan - } - plan.transform { - case a @ Aggregate(_, _, child) if canSupportMetadataOnly(a) => - val (plan, filters) = findRelation(child) - if (plan.isDefined) { - convertToMetadataOnlyPlan(a, None, filters, plan.get) - } else { - a - } - - case d @ Distinct(p @ Project(_, _)) => - val (plan, filters) = findRelation(p) - if (plan.isDefined) { - convertToMetadataOnlyPlan(d, Some(p), filters, plan.get) - } else { - d - } - } - } -} diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index 3c7b70a4bd53a..552eacfcbceb6 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -455,7 +455,6 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log allowExisting) } } - } /** From 7e3729e407417b58bd5a406e43ab2b1901761699 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Sat, 25 Jun 2016 01:50:05 +0800 Subject: [PATCH 11/45] add ut --- .../scala/org/apache/spark/sql/SQLQuerySuite.scala | 11 +++++++++++ .../spark/sql/hive/execution/SQLQuerySuite.scala | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index bbe821b76805c..68d922b454db0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -2870,4 +2870,15 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { sql(s"SELECT '$literal' AS DUMMY"), Row(s"$expected") :: Nil) } + + test("spark-15752 metadata only optimizer for datasource table") { + val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "even" else "odd")) + .toDF("id", "data", "partId", "part") + data.write.partitionBy("partId", "part").mode("append").saveAsTable("srcpart_15752") + checkAnswer(sql("select max(part) from srcpart_15752"), Row("odd")) + checkAnswer(sql("select max(part) from srcpart_15752 where partId = 0"), Row("even")) + checkAnswer(sql("select max(part) from (select part from srcpart_15752) t"), Row("odd")) + checkAnswer(sql("select distinct part from srcpart_15752"), Row("even") :: Row("odd") :: Nil) + checkAnswer(sql("select distinct part from srcpart_15752 where partId = 0"), Row("even")) + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 503a5f1a8ee91..257acd0673341 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1685,7 +1685,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } } - test("spark-15752 metadata only optimizer") { + test("spark-15752 metadata only optimizer for hive table") { withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { val df = Seq((1, "2"), (3, "4")).toDF("key", "value") df.createOrReplaceTempView("data_15752") From fbf5d615713caab8a6d5aa8093cdb2ef59020966 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Sat, 25 Jun 2016 02:36:01 +0800 Subject: [PATCH 12/45] fix ut --- .../apache/spark/sql/execution/MetadataOnlyOptimizer.scala | 5 +++-- .../org/apache/spark/sql/execution/SparkOptimizer.scala | 5 +++-- .../scala/org/apache/spark/sql/internal/SessionState.scala | 3 ++- .../scala/org/apache/spark/sql/hive/HiveSessionState.scala | 3 +-- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala index 6f4eb0e9c854b..f1dedec2051c5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala @@ -31,9 +31,10 @@ import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRela * It is used for distinct, distinct aggregations or distinct-like aggregations(example: Max/Min). * Example: select Max(partition) from table. */ -case class MetadataOnlyOptimizer(catalog: SessionCatalog) extends Rule[LogicalPlan] { +case class MetadataOnlyOptimizer( + sparkSession: SparkSession, + catalog: SessionCatalog) extends Rule[LogicalPlan] { - private val sparkSession: SparkSession = SparkSession.getActiveSession.get private def canSupportMetadataOnly(a: Aggregate): Boolean = { val aggregateExpressions = a.aggregateExpressions.flatMap { expr => expr.collect { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala index 0512c7f72958a..49613e8b53d53 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution -import org.apache.spark.sql.ExperimentalMethods +import org.apache.spark.sql.{ExperimentalMethods, SparkSession} import org.apache.spark.sql.catalyst.analysis.RewriteDistinctAggregates import org.apache.spark.sql.catalyst.catalog.SessionCatalog import org.apache.spark.sql.catalyst.optimizer.Optimizer @@ -27,13 +27,14 @@ import org.apache.spark.sql.execution.python.ExtractPythonUDFFromAggregate import org.apache.spark.sql.internal.SQLConf class SparkOptimizer( + sparkSession: SparkSession, catalog: SessionCatalog, conf: SQLConf, experimentalMethods: ExperimentalMethods) extends Optimizer(catalog, conf) { override val aggregateOptimizationsRules: Seq[Rule[LogicalPlan]] = - MetadataOnlyOptimizer(catalog) :: + MetadataOnlyOptimizer(sparkSession, catalog) :: RewriteDistinctAggregates :: Nil override def batches: Seq[Batch] = super.batches :+ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala index 5f5cf5c6d30c4..f107236183808 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala @@ -123,7 +123,8 @@ private[sql] class SessionState(sparkSession: SparkSession) { /** * Logical query plan optimizer. */ - lazy val optimizer: Optimizer = new SparkOptimizer(catalog, conf, experimentalMethods) + lazy val optimizer: Optimizer = + new SparkOptimizer(sparkSession, catalog, conf, experimentalMethods) /** * Parser that extracts expressions, plans, table identifiers etc. from SQL texts. diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala index 46978d5e64a22..8773993d362c4 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala @@ -19,8 +19,7 @@ package org.apache.spark.sql.hive import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.Analyzer -import org.apache.spark.sql.catalyst.optimizer.Optimizer -import org.apache.spark.sql.execution.{SparkOptimizer, SparkPlanner} +import org.apache.spark.sql.execution.SparkPlanner import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.hive.client.HiveClient import org.apache.spark.sql.internal.SessionState From 3411fd6e51758d8f556b9cbf3abbc1a7fda952e1 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Mon, 27 Jun 2016 01:29:19 +0800 Subject: [PATCH 13/45] fix project --- .../sql/catalyst/optimizer/Optimizer.scala | 3 +- .../sql/execution/MetadataOnlyOptimizer.scala | 63 +++++++++++++------ .../sql/hive/execution/SQLQuerySuite.scala | 6 ++ 3 files changed, 52 insertions(+), 20 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 3b567571a07ca..1b70aa426691c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -53,8 +53,7 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, conf: CatalystConf) EliminateSubqueryAliases, ReplaceExpressions, ComputeCurrentTime, - GetCurrentDatabase(sessionCatalog), - RewriteDistinctAggregates) :: + GetCurrentDatabase(sessionCatalog)) :: ////////////////////////////////////////////////////////////////////////////////////////// // Optimizer rules start here ////////////////////////////////////////////////////////////////////////////////////////// diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala index f1dedec2051c5..1ffb4e135da42 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala @@ -63,29 +63,50 @@ case class MetadataOnlyOptimizer( } } - private def findRelation(plan: LogicalPlan): (Option[LogicalPlan], Seq[Expression]) = { + private def collectAliases(fields: Seq[Expression]): Map[ExprId, Expression] = fields.collect { + case a @ Alias(child, _) => a.toAttribute.exprId -> child + }.toMap + + private def substitute(aliases: Map[ExprId, Expression])(expr: Expression): Expression = { + expr.transform { + case a @ Alias(ref: AttributeReference, name) => + aliases.get(ref.exprId) + .map(Alias(_, name)(a.exprId, a.qualifier, isGenerated = a.isGenerated)) + .getOrElse(a) + + case a: AttributeReference => + aliases.get(a.exprId) + .map(Alias(_, a.name)(a.exprId, a.qualifier, isGenerated = a.isGenerated)).getOrElse(a) + } + } + + private def findRelation(plan: LogicalPlan) + : (Option[LogicalPlan], Seq[NamedExpression], Seq[Expression], Map[ExprId, Expression]) = { plan match { case relation @ LogicalRelation(files: HadoopFsRelation, _, table) if files.partitionSchema.nonEmpty => - (Some(relation), Seq.empty[Expression]) + (Some(relation), Seq.empty[NamedExpression], Seq.empty[Expression], Map.empty) case relation: CatalogRelation if relation.catalogTable.partitionColumnNames.nonEmpty => - (Some(relation), Seq.empty[Expression]) + (Some(relation), Seq.empty[NamedExpression], Seq.empty[Expression], Map.empty) - case p @ Project(_, child) => - findRelation(child) + case p @ Project(fields, child) if fields.forall(_.deterministic) => + val (plan, _, filters, aliases) = findRelation(child) + val substitutedFields = fields.map(substitute(aliases)).asInstanceOf[Seq[NamedExpression]] + (plan, substitutedFields, filters, collectAliases(substitutedFields)) - case f @ Filter(filterCondition, child) => - val (plan, conditions) = findRelation(child) - (plan, conditions ++ Seq(filterCondition)) + case f @ Filter(condition, child) if condition.deterministic => + val (plan, fields, filters, aliases) = findRelation(child) + val substitutedCondition = substitute(aliases)(condition) + (plan, fields, filters ++ Seq(substitutedCondition), aliases) - case _ => (None, Seq.empty[Expression]) + case _ => (None, Seq.empty[NamedExpression], Seq.empty[Expression], Map.empty) } } private def convertToMetadataOnlyPlan( parent: LogicalPlan, - project: Option[LogicalPlan], + projectList: Seq[NamedExpression], filters: Seq[Expression], relation: LogicalPlan): LogicalPlan = relation match { case l @ LogicalRelation(files: HadoopFsRelation, _, _) => @@ -101,8 +122,11 @@ case class MetadataOnlyOptimizer( val partitionValues = selectedPartitions.map(_.values) val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) - val scanPlan = project.map(_.withNewChildren(valuesPlan :: Nil)).getOrElse(valuesPlan) - parent.withNewChildren(scanPlan :: Nil) + if (projectList.nonEmpty) { + parent.withNewChildren(Project(projectList, valuesPlan) :: Nil) + } else { + parent.withNewChildren(valuesPlan :: Nil) + } } else { parent } @@ -128,8 +152,11 @@ case class MetadataOnlyOptimizer( val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) val filterPlan = filters.reduceLeftOption(And).map(Filter(_, valuesPlan)).getOrElse(valuesPlan) - val scanPlan = project.map(_.withNewChildren(filterPlan :: Nil)).getOrElse(filterPlan) - parent.withNewChildren(scanPlan :: Nil) + if (projectList.nonEmpty) { + parent.withNewChildren(Project(projectList, filterPlan) :: Nil) + } else { + parent.withNewChildren(filterPlan :: Nil) + } } else { parent } @@ -144,17 +171,17 @@ case class MetadataOnlyOptimizer( } plan.transform { case a @ Aggregate(_, _, child) if canSupportMetadataOnly(a) => - val (plan, filters) = findRelation(child) + val (plan, projectList, filters, _) = findRelation(child) if (plan.isDefined) { - convertToMetadataOnlyPlan(a, None, filters, plan.get) + convertToMetadataOnlyPlan(a, projectList, filters, plan.get) } else { a } case d @ Distinct(p @ Project(_, _)) => - val (plan, filters) = findRelation(p) + val (plan, projectList, filters, _) = findRelation(p) if (plan.isDefined) { - convertToMetadataOnlyPlan(d, Some(p), filters, plan.get) + convertToMetadataOnlyPlan(d, projectList, filters, plan.get) } else { d } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 257acd0673341..e4608d4bb7401 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1723,8 +1723,14 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { checkAnswer(sql("select max(hr) from srctext_15752"), Row(12)) checkAnswer(sql("select max(hr) from srctext_15752 where hr = 11"), Row(11)) checkAnswer(sql("select max(hr) from (select hr from srctext_15752) t"), Row(12)) + checkAnswer( + sql("select max(x) from (select hr + 1 as x from srctext_15752 where hr = 12) t"), + Row(13)) checkAnswer(sql("select distinct hr from srctext_15752"), Row(11) :: Row(12) :: Nil) checkAnswer(sql("select distinct hr from srctext_15752 where hr = 11"), Row(11)) + checkAnswer( + sql("select distinct x from (select hr + 1 as x from srctext_15752 where hr = 12) t"), + Row(13)) } } } From aefab7fa2b2c99f861faf0693fc1e13c2cf5311c Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Mon, 27 Jun 2016 21:47:29 +0800 Subject: [PATCH 14/45] address comments --- .../sql/execution/MetadataOnlyOptimizer.scala | 25 ++++++++----------- .../sql/hive/execution/SQLQuerySuite.scala | 2 ++ 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala index 1ffb4e135da42..7d854d7bc4768 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala @@ -42,9 +42,9 @@ case class MetadataOnlyOptimizer( } }.distinct if (aggregateExpressions.isEmpty) { - // Cannot support for aggregate that has no aggregateFunction. - // example: select col1 from table group by col1. - false + // Support for aggregate that has no aggregateFunction when expressions are partition columns + // example: select partitionCol from table group by partitionCol. + true } else { aggregateExpressions.forall { agg => if (agg.isDistinct) { @@ -119,13 +119,11 @@ case class MetadataOnlyOptimizer( val projectSet = parent.references ++ AttributeSet(filterColumns) if (projectSet.subsetOf(AttributeSet(partitionColumns))) { val selectedPartitions = files.location.listFiles(filters) - val partitionValues = selectedPartitions.map(_.values) - val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) + val valuesRdd = sparkSession.sparkContext.parallelize(selectedPartitions.map(_.values), 1) val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) - if (projectList.nonEmpty) { - parent.withNewChildren(Project(projectList, valuesPlan) :: Nil) - } else { - parent.withNewChildren(valuesPlan :: Nil) + parent.transform { + case l @ LogicalRelation(files: HadoopFsRelation, _, _) => + valuesPlan } } else { parent @@ -150,12 +148,9 @@ case class MetadataOnlyOptimizer( } val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) - val filterPlan = - filters.reduceLeftOption(And).map(Filter(_, valuesPlan)).getOrElse(valuesPlan) - if (projectList.nonEmpty) { - parent.withNewChildren(Project(projectList, filterPlan) :: Nil) - } else { - parent.withNewChildren(filterPlan :: Nil) + parent.transform { + case relation: CatalogRelation => + valuesPlan } } else { parent diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index e4608d4bb7401..b408d42fafd22 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1701,6 +1701,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { |select key, value from data_15752 """.stripMargin) } + checkAnswer(sql("select hr from srcpart_15752 where hr = 11 group by hr"), Row(11)) checkAnswer(sql("select max(hr) from srcpart_15752"), Row(12)) checkAnswer(sql("select max(hr) from srcpart_15752 where hr = 11"), Row(11)) checkAnswer(sql("select max(hr) from (select hr from srcpart_15752) t"), Row(12)) @@ -1720,6 +1721,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { """.stripMargin) } + checkAnswer(sql("select hr from srctext_15752 where hr = 11 group by hr"), Row(11)) checkAnswer(sql("select max(hr) from srctext_15752"), Row(12)) checkAnswer(sql("select max(hr) from srctext_15752 where hr = 11"), Row(11)) checkAnswer(sql("select max(hr) from (select hr from srctext_15752) t"), Row(12)) From c5ccdea435789ec5cdea1438ace09d91e7726d22 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Tue, 28 Jun 2016 00:23:20 +0800 Subject: [PATCH 15/45] fix cube/rollup --- .../catalyst/catalog/ExternalCatalog.scala | 13 ----------- .../catalyst/catalog/InMemoryCatalog.scala | 7 ------ .../sql/catalyst/catalog/SessionCatalog.scala | 13 ----------- .../sql/catalyst/optimizer/Optimizer.scala | 7 ++---- .../sql/execution/MetadataOnlyOptimizer.scala | 23 +++++++++++-------- .../spark/sql/execution/SparkOptimizer.scala | 5 +--- .../spark/sql/hive/HiveExternalCatalog.scala | 9 -------- .../spark/sql/hive/client/HiveClient.scala | 11 --------- .../sql/hive/execution/SQLQuerySuite.scala | 6 +++++ 9 files changed, 22 insertions(+), 72 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala index 005457b2bfbde..2745d5b0dd3b8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala @@ -152,19 +152,6 @@ abstract class ExternalCatalog { def getPartition(db: String, table: String, spec: TablePartitionSpec): CatalogTablePartition - /** - * Returns partitions filtered by predicates for the given table, Filters only work for Hive. - * - * The filters Expressions may optionally be provided to filter the partitions returned. - * For instance, if there exist partitions (a='1', b='2'), (a='1', b='3') and (a='2', b='4'), - * then the filters (a='1') will return the first two only. - * @param catalogTable table - * @param filters The filters used to prune which partitions are returned. - */ - def getPartitionsByFilter( - catalogTable: CatalogTable, - filters: Seq[Expression] = Nil): Seq[CatalogTablePartition] - /** * List the metadata of all partitions that belong to the specified table, assuming it exists. * diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala index 1d9b30d3f8734..097ab99c64524 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala @@ -448,13 +448,6 @@ class InMemoryCatalog(hadoopConfig: Configuration = new Configuration) extends E catalog(db).tables(table).partitions(spec) } - override def getPartitionsByFilter( - catalogTable: CatalogTable, - filters: Seq[Expression] = Nil): Seq[CatalogTablePartition] = synchronized { - requireTableExists(catalogTable.database, catalogTable.identifier.table) - catalog(catalogTable.database).tables(catalogTable.identifier.table).partitions.values.toSeq - } - override def listPartitions( db: String, table: String, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index c55830dfd798f..7ab10d1c38237 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -585,19 +585,6 @@ class SessionCatalog( externalCatalog.getPartition(db, table, spec) } - /** - * Returns partitions filtered by predicates for the given table, Filters only work for Hive. - * - * The filters Expressions may optionally be provided to filter the partitions returned. - * For instance, if there exist partitions (a='1', b='2'), (a='1', b='3') and (a='2', b='4'), - * then the filters (a='1') will return the first two only. - */ - def getPartitionsByFilter( - catalogTable: CatalogTable, - filters: Seq[Expression] = Nil): Seq[CatalogTablePartition] = { - externalCatalog.getPartitionsByFilter(catalogTable, filters) - } - /** * List the metadata of all partitions that belong to the specified table, assuming it exists. * diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 1b70aa426691c..2bca31d5f1f75 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -42,8 +42,6 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, conf: CatalystConf) protected val fixedPoint = FixedPoint(conf.optimizerMaxIterations) - val aggregateOptimizationsRules: Seq[Rule[LogicalPlan]] = RewriteDistinctAggregates :: Nil - def batches: Seq[Batch] = { // Technically some of the rules in Finish Analysis are not optimizer rules and belong more // in the analyzer, because they are needed for correctness (e.g. ComputeCurrentTime). @@ -53,7 +51,8 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, conf: CatalystConf) EliminateSubqueryAliases, ReplaceExpressions, ComputeCurrentTime, - GetCurrentDatabase(sessionCatalog)) :: + GetCurrentDatabase(sessionCatalog), + RewriteDistinctAggregates) :: ////////////////////////////////////////////////////////////////////////////////////////// // Optimizer rules start here ////////////////////////////////////////////////////////////////////////////////////////// @@ -62,8 +61,6 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, conf: CatalystConf) // extra operators between two adjacent Union operators. // - Call CombineUnions again in Batch("Operator Optimizations"), // since the other rules might make two separate Unions operators adjacent. - Batch("Aggregate Optimizations", Once, - aggregateOptimizationsRules : _*) :: Batch("Union", Once, CombineUnions) :: Batch("Subquery", Once, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala index 7d854d7bc4768..4b2e09dd3c901 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala @@ -29,7 +29,14 @@ import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRela /** * When scanning only partition columns, get results based on metadata without scanning files. * It is used for distinct, distinct aggregations or distinct-like aggregations(example: Max/Min). - * Example: select Max(partition) from table. + * First of all, scanning only partition columns are required, then the rule does the following + * things here: + * 1. aggregate expression is partition columns, + * e.g. SELECT col FROM tbl GROUP BY col or SELECT col FROM tbl GROUP BY cube(col). + * 2. aggregate function on partition columns with DISTINCT, + * e.g. SELECT count(DISTINCT col) FROM tbl GROUP BY col. + * 3. aggregate function on partition columns which have same result with DISTINCT keyword. + * e.g. SELECT Max(col2) FROM tbl GROUP BY col1. */ case class MetadataOnlyOptimizer( sparkSession: SparkSession, @@ -44,6 +51,7 @@ case class MetadataOnlyOptimizer( if (aggregateExpressions.isEmpty) { // Support for aggregate that has no aggregateFunction when expressions are partition columns // example: select partitionCol from table group by partitionCol. + // Moreover, multiple-distinct has been rewritted into it by RewriteDistinctAggregates. true } else { aggregateExpressions.forall { agg => @@ -100,6 +108,9 @@ case class MetadataOnlyOptimizer( val substitutedCondition = substitute(aliases)(condition) (plan, fields, filters ++ Seq(substitutedCondition), aliases) + case e @ Expand(_, _, child) => + findRelation(child) + case _ => (None, Seq.empty[NamedExpression], Seq.empty[Expression], Map.empty) } } @@ -139,7 +150,7 @@ case class MetadataOnlyOptimizer( val projectSet = parent.references ++ AttributeSet(filterColumns) if (projectSet.subsetOf(AttributeSet(partitionColumns))) { val partitionColumnDataTypes = partitionColumns.map(_.dataType) - val partitionValues = catalog.getPartitionsByFilter(relation.catalogTable, filters) + val partitionValues = catalog.listPartitions(relation.catalogTable.identifier) .map { p => InternalRow.fromSeq( partitionColumns.map(a => p.spec(a.name)).zip(partitionColumnDataTypes).map { @@ -172,14 +183,6 @@ case class MetadataOnlyOptimizer( } else { a } - - case d @ Distinct(p @ Project(_, _)) => - val (plan, projectList, filters, _) = findRelation(p) - if (plan.isDefined) { - convertToMetadataOnlyPlan(d, projectList, filters, plan.get) - } else { - d - } } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala index 49613e8b53d53..dd49bf989e770 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala @@ -33,11 +33,8 @@ class SparkOptimizer( experimentalMethods: ExperimentalMethods) extends Optimizer(catalog, conf) { - override val aggregateOptimizationsRules: Seq[Rule[LogicalPlan]] = - MetadataOnlyOptimizer(sparkSession, catalog) :: - RewriteDistinctAggregates :: Nil - override def batches: Seq[Batch] = super.batches :+ + Batch("Metadata Only Optimization", Once, MetadataOnlyOptimizer(sparkSession, catalog)) :+ Batch("Extract Python UDF from Aggregate", Once, ExtractPythonUDFFromAggregate) :+ Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*) } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index bfb53b915da83..001dd07c87638 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -323,15 +323,6 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu client.getPartition(db, table, spec) } - /** - * Returns partitions filtered by predicates for the given table. - */ - override def getPartitionsByFilter( - catalogTable: CatalogTable, - filters: Seq[Expression] = Nil): Seq[CatalogTablePartition] = withClient { - client.getPartitionsByFilter(catalogTable, filters) - } - /** * Returns the partition names from hive metastore for a given table in a database. */ diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala index 4f8b1cc21e090..5f896969188d7 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala @@ -183,17 +183,6 @@ private[hive] trait HiveClient { table: CatalogTable, partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition] - /** - * Returns the partitions for the given table that match the supplied partition spec. - * If no partition spec is specified, all partitions are returned. - */ - final def getPartitionsByFilter( - db: String, - table: String, - predicates: Seq[Expression]): Seq[CatalogTablePartition] = { - getPartitionsByFilter(getTable(db, table), predicates) - } - /** Returns partitions filtered by predicates for the given table. */ def getPartitionsByFilter( table: CatalogTable, diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index b408d42fafd22..939c53ec22239 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1702,6 +1702,9 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { """.stripMargin) } checkAnswer(sql("select hr from srcpart_15752 where hr = 11 group by hr"), Row(11)) + checkAnswer( + sql("select hr from srcpart_15752 where hr = 12 group by rollup(hr)"), + Row(null) :: Row(12) :: Nil) checkAnswer(sql("select max(hr) from srcpart_15752"), Row(12)) checkAnswer(sql("select max(hr) from srcpart_15752 where hr = 11"), Row(11)) checkAnswer(sql("select max(hr) from (select hr from srcpart_15752) t"), Row(12)) @@ -1722,6 +1725,9 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } checkAnswer(sql("select hr from srctext_15752 where hr = 11 group by hr"), Row(11)) + checkAnswer( + sql("select hr from srctext_15752 where hr = 12 group by rollup(hr)"), + Row(null) :: Row(12) :: Nil) checkAnswer(sql("select max(hr) from srctext_15752"), Row(12)) checkAnswer(sql("select max(hr) from srctext_15752 where hr = 11"), Row(11)) checkAnswer(sql("select max(hr) from (select hr from srctext_15752) t"), Row(12)) From ae6cf9ff909e4c375d4d0f688603f415edff6dca Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Tue, 28 Jun 2016 00:44:48 +0800 Subject: [PATCH 16/45] fix style --- .../apache/spark/sql/catalyst/catalog/ExternalCatalog.scala | 1 - .../apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala | 1 - .../scala/org/apache/spark/sql/execution/SparkOptimizer.scala | 3 --- .../scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala | 1 - 4 files changed, 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala index 2745d5b0dd3b8..6714846e8cbda 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql.catalyst.catalog import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException -import org.apache.spark.sql.catalyst.expressions.Expression /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala index 097ab99c64524..fb3e1b3637f21 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala @@ -28,7 +28,6 @@ import org.apache.spark.SparkException import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.analysis._ -import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.util.StringUtils /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala index dd49bf989e770..9a680bdac962e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala @@ -18,11 +18,8 @@ package org.apache.spark.sql.execution import org.apache.spark.sql.{ExperimentalMethods, SparkSession} -import org.apache.spark.sql.catalyst.analysis.RewriteDistinctAggregates import org.apache.spark.sql.catalyst.catalog.SessionCatalog import org.apache.spark.sql.catalyst.optimizer.Optimizer -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.python.ExtractPythonUDFFromAggregate import org.apache.spark.sql.internal.SQLConf diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index 001dd07c87638..b8bc9ab900ad1 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -30,7 +30,6 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog._ -import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.hive.client.HiveClient From 159331b4cc0583b6414152512d84696c16b895fc Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Tue, 28 Jun 2016 02:32:13 +0800 Subject: [PATCH 17/45] refactor --- .../sql/execution/MetadataOnlyOptimizer.scala | 105 ++++++++++-------- 1 file changed, 56 insertions(+), 49 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala index 4b2e09dd3c901..3162d5f592082 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala @@ -115,60 +115,60 @@ case class MetadataOnlyOptimizer( } } - private def convertToMetadataOnlyPlan( + private def convertLogicalToMetadataOnly( parent: LogicalPlan, - projectList: Seq[NamedExpression], filters: Seq[Expression], - relation: LogicalPlan): LogicalPlan = relation match { - case l @ LogicalRelation(files: HadoopFsRelation, _, _) => - val attributeMap = l.output.map(attr => (attr.name, attr)).toMap - val partitionColumns = files.partitionSchema.map { field => - attributeMap.getOrElse(field.name, throw new AnalysisException( - s"Unable to resolve ${field.name} given [${l.output.map(_.name).mkString(", ")}]")) - } - val filterColumns = filters.flatMap(_.references) - val projectSet = parent.references ++ AttributeSet(filterColumns) - if (projectSet.subsetOf(AttributeSet(partitionColumns))) { - val selectedPartitions = files.location.listFiles(filters) - val valuesRdd = sparkSession.sparkContext.parallelize(selectedPartitions.map(_.values), 1) - val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) - parent.transform { - case l @ LogicalRelation(files: HadoopFsRelation, _, _) => - valuesPlan - } - } else { - parent + logical: LogicalPlan, + files: HadoopFsRelation): LogicalPlan = { + val attributeMap = logical.output.map(attr => (attr.name, attr)).toMap + val partitionColumns = files.partitionSchema.map { field => + attributeMap.getOrElse(field.name, throw new AnalysisException( + s"Unable to resolve ${field.name} given [${logical.output.map(_.name).mkString(", ")}]")) + } + val filterColumns = filters.flatMap(_.references) + val projectSet = parent.references ++ AttributeSet(filterColumns) + if (projectSet.subsetOf(AttributeSet(partitionColumns))) { + val selectedPartitions = files.location.listFiles(filters) + val valuesRdd = sparkSession.sparkContext.parallelize(selectedPartitions.map(_.values), 1) + val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) + parent.transform { + case l @ LogicalRelation(files: HadoopFsRelation, _, _) => + valuesPlan } + } else { + parent + } + } - case relation: CatalogRelation => - val attributeMap = relation.output.map(attr => (attr.name, attr)).toMap - val partitionColumns = relation.catalogTable.partitionColumnNames.map { column => - attributeMap.getOrElse(column, throw new AnalysisException( - s"Unable to resolve ${column} given [${relation.output.map(_.name).mkString(", ")}]")) - } - val filterColumns = filters.flatMap(_.references) - val projectSet = parent.references ++ AttributeSet(filterColumns) - if (projectSet.subsetOf(AttributeSet(partitionColumns))) { - val partitionColumnDataTypes = partitionColumns.map(_.dataType) - val partitionValues = catalog.listPartitions(relation.catalogTable.identifier) - .map { p => - InternalRow.fromSeq( - partitionColumns.map(a => p.spec(a.name)).zip(partitionColumnDataTypes).map { - case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null) - }) - } - val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) - val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) - parent.transform { - case relation: CatalogRelation => - valuesPlan + private def convertCatalogToMetadataOnly( + parent: LogicalPlan, + filters: Seq[Expression], + relation: CatalogRelation): LogicalPlan = { + val attributeMap = relation.output.map(attr => (attr.name, attr)).toMap + val partitionColumns = relation.catalogTable.partitionColumnNames.map { column => + attributeMap.getOrElse(column, throw new AnalysisException( + s"Unable to resolve ${column} given [${relation.output.map(_.name).mkString(", ")}]")) + } + val filterColumns = filters.flatMap(_.references) + val projectSet = parent.references ++ AttributeSet(filterColumns) + if (projectSet.subsetOf(AttributeSet(partitionColumns))) { + val partitionColumnDataTypes = partitionColumns.map(_.dataType) + val partitionValues = catalog.listPartitions(relation.catalogTable.identifier) + .map { p => + InternalRow.fromSeq( + partitionColumns.map(a => p.spec(a.name)).zip(partitionColumnDataTypes).map { + case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null) + }) } - } else { - parent + val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) + val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) + parent.transform { + case relation: CatalogRelation => + valuesPlan } - - case _ => + } else { parent + } } def apply(plan: LogicalPlan): LogicalPlan = { @@ -177,9 +177,16 @@ case class MetadataOnlyOptimizer( } plan.transform { case a @ Aggregate(_, _, child) if canSupportMetadataOnly(a) => - val (plan, projectList, filters, _) = findRelation(child) + val (plan, _, filters, _) = findRelation(child) if (plan.isDefined) { - convertToMetadataOnlyPlan(a, projectList, filters, plan.get) + plan.get match { + case l @ LogicalRelation(files: HadoopFsRelation, _, _) => + convertLogicalToMetadataOnly(a, filters, l, files) + case relation: CatalogRelation => + convertCatalogToMetadataOnly(a, filters, relation) + case _ => + a + } } else { a } From 3a1438ba41c94ed44dbc0dc43c2457509f5e4fcc Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Tue, 28 Jun 2016 17:41:36 +0800 Subject: [PATCH 18/45] refactor --- .../sql/catalyst/catalog/interface.scala | 2 +- .../sql/execution/MetadataOnlyOptimizer.scala | 130 +++++++----------- .../sql/hive/execution/SQLQuerySuite.scala | 9 ++ 3 files changed, 63 insertions(+), 78 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 6197acab33786..43c5e7b2992b7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -231,7 +231,7 @@ object CatalogTypes { * If we can in the future consolidate SimpleCatalogRelation and MetastoreRelation, we should * probably remove this interface. */ -trait CatalogRelation { +trait CatalogRelation extends LogicalPlan { def catalogTable: CatalogTable def output: Seq[Attribute] } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala index 3162d5f592082..ba1cfcdc32055 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala @@ -71,86 +71,37 @@ case class MetadataOnlyOptimizer( } } - private def collectAliases(fields: Seq[Expression]): Map[ExprId, Expression] = fields.collect { - case a @ Alias(child, _) => a.toAttribute.exprId -> child - }.toMap - - private def substitute(aliases: Map[ExprId, Expression])(expr: Expression): Expression = { - expr.transform { - case a @ Alias(ref: AttributeReference, name) => - aliases.get(ref.exprId) - .map(Alias(_, name)(a.exprId, a.qualifier, isGenerated = a.isGenerated)) - .getOrElse(a) - - case a: AttributeReference => - aliases.get(a.exprId) - .map(Alias(_, a.name)(a.exprId, a.qualifier, isGenerated = a.isGenerated)).getOrElse(a) - } - } - - private def findRelation(plan: LogicalPlan) - : (Option[LogicalPlan], Seq[NamedExpression], Seq[Expression], Map[ExprId, Expression]) = { - plan match { - case relation @ LogicalRelation(files: HadoopFsRelation, _, table) - if files.partitionSchema.nonEmpty => - (Some(relation), Seq.empty[NamedExpression], Seq.empty[Expression], Map.empty) - - case relation: CatalogRelation if relation.catalogTable.partitionColumnNames.nonEmpty => - (Some(relation), Seq.empty[NamedExpression], Seq.empty[Expression], Map.empty) - - case p @ Project(fields, child) if fields.forall(_.deterministic) => - val (plan, _, filters, aliases) = findRelation(child) - val substitutedFields = fields.map(substitute(aliases)).asInstanceOf[Seq[NamedExpression]] - (plan, substitutedFields, filters, collectAliases(substitutedFields)) - - case f @ Filter(condition, child) if condition.deterministic => - val (plan, fields, filters, aliases) = findRelation(child) - val substitutedCondition = substitute(aliases)(condition) - (plan, fields, filters ++ Seq(substitutedCondition), aliases) - - case e @ Expand(_, _, child) => - findRelation(child) - - case _ => (None, Seq.empty[NamedExpression], Seq.empty[Expression], Map.empty) - } - } - private def convertLogicalToMetadataOnly( - parent: LogicalPlan, - filters: Seq[Expression], - logical: LogicalPlan, + project: LogicalPlan, + filter: Option[Expression], + logical: LogicalRelation, files: HadoopFsRelation): LogicalPlan = { val attributeMap = logical.output.map(attr => (attr.name, attr)).toMap val partitionColumns = files.partitionSchema.map { field => attributeMap.getOrElse(field.name, throw new AnalysisException( s"Unable to resolve ${field.name} given [${logical.output.map(_.name).mkString(", ")}]")) } - val filterColumns = filters.flatMap(_.references) - val projectSet = parent.references ++ AttributeSet(filterColumns) + val projectSet = filter.map(project.references ++ _.references).getOrElse(project.references) if (projectSet.subsetOf(AttributeSet(partitionColumns))) { - val selectedPartitions = files.location.listFiles(filters) + val selectedPartitions = files.location.listFiles(filter.map(Seq(_)).getOrElse(Seq.empty)) val valuesRdd = sparkSession.sparkContext.parallelize(selectedPartitions.map(_.values), 1) val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) - parent.transform { - case l @ LogicalRelation(files: HadoopFsRelation, _, _) => - valuesPlan - } + valuesPlan } else { - parent + logical } } private def convertCatalogToMetadataOnly( - parent: LogicalPlan, - filters: Seq[Expression], + project: LogicalPlan, + filter: Option[Expression], relation: CatalogRelation): LogicalPlan = { val attributeMap = relation.output.map(attr => (attr.name, attr)).toMap val partitionColumns = relation.catalogTable.partitionColumnNames.map { column => attributeMap.getOrElse(column, throw new AnalysisException( s"Unable to resolve ${column} given [${relation.output.map(_.name).mkString(", ")}]")) } - val filterColumns = filters.flatMap(_.references) - val projectSet = parent.references ++ AttributeSet(filterColumns) + val projectSet = filter.map(project.references ++ _.references).getOrElse(project.references) if (projectSet.subsetOf(AttributeSet(partitionColumns))) { val partitionColumnDataTypes = partitionColumns.map(_.dataType) val partitionValues = catalog.listPartitions(relation.catalogTable.identifier) @@ -162,34 +113,59 @@ case class MetadataOnlyOptimizer( } val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) - parent.transform { - case relation: CatalogRelation => - valuesPlan - } + valuesPlan } else { - parent + relation } } + private def convertToMetadataOnly(plan: LogicalPlan): LogicalPlan = plan match { + case p @ Project(fields, child) => + child match { + case f @ Filter(condition, l @ LogicalRelation(files: HadoopFsRelation, _, _)) + if files.partitionSchema.nonEmpty => + val plan = convertLogicalToMetadataOnly(p, Some(condition), l, files) + p.withNewChildren(f.withNewChildren(plan :: Nil) :: Nil) + + case l @ LogicalRelation(files: HadoopFsRelation, _, _) + if files.partitionSchema.nonEmpty => + val plan = convertLogicalToMetadataOnly(p, None, l, files) + p.withNewChildren(plan :: Nil) + + case f @ Filter(condition, relation: CatalogRelation) + if relation.catalogTable.partitionColumnNames.nonEmpty => + val plan = convertCatalogToMetadataOnly(p, Some(condition), relation) + p.withNewChildren(f.withNewChildren(plan :: Nil) :: Nil) + + case relation: CatalogRelation + if relation.catalogTable.partitionColumnNames.nonEmpty => + val plan = convertCatalogToMetadataOnly(p, None, relation) + p.withNewChildren(plan :: Nil) + + case other => + p.withNewChildren(p.children.map(convertToMetadataOnly(_))) + } + + case f : Filter => + f.withNewChildren(f.children.map(convertToMetadataOnly(_))) + + case e: Expand => + e.withNewChildren(e.children.map(convertToMetadataOnly(_))) + + case u: Union => + u.withNewChildren(u.children.map(convertToMetadataOnly(_))) + + case other: LogicalPlan => + other + } + def apply(plan: LogicalPlan): LogicalPlan = { if (!sparkSession.sessionState.conf.optimizerMetadataOnly) { return plan } plan.transform { case a @ Aggregate(_, _, child) if canSupportMetadataOnly(a) => - val (plan, _, filters, _) = findRelation(child) - if (plan.isDefined) { - plan.get match { - case l @ LogicalRelation(files: HadoopFsRelation, _, _) => - convertLogicalToMetadataOnly(a, filters, l, files) - case relation: CatalogRelation => - convertCatalogToMetadataOnly(a, filters, relation) - case _ => - a - } - } else { - a - } + a.withNewChildren(convertToMetadataOnly(child) :: Nil) } } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 939c53ec22239..f17d0ce55d805 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1701,10 +1701,15 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { |select key, value from data_15752 """.stripMargin) } + checkAnswer(sql("select hr from srcpart_15752 where hr = 11 group by hr"), Row(11)) checkAnswer( sql("select hr from srcpart_15752 where hr = 12 group by rollup(hr)"), Row(null) :: Row(12) :: Nil) + checkAnswer( + sql("select hr from (select hr from srcpart_15752 where hr = 11 union all " + + "select hr from srcpart_15752 where hr= 12)t group by hr"), + Row(11) :: Row(12) :: Nil) checkAnswer(sql("select max(hr) from srcpart_15752"), Row(12)) checkAnswer(sql("select max(hr) from srcpart_15752 where hr = 11"), Row(11)) checkAnswer(sql("select max(hr) from (select hr from srcpart_15752) t"), Row(12)) @@ -1728,6 +1733,10 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { checkAnswer( sql("select hr from srctext_15752 where hr = 12 group by rollup(hr)"), Row(null) :: Row(12) :: Nil) + checkAnswer( + sql("select hr from (select hr from srctext_15752 where hr = 11 union all " + + "select hr from srctext_15752 where hr= 12)t group by hr"), + Row(11) :: Row(12) :: Nil) checkAnswer(sql("select max(hr) from srctext_15752"), Row(12)) checkAnswer(sql("select max(hr) from srctext_15752 where hr = 11"), Row(11)) checkAnswer(sql("select max(hr) from (select hr from srctext_15752) t"), Row(12)) From c0a7d59b014f55b348aa5ed5b089ee87936d2413 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Wed, 29 Jun 2016 19:54:45 +0800 Subject: [PATCH 19/45] update --- .../sql/catalyst/analysis/Analyzer.scala | 5 +- .../expressions/namedExpressions.scala | 39 ++++-- .../sql/execution/MetadataOnlyOptimizer.scala | 123 ++++++++---------- .../datasources/LogicalRelation.scala | 5 +- .../datasources/fileSourceInterfaces.scala | 2 + .../apache/spark/sql/sources/interfaces.scala | 1 + .../spark/sql/hive/MetastoreRelation.scala | 8 +- .../sql/hive/execution/SQLQuerySuite.scala | 28 ++-- 8 files changed, 108 insertions(+), 103 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 96f2e38946f1c..044121ea45a5b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -292,14 +292,15 @@ class Analyzer( // Ensure all the expressions have been resolved. case x: GroupingSets if x.expressions.forall(_.resolved) => - val gid = AttributeReference(VirtualColumn.groupingIdName, IntegerType, false)() + val gid = AttributeReference(VirtualColumn.groupingIdName, IntegerType, false)( + isPartitionColumn = true) // Expand works by setting grouping expressions to null as determined by the bitmasks. To // prevent these null values from being used in an aggregate instead of the original value // we need to create new aliases for all group by expressions that will only be used for // the intended purpose. val groupByAliases: Seq[Alias] = x.groupByExprs.map { - case e: NamedExpression => Alias(e, e.name)() + case e: NamedExpression => Alias(e, e.name)(isPartitionColumn = e.isPartitionColumn) case other => Alias(other, other.toString)() } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala index 306a99d5a37bf..ba7237dd18250 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala @@ -83,6 +83,9 @@ trait NamedExpression extends Expression { /** Returns true if the expression is generated by Catalyst */ def isGenerated: java.lang.Boolean = false + /** Returns true if the expression is a partition column */ + def isPartitionColumn: java.lang.Boolean = false + /** Returns a copy of this expression with a new `exprId`. */ def newInstance(): NamedExpression @@ -127,12 +130,14 @@ abstract class Attribute extends LeafExpression with NamedExpression with NullIn * tableName and subQueryAlias are possible qualifiers. * @param explicitMetadata Explicit metadata associated with this alias that overwrites child's. * @param isGenerated A flag to indicate if this alias is generated by Catalyst + * @param isPartitionColumn A flag to indicate if this alias is a partition column */ case class Alias(child: Expression, name: String)( val exprId: ExprId = NamedExpression.newExprId, val qualifier: Option[String] = None, val explicitMetadata: Option[Metadata] = None, - override val isGenerated: java.lang.Boolean = false) + override val isGenerated: java.lang.Boolean = false, + override val isPartitionColumn: java.lang.Boolean = false) extends UnaryExpression with NamedExpression { // Alias(Generator, xx) need to be transformed into Generate(generator, ...) @@ -158,12 +163,13 @@ case class Alias(child: Expression, name: String)( def newInstance(): NamedExpression = Alias(child, name)( - qualifier = qualifier, explicitMetadata = explicitMetadata, isGenerated = isGenerated) + qualifier = qualifier, explicitMetadata = explicitMetadata, isGenerated = isGenerated, + isPartitionColumn = isPartitionColumn) override def toAttribute: Attribute = { if (resolved) { AttributeReference(name, child.dataType, child.nullable, metadata)( - exprId, qualifier, isGenerated) + exprId, qualifier, isGenerated, isPartitionColumn) } else { UnresolvedAttribute(name) } @@ -172,7 +178,7 @@ case class Alias(child: Expression, name: String)( override def toString: String = s"$child AS $name#${exprId.id}$typeSuffix" override protected final def otherCopyArgs: Seq[AnyRef] = { - exprId :: qualifier :: explicitMetadata :: isGenerated :: Nil + exprId :: qualifier :: explicitMetadata :: isGenerated :: isPartitionColumn :: Nil } override def hashCode(): Int = { @@ -206,6 +212,7 @@ case class Alias(child: Expression, name: String)( * qualified way. Consider the examples tableName.name, subQueryAlias.name. * tableName and subQueryAlias are possible qualifiers. * @param isGenerated A flag to indicate if this reference is generated by Catalyst + * @param isPartitionColumn A flag to indicate if this reference is a partition column */ case class AttributeReference( name: String, @@ -214,7 +221,8 @@ case class AttributeReference( override val metadata: Metadata = Metadata.empty)( val exprId: ExprId = NamedExpression.newExprId, val qualifier: Option[String] = None, - override val isGenerated: java.lang.Boolean = false) + override val isGenerated: java.lang.Boolean = false, + override val isPartitionColumn: java.lang.Boolean = false) extends Attribute with Unevaluable { /** @@ -252,7 +260,7 @@ case class AttributeReference( override def newInstance(): AttributeReference = AttributeReference(name, dataType, nullable, metadata)( - qualifier = qualifier, isGenerated = isGenerated) + qualifier = qualifier, isGenerated = isGenerated, isPartitionColumn = isPartitionColumn) /** * Returns a copy of this [[AttributeReference]] with changed nullability. @@ -261,7 +269,8 @@ case class AttributeReference( if (nullable == newNullability) { this } else { - AttributeReference(name, dataType, newNullability, metadata)(exprId, qualifier, isGenerated) + AttributeReference(name, dataType, newNullability, metadata)( + exprId, qualifier, isGenerated, isPartitionColumn) } } @@ -269,7 +278,8 @@ case class AttributeReference( if (name == newName) { this } else { - AttributeReference(newName, dataType, nullable, metadata)(exprId, qualifier, isGenerated) + AttributeReference(newName, dataType, nullable, metadata)( + exprId, qualifier, isGenerated, isPartitionColumn) } } @@ -280,7 +290,8 @@ case class AttributeReference( if (newQualifier == qualifier) { this } else { - AttributeReference(name, dataType, nullable, metadata)(exprId, newQualifier, isGenerated) + AttributeReference(name, dataType, nullable, metadata)( + exprId, newQualifier, isGenerated, isPartitionColumn) } } @@ -288,12 +299,18 @@ case class AttributeReference( if (exprId == newExprId) { this } else { - AttributeReference(name, dataType, nullable, metadata)(newExprId, qualifier, isGenerated) + AttributeReference(name, dataType, nullable, metadata)( + newExprId, qualifier, isGenerated, isPartitionColumn) } } + def setPartitionColumn(isPartitionColumn: Boolean): AttributeReference = { + AttributeReference(name, dataType, nullable, metadata)( + exprId, qualifier, isGenerated, isPartitionColumn) + } + override protected final def otherCopyArgs: Seq[AnyRef] = { - exprId :: qualifier :: isGenerated :: Nil + exprId :: qualifier :: isGenerated :: isPartitionColumn :: Nil } override def toString: String = s"$name#${exprId.id}$typeSuffix" diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala index ba1cfcdc32055..9709a9871d514 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala @@ -28,11 +28,10 @@ import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRela /** * When scanning only partition columns, get results based on metadata without scanning files. - * It is used for distinct, distinct aggregations or distinct-like aggregations(example: Max/Min). - * First of all, scanning only partition columns are required, then the rule does the following - * things here: + * It's used for operators that only need distinct values. Currently only [[Aggregate]] operator + * which satisfy the following conditions are supported: * 1. aggregate expression is partition columns, - * e.g. SELECT col FROM tbl GROUP BY col or SELECT col FROM tbl GROUP BY cube(col). + * e.g. SELECT col FROM tbl GROUP BY col, SELECT col FROM tbl GROUP BY cube(col). * 2. aggregate function on partition columns with DISTINCT, * e.g. SELECT count(DISTINCT col) FROM tbl GROUP BY col. * 3. aggregate function on partition columns which have same result with DISTINCT keyword. @@ -43,28 +42,33 @@ case class MetadataOnlyOptimizer( catalog: SessionCatalog) extends Rule[LogicalPlan] { private def canSupportMetadataOnly(a: Aggregate): Boolean = { - val aggregateExpressions = a.aggregateExpressions.flatMap { expr => - expr.collect { - case agg: AggregateExpression => agg - } - }.distinct - if (aggregateExpressions.isEmpty) { - // Support for aggregate that has no aggregateFunction when expressions are partition columns - // example: select partitionCol from table group by partitionCol. - // Moreover, multiple-distinct has been rewritted into it by RewriteDistinctAggregates. - true + if (!a.references.forall(_.isPartitionColumn)) { + // Support for scanning only partition columns + false } else { - aggregateExpressions.forall { agg => - if (agg.isDistinct) { - true - } else { - // If function can be evaluated on just the distinct values of a column, it can be used - // by metadata-only optimizer. - agg.aggregateFunction match { - case max: Max => true - case min: Min => true - case hyperLog: HyperLogLogPlusPlus => true - case _ => false + val aggregateExpressions = a.aggregateExpressions.flatMap { expr => + expr.collect { + case agg: AggregateExpression => agg + } + }.distinct + if (aggregateExpressions.isEmpty) { + // Support for aggregate that has no aggregateFunction when expressions are partition + // columns. example: select partitionCol from table group by partitionCol. + // Moreover, multiple-distinct has been rewritted into it by RewriteDistinctAggregates. + true + } else { + aggregateExpressions.forall { agg => + if (agg.isDistinct) { + true + } else { + // If function can be evaluated on just the distinct values of a column, it can be used + // by metadata-only optimizer. + agg.aggregateFunction match { + case max: Max => true + case min: Min => true + case hyperLog: HyperLogLogPlusPlus => true + case _ => false + } } } } @@ -72,7 +76,6 @@ case class MetadataOnlyOptimizer( } private def convertLogicalToMetadataOnly( - project: LogicalPlan, filter: Option[Expression], logical: LogicalRelation, files: HadoopFsRelation): LogicalPlan = { @@ -81,80 +84,62 @@ case class MetadataOnlyOptimizer( attributeMap.getOrElse(field.name, throw new AnalysisException( s"Unable to resolve ${field.name} given [${logical.output.map(_.name).mkString(", ")}]")) } - val projectSet = filter.map(project.references ++ _.references).getOrElse(project.references) - if (projectSet.subsetOf(AttributeSet(partitionColumns))) { - val selectedPartitions = files.location.listFiles(filter.map(Seq(_)).getOrElse(Seq.empty)) - val valuesRdd = sparkSession.sparkContext.parallelize(selectedPartitions.map(_.values), 1) - val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) - valuesPlan - } else { - logical - } + val selectedPartitions = files.location.listFiles(filter.map(Seq(_)).getOrElse(Seq.empty)) + val valuesRdd = sparkSession.sparkContext.parallelize(selectedPartitions.map(_.values), 1) + val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) + valuesPlan } - private def convertCatalogToMetadataOnly( - project: LogicalPlan, - filter: Option[Expression], - relation: CatalogRelation): LogicalPlan = { + private def convertCatalogToMetadataOnly(relation: CatalogRelation): LogicalPlan = { val attributeMap = relation.output.map(attr => (attr.name, attr)).toMap val partitionColumns = relation.catalogTable.partitionColumnNames.map { column => attributeMap.getOrElse(column, throw new AnalysisException( s"Unable to resolve ${column} given [${relation.output.map(_.name).mkString(", ")}]")) } - val projectSet = filter.map(project.references ++ _.references).getOrElse(project.references) - if (projectSet.subsetOf(AttributeSet(partitionColumns))) { - val partitionColumnDataTypes = partitionColumns.map(_.dataType) - val partitionValues = catalog.listPartitions(relation.catalogTable.identifier) - .map { p => - InternalRow.fromSeq( - partitionColumns.map(a => p.spec(a.name)).zip(partitionColumnDataTypes).map { - case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null) - }) - } - val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) - val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) - valuesPlan - } else { - relation - } + val partitionColumnDataTypes = partitionColumns.map(_.dataType) + val partitionValues = catalog.listPartitions(relation.catalogTable.identifier) + .map { p => + InternalRow.fromSeq( + partitionColumns.map(a => p.spec(a.name)).zip(partitionColumnDataTypes).map { + case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null) + }) + } + val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) + val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) + valuesPlan } private def convertToMetadataOnly(plan: LogicalPlan): LogicalPlan = plan match { - case p @ Project(fields, child) => + case p @ Project(fields, child) if p.references.forall(_.isPartitionColumn) => child match { case f @ Filter(condition, l @ LogicalRelation(files: HadoopFsRelation, _, _)) - if files.partitionSchema.nonEmpty => - val plan = convertLogicalToMetadataOnly(p, Some(condition), l, files) + if files.partitionSchema.nonEmpty && f.references.forall(_.isPartitionColumn) => + val plan = convertLogicalToMetadataOnly(Some(condition), l, files) p.withNewChildren(f.withNewChildren(plan :: Nil) :: Nil) case l @ LogicalRelation(files: HadoopFsRelation, _, _) if files.partitionSchema.nonEmpty => - val plan = convertLogicalToMetadataOnly(p, None, l, files) + val plan = convertLogicalToMetadataOnly(None, l, files) p.withNewChildren(plan :: Nil) case f @ Filter(condition, relation: CatalogRelation) - if relation.catalogTable.partitionColumnNames.nonEmpty => - val plan = convertCatalogToMetadataOnly(p, Some(condition), relation) + if relation.catalogTable.partitionColumnNames.nonEmpty && + f.references.forall(_.isPartitionColumn) => + val plan = convertCatalogToMetadataOnly(relation) p.withNewChildren(f.withNewChildren(plan :: Nil) :: Nil) case relation: CatalogRelation if relation.catalogTable.partitionColumnNames.nonEmpty => - val plan = convertCatalogToMetadataOnly(p, None, relation) + val plan = convertCatalogToMetadataOnly(relation) p.withNewChildren(plan :: Nil) case other => - p.withNewChildren(p.children.map(convertToMetadataOnly(_))) + other } - case f : Filter => - f.withNewChildren(f.children.map(convertToMetadataOnly(_))) - case e: Expand => e.withNewChildren(e.children.map(convertToMetadataOnly(_))) - case u: Union => - u.withNewChildren(u.children.map(convertToMetadataOnly(_))) - case other: LogicalPlan => other } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala index 39c8606fd14b5..0fa3e440a140e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala @@ -37,7 +37,10 @@ case class LogicalRelation( extends LeafNode with MultiInstanceRelation { override val output: Seq[AttributeReference] = { - val attrs = relation.schema.toAttributes + val attrs = relation.schema.toAttributes.map { attr => + attr.setPartitionColumn( + relation.partitionColumnNames.contains(attr.name.toLowerCase)) + } expectedOutputAttributes.map { expectedAttrs => assert(expectedAttrs.length == attrs.length) attrs.zip(expectedAttrs).map { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala index 06adaf71128aa..339512aa2b9b4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala @@ -151,6 +151,8 @@ case class HadoopFsRelation( }) } + override def partitionColumnNames: Set[String] = partitionSchema.map(_.name.toLowerCase).toSet + def partitionSchemaOption: Option[StructType] = if (partitionSchema.isEmpty) None else Some(partitionSchema) def partitionSpec: PartitionSpec = location.partitionSpec() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala index d2077a07f440a..53d24051eb271 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala @@ -185,6 +185,7 @@ trait CreatableRelationProvider { abstract class BaseRelation { def sqlContext: SQLContext def schema: StructType + def partitionColumnNames: Set[String] = Set.empty[String] /** * Returns an estimated size of this relation in bytes. This information is used by the planner diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala index 58bca2059cacc..4ceb0e1933e96 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala @@ -203,22 +203,22 @@ private[hive] case class MetastoreRelation( ) implicit class SchemaAttribute(f: CatalogColumn) { - def toAttribute: AttributeReference = AttributeReference( + def toAttribute(isPartitionColumn: Boolean): AttributeReference = AttributeReference( f.name, CatalystSqlParser.parseDataType(f.dataType), // Since data can be dumped in randomly with no validation, everything is nullable. nullable = true - )(qualifier = Some(alias.getOrElse(tableName))) + )(qualifier = Some(alias.getOrElse(tableName)), isPartitionColumn = isPartitionColumn) } /** PartitionKey attributes */ - val partitionKeys = catalogTable.partitionColumns.map(_.toAttribute) + val partitionKeys = catalogTable.partitionColumns.map(_.toAttribute(true)) /** Non-partitionKey attributes */ // TODO: just make this hold the schema itself, not just non-partition columns val attributes = catalogTable.schema .filter { c => !catalogTable.partitionColumnNames.contains(c.name) } - .map(_.toAttribute) + .map(_.toAttribute(false)) val output = attributes ++ partitionKeys diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index f17d0ce55d805..dc7a6867c3cd0 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1701,21 +1701,28 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { |select key, value from data_15752 """.stripMargin) } - checkAnswer(sql("select hr from srcpart_15752 where hr = 11 group by hr"), Row(11)) checkAnswer( sql("select hr from srcpart_15752 where hr = 12 group by rollup(hr)"), Row(null) :: Row(12) :: Nil) - checkAnswer( - sql("select hr from (select hr from srcpart_15752 where hr = 11 union all " + - "select hr from srcpart_15752 where hr= 12)t group by hr"), - Row(11) :: Row(12) :: Nil) checkAnswer(sql("select max(hr) from srcpart_15752"), Row(12)) checkAnswer(sql("select max(hr) from srcpart_15752 where hr = 11"), Row(11)) checkAnswer(sql("select max(hr) from (select hr from srcpart_15752) t"), Row(12)) checkAnswer(sql("select distinct hr from srcpart_15752"), Row(11) :: Row(12) :: Nil) checkAnswer(sql("select distinct hr from srcpart_15752 where hr = 11"), Row(11)) + // Now donot support metadata only optimizer + checkAnswer( + sql("select hr from (select hr from srcpart_15752 where hr = 11 union all " + + "select hr from srcpart_15752 where hr= 12)t group by hr"), + Row(11) :: Row(12) :: Nil) + checkAnswer( + sql("select max(x) from (select hr + 1 as x from srcpart_15752 where hr = 12) t"), + Row(13)) + checkAnswer( + sql("select distinct x from (select hr + 1 as x from srcpart_15752 where hr = 12) t"), + Row(13)) + sql( """ |CREATE TABLE srctext_15752 (key INT, value STRING) @@ -1728,26 +1735,15 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { |select key, value from data_15752 """.stripMargin) } - checkAnswer(sql("select hr from srctext_15752 where hr = 11 group by hr"), Row(11)) checkAnswer( sql("select hr from srctext_15752 where hr = 12 group by rollup(hr)"), Row(null) :: Row(12) :: Nil) - checkAnswer( - sql("select hr from (select hr from srctext_15752 where hr = 11 union all " + - "select hr from srctext_15752 where hr= 12)t group by hr"), - Row(11) :: Row(12) :: Nil) checkAnswer(sql("select max(hr) from srctext_15752"), Row(12)) checkAnswer(sql("select max(hr) from srctext_15752 where hr = 11"), Row(11)) checkAnswer(sql("select max(hr) from (select hr from srctext_15752) t"), Row(12)) - checkAnswer( - sql("select max(x) from (select hr + 1 as x from srctext_15752 where hr = 12) t"), - Row(13)) checkAnswer(sql("select distinct hr from srctext_15752"), Row(11) :: Row(12) :: Nil) checkAnswer(sql("select distinct hr from srctext_15752 where hr = 11"), Row(11)) - checkAnswer( - sql("select distinct x from (select hr + 1 as x from srctext_15752 where hr = 12) t"), - Row(13)) } } } From a4045ca3e492d76ae6f7d919cd1130bc9d1a52c5 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Wed, 29 Jun 2016 20:05:47 +0800 Subject: [PATCH 20/45] add comments --- .../apache/spark/sql/execution/MetadataOnlyOptimizer.scala | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala index 9709a9871d514..9fcabccfa463d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala @@ -109,6 +109,11 @@ case class MetadataOnlyOptimizer( valuesPlan } + /** + * When scanning only partition columns, convert LogicalRelation or CatalogRelation to LogicalRDD. + * Now support logical plan: + * Aggregate [Expand] Project [Filter] (LogicalRelation | CatalogRelation) + */ private def convertToMetadataOnly(plan: LogicalPlan): LogicalPlan = plan match { case p @ Project(fields, child) if p.references.forall(_.isPartitionColumn) => child match { From 0a023e7ce7a20af0b25d320e7fd4455f9e1b3029 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Wed, 29 Jun 2016 20:17:31 +0800 Subject: [PATCH 21/45] fix minor --- .../scala/org/apache/spark/sql/catalyst/catalog/interface.scala | 2 +- .../org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 43c5e7b2992b7..6197acab33786 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -231,7 +231,7 @@ object CatalogTypes { * If we can in the future consolidate SimpleCatalogRelation and MetastoreRelation, we should * probably remove this interface. */ -trait CatalogRelation extends LogicalPlan { +trait CatalogRelation { def catalogTable: CatalogTable def output: Seq[Attribute] } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala index 9fcabccfa463d..d37bfa923c4cf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala @@ -145,7 +145,7 @@ case class MetadataOnlyOptimizer( case e: Expand => e.withNewChildren(e.children.map(convertToMetadataOnly(_))) - case other: LogicalPlan => + case other => other } From a9b38abf0972bd956142278379e2e2e546c2b1a1 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Wed, 29 Jun 2016 22:47:05 +0800 Subject: [PATCH 22/45] rename --- .../sql/catalyst/analysis/Analyzer.scala | 4 +-- .../expressions/namedExpressions.scala | 34 +++++++++---------- .../sql/execution/MetadataOnlyOptimizer.scala | 8 ++--- .../datasources/LogicalRelation.scala | 2 +- .../spark/sql/hive/MetastoreRelation.scala | 4 +-- 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 044121ea45a5b..65457a4fa58d4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -293,14 +293,14 @@ class Analyzer( // Ensure all the expressions have been resolved. case x: GroupingSets if x.expressions.forall(_.resolved) => val gid = AttributeReference(VirtualColumn.groupingIdName, IntegerType, false)( - isPartitionColumn = true) + isMetadataColumn = true) // Expand works by setting grouping expressions to null as determined by the bitmasks. To // prevent these null values from being used in an aggregate instead of the original value // we need to create new aliases for all group by expressions that will only be used for // the intended purpose. val groupByAliases: Seq[Alias] = x.groupByExprs.map { - case e: NamedExpression => Alias(e, e.name)(isPartitionColumn = e.isPartitionColumn) + case e: NamedExpression => Alias(e, e.name)(isMetadataColumn = e.isMetadataColumn) case other => Alias(other, other.toString)() } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala index ba7237dd18250..2816383e9a42d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala @@ -83,8 +83,8 @@ trait NamedExpression extends Expression { /** Returns true if the expression is generated by Catalyst */ def isGenerated: java.lang.Boolean = false - /** Returns true if the expression is a partition column */ - def isPartitionColumn: java.lang.Boolean = false + /** Returns true if the expression is a metadata column */ + def isMetadataColumn: java.lang.Boolean = false /** Returns a copy of this expression with a new `exprId`. */ def newInstance(): NamedExpression @@ -130,14 +130,14 @@ abstract class Attribute extends LeafExpression with NamedExpression with NullIn * tableName and subQueryAlias are possible qualifiers. * @param explicitMetadata Explicit metadata associated with this alias that overwrites child's. * @param isGenerated A flag to indicate if this alias is generated by Catalyst - * @param isPartitionColumn A flag to indicate if this alias is a partition column + * @param isMetadataColumn A flag to indicate if this alias is a metadata column */ case class Alias(child: Expression, name: String)( val exprId: ExprId = NamedExpression.newExprId, val qualifier: Option[String] = None, val explicitMetadata: Option[Metadata] = None, override val isGenerated: java.lang.Boolean = false, - override val isPartitionColumn: java.lang.Boolean = false) + override val isMetadataColumn: java.lang.Boolean = false) extends UnaryExpression with NamedExpression { // Alias(Generator, xx) need to be transformed into Generate(generator, ...) @@ -164,12 +164,12 @@ case class Alias(child: Expression, name: String)( def newInstance(): NamedExpression = Alias(child, name)( qualifier = qualifier, explicitMetadata = explicitMetadata, isGenerated = isGenerated, - isPartitionColumn = isPartitionColumn) + isMetadataColumn = isMetadataColumn) override def toAttribute: Attribute = { if (resolved) { AttributeReference(name, child.dataType, child.nullable, metadata)( - exprId, qualifier, isGenerated, isPartitionColumn) + exprId, qualifier, isGenerated, isMetadataColumn) } else { UnresolvedAttribute(name) } @@ -178,7 +178,7 @@ case class Alias(child: Expression, name: String)( override def toString: String = s"$child AS $name#${exprId.id}$typeSuffix" override protected final def otherCopyArgs: Seq[AnyRef] = { - exprId :: qualifier :: explicitMetadata :: isGenerated :: isPartitionColumn :: Nil + exprId :: qualifier :: explicitMetadata :: isGenerated :: isMetadataColumn :: Nil } override def hashCode(): Int = { @@ -212,7 +212,7 @@ case class Alias(child: Expression, name: String)( * qualified way. Consider the examples tableName.name, subQueryAlias.name. * tableName and subQueryAlias are possible qualifiers. * @param isGenerated A flag to indicate if this reference is generated by Catalyst - * @param isPartitionColumn A flag to indicate if this reference is a partition column + * @param isMetadataColumn A flag to indicate if this reference is a metadata column */ case class AttributeReference( name: String, @@ -222,7 +222,7 @@ case class AttributeReference( val exprId: ExprId = NamedExpression.newExprId, val qualifier: Option[String] = None, override val isGenerated: java.lang.Boolean = false, - override val isPartitionColumn: java.lang.Boolean = false) + override val isMetadataColumn: java.lang.Boolean = false) extends Attribute with Unevaluable { /** @@ -260,7 +260,7 @@ case class AttributeReference( override def newInstance(): AttributeReference = AttributeReference(name, dataType, nullable, metadata)( - qualifier = qualifier, isGenerated = isGenerated, isPartitionColumn = isPartitionColumn) + qualifier = qualifier, isGenerated = isGenerated, isMetadataColumn = isMetadataColumn) /** * Returns a copy of this [[AttributeReference]] with changed nullability. @@ -270,7 +270,7 @@ case class AttributeReference( this } else { AttributeReference(name, dataType, newNullability, metadata)( - exprId, qualifier, isGenerated, isPartitionColumn) + exprId, qualifier, isGenerated, isMetadataColumn) } } @@ -279,7 +279,7 @@ case class AttributeReference( this } else { AttributeReference(newName, dataType, nullable, metadata)( - exprId, qualifier, isGenerated, isPartitionColumn) + exprId, qualifier, isGenerated, isMetadataColumn) } } @@ -291,7 +291,7 @@ case class AttributeReference( this } else { AttributeReference(name, dataType, nullable, metadata)( - exprId, newQualifier, isGenerated, isPartitionColumn) + exprId, newQualifier, isGenerated, isMetadataColumn) } } @@ -300,17 +300,17 @@ case class AttributeReference( this } else { AttributeReference(name, dataType, nullable, metadata)( - newExprId, qualifier, isGenerated, isPartitionColumn) + newExprId, qualifier, isGenerated, isMetadataColumn) } } - def setPartitionColumn(isPartitionColumn: Boolean): AttributeReference = { + def withMetadataColumn(isMetadataColumn: Boolean): AttributeReference = { AttributeReference(name, dataType, nullable, metadata)( - exprId, qualifier, isGenerated, isPartitionColumn) + exprId, qualifier, isGenerated, isMetadataColumn) } override protected final def otherCopyArgs: Seq[AnyRef] = { - exprId :: qualifier :: isGenerated :: isPartitionColumn :: Nil + exprId :: qualifier :: isGenerated :: isMetadataColumn :: Nil } override def toString: String = s"$name#${exprId.id}$typeSuffix" diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala index d37bfa923c4cf..0a151e1f33a2d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala @@ -42,7 +42,7 @@ case class MetadataOnlyOptimizer( catalog: SessionCatalog) extends Rule[LogicalPlan] { private def canSupportMetadataOnly(a: Aggregate): Boolean = { - if (!a.references.forall(_.isPartitionColumn)) { + if (!a.references.forall(_.isMetadataColumn)) { // Support for scanning only partition columns false } else { @@ -115,10 +115,10 @@ case class MetadataOnlyOptimizer( * Aggregate [Expand] Project [Filter] (LogicalRelation | CatalogRelation) */ private def convertToMetadataOnly(plan: LogicalPlan): LogicalPlan = plan match { - case p @ Project(fields, child) if p.references.forall(_.isPartitionColumn) => + case p @ Project(fields, child) if p.references.forall(_.isMetadataColumn) => child match { case f @ Filter(condition, l @ LogicalRelation(files: HadoopFsRelation, _, _)) - if files.partitionSchema.nonEmpty && f.references.forall(_.isPartitionColumn) => + if files.partitionSchema.nonEmpty && f.references.forall(_.isMetadataColumn) => val plan = convertLogicalToMetadataOnly(Some(condition), l, files) p.withNewChildren(f.withNewChildren(plan :: Nil) :: Nil) @@ -129,7 +129,7 @@ case class MetadataOnlyOptimizer( case f @ Filter(condition, relation: CatalogRelation) if relation.catalogTable.partitionColumnNames.nonEmpty && - f.references.forall(_.isPartitionColumn) => + f.references.forall(_.isMetadataColumn) => val plan = convertCatalogToMetadataOnly(relation) p.withNewChildren(f.withNewChildren(plan :: Nil) :: Nil) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala index 0fa3e440a140e..2d5f6d90f3672 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala @@ -38,7 +38,7 @@ case class LogicalRelation( override val output: Seq[AttributeReference] = { val attrs = relation.schema.toAttributes.map { attr => - attr.setPartitionColumn( + attr.withMetadataColumn( relation.partitionColumnNames.contains(attr.name.toLowerCase)) } expectedOutputAttributes.map { expectedAttrs => diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala index 4ceb0e1933e96..68c0c985b56a4 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala @@ -203,12 +203,12 @@ private[hive] case class MetastoreRelation( ) implicit class SchemaAttribute(f: CatalogColumn) { - def toAttribute(isPartitionColumn: Boolean): AttributeReference = AttributeReference( + def toAttribute(isMetadataColumn: Boolean): AttributeReference = AttributeReference( f.name, CatalystSqlParser.parseDataType(f.dataType), // Since data can be dumped in randomly with no validation, everything is nullable. nullable = true - )(qualifier = Some(alias.getOrElse(tableName)), isPartitionColumn = isPartitionColumn) + )(qualifier = Some(alias.getOrElse(tableName)), isMetadataColumn = isMetadataColumn) } /** PartitionKey attributes */ From a5ea995929358ab999f0636d053901a9af84a548 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Thu, 30 Jun 2016 02:24:59 +0800 Subject: [PATCH 23/45] update --- .../spark/sql/catalyst/CatalystConf.scala | 2 ++ .../sql/execution/MetadataOnlyOptimizer.scala | 22 ++++++++----------- .../spark/sql/execution/SparkOptimizer.scala | 3 +-- .../spark/sql/internal/SessionState.scala | 3 +-- 4 files changed, 13 insertions(+), 17 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala index 4df100c2a8304..cabef8d297825 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala @@ -28,6 +28,7 @@ trait CatalystConf { def orderByOrdinal: Boolean def groupByOrdinal: Boolean + def optimizerMetadataOnly: Boolean def optimizerMaxIterations: Int def optimizerInSetConversionThreshold: Int def maxCaseBranchesForCodegen: Int @@ -51,6 +52,7 @@ case class SimpleCatalystConf( caseSensitiveAnalysis: Boolean, orderByOrdinal: Boolean = true, groupByOrdinal: Boolean = true, + optimizerMetadataOnly: Boolean = true, optimizerMaxIterations: Int = 100, optimizerInSetConversionThreshold: Int = 10, maxCaseBranchesForCodegen: Int = 20, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala index 0a151e1f33a2d..e3299fbc2ba8c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala @@ -17,8 +17,8 @@ package org.apache.spark.sql.execution -import org.apache.spark.sql.{AnalysisException, SparkSession} -import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.{CatalystConf, InternalRow} import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, SessionCatalog} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ @@ -38,8 +38,8 @@ import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRela * e.g. SELECT Max(col2) FROM tbl GROUP BY col1. */ case class MetadataOnlyOptimizer( - sparkSession: SparkSession, - catalog: SessionCatalog) extends Rule[LogicalPlan] { + catalog: SessionCatalog, + conf: CatalystConf) extends Rule[LogicalPlan] { private def canSupportMetadataOnly(a: Aggregate): Boolean = { if (!a.references.forall(_.isMetadataColumn)) { @@ -85,9 +85,7 @@ case class MetadataOnlyOptimizer( s"Unable to resolve ${field.name} given [${logical.output.map(_.name).mkString(", ")}]")) } val selectedPartitions = files.location.listFiles(filter.map(Seq(_)).getOrElse(Seq.empty)) - val valuesRdd = sparkSession.sparkContext.parallelize(selectedPartitions.map(_.values), 1) - val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) - valuesPlan + LocalRelation(partitionColumns, selectedPartitions.map(_.values)) } private def convertCatalogToMetadataOnly(relation: CatalogRelation): LogicalPlan = { @@ -104,14 +102,12 @@ case class MetadataOnlyOptimizer( case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null) }) } - val valuesRdd = sparkSession.sparkContext.parallelize(partitionValues, 1) - val valuesPlan = LogicalRDD(partitionColumns, valuesRdd)(sparkSession) - valuesPlan + LocalRelation(partitionColumns, partitionValues) } /** - * When scanning only partition columns, convert LogicalRelation or CatalogRelation to LogicalRDD. - * Now support logical plan: + * When scanning only partition columns, convert LogicalRelation or CatalogRelation + * to LocalRelation. Now support logical plan: * Aggregate [Expand] Project [Filter] (LogicalRelation | CatalogRelation) */ private def convertToMetadataOnly(plan: LogicalPlan): LogicalPlan = plan match { @@ -150,7 +146,7 @@ case class MetadataOnlyOptimizer( } def apply(plan: LogicalPlan): LogicalPlan = { - if (!sparkSession.sessionState.conf.optimizerMetadataOnly) { + if (!conf.optimizerMetadataOnly) { return plan } plan.transform { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala index 9a680bdac962e..d61f48e359a41 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala @@ -24,14 +24,13 @@ import org.apache.spark.sql.execution.python.ExtractPythonUDFFromAggregate import org.apache.spark.sql.internal.SQLConf class SparkOptimizer( - sparkSession: SparkSession, catalog: SessionCatalog, conf: SQLConf, experimentalMethods: ExperimentalMethods) extends Optimizer(catalog, conf) { override def batches: Seq[Batch] = super.batches :+ - Batch("Metadata Only Optimization", Once, MetadataOnlyOptimizer(sparkSession, catalog)) :+ + Batch("Metadata Only Optimization", Once, MetadataOnlyOptimizer(catalog, conf)) :+ Batch("Extract Python UDF from Aggregate", Once, ExtractPythonUDFFromAggregate) :+ Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala index f107236183808..5f5cf5c6d30c4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala @@ -123,8 +123,7 @@ private[sql] class SessionState(sparkSession: SparkSession) { /** * Logical query plan optimizer. */ - lazy val optimizer: Optimizer = - new SparkOptimizer(sparkSession, catalog, conf, experimentalMethods) + lazy val optimizer: Optimizer = new SparkOptimizer(catalog, conf, experimentalMethods) /** * Parser that extracts expressions, plans, table identifiers etc. from SQL texts. From 1bed08deb2eba2426b83bd5f02dcb0a2ce18afd6 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Thu, 30 Jun 2016 02:27:12 +0800 Subject: [PATCH 24/45] fix monir --- .../scala/org/apache/spark/sql/execution/SparkOptimizer.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala index d61f48e359a41..b78390fb5d863 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution -import org.apache.spark.sql.{ExperimentalMethods, SparkSession} +import org.apache.spark.sql.ExperimentalMethods import org.apache.spark.sql.catalyst.catalog.SessionCatalog import org.apache.spark.sql.catalyst.optimizer.Optimizer import org.apache.spark.sql.execution.python.ExtractPythonUDFFromAggregate From a22e9626e6294671e0915822def6eb283a72a643 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Fri, 1 Jul 2016 16:36:04 +0800 Subject: [PATCH 25/45] refactor --- .../sql/catalyst/analysis/Analyzer.scala | 5 +- .../expressions/namedExpressions.scala | 37 ++-- .../sql/execution/MetadataOnlyOptimizer.scala | 186 ++++++++---------- .../datasources/LogicalRelation.scala | 5 +- .../datasources/fileSourceInterfaces.scala | 2 - .../apache/spark/sql/sources/interfaces.scala | 1 - .../spark/sql/hive/MetastoreRelation.scala | 8 +- .../sql/hive/execution/SQLQuerySuite.scala | 27 +-- 8 files changed, 115 insertions(+), 156 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 65457a4fa58d4..96f2e38946f1c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -292,15 +292,14 @@ class Analyzer( // Ensure all the expressions have been resolved. case x: GroupingSets if x.expressions.forall(_.resolved) => - val gid = AttributeReference(VirtualColumn.groupingIdName, IntegerType, false)( - isMetadataColumn = true) + val gid = AttributeReference(VirtualColumn.groupingIdName, IntegerType, false)() // Expand works by setting grouping expressions to null as determined by the bitmasks. To // prevent these null values from being used in an aggregate instead of the original value // we need to create new aliases for all group by expressions that will only be used for // the intended purpose. val groupByAliases: Seq[Alias] = x.groupByExprs.map { - case e: NamedExpression => Alias(e, e.name)(isMetadataColumn = e.isMetadataColumn) + case e: NamedExpression => Alias(e, e.name)() case other => Alias(other, other.toString)() } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala index 2816383e9a42d..9987c950b88a2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala @@ -83,9 +83,6 @@ trait NamedExpression extends Expression { /** Returns true if the expression is generated by Catalyst */ def isGenerated: java.lang.Boolean = false - /** Returns true if the expression is a metadata column */ - def isMetadataColumn: java.lang.Boolean = false - /** Returns a copy of this expression with a new `exprId`. */ def newInstance(): NamedExpression @@ -130,14 +127,12 @@ abstract class Attribute extends LeafExpression with NamedExpression with NullIn * tableName and subQueryAlias are possible qualifiers. * @param explicitMetadata Explicit metadata associated with this alias that overwrites child's. * @param isGenerated A flag to indicate if this alias is generated by Catalyst - * @param isMetadataColumn A flag to indicate if this alias is a metadata column */ case class Alias(child: Expression, name: String)( val exprId: ExprId = NamedExpression.newExprId, val qualifier: Option[String] = None, val explicitMetadata: Option[Metadata] = None, - override val isGenerated: java.lang.Boolean = false, - override val isMetadataColumn: java.lang.Boolean = false) + override val isGenerated: java.lang.Boolean = false) extends UnaryExpression with NamedExpression { // Alias(Generator, xx) need to be transformed into Generate(generator, ...) @@ -163,13 +158,12 @@ case class Alias(child: Expression, name: String)( def newInstance(): NamedExpression = Alias(child, name)( - qualifier = qualifier, explicitMetadata = explicitMetadata, isGenerated = isGenerated, - isMetadataColumn = isMetadataColumn) + qualifier = qualifier, explicitMetadata = explicitMetadata, isGenerated = isGenerated) override def toAttribute: Attribute = { if (resolved) { AttributeReference(name, child.dataType, child.nullable, metadata)( - exprId, qualifier, isGenerated, isMetadataColumn) + exprId, qualifier, isGenerated) } else { UnresolvedAttribute(name) } @@ -178,7 +172,7 @@ case class Alias(child: Expression, name: String)( override def toString: String = s"$child AS $name#${exprId.id}$typeSuffix" override protected final def otherCopyArgs: Seq[AnyRef] = { - exprId :: qualifier :: explicitMetadata :: isGenerated :: isMetadataColumn :: Nil + exprId :: qualifier :: explicitMetadata :: isGenerated :: Nil } override def hashCode(): Int = { @@ -212,7 +206,6 @@ case class Alias(child: Expression, name: String)( * qualified way. Consider the examples tableName.name, subQueryAlias.name. * tableName and subQueryAlias are possible qualifiers. * @param isGenerated A flag to indicate if this reference is generated by Catalyst - * @param isMetadataColumn A flag to indicate if this reference is a metadata column */ case class AttributeReference( name: String, @@ -221,8 +214,7 @@ case class AttributeReference( override val metadata: Metadata = Metadata.empty)( val exprId: ExprId = NamedExpression.newExprId, val qualifier: Option[String] = None, - override val isGenerated: java.lang.Boolean = false, - override val isMetadataColumn: java.lang.Boolean = false) + override val isGenerated: java.lang.Boolean = false) extends Attribute with Unevaluable { /** @@ -260,7 +252,7 @@ case class AttributeReference( override def newInstance(): AttributeReference = AttributeReference(name, dataType, nullable, metadata)( - qualifier = qualifier, isGenerated = isGenerated, isMetadataColumn = isMetadataColumn) + qualifier = qualifier, isGenerated = isGenerated) /** * Returns a copy of this [[AttributeReference]] with changed nullability. @@ -269,8 +261,7 @@ case class AttributeReference( if (nullable == newNullability) { this } else { - AttributeReference(name, dataType, newNullability, metadata)( - exprId, qualifier, isGenerated, isMetadataColumn) + AttributeReference(name, dataType, newNullability, metadata)(exprId, qualifier, isGenerated) } } @@ -278,8 +269,7 @@ case class AttributeReference( if (name == newName) { this } else { - AttributeReference(newName, dataType, nullable, metadata)( - exprId, qualifier, isGenerated, isMetadataColumn) + AttributeReference(newName, dataType, nullable, metadata)(exprId, qualifier, isGenerated) } } @@ -290,8 +280,7 @@ case class AttributeReference( if (newQualifier == qualifier) { this } else { - AttributeReference(name, dataType, nullable, metadata)( - exprId, newQualifier, isGenerated, isMetadataColumn) + AttributeReference(name, dataType, nullable, metadata)(exprId, newQualifier, isGenerated) } } @@ -299,18 +288,16 @@ case class AttributeReference( if (exprId == newExprId) { this } else { - AttributeReference(name, dataType, nullable, metadata)( - newExprId, qualifier, isGenerated, isMetadataColumn) + AttributeReference(name, dataType, nullable, metadata)(newExprId, qualifier, isGenerated) } } def withMetadataColumn(isMetadataColumn: Boolean): AttributeReference = { - AttributeReference(name, dataType, nullable, metadata)( - exprId, qualifier, isGenerated, isMetadataColumn) + AttributeReference(name, dataType, nullable, metadata)(exprId, qualifier, isGenerated) } override protected final def otherCopyArgs: Seq[AnyRef] = { - exprId :: qualifier :: isGenerated :: isMetadataColumn :: Nil + exprId :: qualifier :: isGenerated :: Nil } override def toString: String = s"$name#${exprId.id}$typeSuffix" diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala index e3299fbc2ba8c..ff500add2d9d7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala @@ -27,131 +27,107 @@ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} /** - * When scanning only partition columns, get results based on metadata without scanning files. + * When scanning only partition columns, get results based on partition data without scanning files. * It's used for operators that only need distinct values. Currently only [[Aggregate]] operator * which satisfy the following conditions are supported: - * 1. aggregate expression is partition columns, - * e.g. SELECT col FROM tbl GROUP BY col, SELECT col FROM tbl GROUP BY cube(col). - * 2. aggregate function on partition columns with DISTINCT, + * 1. aggregate expression is partition columns. + * e.g. SELECT col FROM tbl GROUP BY col. + * 2. aggregate function on partition columns with DISTINCT. * e.g. SELECT count(DISTINCT col) FROM tbl GROUP BY col. - * 3. aggregate function on partition columns which have same result with DISTINCT keyword. + * 3. aggregate function on partition columns which have same result w or w/o DISTINCT keyword. * e.g. SELECT Max(col2) FROM tbl GROUP BY col1. */ case class MetadataOnlyOptimizer( - catalog: SessionCatalog, - conf: CatalystConf) extends Rule[LogicalPlan] { + catalog: SessionCatalog, + conf: CatalystConf) extends Rule[LogicalPlan] { - private def canSupportMetadataOnly(a: Aggregate): Boolean = { - if (!a.references.forall(_.isMetadataColumn)) { - // Support for scanning only partition columns - false - } else { - val aggregateExpressions = a.aggregateExpressions.flatMap { expr => - expr.collect { - case agg: AggregateExpression => agg - } - }.distinct - if (aggregateExpressions.isEmpty) { - // Support for aggregate that has no aggregateFunction when expressions are partition - // columns. example: select partitionCol from table group by partitionCol. - // Moreover, multiple-distinct has been rewritted into it by RewriteDistinctAggregates. - true - } else { - aggregateExpressions.forall { agg => - if (agg.isDistinct) { - true - } else { - // If function can be evaluated on just the distinct values of a column, it can be used - // by metadata-only optimizer. - agg.aggregateFunction match { - case max: Max => true - case min: Min => true - case hyperLog: HyperLogLogPlusPlus => true + def apply(plan: LogicalPlan): LogicalPlan = { + if (!conf.optimizerMetadataOnly) { + return plan + } + + plan.transform { + case a @ Aggregate(_, aggExprs, child @ PartitionedRelation(partAttrs, relation)) => + if (a.references.subsetOf(partAttrs)) { + val aggFunctions = aggExprs.flatMap(_.collect { + case agg: AggregateExpression => agg + }) + val isPartitionDataOnly = aggFunctions.isEmpty || aggFunctions.forall { agg => + agg.isDistinct || (agg.aggregateFunction match { + case _: Max => true + case _: Min => true case _ => false - } + }) + } + if (isPartitionDataOnly) { + a.withNewChildren(Seq(usePartitionData(child, relation))) + } else { + a } + } else { + a } - } } } - private def convertLogicalToMetadataOnly( - filter: Option[Expression], - logical: LogicalRelation, - files: HadoopFsRelation): LogicalPlan = { - val attributeMap = logical.output.map(attr => (attr.name, attr)).toMap - val partitionColumns = files.partitionSchema.map { field => - attributeMap.getOrElse(field.name, throw new AnalysisException( - s"Unable to resolve ${field.name} given [${logical.output.map(_.name).mkString(", ")}]")) - } - val selectedPartitions = files.location.listFiles(filter.map(Seq(_)).getOrElse(Seq.empty)) - LocalRelation(partitionColumns, selectedPartitions.map(_.values)) - } + private def usePartitionData(child: LogicalPlan, relation: LogicalPlan): LogicalPlan = { + child transform { + case plan if plan eq relation => + relation match { + case l @ LogicalRelation(fsRelation: HadoopFsRelation, _, _) => + val partColumns = fsRelation.partitionSchema.map(_.name.toLowerCase).toSet + val partAttrs = l.output.filter(a => partColumns.contains(a.name.toLowerCase)) + val partitionData = fsRelation.location.listFiles(Nil) + LocalRelation(partAttrs, partitionData.map(_.values)) + + case relation: CatalogRelation => + val partColumns = relation.catalogTable.partitionColumnNames.map(_.toLowerCase).toSet + val partAttrs = relation.output.filter(a => partColumns.contains(a.name.toLowerCase)) + val partitionData = catalog.listPartitions(relation.catalogTable.identifier).map { p => + InternalRow.fromSeq(partAttrs.map { attr => + Cast(Literal(p.spec(attr.name)), attr.dataType).eval() + }) + } + LocalRelation(partAttrs, partitionData) - private def convertCatalogToMetadataOnly(relation: CatalogRelation): LogicalPlan = { - val attributeMap = relation.output.map(attr => (attr.name, attr)).toMap - val partitionColumns = relation.catalogTable.partitionColumnNames.map { column => - attributeMap.getOrElse(column, throw new AnalysisException( - s"Unable to resolve ${column} given [${relation.output.map(_.name).mkString(", ")}]")) + case _ => throw new IllegalStateException() + } } - val partitionColumnDataTypes = partitionColumns.map(_.dataType) - val partitionValues = catalog.listPartitions(relation.catalogTable.identifier) - .map { p => - InternalRow.fromSeq( - partitionColumns.map(a => p.spec(a.name)).zip(partitionColumnDataTypes).map { - case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null) - }) - } - LocalRelation(partitionColumns, partitionValues) } - /** - * When scanning only partition columns, convert LogicalRelation or CatalogRelation - * to LocalRelation. Now support logical plan: - * Aggregate [Expand] Project [Filter] (LogicalRelation | CatalogRelation) - */ - private def convertToMetadataOnly(plan: LogicalPlan): LogicalPlan = plan match { - case p @ Project(fields, child) if p.references.forall(_.isMetadataColumn) => - child match { - case f @ Filter(condition, l @ LogicalRelation(files: HadoopFsRelation, _, _)) - if files.partitionSchema.nonEmpty && f.references.forall(_.isMetadataColumn) => - val plan = convertLogicalToMetadataOnly(Some(condition), l, files) - p.withNewChildren(f.withNewChildren(plan :: Nil) :: Nil) - - case l @ LogicalRelation(files: HadoopFsRelation, _, _) - if files.partitionSchema.nonEmpty => - val plan = convertLogicalToMetadataOnly(None, l, files) - p.withNewChildren(plan :: Nil) - - case f @ Filter(condition, relation: CatalogRelation) - if relation.catalogTable.partitionColumnNames.nonEmpty && - f.references.forall(_.isMetadataColumn) => - val plan = convertCatalogToMetadataOnly(relation) - p.withNewChildren(f.withNewChildren(plan :: Nil) :: Nil) + object PartitionedRelation { + def unapply(plan: LogicalPlan): Option[(AttributeSet, LogicalPlan)] = plan match { + case l @ LogicalRelation(fsRelation: HadoopFsRelation, _, _) + if fsRelation.partitionSchema.nonEmpty => + val partColumns = fsRelation.partitionSchema.map(_.name.toLowerCase).toSet + val partAttrs = l.output.filter(a => partColumns.contains(a.name.toLowerCase)) + Some(AttributeSet(partAttrs), l) - case relation: CatalogRelation - if relation.catalogTable.partitionColumnNames.nonEmpty => - val plan = convertCatalogToMetadataOnly(relation) - p.withNewChildren(plan :: Nil) + case relation: CatalogRelation if relation.catalogTable.partitionColumnNames.nonEmpty => + val partColumns = relation.catalogTable.partitionColumnNames.map(_.toLowerCase).toSet + val partAttrs = relation.output.filter(a => partColumns.contains(a.name.toLowerCase)) + Some(AttributeSet(partAttrs), relation) - case other => - other - } - - case e: Expand => - e.withNewChildren(e.children.map(convertToMetadataOnly(_))) - - case other => - other - } + case p @ Project(projectList, child) if projectList.forall(_.deterministic) => + unapply(child).flatMap { + case (partAttrs, relation) => + if (p.references.subsetOf(partAttrs)) { + Some(p.outputSet, relation) + } else { + None + } + } - def apply(plan: LogicalPlan): LogicalPlan = { - if (!conf.optimizerMetadataOnly) { - return plan - } - plan.transform { - case a @ Aggregate(_, _, child) if canSupportMetadataOnly(a) => - a.withNewChildren(convertToMetadataOnly(child) :: Nil) + case f @ Filter(condition, child) if condition.deterministic => + unapply(child).flatMap { + case (partAttrs, relation) => + if (f.references.subsetOf(partAttrs)) { + Some(f.outputSet, relation) + } else { + None + } + } + case _ => None } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala index 2d5f6d90f3672..39c8606fd14b5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala @@ -37,10 +37,7 @@ case class LogicalRelation( extends LeafNode with MultiInstanceRelation { override val output: Seq[AttributeReference] = { - val attrs = relation.schema.toAttributes.map { attr => - attr.withMetadataColumn( - relation.partitionColumnNames.contains(attr.name.toLowerCase)) - } + val attrs = relation.schema.toAttributes expectedOutputAttributes.map { expectedAttrs => assert(expectedAttrs.length == attrs.length) attrs.zip(expectedAttrs).map { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala index 339512aa2b9b4..06adaf71128aa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala @@ -151,8 +151,6 @@ case class HadoopFsRelation( }) } - override def partitionColumnNames: Set[String] = partitionSchema.map(_.name.toLowerCase).toSet - def partitionSchemaOption: Option[StructType] = if (partitionSchema.isEmpty) None else Some(partitionSchema) def partitionSpec: PartitionSpec = location.partitionSpec() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala index 53d24051eb271..d2077a07f440a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala @@ -185,7 +185,6 @@ trait CreatableRelationProvider { abstract class BaseRelation { def sqlContext: SQLContext def schema: StructType - def partitionColumnNames: Set[String] = Set.empty[String] /** * Returns an estimated size of this relation in bytes. This information is used by the planner diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala index 68c0c985b56a4..26d880fdc45f4 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala @@ -203,22 +203,22 @@ private[hive] case class MetastoreRelation( ) implicit class SchemaAttribute(f: CatalogColumn) { - def toAttribute(isMetadataColumn: Boolean): AttributeReference = AttributeReference( + def toAttribute(): AttributeReference = AttributeReference( f.name, CatalystSqlParser.parseDataType(f.dataType), // Since data can be dumped in randomly with no validation, everything is nullable. nullable = true - )(qualifier = Some(alias.getOrElse(tableName)), isMetadataColumn = isMetadataColumn) + )(qualifier = Some(alias.getOrElse(tableName))) } /** PartitionKey attributes */ - val partitionKeys = catalogTable.partitionColumns.map(_.toAttribute(true)) + val partitionKeys = catalogTable.partitionColumns.map(_.toAttribute()) /** Non-partitionKey attributes */ // TODO: just make this hold the schema itself, not just non-partition columns val attributes = catalogTable.schema .filter { c => !catalogTable.partitionColumnNames.contains(c.name) } - .map(_.toAttribute(false)) + .map(_.toAttribute()) val output = attributes ++ partitionKeys diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index dc7a6867c3cd0..00a9339218081 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1702,26 +1702,26 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { """.stripMargin) } checkAnswer(sql("select hr from srcpart_15752 where hr = 11 group by hr"), Row(11)) - checkAnswer( - sql("select hr from srcpart_15752 where hr = 12 group by rollup(hr)"), - Row(null) :: Row(12) :: Nil) checkAnswer(sql("select max(hr) from srcpart_15752"), Row(12)) checkAnswer(sql("select max(hr) from srcpart_15752 where hr = 11"), Row(11)) checkAnswer(sql("select max(hr) from (select hr from srcpart_15752) t"), Row(12)) + checkAnswer( + sql("select max(x) from (select hr + 1 as x from srcpart_15752 where hr = 12) t"), + Row(13)) checkAnswer(sql("select distinct hr from srcpart_15752"), Row(11) :: Row(12) :: Nil) checkAnswer(sql("select distinct hr from srcpart_15752 where hr = 11"), Row(11)) + checkAnswer( + sql("select distinct x from (select hr + 1 as x from srcpart_15752 where hr = 12) t"), + Row(13)) // Now donot support metadata only optimizer + checkAnswer( + sql("select hr from srcpart_15752 where hr = 12 group by rollup(hr)"), + Row(null) :: Row(12) :: Nil) checkAnswer( sql("select hr from (select hr from srcpart_15752 where hr = 11 union all " + "select hr from srcpart_15752 where hr= 12)t group by hr"), Row(11) :: Row(12) :: Nil) - checkAnswer( - sql("select max(x) from (select hr + 1 as x from srcpart_15752 where hr = 12) t"), - Row(13)) - checkAnswer( - sql("select distinct x from (select hr + 1 as x from srcpart_15752 where hr = 12) t"), - Row(13)) sql( """ @@ -1736,14 +1736,17 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { """.stripMargin) } checkAnswer(sql("select hr from srctext_15752 where hr = 11 group by hr"), Row(11)) - checkAnswer( - sql("select hr from srctext_15752 where hr = 12 group by rollup(hr)"), - Row(null) :: Row(12) :: Nil) checkAnswer(sql("select max(hr) from srctext_15752"), Row(12)) checkAnswer(sql("select max(hr) from srctext_15752 where hr = 11"), Row(11)) checkAnswer(sql("select max(hr) from (select hr from srctext_15752) t"), Row(12)) + checkAnswer( + sql("select max(x) from (select hr + 1 as x from srctext_15752 where hr = 12) t"), + Row(13)) checkAnswer(sql("select distinct hr from srctext_15752"), Row(11) :: Row(12) :: Nil) checkAnswer(sql("select distinct hr from srctext_15752 where hr = 11"), Row(11)) + checkAnswer( + sql("select distinct x from (select hr + 1 as x from srctext_15752 where hr = 12) t"), + Row(13)) } } } From 41fef2c40f4929fd26476ecdfa3ee8160394a7d3 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Fri, 1 Jul 2016 16:39:36 +0800 Subject: [PATCH 26/45] update --- .../spark/sql/catalyst/expressions/namedExpressions.scala | 4 ---- .../scala/org/apache/spark/sql/hive/MetastoreRelation.scala | 6 +++--- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala index 9987c950b88a2..306a99d5a37bf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala @@ -292,10 +292,6 @@ case class AttributeReference( } } - def withMetadataColumn(isMetadataColumn: Boolean): AttributeReference = { - AttributeReference(name, dataType, nullable, metadata)(exprId, qualifier, isGenerated) - } - override protected final def otherCopyArgs: Seq[AnyRef] = { exprId :: qualifier :: isGenerated :: Nil } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala index 26d880fdc45f4..58bca2059cacc 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala @@ -203,7 +203,7 @@ private[hive] case class MetastoreRelation( ) implicit class SchemaAttribute(f: CatalogColumn) { - def toAttribute(): AttributeReference = AttributeReference( + def toAttribute: AttributeReference = AttributeReference( f.name, CatalystSqlParser.parseDataType(f.dataType), // Since data can be dumped in randomly with no validation, everything is nullable. @@ -212,13 +212,13 @@ private[hive] case class MetastoreRelation( } /** PartitionKey attributes */ - val partitionKeys = catalogTable.partitionColumns.map(_.toAttribute()) + val partitionKeys = catalogTable.partitionColumns.map(_.toAttribute) /** Non-partitionKey attributes */ // TODO: just make this hold the schema itself, not just non-partition columns val attributes = catalogTable.schema .filter { c => !catalogTable.partitionColumnNames.contains(c.name) } - .map(_.toAttribute()) + .map(_.toAttribute) val output = attributes ++ partitionKeys From 88f7308173829ca2473690a0c409c438d3cd5cf4 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Fri, 1 Jul 2016 16:56:11 +0800 Subject: [PATCH 27/45] update --- .../scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index 552eacfcbceb6..2e0b5d59b5783 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -27,6 +27,7 @@ import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils._ @@ -36,6 +37,7 @@ import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, Pa import org.apache.spark.sql.hive.orc.OrcFileFormat import org.apache.spark.sql.types._ + /** * Legacy catalog for interacting with the Hive metastore. * From 2568193f91b9ae129c19a67bfd514065215840ac Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Fri, 1 Jul 2016 18:34:30 +0800 Subject: [PATCH 28/45] add ut --- .../MetadataOnlyOptimizerSuite.scala | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizerSuite.scala diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizerSuite.scala new file mode 100644 index 0000000000000..045a536e9cb05 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizerSuite.scala @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.plans.logical.LocalRelation +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSQLContext + +class MetadataOnlyOptimizerSuite extends QueryTest with SharedSQLContext { + import testImplicits._ + + override def beforeAll(): Unit = { + super.beforeAll() + val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "even" else "odd")) + .toDF("id", "data", "partId", "part") + data.write.partitionBy("partId", "part").mode("append").saveAsTable("srcpart_15752") + } + + private def checkWithMetadataOnly(df: DataFrame): Unit = { + val localRelations = df.queryExecution.optimizedPlan.collect { + case l @ LocalRelation(_, _) => l + } + assert(localRelations.size == 1) + } + + private def checkWithoutMetadataOnly(df: DataFrame): Unit = { + val localRelations = df.queryExecution.optimizedPlan.collect{ + case l @ LocalRelation(_, _) => l + } + assert(localRelations.size == 0) + } + + test("spark-15752 metadata only optimizer for partition table") { + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { + checkWithMetadataOnly(sql("select part from srcpart_15752 where part = 0 group by part")) + checkWithMetadataOnly(sql("select max(part) from srcpart_15752")) + checkWithMetadataOnly(sql("select max(part) from srcpart_15752 where part = 0")) + checkWithMetadataOnly( + sql("select part, min(partId) from srcpart_15752 where part = 0 group by part")) + checkWithMetadataOnly( + sql("select max(x) from (select part + 1 as x from srcpart_15752 where part = 1) t")) + checkWithMetadataOnly(sql("select distinct part from srcpart_15752")) + checkWithMetadataOnly(sql("select distinct part, partId from srcpart_15752")) + checkWithMetadataOnly( + sql("select distinct x from (select part + 1 as x from srcpart_15752 where part = 0) t")) + + // Now donot support metadata only optimizer + checkWithoutMetadataOnly(sql("select part, max(id) from srcpart_15752 group by part")) + checkWithoutMetadataOnly(sql("select distinct part, id from srcpart_15752")) + checkWithoutMetadataOnly(sql("select part, sum(partId) from srcpart_15752 group by part")) + checkWithoutMetadataOnly( + sql("select part from srcpart_15752 where part = 1 group by rollup(part)")) + checkWithoutMetadataOnly( + sql("select part from (select part from srcpart_15752 where part = 0 union all " + + "select part from srcpart_15752 where part= 1)t group by part")) + } + } + + test("spark-15752 without metadata only optimizer for partition table") { + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") { + checkWithoutMetadataOnly(sql("select part from srcpart_15752 where part = 0 group by part")) + checkWithoutMetadataOnly(sql("select max(part) from srcpart_15752")) + checkWithoutMetadataOnly(sql("select max(part) from srcpart_15752 where part = 0")) + checkWithoutMetadataOnly( + sql("select max(x) from (select part + 1 as x from srcpart_15752 where part = 1) t")) + checkWithoutMetadataOnly(sql("select distinct part from srcpart_15752")) + checkWithoutMetadataOnly( + sql("select distinct x from (select part + 1 as x from srcpart_15752 where part = 1) t")) + } + } +} From 26a97f4df483fe324b76c4ee3c1ddec2f3830566 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Mon, 4 Jul 2016 19:20:42 +0800 Subject: [PATCH 29/45] address comments --- .../sql/execution/MetadataOnlyOptimizer.scala | 2 +- .../org/apache/spark/sql/SQLQuerySuite.scala | 22 ++-- .../MetadataOnlyOptimizerSuite.scala | 8 ++ .../sql/hive/execution/SQLQuerySuite.scala | 112 +++++++++--------- 4 files changed, 80 insertions(+), 64 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala index ff500add2d9d7..73b8efe8ce177 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala @@ -33,7 +33,7 @@ import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRela * 1. aggregate expression is partition columns. * e.g. SELECT col FROM tbl GROUP BY col. * 2. aggregate function on partition columns with DISTINCT. - * e.g. SELECT count(DISTINCT col) FROM tbl GROUP BY col. + * e.g. SELECT count(DISTINCT col1) FROM tbl GROUP BY col2. * 3. aggregate function on partition columns which have same result w or w/o DISTINCT keyword. * e.g. SELECT Max(col2) FROM tbl GROUP BY col1. */ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 49ac49c5b2c16..bdd7241545f61 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -2867,13 +2867,19 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } test("spark-15752 metadata only optimizer for datasource table") { - val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "even" else "odd")) - .toDF("id", "data", "partId", "part") - data.write.partitionBy("partId", "part").mode("append").saveAsTable("srcpart_15752") - checkAnswer(sql("select max(part) from srcpart_15752"), Row("odd")) - checkAnswer(sql("select max(part) from srcpart_15752 where partId = 0"), Row("even")) - checkAnswer(sql("select max(part) from (select part from srcpart_15752) t"), Row("odd")) - checkAnswer(sql("select distinct part from srcpart_15752"), Row("even") :: Row("odd") :: Nil) - checkAnswer(sql("select distinct part from srcpart_15752 where partId = 0"), Row("even")) + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { + withTable("srcpart_15752") { + val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "even" else "odd")) + .toDF("id", "data", "partId", "part") + data.write.partitionBy("partId", "part").mode("append").saveAsTable("srcpart_15752") + checkAnswer(sql("select max(part) from srcpart_15752"), Row("odd")) + checkAnswer(sql("select max(part) from srcpart_15752 where partId = 0"), Row("even")) + checkAnswer(sql("select max(part) from (select part from srcpart_15752) t"), Row("odd")) + checkAnswer( + sql("select distinct part from srcpart_15752"), + Row("even") :: Row("odd") :: Nil) + checkAnswer(sql("select distinct part from srcpart_15752 where partId = 0"), Row("even")) + } + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizerSuite.scala index 045a536e9cb05..55dc4ad5da5fb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizerSuite.scala @@ -32,6 +32,14 @@ class MetadataOnlyOptimizerSuite extends QueryTest with SharedSQLContext { data.write.partitionBy("partId", "part").mode("append").saveAsTable("srcpart_15752") } + override protected def afterAll(): Unit = { + try { + sql("DROP TABLE IF EXISTS srcpart_15752") + } finally { + super.afterAll() + } + } + private def checkWithMetadataOnly(df: DataFrame): Unit = { val localRelations = df.queryExecution.optimizedPlan.collect { case l @ LocalRelation(_, _) => l diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index a7215faaca219..151a001562d87 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1692,66 +1692,68 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("spark-15752 metadata only optimizer for hive table") { withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { - val df = Seq((1, "2"), (3, "4")).toDF("key", "value") - df.createOrReplaceTempView("data_15752") - sql( - """ - |CREATE TABLE srcpart_15752 (key INT, value STRING) - |PARTITIONED BY (ds STRING, hr INT) STORED AS parquet - """.stripMargin) - for (ds <- Seq("2008-04-08", "2008-04-09"); hr <- Seq(11, 12)) { + withTable("data_15752", "srcpart_15752", "srctext_15752") { + val df = Seq((1, "2"), (3, "4")).toDF("key", "value") + df.createOrReplaceTempView("data_15752") sql( - s""" - |INSERT OVERWRITE TABLE srcpart_15752 PARTITION (ds='$ds',hr='$hr') - |select key, value from data_15752 - """.stripMargin) - } - checkAnswer(sql("select hr from srcpart_15752 where hr = 11 group by hr"), Row(11)) - checkAnswer(sql("select max(hr) from srcpart_15752"), Row(12)) - checkAnswer(sql("select max(hr) from srcpart_15752 where hr = 11"), Row(11)) - checkAnswer(sql("select max(hr) from (select hr from srcpart_15752) t"), Row(12)) - checkAnswer( - sql("select max(x) from (select hr + 1 as x from srcpart_15752 where hr = 12) t"), - Row(13)) - checkAnswer(sql("select distinct hr from srcpart_15752"), Row(11) :: Row(12) :: Nil) - checkAnswer(sql("select distinct hr from srcpart_15752 where hr = 11"), Row(11)) - checkAnswer( - sql("select distinct x from (select hr + 1 as x from srcpart_15752 where hr = 12) t"), - Row(13)) + """ + |CREATE TABLE srcpart_15752 (key INT, value STRING) + |PARTITIONED BY (ds STRING, hr INT) STORED AS parquet + """.stripMargin) + for (ds <- Seq("2008-04-08", "2008-04-09"); hr <- Seq(11, 12)) { + sql( + s""" + |INSERT OVERWRITE TABLE srcpart_15752 PARTITION (ds='$ds',hr='$hr') + |select key, value from data_15752 + """.stripMargin) + } + checkAnswer(sql("select hr from srcpart_15752 where hr = 11 group by hr"), Row(11)) + checkAnswer(sql("select max(hr) from srcpart_15752"), Row(12)) + checkAnswer(sql("select max(hr) from srcpart_15752 where hr = 11"), Row(11)) + checkAnswer(sql("select max(hr) from (select hr from srcpart_15752) t"), Row(12)) + checkAnswer( + sql("select max(x) from (select hr + 1 as x from srcpart_15752 where hr = 12) t"), + Row(13)) + checkAnswer(sql("select distinct hr from srcpart_15752"), Row(11) :: Row(12) :: Nil) + checkAnswer(sql("select distinct hr from srcpart_15752 where hr = 11"), Row(11)) + checkAnswer( + sql("select distinct x from (select hr + 1 as x from srcpart_15752 where hr = 12) t"), + Row(13)) - // Now donot support metadata only optimizer - checkAnswer( - sql("select hr from srcpart_15752 where hr = 12 group by rollup(hr)"), - Row(null) :: Row(12) :: Nil) - checkAnswer( - sql("select hr from (select hr from srcpart_15752 where hr = 11 union all " + - "select hr from srcpart_15752 where hr= 12)t group by hr"), - Row(11) :: Row(12) :: Nil) + // Now donot support metadata only optimizer + checkAnswer( + sql("select hr from srcpart_15752 where hr = 12 group by rollup(hr)"), + Row(null) :: Row(12) :: Nil) + checkAnswer( + sql("select hr from (select hr from srcpart_15752 where hr = 11 union all " + + "select hr from srcpart_15752 where hr= 12)t group by hr"), + Row(11) :: Row(12) :: Nil) - sql( - """ - |CREATE TABLE srctext_15752 (key INT, value STRING) - |PARTITIONED BY (ds STRING, hr INT) STORED AS textfile - """.stripMargin) - for (ds <- Seq("2008-04-08", "2008-04-09"); hr <- Seq(11, 12)) { sql( - s""" - |INSERT OVERWRITE TABLE srctext_15752 PARTITION (ds='$ds',hr='$hr') - |select key, value from data_15752 - """.stripMargin) + """ + |CREATE TABLE srctext_15752 (key INT, value STRING) + |PARTITIONED BY (ds STRING, hr INT) STORED AS textfile + """.stripMargin) + for (ds <- Seq("2008-04-08", "2008-04-09"); hr <- Seq(11, 12)) { + sql( + s""" + |INSERT OVERWRITE TABLE srctext_15752 PARTITION (ds='$ds',hr='$hr') + |select key, value from data_15752 + """.stripMargin) + } + checkAnswer(sql("select hr from srctext_15752 where hr = 11 group by hr"), Row(11)) + checkAnswer(sql("select max(hr) from srctext_15752"), Row(12)) + checkAnswer(sql("select max(hr) from srctext_15752 where hr = 11"), Row(11)) + checkAnswer(sql("select max(hr) from (select hr from srctext_15752) t"), Row(12)) + checkAnswer( + sql("select max(x) from (select hr + 1 as x from srctext_15752 where hr = 12) t"), + Row(13)) + checkAnswer(sql("select distinct hr from srctext_15752"), Row(11) :: Row(12) :: Nil) + checkAnswer(sql("select distinct hr from srctext_15752 where hr = 11"), Row(11)) + checkAnswer( + sql("select distinct x from (select hr + 1 as x from srctext_15752 where hr = 12) t"), + Row(13)) } - checkAnswer(sql("select hr from srctext_15752 where hr = 11 group by hr"), Row(11)) - checkAnswer(sql("select max(hr) from srctext_15752"), Row(12)) - checkAnswer(sql("select max(hr) from srctext_15752 where hr = 11"), Row(11)) - checkAnswer(sql("select max(hr) from (select hr from srctext_15752) t"), Row(12)) - checkAnswer( - sql("select max(x) from (select hr + 1 as x from srctext_15752 where hr = 12) t"), - Row(13)) - checkAnswer(sql("select distinct hr from srctext_15752"), Row(11) :: Row(12) :: Nil) - checkAnswer(sql("select distinct hr from srctext_15752 where hr = 11"), Row(11)) - checkAnswer( - sql("select distinct x from (select hr + 1 as x from srctext_15752 where hr = 12) t"), - Row(13)) } } } From 4297f9f7f32fc2ea59bba43569b17da94ed84fce Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Wed, 6 Jul 2016 11:14:09 +0800 Subject: [PATCH 30/45] update name --- .../{MetadataOnlyOptimizer.scala => OptimizeMetadataOnly.scala} | 2 +- .../scala/org/apache/spark/sql/execution/SparkOptimizer.scala | 2 +- ...OnlyOptimizerSuite.scala => OptimizeMetadataOnlySuite.scala} | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) rename sql/core/src/main/scala/org/apache/spark/sql/execution/{MetadataOnlyOptimizer.scala => OptimizeMetadataOnly.scala} (99%) rename sql/core/src/test/scala/org/apache/spark/sql/execution/{MetadataOnlyOptimizerSuite.scala => OptimizeMetadataOnlySuite.scala} (98%) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnly.scala similarity index 99% rename from sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnly.scala index 73b8efe8ce177..d17b5f4ba2db8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnly.scala @@ -37,7 +37,7 @@ import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRela * 3. aggregate function on partition columns which have same result w or w/o DISTINCT keyword. * e.g. SELECT Max(col2) FROM tbl GROUP BY col1. */ -case class MetadataOnlyOptimizer( +case class OptimizeMetadataOnly( catalog: SessionCatalog, conf: CatalystConf) extends Rule[LogicalPlan] { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala index b78390fb5d863..26393c5f1839d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala @@ -30,7 +30,7 @@ class SparkOptimizer( extends Optimizer(catalog, conf) { override def batches: Seq[Batch] = super.batches :+ - Batch("Metadata Only Optimization", Once, MetadataOnlyOptimizer(catalog, conf)) :+ + Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnly(catalog, conf)) :+ Batch("Extract Python UDF from Aggregate", Once, ExtractPythonUDFFromAggregate) :+ Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlySuite.scala similarity index 98% rename from sql/core/src/test/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizerSuite.scala rename to sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlySuite.scala index 55dc4ad5da5fb..d3978dd498d77 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/MetadataOnlyOptimizerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlySuite.scala @@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext -class MetadataOnlyOptimizerSuite extends QueryTest with SharedSQLContext { +class OptimizeMetadataOnlySuite extends QueryTest with SharedSQLContext { import testImplicits._ override def beforeAll(): Unit = { From 1a65aa7c57e3be349a8823fedc51365e7332981e Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Wed, 6 Jul 2016 20:14:36 +0800 Subject: [PATCH 31/45] address comments --- docs/sql-programming-guide.md | 11 ++ .../spark/sql/catalyst/CatalystConf.scala | 2 - ....scala => OptimizeMetadataOnlyQuery.scala} | 54 ++++--- .../spark/sql/execution/SparkOptimizer.scala | 2 +- .../apache/spark/sql/internal/SQLConf.scala | 5 +- .../org/apache/spark/sql/SQLQuerySuite.scala | 22 +-- .../OptimizeMetadataOnlyQuerySuite.scala | 153 ++++++++++++++++++ .../execution/OptimizeMetadataOnlySuite.scala | 95 ----------- .../sql/hive/execution/SQLQuerySuite.scala | 75 +++++---- 9 files changed, 255 insertions(+), 164 deletions(-) rename sql/core/src/main/scala/org/apache/spark/sql/execution/{OptimizeMetadataOnly.scala => OptimizeMetadataOnlyQuery.scala} (71%) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlySuite.scala diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 68419e1331594..addf25787219e 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1476,6 +1476,17 @@ Configuration of Parquet can be done using the `setConf` method on `SparkSession

+ + spark.sql.optimizer.metadataOnly + true + +

+ When true, enable the metadata-only query optimization that use the table's metadata to + produce the partition columns instead of table scans. It applies when all the columns scanned + are partition columns and the query has an aggregate operator have distinct semantics. +

+ + ## JSON Datasets diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala index cabef8d297825..4df100c2a8304 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala @@ -28,7 +28,6 @@ trait CatalystConf { def orderByOrdinal: Boolean def groupByOrdinal: Boolean - def optimizerMetadataOnly: Boolean def optimizerMaxIterations: Int def optimizerInSetConversionThreshold: Int def maxCaseBranchesForCodegen: Int @@ -52,7 +51,6 @@ case class SimpleCatalystConf( caseSensitiveAnalysis: Boolean, orderByOrdinal: Boolean = true, groupByOrdinal: Boolean = true, - optimizerMetadataOnly: Boolean = true, optimizerMaxIterations: Int = 100, optimizerInSetConversionThreshold: Int = 10, maxCaseBranchesForCodegen: Int = 20, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnly.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala similarity index 71% rename from sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnly.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala index d17b5f4ba2db8..1bef83aead1b2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnly.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala @@ -25,21 +25,22 @@ import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} +import org.apache.spark.sql.internal.SQLConf /** - * When scanning only partition columns, get results based on partition data without scanning files. - * It's used for operators that only need distinct values. Currently only [[Aggregate]] operator - * which satisfy the following conditions are supported: + * This rule optimizes the execution of queries that can be answered by looking only at + * partition-level metadata. This applies when all the columns scanned are partition columns, and + * the query has an aggregate operator that satisfies the following conditions: * 1. aggregate expression is partition columns. * e.g. SELECT col FROM tbl GROUP BY col. * 2. aggregate function on partition columns with DISTINCT. - * e.g. SELECT count(DISTINCT col1) FROM tbl GROUP BY col2. + * e.g. SELECT col1, count(DISTINCT col2) FROM tbl GROUP BY col1. * 3. aggregate function on partition columns which have same result w or w/o DISTINCT keyword. - * e.g. SELECT Max(col2) FROM tbl GROUP BY col1. + * e.g. SELECT col1, Max(col2) FROM tbl GROUP BY col1. */ -case class OptimizeMetadataOnly( +case class OptimizeMetadataOnlyQuery( catalog: SessionCatalog, - conf: CatalystConf) extends Rule[LogicalPlan] { + conf: SQLConf) extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = { if (!conf.optimizerMetadataOnly) { @@ -52,7 +53,7 @@ case class OptimizeMetadataOnly( val aggFunctions = aggExprs.flatMap(_.collect { case agg: AggregateExpression => agg }) - val isPartitionDataOnly = aggFunctions.isEmpty || aggFunctions.forall { agg => + val isPartitionDataOnly = aggFunctions.forall { agg => agg.isDistinct || (agg.aggregateFunction match { case _: Max => true case _: Min => true @@ -60,7 +61,7 @@ case class OptimizeMetadataOnly( }) } if (isPartitionDataOnly) { - a.withNewChildren(Seq(usePartitionData(child, relation))) + a.withNewChildren(Seq(replaceTableScanWithPartitionMetadata(child, relation))) } else { a } @@ -70,14 +71,16 @@ case class OptimizeMetadataOnly( } } - private def usePartitionData(child: LogicalPlan, relation: LogicalPlan): LogicalPlan = { + private def replaceTableScanWithPartitionMetadata( + child: LogicalPlan, + relation: LogicalPlan): LogicalPlan = { child transform { case plan if plan eq relation => relation match { case l @ LogicalRelation(fsRelation: HadoopFsRelation, _, _) => val partColumns = fsRelation.partitionSchema.map(_.name.toLowerCase).toSet val partAttrs = l.output.filter(a => partColumns.contains(a.name.toLowerCase)) - val partitionData = fsRelation.location.listFiles(Nil) + val partitionData = fsRelation.location.listFiles(filters = Nil) LocalRelation(partAttrs, partitionData.map(_.values)) case relation: CatalogRelation => @@ -90,11 +93,19 @@ case class OptimizeMetadataOnly( } LocalRelation(partAttrs, partitionData) - case _ => throw new IllegalStateException() + case _ => + throw new IllegalStateException(s"The plan: $relation Cannot be replaced by " + + s"LocalRelation in the OptimizeMetadataOnlyQuery.") } } } + /** + * A pattern that finds scanned partition attributes and table relation all of whose columns + * scanned are partition columns. This applies when project or filter operators with + * deterministic expressions scan only partition columns. + * It returns scanned partition attributes and table relation plan, otherwise None. + */ object PartitionedRelation { def unapply(plan: LogicalPlan): Option[(AttributeSet, LogicalPlan)] = plan match { case l @ LogicalRelation(fsRelation: HadoopFsRelation, _, _) @@ -109,24 +120,15 @@ case class OptimizeMetadataOnly( Some(AttributeSet(partAttrs), relation) case p @ Project(projectList, child) if projectList.forall(_.deterministic) => - unapply(child).flatMap { - case (partAttrs, relation) => - if (p.references.subsetOf(partAttrs)) { - Some(p.outputSet, relation) - } else { - None - } + unapply(child).flatMap { case (partAttrs, relation) => + if (p.references.subsetOf(partAttrs)) Some(p.outputSet, relation) else None } case f @ Filter(condition, child) if condition.deterministic => - unapply(child).flatMap { - case (partAttrs, relation) => - if (f.references.subsetOf(partAttrs)) { - Some(f.outputSet, relation) - } else { - None - } + unapply(child).flatMap { case (partAttrs, relation) => + if (f.references.subsetOf(partAttrs)) Some(f.outputSet, relation) else None } + case _ => None } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala index 26393c5f1839d..8b762b5d6c5f2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala @@ -30,7 +30,7 @@ class SparkOptimizer( extends Optimizer(catalog, conf) { override def batches: Seq[Batch] = super.batches :+ - Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnly(catalog, conf)) :+ + Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog, conf)) :+ Batch("Extract Python UDF from Aggregate", Once, ExtractPythonUDFFromAggregate) :+ Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 9ed640d6e6a4e..7190b135a2f26 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -259,7 +259,10 @@ object SQLConf { .createWithDefault(false) val OPTIMIZER_METADATA_ONLY = SQLConfigBuilder("spark.sql.optimizer.metadataOnly") - .doc("When true, enable the metadata-only query optimization.") + .doc("When true, enable the metadata-only query optimization that use the table's metadata " + + "to produce the partition columns instead of table scans. It applies when all the columns " + + "scanned are partition columns and the query has an aggregate operator have distinct " + + "semantics.") .booleanConf .createWithDefault(true) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index bdd7241545f61..4471a197a9a87 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -2866,19 +2866,23 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { Row(s"$expected") :: Nil) } - test("spark-15752 metadata only optimizer for datasource table") { + test("spark-15752 optimize metadata only query for datasource table") { withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { withTable("srcpart_15752") { val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "even" else "odd")) - .toDF("id", "data", "partId", "part") - data.write.partitionBy("partId", "part").mode("append").saveAsTable("srcpart_15752") - checkAnswer(sql("select max(part) from srcpart_15752"), Row("odd")) - checkAnswer(sql("select max(part) from srcpart_15752 where partId = 0"), Row("even")) - checkAnswer(sql("select max(part) from (select part from srcpart_15752) t"), Row("odd")) + .toDF("col1", "col2", "partcol1", "partcol2") + data.write.partitionBy("partcol1", "partcol2").mode("append").saveAsTable("srcpart_15752") + checkAnswer(sql("select max(partcol1) from srcpart_15752"), Row(1)) + checkAnswer(sql("select max(partcol1) from srcpart_15752 where partcol1 = 0"), Row(0)) checkAnswer( - sql("select distinct part from srcpart_15752"), - Row("even") :: Row("odd") :: Nil) - checkAnswer(sql("select distinct part from srcpart_15752 where partId = 0"), Row("even")) + sql("select max(partcol1) from (select partcol1 from srcpart_15752) t"), + Row(1)) + checkAnswer( + sql("select distinct partcol1 from srcpart_15752"), + Row(0) :: Row(1) :: Nil) + checkAnswer( + sql("select distinct partcol1 from srcpart_15752 where partcol1 = 0"), + Row(0)) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala new file mode 100644 index 0000000000000..566c494420749 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.plans.logical.LocalRelation +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSQLContext + +class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext { + import testImplicits._ + + override def beforeAll(): Unit = { + super.beforeAll() + val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "even" else "odd")) + .toDF("col1", "col2", "partcol1", "partcol2") + data.write.partitionBy("partcol1", "partcol2").mode("append").saveAsTable("srcpart") + } + + override protected def afterAll(): Unit = { + try { + sql("DROP TABLE IF EXISTS srcpart") + } finally { + super.afterAll() + } + } + + private def assertMetadataOnlyQuery(df: DataFrame): Unit = { + val localRelations = df.queryExecution.optimizedPlan.collect { + case l @ LocalRelation(_, _) => l + } + assert(localRelations.size == 1) + } + + private def assertNotMetadataOnlyQuery(df: DataFrame): Unit = { + val localRelations = df.queryExecution.optimizedPlan.collect { + case l @ LocalRelation(_, _) => l + } + assert(localRelations.size == 0) + } + + test("OptimizeMetadataOnlyQuery test: aggregate expression is partition columns") { + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { + assertMetadataOnlyQuery(sql("select partcol1 from srcpart group by partcol1")) + assertMetadataOnlyQuery( + sql("select partcol2 from srcpart where partcol1 = 0 group by partcol2")) + } + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") { + assertNotMetadataOnlyQuery(sql("select partcol1 from srcpart group by partcol1")) + assertNotMetadataOnlyQuery( + sql("select partcol2 from srcpart where partcol1 = 0 group by partcol2")) + } + } + + test("OptimizeMetadataOnlyQuery test: distinct aggregate function on partition columns") { + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { + assertMetadataOnlyQuery( + sql("SELECT partcol1, count(distinct partcol2) FROM srcpart group by partcol1")) + assertMetadataOnlyQuery( + sql("SELECT partcol1, count(distinct partcol2) FROM srcpart where partcol1 = 0 " + + "group by partcol1")) + } + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") { + assertNotMetadataOnlyQuery( + sql("SELECT partcol1, count(distinct partcol2) FROM srcpart group by partcol1")) + assertNotMetadataOnlyQuery( + sql("SELECT partcol1, count(distinct partcol2) FROM srcpart where partcol1 = 0 " + + "group by partcol1")) + } + } + + test("OptimizeMetadataOnlyQuery test: distinct on partition columns") { + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { + assertMetadataOnlyQuery(sql("select distinct partcol1, partcol2 from srcpart")) + assertMetadataOnlyQuery( + sql("select distinct c1 from (select partcol1 + 1 as c1 from srcpart " + + "where partcol1 = 0) t")) + } + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") { + assertNotMetadataOnlyQuery(sql("select distinct partcol1, partcol2 from srcpart")) + assertNotMetadataOnlyQuery( + sql("select distinct c1 from (select partcol1 + 1 as c1 from srcpart " + + "where partcol1 = 0) t")) + } + } + + test("OptimizeMetadataOnlyQuery test: aggregate function on partition columns " + + "which have same result w or w/o DISTINCT keyword.") { + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { + assertMetadataOnlyQuery(sql("select max(partcol1) from srcpart")) + assertMetadataOnlyQuery(sql("select max(partcol1) from srcpart where partcol1 = 0")) + assertMetadataOnlyQuery( + sql("select partcol2, min(partcol1) from srcpart where partcol1 = 0 group by partcol2")) + assertMetadataOnlyQuery( + sql("select max(c1) from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t")) + } + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") { + assertNotMetadataOnlyQuery(sql("select max(partcol1) from srcpart")) + assertNotMetadataOnlyQuery(sql("select max(partcol1) from srcpart where partcol1 = 0")) + assertNotMetadataOnlyQuery( + sql("select partcol2, min(partcol1) from srcpart where partcol1 = 0 group by partcol2")) + assertNotMetadataOnlyQuery( + sql("select max(c1) from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t")) + } + } + + test("OptimizeMetadataOnlyQuery test: unsupported for non-partition columns") { + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { + assertNotMetadataOnlyQuery(sql("select col1 from srcpart group by col1")) + assertNotMetadataOnlyQuery( + sql("select partcol1, max(col1) from srcpart group by partcol1")) + assertNotMetadataOnlyQuery( + sql("select partcol1, count(distinct col1) from srcpart group by partcol1")) + assertNotMetadataOnlyQuery(sql("select distinct partcol1, col1 from srcpart")) + } + } + + test("OptimizeMetadataOnlyQuery test: unsupported for non-distinct aggregate function on " + + "partition columns") { + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { + assertNotMetadataOnlyQuery( + sql("select partcol1, sum(partcol2) from srcpart group by partcol1")) + assertNotMetadataOnlyQuery( + sql("select partcol1, count(partcol2) from srcpart group by partcol1")) + } + } + + test("OptimizeMetadataOnlyQuery test: unsupported for GroupingSet/Union operator") { + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { + assertNotMetadataOnlyQuery( + sql("select partcol1, max(partcol2) from srcpart where partcol1 = 0 " + + "group by rollup (partcol1)")) + assertNotMetadataOnlyQuery( + sql("select partcol2 from (select partcol2 from srcpart where partcol1 = 0 union all " + + "select partcol2 from srcpart where partcol1 = 1) t group by partcol2")) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlySuite.scala deleted file mode 100644 index d3978dd498d77..0000000000000 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlySuite.scala +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution - -import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.plans.logical.LocalRelation -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.test.SharedSQLContext - -class OptimizeMetadataOnlySuite extends QueryTest with SharedSQLContext { - import testImplicits._ - - override def beforeAll(): Unit = { - super.beforeAll() - val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "even" else "odd")) - .toDF("id", "data", "partId", "part") - data.write.partitionBy("partId", "part").mode("append").saveAsTable("srcpart_15752") - } - - override protected def afterAll(): Unit = { - try { - sql("DROP TABLE IF EXISTS srcpart_15752") - } finally { - super.afterAll() - } - } - - private def checkWithMetadataOnly(df: DataFrame): Unit = { - val localRelations = df.queryExecution.optimizedPlan.collect { - case l @ LocalRelation(_, _) => l - } - assert(localRelations.size == 1) - } - - private def checkWithoutMetadataOnly(df: DataFrame): Unit = { - val localRelations = df.queryExecution.optimizedPlan.collect{ - case l @ LocalRelation(_, _) => l - } - assert(localRelations.size == 0) - } - - test("spark-15752 metadata only optimizer for partition table") { - withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { - checkWithMetadataOnly(sql("select part from srcpart_15752 where part = 0 group by part")) - checkWithMetadataOnly(sql("select max(part) from srcpart_15752")) - checkWithMetadataOnly(sql("select max(part) from srcpart_15752 where part = 0")) - checkWithMetadataOnly( - sql("select part, min(partId) from srcpart_15752 where part = 0 group by part")) - checkWithMetadataOnly( - sql("select max(x) from (select part + 1 as x from srcpart_15752 where part = 1) t")) - checkWithMetadataOnly(sql("select distinct part from srcpart_15752")) - checkWithMetadataOnly(sql("select distinct part, partId from srcpart_15752")) - checkWithMetadataOnly( - sql("select distinct x from (select part + 1 as x from srcpart_15752 where part = 0) t")) - - // Now donot support metadata only optimizer - checkWithoutMetadataOnly(sql("select part, max(id) from srcpart_15752 group by part")) - checkWithoutMetadataOnly(sql("select distinct part, id from srcpart_15752")) - checkWithoutMetadataOnly(sql("select part, sum(partId) from srcpart_15752 group by part")) - checkWithoutMetadataOnly( - sql("select part from srcpart_15752 where part = 1 group by rollup(part)")) - checkWithoutMetadataOnly( - sql("select part from (select part from srcpart_15752 where part = 0 union all " + - "select part from srcpart_15752 where part= 1)t group by part")) - } - } - - test("spark-15752 without metadata only optimizer for partition table") { - withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") { - checkWithoutMetadataOnly(sql("select part from srcpart_15752 where part = 0 group by part")) - checkWithoutMetadataOnly(sql("select max(part) from srcpart_15752")) - checkWithoutMetadataOnly(sql("select max(part) from srcpart_15752 where part = 0")) - checkWithoutMetadataOnly( - sql("select max(x) from (select part + 1 as x from srcpart_15752 where part = 1) t")) - checkWithoutMetadataOnly(sql("select distinct part from srcpart_15752")) - checkWithoutMetadataOnly( - sql("select distinct x from (select part + 1 as x from srcpart_15752 where part = 1) t")) - } - } -} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 151a001562d87..3f62afad73829 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1690,68 +1690,83 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } } - test("spark-15752 metadata only optimizer for hive table") { + test("spark-15752 optimize metadata only query for hive table") { withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { withTable("data_15752", "srcpart_15752", "srctext_15752") { val df = Seq((1, "2"), (3, "4")).toDF("key", "value") df.createOrReplaceTempView("data_15752") sql( """ - |CREATE TABLE srcpart_15752 (key INT, value STRING) - |PARTITIONED BY (ds STRING, hr INT) STORED AS parquet + |CREATE TABLE srcpart_15752 (col1 INT, col2 STRING) + |PARTITIONED BY (partcol1 INT, partcol2 STRING) STORED AS parquet """.stripMargin) - for (ds <- Seq("2008-04-08", "2008-04-09"); hr <- Seq(11, 12)) { + for (partcol1 <- Seq(11, 12); partcol2 <- Seq("a", "b")) { sql( s""" - |INSERT OVERWRITE TABLE srcpart_15752 PARTITION (ds='$ds',hr='$hr') + |INSERT OVERWRITE TABLE srcpart_15752 + |PARTITION (partcol1='$partcol1', partcol2='$partcol2') |select key, value from data_15752 """.stripMargin) } - checkAnswer(sql("select hr from srcpart_15752 where hr = 11 group by hr"), Row(11)) - checkAnswer(sql("select max(hr) from srcpart_15752"), Row(12)) - checkAnswer(sql("select max(hr) from srcpart_15752 where hr = 11"), Row(11)) - checkAnswer(sql("select max(hr) from (select hr from srcpart_15752) t"), Row(12)) checkAnswer( - sql("select max(x) from (select hr + 1 as x from srcpart_15752 where hr = 12) t"), + sql("select partcol1 from srcpart_15752 where partcol1 = 11 group by partcol1"), + Row(11)) + checkAnswer(sql("select max(partcol1) from srcpart_15752"), Row(12)) + checkAnswer(sql("select max(partcol1) from srcpart_15752 where partcol1 = 11"), Row(11)) + checkAnswer( + sql("select max(partcol1) from (select partcol1 from srcpart_15752) t"), + Row(12)) + checkAnswer( + sql("select max(col) from (select partcol1 + 1 as col from srcpart_15752 " + + "where partcol1 = 12) t"), Row(13)) - checkAnswer(sql("select distinct hr from srcpart_15752"), Row(11) :: Row(12) :: Nil) - checkAnswer(sql("select distinct hr from srcpart_15752 where hr = 11"), Row(11)) + checkAnswer(sql("select distinct partcol1 from srcpart_15752"), Row(11) :: Row(12) :: Nil) + checkAnswer(sql("select distinct partcol1 from srcpart_15752 where partcol1 = 11"), Row(11)) checkAnswer( - sql("select distinct x from (select hr + 1 as x from srcpart_15752 where hr = 12) t"), + sql("select distinct col from (select partcol1 + 1 as col from srcpart_15752 " + + "where partcol1 = 12) t"), Row(13)) // Now donot support metadata only optimizer checkAnswer( - sql("select hr from srcpart_15752 where hr = 12 group by rollup(hr)"), - Row(null) :: Row(12) :: Nil) + sql("select partcol1, max(partcol2) from srcpart_15752 where partcol1 = 12 " + + "group by rollup(partcol1)"), + Row(null, "b") :: Row(12, "b") :: Nil) checkAnswer( - sql("select hr from (select hr from srcpart_15752 where hr = 11 union all " + - "select hr from srcpart_15752 where hr= 12)t group by hr"), - Row(11) :: Row(12) :: Nil) + sql("select partcol2 from (select partcol2 from srcpart_15752 where partcol1 = 11 " + + "union all select partcol2 from srcpart_15752 where partcol1= 12)t group by partcol2"), + Row("a") :: Row("b") :: Nil) sql( """ - |CREATE TABLE srctext_15752 (key INT, value STRING) - |PARTITIONED BY (ds STRING, hr INT) STORED AS textfile + |CREATE TABLE srctext_15752 (col1 INT, col2 STRING) + |PARTITIONED BY (partcol1 INT, partcol2 STRING) STORED AS textfile """.stripMargin) - for (ds <- Seq("2008-04-08", "2008-04-09"); hr <- Seq(11, 12)) { + for (partcol1 <- Seq(11, 12); partcol2 <- Seq("a", "b")) { sql( s""" - |INSERT OVERWRITE TABLE srctext_15752 PARTITION (ds='$ds',hr='$hr') + |INSERT OVERWRITE TABLE srctext_15752 + |PARTITION (partcol1='$partcol1', partcol2='$partcol2') |select key, value from data_15752 """.stripMargin) } - checkAnswer(sql("select hr from srctext_15752 where hr = 11 group by hr"), Row(11)) - checkAnswer(sql("select max(hr) from srctext_15752"), Row(12)) - checkAnswer(sql("select max(hr) from srctext_15752 where hr = 11"), Row(11)) - checkAnswer(sql("select max(hr) from (select hr from srctext_15752) t"), Row(12)) checkAnswer( - sql("select max(x) from (select hr + 1 as x from srctext_15752 where hr = 12) t"), + sql("select partcol1 from srctext_15752 where partcol1 = 11 group by partcol1"), + Row(11)) + checkAnswer(sql("select max(partcol1) from srctext_15752"), Row(12)) + checkAnswer(sql("select max(partcol1) from srctext_15752 where partcol1 = 11"), Row(11)) + checkAnswer( + sql("select max(partcol1) from (select partcol1 from srctext_15752) t"), + Row(12)) + checkAnswer( + sql("select max(col) from (select partcol1 + 1 as col from srctext_15752 " + + "where partcol1 = 12) t"), Row(13)) - checkAnswer(sql("select distinct hr from srctext_15752"), Row(11) :: Row(12) :: Nil) - checkAnswer(sql("select distinct hr from srctext_15752 where hr = 11"), Row(11)) + checkAnswer(sql("select distinct partcol1 from srctext_15752"), Row(11) :: Row(12) :: Nil) + checkAnswer(sql("select distinct partcol1 from srctext_15752 where partcol1 = 11"), Row(11)) checkAnswer( - sql("select distinct x from (select hr + 1 as x from srctext_15752 where hr = 12) t"), + sql("select distinct col from (select partcol1 + 1 as col from srctext_15752 " + + "where partcol1 = 12) t"), Row(13)) } } From d5e0df4acf3ce6a285b030c4cba42426e146024b Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Wed, 6 Jul 2016 20:21:13 +0800 Subject: [PATCH 32/45] update --- docs/sql-programming-guide.md | 2 +- .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index addf25787219e..b4b6e100f051f 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1483,7 +1483,7 @@ Configuration of Parquet can be done using the `setConf` method on `SparkSession

When true, enable the metadata-only query optimization that use the table's metadata to produce the partition columns instead of table scans. It applies when all the columns scanned - are partition columns and the query has an aggregate operator have distinct semantics. + are partition columns and the query has an aggregate operator that has distinct semantics.

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 7190b135a2f26..91fa1c60dac14 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -261,7 +261,7 @@ object SQLConf { val OPTIMIZER_METADATA_ONLY = SQLConfigBuilder("spark.sql.optimizer.metadataOnly") .doc("When true, enable the metadata-only query optimization that use the table's metadata " + "to produce the partition columns instead of table scans. It applies when all the columns " + - "scanned are partition columns and the query has an aggregate operator have distinct " + + "scanned are partition columns and the query has an aggregate operator that has distinct " + "semantics.") .booleanConf .createWithDefault(true) From 9d6dd76944eb42411850f8b721a05e263a7db30e Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Wed, 6 Jul 2016 20:24:09 +0800 Subject: [PATCH 33/45] update2 --- docs/sql-programming-guide.md | 3 ++- .../main/scala/org/apache/spark/sql/internal/SQLConf.scala | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index b4b6e100f051f..7096fd31a3fea 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1483,7 +1483,8 @@ Configuration of Parquet can be done using the `setConf` method on `SparkSession

When true, enable the metadata-only query optimization that use the table's metadata to produce the partition columns instead of table scans. It applies when all the columns scanned - are partition columns and the query has an aggregate operator that has distinct semantics. + are partition columns and the query has an aggregate operator that satisfies distinct + semantics.

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 91fa1c60dac14..9aa694f8e8fde 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -261,8 +261,8 @@ object SQLConf { val OPTIMIZER_METADATA_ONLY = SQLConfigBuilder("spark.sql.optimizer.metadataOnly") .doc("When true, enable the metadata-only query optimization that use the table's metadata " + "to produce the partition columns instead of table scans. It applies when all the columns " + - "scanned are partition columns and the query has an aggregate operator that has distinct " + - "semantics.") + "scanned are partition columns and the query has an aggregate operator that satisfies " + + "distinct semantics.") .booleanConf .createWithDefault(true) From 9cb01d810bc993cae3c98962317026c1e5295ed2 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Wed, 6 Jul 2016 21:06:36 +0800 Subject: [PATCH 34/45] update --- .../spark/sql/execution/OptimizeMetadataOnlyQuery.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala index 1bef83aead1b2..9ee0a96d7a3c7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala @@ -53,14 +53,14 @@ case class OptimizeMetadataOnlyQuery( val aggFunctions = aggExprs.flatMap(_.collect { case agg: AggregateExpression => agg }) - val isPartitionDataOnly = aggFunctions.forall { agg => + val hasAllDistinctAgg = aggFunctions.forall { agg => agg.isDistinct || (agg.aggregateFunction match { case _: Max => true case _: Min => true case _ => false }) } - if (isPartitionDataOnly) { + if (hasAllDistinctAgg) { a.withNewChildren(Seq(replaceTableScanWithPartitionMetadata(child, relation))) } else { a From 3e2687ded84965cecc5c0a251d0bc4f086812272 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Thu, 7 Jul 2016 01:56:22 +0800 Subject: [PATCH 35/45] doc improve --- .../execution/OptimizeMetadataOnlyQuery.scala | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala index 9ee0a96d7a3c7..bf16c3643da36 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala @@ -17,8 +17,7 @@ package org.apache.spark.sql.execution -import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.{CatalystConf, InternalRow} +import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, SessionCatalog} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ @@ -49,18 +48,21 @@ case class OptimizeMetadataOnlyQuery( plan.transform { case a @ Aggregate(_, aggExprs, child @ PartitionedRelation(partAttrs, relation)) => + // We only apply this optimization when only partitioned attributes are scanned. if (a.references.subsetOf(partAttrs)) { val aggFunctions = aggExprs.flatMap(_.collect { case agg: AggregateExpression => agg }) - val hasAllDistinctAgg = aggFunctions.forall { agg => + val isAllDistinctAgg = aggFunctions.forall { agg => agg.isDistinct || (agg.aggregateFunction match { + // `Max` and `Min` are always distinct aggregate functions no matter they have + // DISTINCT keyword or not, as the result will be same. case _: Max => true case _: Min => true case _ => false }) } - if (hasAllDistinctAgg) { + if (isAllDistinctAgg) { a.withNewChildren(Seq(replaceTableScanWithPartitionMetadata(child, relation))) } else { a @@ -71,6 +73,10 @@ case class OptimizeMetadataOnlyQuery( } } + /** + * Transform the given plan, find its table scan nodes that matches the given relation, and then + * replace the table scan node with its corresponding partition values. + */ private def replaceTableScanWithPartitionMetadata( child: LogicalPlan, relation: LogicalPlan): LogicalPlan = { @@ -93,18 +99,18 @@ case class OptimizeMetadataOnlyQuery( } LocalRelation(partAttrs, partitionData) - case _ => - throw new IllegalStateException(s"The plan: $relation Cannot be replaced by " + - s"LocalRelation in the OptimizeMetadataOnlyQuery.") + case _ => throw new IllegalStateException(s"unrecognized table scan node: $relation") } } } /** - * A pattern that finds scanned partition attributes and table relation all of whose columns - * scanned are partition columns. This applies when project or filter operators with - * deterministic expressions scan only partition columns. - * It returns scanned partition attributes and table relation plan, otherwise None. + * A pattern that finds the partitioned table relation node inside the given plan, and returns a + * pair of the partition attributes and the table relation node. + * + * It keeps traversing down the given plan tree if there is a [[Project]] or [[Filter]] with + * deterministic expressions, and returns result after reaching the partitioned table relation + * node. */ object PartitionedRelation { def unapply(plan: LogicalPlan): Option[(AttributeSet, LogicalPlan)] = plan match { @@ -126,7 +132,7 @@ case class OptimizeMetadataOnlyQuery( case f @ Filter(condition, child) if condition.deterministic => unapply(child).flatMap { case (partAttrs, relation) => - if (f.references.subsetOf(partAttrs)) Some(f.outputSet, relation) else None + if (f.references.subsetOf(partAttrs)) Some(partAttrs, relation) else None } case _ => None From 2b4faf321302adcb60f366dede0b75f8bbe368b9 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Thu, 7 Jul 2016 08:27:20 +0800 Subject: [PATCH 36/45] update --- .../spark/sql/execution/OptimizeMetadataOnlyQuery.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala index bf16c3643da36..b5c32d022c317 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala @@ -99,7 +99,9 @@ case class OptimizeMetadataOnlyQuery( } LocalRelation(partAttrs, partitionData) - case _ => throw new IllegalStateException(s"unrecognized table scan node: $relation") + case _ => + throw new IllegalStateException(s"unrecognized table scan node: $relation, " + + s"please turn off ${SQLConf.OPTIMIZER_METADATA_ONLY.key} and try again.") } } } From a894bb7063ab8d37a99b6466e8fcda8e734867fe Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Thu, 7 Jul 2016 10:35:12 +0800 Subject: [PATCH 37/45] delete cases --- .../spark/sql/hive/execution/SQLQuerySuite.scala | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 3f62afad73829..7eac19e9f45a3 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1727,16 +1727,6 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { "where partcol1 = 12) t"), Row(13)) - // Now donot support metadata only optimizer - checkAnswer( - sql("select partcol1, max(partcol2) from srcpart_15752 where partcol1 = 12 " + - "group by rollup(partcol1)"), - Row(null, "b") :: Row(12, "b") :: Nil) - checkAnswer( - sql("select partcol2 from (select partcol2 from srcpart_15752 where partcol1 = 11 " + - "union all select partcol2 from srcpart_15752 where partcol1= 12)t group by partcol2"), - Row("a") :: Row("b") :: Nil) - sql( """ |CREATE TABLE srctext_15752 (col1 INT, col2 STRING) From 85b695bd2f259b9b42f9405f3fd109fe75a7a93a Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Thu, 7 Jul 2016 16:09:42 +0800 Subject: [PATCH 38/45] update ut --- .../org/apache/spark/sql/SQLQuerySuite.scala | 33 +++++--- .../sql/hive/execution/SQLQuerySuite.scala | 75 ++++++++++++------- 2 files changed, 70 insertions(+), 38 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 4471a197a9a87..4e6d4b7805daa 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -2866,23 +2866,38 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { Row(s"$expected") :: Nil) } - test("spark-15752 optimize metadata only query for datasource table") { + test("SPARK-15752 optimize metadata only query for datasource table") { withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { withTable("srcpart_15752") { - val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "even" else "odd")) + val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "a" else "b")) .toDF("col1", "col2", "partcol1", "partcol2") data.write.partitionBy("partcol1", "partcol2").mode("append").saveAsTable("srcpart_15752") - checkAnswer(sql("select max(partcol1) from srcpart_15752"), Row(1)) - checkAnswer(sql("select max(partcol1) from srcpart_15752 where partcol1 = 0"), Row(0)) checkAnswer( - sql("select max(partcol1) from (select partcol1 from srcpart_15752) t"), + sql("select partcol1 from srcpart_15752 group by partcol1"), + Row(0) :: Row(1) :: Nil) + checkAnswer( + sql("select partcol1 from srcpart_15752 where partcol1 = 1 group by partcol1"), Row(1)) checkAnswer( - sql("select distinct partcol1 from srcpart_15752"), - Row(0) :: Row(1) :: Nil) + sql("select partcol1, count(distinct partcol2) from srcpart_15752 group by partcol1"), + Row(0, 1) :: Row(1, 1) :: Nil) + checkAnswer( + sql("select partcol1, count(distinct partcol2) from srcpart_15752 where partcol1 = 1 " + + "group by partcol1"), + Row(1, 1) :: Nil) + checkAnswer(sql("select distinct partcol1 from srcpart_15752"), Row(0) :: Row(1) :: Nil) + checkAnswer(sql("select distinct partcol1 from srcpart_15752 where partcol1 = 1"), Row(1)) + checkAnswer( + sql("select distinct col from (select partcol1 + 1 as col from srcpart_15752 " + + "where partcol1 = 1) t"), + Row(2)) + checkAnswer(sql("select max(partcol1) from srcpart_15752"), Row(1)) + checkAnswer(sql("select max(partcol1) from srcpart_15752 where partcol1 = 1"), Row(1)) + checkAnswer(sql("select max(partcol1) from (select partcol1 from srcpart_15752) t"), Row(1)) checkAnswer( - sql("select distinct partcol1 from srcpart_15752 where partcol1 = 0"), - Row(0)) + sql("select max(col) from (select partcol1 + 1 as col from srcpart_15752 " + + "where partcol1 = 1) t"), + Row(2)) } } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 7eac19e9f45a3..a43f0d0d7e97c 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1690,7 +1690,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } } - test("spark-15752 optimize metadata only query for hive table") { + test("SPARK-15752 optimize metadata only query for hive table") { withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { withTable("data_15752", "srcpart_15752", "srctext_15752") { val df = Seq((1, "2"), (3, "4")).toDF("key", "value") @@ -1700,7 +1700,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { |CREATE TABLE srcpart_15752 (col1 INT, col2 STRING) |PARTITIONED BY (partcol1 INT, partcol2 STRING) STORED AS parquet """.stripMargin) - for (partcol1 <- Seq(11, 12); partcol2 <- Seq("a", "b")) { + for (partcol1 <- Seq(0, 1); partcol2 <- Seq("a", "b")) { sql( s""" |INSERT OVERWRITE TABLE srcpart_15752 @@ -1709,30 +1709,39 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { """.stripMargin) } checkAnswer( - sql("select partcol1 from srcpart_15752 where partcol1 = 11 group by partcol1"), - Row(11)) - checkAnswer(sql("select max(partcol1) from srcpart_15752"), Row(12)) - checkAnswer(sql("select max(partcol1) from srcpart_15752 where partcol1 = 11"), Row(11)) + sql("select partcol1 from srcpart_15752 group by partcol1"), + Row(0) :: Row(1) :: Nil) checkAnswer( - sql("select max(partcol1) from (select partcol1 from srcpart_15752) t"), - Row(12)) + sql("select partcol1 from srcpart_15752 where partcol1 = 1 group by partcol1"), + Row(1)) checkAnswer( - sql("select max(col) from (select partcol1 + 1 as col from srcpart_15752 " + - "where partcol1 = 12) t"), - Row(13)) - checkAnswer(sql("select distinct partcol1 from srcpart_15752"), Row(11) :: Row(12) :: Nil) - checkAnswer(sql("select distinct partcol1 from srcpart_15752 where partcol1 = 11"), Row(11)) + sql("select partcol1, count(distinct partcol2) from srcpart_15752 group by partcol1"), + Row(0, 2) :: Row(1, 2) :: Nil) + checkAnswer( + sql("select partcol1, count(distinct partcol2) from srcpart_15752 where partcol1 = 1 " + + "group by partcol1"), + Row(1, 2) :: Nil) + checkAnswer(sql("select distinct partcol1 from srcpart_15752"), Row(0) :: Row(1) :: Nil) + checkAnswer(sql("select distinct partcol1 from srcpart_15752 where partcol1 = 1"), Row(1)) checkAnswer( sql("select distinct col from (select partcol1 + 1 as col from srcpart_15752 " + - "where partcol1 = 12) t"), - Row(13)) + "where partcol1 = 1) t"), + Row(2)) + checkAnswer(sql("select distinct partcol1 from srcpart_15752 where partcol1 = 1"), Row(1)) + checkAnswer(sql("select max(partcol1) from srcpart_15752"), Row(1)) + checkAnswer(sql("select max(partcol1) from srcpart_15752 where partcol1 = 1"), Row(1)) + checkAnswer(sql("select max(partcol1) from (select partcol1 from srcpart_15752) t"), Row(1)) + checkAnswer( + sql("select max(col) from (select partcol1 + 1 as col from srcpart_15752 " + + "where partcol1 = 1) t"), + Row(2)) sql( """ |CREATE TABLE srctext_15752 (col1 INT, col2 STRING) |PARTITIONED BY (partcol1 INT, partcol2 STRING) STORED AS textfile """.stripMargin) - for (partcol1 <- Seq(11, 12); partcol2 <- Seq("a", "b")) { + for (partcol1 <- Seq(0, 1); partcol2 <- Seq("a", "b")) { sql( s""" |INSERT OVERWRITE TABLE srctext_15752 @@ -1741,23 +1750,31 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { """.stripMargin) } checkAnswer( - sql("select partcol1 from srctext_15752 where partcol1 = 11 group by partcol1"), - Row(11)) - checkAnswer(sql("select max(partcol1) from srctext_15752"), Row(12)) - checkAnswer(sql("select max(partcol1) from srctext_15752 where partcol1 = 11"), Row(11)) + sql("select partcol1 from srctext_15752 group by partcol1"), + Row(0) :: Row(1) :: Nil) checkAnswer( - sql("select max(partcol1) from (select partcol1 from srctext_15752) t"), - Row(12)) + sql("select partcol1 from srctext_15752 where partcol1 = 1 group by partcol1"), + Row(1)) checkAnswer( - sql("select max(col) from (select partcol1 + 1 as col from srctext_15752 " + - "where partcol1 = 12) t"), - Row(13)) - checkAnswer(sql("select distinct partcol1 from srctext_15752"), Row(11) :: Row(12) :: Nil) - checkAnswer(sql("select distinct partcol1 from srctext_15752 where partcol1 = 11"), Row(11)) + sql("select partcol1, count(distinct partcol2) from srctext_15752 group by partcol1"), + Row(0, 2) :: Row(1, 2) :: Nil) + checkAnswer( + sql("select partcol1, count(distinct partcol2) from srctext_15752 where partcol1 = 1 " + + "group by partcol1"), + Row(1, 2) :: Nil) + checkAnswer(sql("select distinct partcol1 from srctext_15752"), Row(0) :: Row(1) :: Nil) + checkAnswer(sql("select distinct partcol1 from srctext_15752 where partcol1 = 1"), Row(1)) checkAnswer( sql("select distinct col from (select partcol1 + 1 as col from srctext_15752 " + - "where partcol1 = 12) t"), - Row(13)) + "where partcol1 = 1) t"), + Row(2)) + checkAnswer(sql("select max(partcol1) from srctext_15752"), Row(1)) + checkAnswer(sql("select max(partcol1) from srctext_15752 where partcol1 = 1"), Row(1)) + checkAnswer(sql("select max(partcol1) from (select partcol1 from srctext_15752) t"), Row(1)) + checkAnswer( + sql("select max(col) from (select partcol1 + 1 as col from srctext_15752 " + + "where partcol1 = 1) t"), + Row(2)) } } } From 67211beb80c4d84fb70c6037cc53044f86f094d5 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Thu, 7 Jul 2016 16:17:23 +0800 Subject: [PATCH 39/45] Merge branch 'master' of https://github.com/apache/spark into metadata-only --- .../apache/spark/sql/internal/SQLConf.scala | 23 ------------------- 1 file changed, 23 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 9aa694f8e8fde..14a1680fafa3a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -266,25 +266,6 @@ object SQLConf { .booleanConf .createWithDefault(true) - val NATIVE_VIEW = SQLConfigBuilder("spark.sql.nativeView") - .internal() - .doc("When true, CREATE VIEW will be handled by Spark SQL instead of Hive native commands. " + - "Note that this function is experimental and should ony be used when you are using " + - "non-hive-compatible tables written by Spark SQL. The SQL string used to create " + - "view should be fully qualified, i.e. use `tbl1`.`col1` instead of `*` whenever " + - "possible, or you may get wrong result.") - .booleanConf - .createWithDefault(true) - - val CANONICAL_NATIVE_VIEW = SQLConfigBuilder("spark.sql.nativeView.canonical") - .internal() - .doc("When this option and spark.sql.nativeView are both true, Spark SQL tries to handle " + - "CREATE VIEW statement using SQL query string generated from view definition logical " + - "plan. If the logical plan doesn't have a SQL representation, we fallback to the " + - "original native view implementation.") - .booleanConf - .createWithDefault(true) - val COLUMN_NAME_OF_CORRUPT_RECORD = SQLConfigBuilder("spark.sql.columnNameOfCorruptRecord") .doc("The name of internal column for storing raw/un-parsed JSON records that fail to parse.") .stringConf @@ -623,8 +604,6 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging { def optimizerMetadataOnly: Boolean = getConf(OPTIMIZER_METADATA_ONLY) - def nativeView: Boolean = getConf(NATIVE_VIEW) - def wholeStageEnabled: Boolean = getConf(WHOLESTAGE_CODEGEN_ENABLED) def wholeStageMaxNumFields: Int = getConf(WHOLESTAGE_MAX_NUM_FIELDS) @@ -635,8 +614,6 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging { def exchangeReuseEnabled: Boolean = getConf(EXCHANGE_REUSE_ENABLED) - def canonicalView: Boolean = getConf(CANONICAL_NATIVE_VIEW) - def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE) def subexpressionEliminationEnabled: Boolean = From 501f93bde2c68b1c72ef9718a74edd2525551bc0 Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Mon, 11 Jul 2016 20:54:54 +0800 Subject: [PATCH 40/45] address commetns --- .../execution/OptimizeMetadataOnlyQuery.scala | 6 +- .../OptimizeMetadataOnlyQuerySuite.scala | 123 ++++++------------ 2 files changed, 47 insertions(+), 82 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala index b5c32d022c317..912d274ce02f2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala @@ -55,10 +55,12 @@ case class OptimizeMetadataOnlyQuery( }) val isAllDistinctAgg = aggFunctions.forall { agg => agg.isDistinct || (agg.aggregateFunction match { - // `Max` and `Min` are always distinct aggregate functions no matter they have - // DISTINCT keyword or not, as the result will be same. + // `Max`, `Min`, `First` and `Last` are always distinct aggregate functions no matter + // they have DISTINCT keyword or not, as the result will be same. case _: Max => true case _: Min => true + case _: First => true + case _: Last => true case _ => false }) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala index 566c494420749..7b690366f7920 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala @@ -41,113 +41,76 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext { } private def assertMetadataOnlyQuery(df: DataFrame): Unit = { - val localRelations = df.queryExecution.optimizedPlan.collect { - case l @ LocalRelation(_, _) => l + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { + val localRelations = df.queryExecution.optimizedPlan.collect { + case l@LocalRelation(_, _) => l + } + assert(localRelations.size == 1) } - assert(localRelations.size == 1) } private def assertNotMetadataOnlyQuery(df: DataFrame): Unit = { - val localRelations = df.queryExecution.optimizedPlan.collect { - case l @ LocalRelation(_, _) => l + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { + val localRelations = df.queryExecution.optimizedPlan.collect { + case l@LocalRelation(_, _) => l + } + assert(localRelations.size == 0) } - assert(localRelations.size == 0) } test("OptimizeMetadataOnlyQuery test: aggregate expression is partition columns") { - withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { - assertMetadataOnlyQuery(sql("select partcol1 from srcpart group by partcol1")) - assertMetadataOnlyQuery( - sql("select partcol2 from srcpart where partcol1 = 0 group by partcol2")) - } - withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") { - assertNotMetadataOnlyQuery(sql("select partcol1 from srcpart group by partcol1")) - assertNotMetadataOnlyQuery( - sql("select partcol2 from srcpart where partcol1 = 0 group by partcol2")) - } + assertMetadataOnlyQuery(sql("select partcol1 from srcpart group by partcol1")) + assertMetadataOnlyQuery( + sql("select partcol2 from srcpart where partcol1 = 0 group by partcol2")) } test("OptimizeMetadataOnlyQuery test: distinct aggregate function on partition columns") { - withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { - assertMetadataOnlyQuery( - sql("SELECT partcol1, count(distinct partcol2) FROM srcpart group by partcol1")) - assertMetadataOnlyQuery( - sql("SELECT partcol1, count(distinct partcol2) FROM srcpart where partcol1 = 0 " + - "group by partcol1")) - } - withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") { - assertNotMetadataOnlyQuery( - sql("SELECT partcol1, count(distinct partcol2) FROM srcpart group by partcol1")) - assertNotMetadataOnlyQuery( - sql("SELECT partcol1, count(distinct partcol2) FROM srcpart where partcol1 = 0 " + - "group by partcol1")) - } + assertMetadataOnlyQuery( + sql("SELECT partcol1, count(distinct partcol2) FROM srcpart group by partcol1")) + assertMetadataOnlyQuery( + sql("SELECT partcol1, count(distinct partcol2) FROM srcpart where partcol1 = 0 " + + "group by partcol1")) } test("OptimizeMetadataOnlyQuery test: distinct on partition columns") { - withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { - assertMetadataOnlyQuery(sql("select distinct partcol1, partcol2 from srcpart")) - assertMetadataOnlyQuery( - sql("select distinct c1 from (select partcol1 + 1 as c1 from srcpart " + - "where partcol1 = 0) t")) - } - withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") { - assertNotMetadataOnlyQuery(sql("select distinct partcol1, partcol2 from srcpart")) - assertNotMetadataOnlyQuery( - sql("select distinct c1 from (select partcol1 + 1 as c1 from srcpart " + - "where partcol1 = 0) t")) - } + assertMetadataOnlyQuery(sql("select distinct partcol1, partcol2 from srcpart")) + assertMetadataOnlyQuery( + sql("select distinct c1 from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t")) } test("OptimizeMetadataOnlyQuery test: aggregate function on partition columns " + "which have same result w or w/o DISTINCT keyword.") { - withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { - assertMetadataOnlyQuery(sql("select max(partcol1) from srcpart")) - assertMetadataOnlyQuery(sql("select max(partcol1) from srcpart where partcol1 = 0")) - assertMetadataOnlyQuery( - sql("select partcol2, min(partcol1) from srcpart where partcol1 = 0 group by partcol2")) - assertMetadataOnlyQuery( - sql("select max(c1) from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t")) - } - withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") { - assertNotMetadataOnlyQuery(sql("select max(partcol1) from srcpart")) - assertNotMetadataOnlyQuery(sql("select max(partcol1) from srcpart where partcol1 = 0")) - assertNotMetadataOnlyQuery( - sql("select partcol2, min(partcol1) from srcpart where partcol1 = 0 group by partcol2")) - assertNotMetadataOnlyQuery( - sql("select max(c1) from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t")) - } + assertMetadataOnlyQuery(sql("select max(partcol1) from srcpart")) + assertMetadataOnlyQuery(sql("select min(partcol1) from srcpart where partcol1 = 0")) + assertMetadataOnlyQuery(sql("select first(partcol1) from srcpart")) + assertMetadataOnlyQuery(sql("select last(partcol1) from srcpart where partcol1 = 0")) + assertMetadataOnlyQuery( + sql("select partcol2, min(partcol1) from srcpart where partcol1 = 0 group by partcol2")) + assertMetadataOnlyQuery( + sql("select max(c1) from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t")) } test("OptimizeMetadataOnlyQuery test: unsupported for non-partition columns") { - withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { - assertNotMetadataOnlyQuery(sql("select col1 from srcpart group by col1")) - assertNotMetadataOnlyQuery( - sql("select partcol1, max(col1) from srcpart group by partcol1")) - assertNotMetadataOnlyQuery( - sql("select partcol1, count(distinct col1) from srcpart group by partcol1")) - assertNotMetadataOnlyQuery(sql("select distinct partcol1, col1 from srcpart")) - } + assertNotMetadataOnlyQuery(sql("select col1 from srcpart group by col1")) + assertNotMetadataOnlyQuery(sql("select partcol1, max(col1) from srcpart group by partcol1")) + assertNotMetadataOnlyQuery( + sql("select partcol1, count(distinct col1) from srcpart group by partcol1")) + assertNotMetadataOnlyQuery(sql("select distinct partcol1, col1 from srcpart")) } test("OptimizeMetadataOnlyQuery test: unsupported for non-distinct aggregate function on " + "partition columns") { - withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { - assertNotMetadataOnlyQuery( - sql("select partcol1, sum(partcol2) from srcpart group by partcol1")) - assertNotMetadataOnlyQuery( - sql("select partcol1, count(partcol2) from srcpart group by partcol1")) - } + assertNotMetadataOnlyQuery(sql("select partcol1, sum(partcol2) from srcpart group by partcol1")) + assertNotMetadataOnlyQuery( + sql("select partcol1, count(partcol2) from srcpart group by partcol1")) } test("OptimizeMetadataOnlyQuery test: unsupported for GroupingSet/Union operator") { - withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { - assertNotMetadataOnlyQuery( - sql("select partcol1, max(partcol2) from srcpart where partcol1 = 0 " + - "group by rollup (partcol1)")) - assertNotMetadataOnlyQuery( - sql("select partcol2 from (select partcol2 from srcpart where partcol1 = 0 union all " + - "select partcol2 from srcpart where partcol1 = 1) t group by partcol2")) - } + assertNotMetadataOnlyQuery( + sql("select partcol1, max(partcol2) from srcpart where partcol1 = 0 " + + "group by rollup (partcol1)")) + assertNotMetadataOnlyQuery( + sql("select partcol2 from (select partcol2 from srcpart where partcol1 = 0 union all " + + "select partcol2 from srcpart where partcol1 = 1) t group by partcol2")) } } From 8ee2a8c82a80d53647a6f3c391cb41c22a07a17f Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Tue, 12 Jul 2016 02:17:56 +0800 Subject: [PATCH 41/45] refactor --- .../execution/OptimizeMetadataOnlyQuery.scala | 26 ++-- .../OptimizeMetadataOnlyQuerySuite.scala | 120 +++++++++--------- 2 files changed, 80 insertions(+), 66 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala index 912d274ce02f2..320dd6d69f544 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala @@ -86,14 +86,14 @@ case class OptimizeMetadataOnlyQuery( case plan if plan eq relation => relation match { case l @ LogicalRelation(fsRelation: HadoopFsRelation, _, _) => - val partColumns = fsRelation.partitionSchema.map(_.name.toLowerCase).toSet - val partAttrs = l.output.filter(a => partColumns.contains(a.name.toLowerCase)) + val partAttrs = PartitionedRelation.getPartitionAttrs( + fsRelation.partitionSchema.map(_.name), l) val partitionData = fsRelation.location.listFiles(filters = Nil) LocalRelation(partAttrs, partitionData.map(_.values)) case relation: CatalogRelation => - val partColumns = relation.catalogTable.partitionColumnNames.map(_.toLowerCase).toSet - val partAttrs = relation.output.filter(a => partColumns.contains(a.name.toLowerCase)) + val partAttrs = PartitionedRelation.getPartitionAttrs( + relation.catalogTable.partitionColumnNames, relation) val partitionData = catalog.listPartitions(relation.catalogTable.identifier).map { p => InternalRow.fromSeq(partAttrs.map { attr => Cast(Literal(p.spec(attr.name)), attr.dataType).eval() @@ -116,17 +116,16 @@ case class OptimizeMetadataOnlyQuery( * deterministic expressions, and returns result after reaching the partitioned table relation * node. */ - object PartitionedRelation { + private[execution] object PartitionedRelation { + def unapply(plan: LogicalPlan): Option[(AttributeSet, LogicalPlan)] = plan match { case l @ LogicalRelation(fsRelation: HadoopFsRelation, _, _) if fsRelation.partitionSchema.nonEmpty => - val partColumns = fsRelation.partitionSchema.map(_.name.toLowerCase).toSet - val partAttrs = l.output.filter(a => partColumns.contains(a.name.toLowerCase)) + val partAttrs = getPartitionAttrs(fsRelation.partitionSchema.map(_.name), l) Some(AttributeSet(partAttrs), l) case relation: CatalogRelation if relation.catalogTable.partitionColumnNames.nonEmpty => - val partColumns = relation.catalogTable.partitionColumnNames.map(_.toLowerCase).toSet - val partAttrs = relation.output.filter(a => partColumns.contains(a.name.toLowerCase)) + val partAttrs = getPartitionAttrs(relation.catalogTable.partitionColumnNames, relation) Some(AttributeSet(partAttrs), relation) case p @ Project(projectList, child) if projectList.forall(_.deterministic) => @@ -141,5 +140,14 @@ case class OptimizeMetadataOnlyQuery( case _ => None } + + /** + * Returns the partition attributes of the table relation plan. + */ + def getPartitionAttrs(partitionColumnNames: Seq[String], relation: LogicalPlan) + : Seq[Attribute] = { + val partColumns = partitionColumnNames.map(_.toLowerCase).toSet + relation.output.filter(a => partColumns.contains(a.name.toLowerCase)) + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala index 7b690366f7920..928588b90111e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala @@ -41,76 +41,82 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext { } private def assertMetadataOnlyQuery(df: DataFrame): Unit = { - withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { - val localRelations = df.queryExecution.optimizedPlan.collect { - case l@LocalRelation(_, _) => l - } - assert(localRelations.size == 1) + val localRelations = df.queryExecution.optimizedPlan.collect { + case l @ LocalRelation(_, _) => l } + assert(localRelations.size == 1) } private def assertNotMetadataOnlyQuery(df: DataFrame): Unit = { - withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { - val localRelations = df.queryExecution.optimizedPlan.collect { - case l@LocalRelation(_, _) => l - } - assert(localRelations.size == 0) + val localRelations = df.queryExecution.optimizedPlan.collect { + case l @ LocalRelation(_, _) => l } + assert(localRelations.size == 0) } - test("OptimizeMetadataOnlyQuery test: aggregate expression is partition columns") { - assertMetadataOnlyQuery(sql("select partcol1 from srcpart group by partcol1")) - assertMetadataOnlyQuery( - sql("select partcol2 from srcpart where partcol1 = 0 group by partcol2")) + private def testMetadataOnly(name: String, sqls: String*): Unit = { + test(name) { + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { + sqls.foreach { case q => assertMetadataOnlyQuery(sql(q)) } + } + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") { + sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) } + } + } } - test("OptimizeMetadataOnlyQuery test: distinct aggregate function on partition columns") { - assertMetadataOnlyQuery( - sql("SELECT partcol1, count(distinct partcol2) FROM srcpart group by partcol1")) - assertMetadataOnlyQuery( - sql("SELECT partcol1, count(distinct partcol2) FROM srcpart where partcol1 = 0 " + - "group by partcol1")) + private def testUnspportedMetadataOnly(name: String, sqls: String*): Unit = { + test(name) { + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { + sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) } + } + withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") { + sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) } + } + } } - test("OptimizeMetadataOnlyQuery test: distinct on partition columns") { - assertMetadataOnlyQuery(sql("select distinct partcol1, partcol2 from srcpart")) - assertMetadataOnlyQuery( - sql("select distinct c1 from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t")) - } + testMetadataOnly( + "OptimizeMetadataOnlyQuery test: aggregate expression is partition columns", + "select partcol1 from srcpart group by partcol1", + "select partcol2 from srcpart where partcol1 = 0 group by partcol2") - test("OptimizeMetadataOnlyQuery test: aggregate function on partition columns " + - "which have same result w or w/o DISTINCT keyword.") { - assertMetadataOnlyQuery(sql("select max(partcol1) from srcpart")) - assertMetadataOnlyQuery(sql("select min(partcol1) from srcpart where partcol1 = 0")) - assertMetadataOnlyQuery(sql("select first(partcol1) from srcpart")) - assertMetadataOnlyQuery(sql("select last(partcol1) from srcpart where partcol1 = 0")) - assertMetadataOnlyQuery( - sql("select partcol2, min(partcol1) from srcpart where partcol1 = 0 group by partcol2")) - assertMetadataOnlyQuery( - sql("select max(c1) from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t")) - } + testMetadataOnly( + "OptimizeMetadataOnlyQuery test: distinct aggregate function on partition columns", + "SELECT partcol1, count(distinct partcol2) FROM srcpart group by partcol1", + "SELECT partcol1, count(distinct partcol2) FROM srcpart where partcol1 = 0 group by partcol1") - test("OptimizeMetadataOnlyQuery test: unsupported for non-partition columns") { - assertNotMetadataOnlyQuery(sql("select col1 from srcpart group by col1")) - assertNotMetadataOnlyQuery(sql("select partcol1, max(col1) from srcpart group by partcol1")) - assertNotMetadataOnlyQuery( - sql("select partcol1, count(distinct col1) from srcpart group by partcol1")) - assertNotMetadataOnlyQuery(sql("select distinct partcol1, col1 from srcpart")) - } + testMetadataOnly( + "OptimizeMetadataOnlyQuery test: distinct on partition columns", + "select distinct partcol1, partcol2 from srcpart", + "select distinct c1 from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t") - test("OptimizeMetadataOnlyQuery test: unsupported for non-distinct aggregate function on " + - "partition columns") { - assertNotMetadataOnlyQuery(sql("select partcol1, sum(partcol2) from srcpart group by partcol1")) - assertNotMetadataOnlyQuery( - sql("select partcol1, count(partcol2) from srcpart group by partcol1")) - } + testMetadataOnly( + "OptimizeMetadataOnlyQuery test: aggregate function on partition columns which have same " + + "result w or w/o DISTINCT keyword.", + "select max(partcol1) from srcpart", + "select min(partcol1) from srcpart where partcol1 = 0", + "select first(partcol1) from srcpart", + "select last(partcol1) from srcpart where partcol1 = 0", + "select partcol2, min(partcol1) from srcpart where partcol1 = 0 group by partcol2", + "select max(c1) from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t") - test("OptimizeMetadataOnlyQuery test: unsupported for GroupingSet/Union operator") { - assertNotMetadataOnlyQuery( - sql("select partcol1, max(partcol2) from srcpart where partcol1 = 0 " + - "group by rollup (partcol1)")) - assertNotMetadataOnlyQuery( - sql("select partcol2 from (select partcol2 from srcpart where partcol1 = 0 union all " + - "select partcol2 from srcpart where partcol1 = 1) t group by partcol2")) - } + testUnspportedMetadataOnly( + "OptimizeMetadataOnlyQuery test: unsupported for non-partition columns", + "select col1 from srcpart group by col1", + "select partcol1, max(col1) from srcpart group by partcol1", + "select partcol1, count(distinct col1) from srcpart group by partcol1", + "select distinct partcol1, col1 from srcpart") + + testUnspportedMetadataOnly( + "OptimizeMetadataOnlyQuery test: unsupported for non-distinct aggregate function on " + + "partition columns", + "select partcol1, sum(partcol2) from srcpart group by partcol1", + "select partcol1, count(partcol2) from srcpart group by partcol1") + + testUnspportedMetadataOnly( + "OptimizeMetadataOnlyQuery test: unsupported for GroupingSet/Union operator", + "select partcol1, max(partcol2) from srcpart where partcol1 = 0 group by rollup (partcol1)", + "select partcol2 from (select partcol2 from srcpart where partcol1 = 0 union all " + + "select partcol2 from srcpart where partcol1 = 1) t group by partcol2") } From d888c85a1b95904408d436271a40d27095e2eb4d Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Tue, 12 Jul 2016 02:20:22 +0800 Subject: [PATCH 42/45] fix minor --- .../sql/execution/OptimizeMetadataOnlyQuerySuite.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala index 928588b90111e..433afc3a39936 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala @@ -65,7 +65,7 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext { } } - private def testUnspportedMetadataOnly(name: String, sqls: String*): Unit = { + private def testNotMetadataOnly(name: String, sqls: String*): Unit = { test(name) { withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) } @@ -101,20 +101,20 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext { "select partcol2, min(partcol1) from srcpart where partcol1 = 0 group by partcol2", "select max(c1) from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t") - testUnspportedMetadataOnly( + testNotMetadataOnly( "OptimizeMetadataOnlyQuery test: unsupported for non-partition columns", "select col1 from srcpart group by col1", "select partcol1, max(col1) from srcpart group by partcol1", "select partcol1, count(distinct col1) from srcpart group by partcol1", "select distinct partcol1, col1 from srcpart") - testUnspportedMetadataOnly( + testNotMetadataOnly( "OptimizeMetadataOnlyQuery test: unsupported for non-distinct aggregate function on " + "partition columns", "select partcol1, sum(partcol2) from srcpart group by partcol1", "select partcol1, count(partcol2) from srcpart group by partcol1") - testUnspportedMetadataOnly( + testNotMetadataOnly( "OptimizeMetadataOnlyQuery test: unsupported for GroupingSet/Union operator", "select partcol1, max(partcol2) from srcpart where partcol1 = 0 group by rollup (partcol1)", "select partcol2 from (select partcol2 from srcpart where partcol1 = 0 union all " + From ff1650987b901825d3828de972fa4434466c951f Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Tue, 12 Jul 2016 10:51:00 +0800 Subject: [PATCH 43/45] update --- .../execution/OptimizeMetadataOnlyQuery.scala | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala index 320dd6d69f544..ee34a8a60b80d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala @@ -75,6 +75,16 @@ case class OptimizeMetadataOnlyQuery( } } + /** + * Returns the partition attributes of the table relation plan. + */ + private def getPartitionAttrs( + partitionColumnNames: Seq[String], + relation: LogicalPlan): Seq[Attribute] = { + val partColumns = partitionColumnNames.map(_.toLowerCase).toSet + relation.output.filter(a => partColumns.contains(a.name.toLowerCase)) + } + /** * Transform the given plan, find its table scan nodes that matches the given relation, and then * replace the table scan node with its corresponding partition values. @@ -86,14 +96,12 @@ case class OptimizeMetadataOnlyQuery( case plan if plan eq relation => relation match { case l @ LogicalRelation(fsRelation: HadoopFsRelation, _, _) => - val partAttrs = PartitionedRelation.getPartitionAttrs( - fsRelation.partitionSchema.map(_.name), l) + val partAttrs = getPartitionAttrs(fsRelation.partitionSchema.map(_.name), l) val partitionData = fsRelation.location.listFiles(filters = Nil) LocalRelation(partAttrs, partitionData.map(_.values)) case relation: CatalogRelation => - val partAttrs = PartitionedRelation.getPartitionAttrs( - relation.catalogTable.partitionColumnNames, relation) + val partAttrs = getPartitionAttrs(relation.catalogTable.partitionColumnNames, relation) val partitionData = catalog.listPartitions(relation.catalogTable.identifier).map { p => InternalRow.fromSeq(partAttrs.map { attr => Cast(Literal(p.spec(attr.name)), attr.dataType).eval() @@ -116,17 +124,17 @@ case class OptimizeMetadataOnlyQuery( * deterministic expressions, and returns result after reaching the partitioned table relation * node. */ - private[execution] object PartitionedRelation { + object PartitionedRelation { def unapply(plan: LogicalPlan): Option[(AttributeSet, LogicalPlan)] = plan match { case l @ LogicalRelation(fsRelation: HadoopFsRelation, _, _) if fsRelation.partitionSchema.nonEmpty => val partAttrs = getPartitionAttrs(fsRelation.partitionSchema.map(_.name), l) - Some(AttributeSet(partAttrs), l) + Some(partAttrs, l) case relation: CatalogRelation if relation.catalogTable.partitionColumnNames.nonEmpty => val partAttrs = getPartitionAttrs(relation.catalogTable.partitionColumnNames, relation) - Some(AttributeSet(partAttrs), relation) + Some(partAttrs, relation) case p @ Project(projectList, child) if projectList.forall(_.deterministic) => unapply(child).flatMap { case (partAttrs, relation) => @@ -144,10 +152,11 @@ case class OptimizeMetadataOnlyQuery( /** * Returns the partition attributes of the table relation plan. */ - def getPartitionAttrs(partitionColumnNames: Seq[String], relation: LogicalPlan) - : Seq[Attribute] = { + private def getPartitionAttrs( + partitionColumnNames: Seq[String], + relation: LogicalPlan): AttributeSet = { val partColumns = partitionColumnNames.map(_.toLowerCase).toSet - relation.output.filter(a => partColumns.contains(a.name.toLowerCase)) + AttributeSet(relation.output.filter(a => partColumns.contains(a.name.toLowerCase))) } } } From 358ad13f58562716d2f376818d9b45898616ba4a Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Tue, 12 Jul 2016 17:28:48 +0800 Subject: [PATCH 44/45] remove duplicate code --- .../sql/execution/OptimizeMetadataOnlyQuery.scala | 14 ++------------ .../OptimizeMetadataOnlyQuerySuite.scala | 15 +++++++-------- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala index ee34a8a60b80d..1b7fedca8484c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala @@ -130,11 +130,11 @@ case class OptimizeMetadataOnlyQuery( case l @ LogicalRelation(fsRelation: HadoopFsRelation, _, _) if fsRelation.partitionSchema.nonEmpty => val partAttrs = getPartitionAttrs(fsRelation.partitionSchema.map(_.name), l) - Some(partAttrs, l) + Some(AttributeSet(partAttrs), l) case relation: CatalogRelation if relation.catalogTable.partitionColumnNames.nonEmpty => val partAttrs = getPartitionAttrs(relation.catalogTable.partitionColumnNames, relation) - Some(partAttrs, relation) + Some(AttributeSet(partAttrs), relation) case p @ Project(projectList, child) if projectList.forall(_.deterministic) => unapply(child).flatMap { case (partAttrs, relation) => @@ -148,15 +148,5 @@ case class OptimizeMetadataOnlyQuery( case _ => None } - - /** - * Returns the partition attributes of the table relation plan. - */ - private def getPartitionAttrs( - partitionColumnNames: Seq[String], - relation: LogicalPlan): AttributeSet = { - val partColumns = partitionColumnNames.map(_.toLowerCase).toSet - AttributeSet(relation.output.filter(a => partColumns.contains(a.name.toLowerCase))) - } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala index 433afc3a39936..ba02f15503eb8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala @@ -77,23 +77,22 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext { } testMetadataOnly( - "OptimizeMetadataOnlyQuery test: aggregate expression is partition columns", + "aggregate expression is partition columns", "select partcol1 from srcpart group by partcol1", "select partcol2 from srcpart where partcol1 = 0 group by partcol2") testMetadataOnly( - "OptimizeMetadataOnlyQuery test: distinct aggregate function on partition columns", + "distinct aggregate function on partition columns", "SELECT partcol1, count(distinct partcol2) FROM srcpart group by partcol1", "SELECT partcol1, count(distinct partcol2) FROM srcpart where partcol1 = 0 group by partcol1") testMetadataOnly( - "OptimizeMetadataOnlyQuery test: distinct on partition columns", + "distinct on partition columns", "select distinct partcol1, partcol2 from srcpart", "select distinct c1 from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t") testMetadataOnly( - "OptimizeMetadataOnlyQuery test: aggregate function on partition columns which have same " + - "result w or w/o DISTINCT keyword.", + "aggregate function on partition columns which have same result w or w/o DISTINCT keyword.", "select max(partcol1) from srcpart", "select min(partcol1) from srcpart where partcol1 = 0", "select first(partcol1) from srcpart", @@ -102,20 +101,20 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext { "select max(c1) from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t") testNotMetadataOnly( - "OptimizeMetadataOnlyQuery test: unsupported for non-partition columns", + "unsupported for non-partition columns", "select col1 from srcpart group by col1", "select partcol1, max(col1) from srcpart group by partcol1", "select partcol1, count(distinct col1) from srcpart group by partcol1", "select distinct partcol1, col1 from srcpart") testNotMetadataOnly( - "OptimizeMetadataOnlyQuery test: unsupported for non-distinct aggregate function on " + + "unsupported for non-distinct aggregate function on " + "partition columns", "select partcol1, sum(partcol2) from srcpart group by partcol1", "select partcol1, count(partcol2) from srcpart group by partcol1") testNotMetadataOnly( - "OptimizeMetadataOnlyQuery test: unsupported for GroupingSet/Union operator", + "unsupported for GroupingSet/Union operator", "select partcol1, max(partcol2) from srcpart where partcol1 = 0 group by rollup (partcol1)", "select partcol2 from (select partcol2 from srcpart where partcol1 = 0 union all " + "select partcol2 from srcpart where partcol1 = 1) t group by partcol2") From 030776ae49484c4e5db7f775344e5e40dff27e9a Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Tue, 12 Jul 2016 17:47:04 +0800 Subject: [PATCH 45/45] fix minor --- .../OptimizeMetadataOnlyQuerySuite.scala | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala index ba02f15503eb8..58c310596ca6d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala @@ -77,22 +77,22 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext { } testMetadataOnly( - "aggregate expression is partition columns", + "Aggregate expression is partition columns", "select partcol1 from srcpart group by partcol1", "select partcol2 from srcpart where partcol1 = 0 group by partcol2") testMetadataOnly( - "distinct aggregate function on partition columns", + "Distinct aggregate function on partition columns", "SELECT partcol1, count(distinct partcol2) FROM srcpart group by partcol1", "SELECT partcol1, count(distinct partcol2) FROM srcpart where partcol1 = 0 group by partcol1") testMetadataOnly( - "distinct on partition columns", + "Distinct on partition columns", "select distinct partcol1, partcol2 from srcpart", "select distinct c1 from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t") testMetadataOnly( - "aggregate function on partition columns which have same result w or w/o DISTINCT keyword.", + "Aggregate function on partition columns which have same result w or w/o DISTINCT keyword", "select max(partcol1) from srcpart", "select min(partcol1) from srcpart where partcol1 = 0", "select first(partcol1) from srcpart", @@ -101,20 +101,19 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext { "select max(c1) from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t") testNotMetadataOnly( - "unsupported for non-partition columns", + "Don't optimize metadata only query for non-partition columns", "select col1 from srcpart group by col1", "select partcol1, max(col1) from srcpart group by partcol1", "select partcol1, count(distinct col1) from srcpart group by partcol1", "select distinct partcol1, col1 from srcpart") testNotMetadataOnly( - "unsupported for non-distinct aggregate function on " + - "partition columns", + "Don't optimize metadata only query for non-distinct aggregate function on partition columns", "select partcol1, sum(partcol2) from srcpart group by partcol1", "select partcol1, count(partcol2) from srcpart group by partcol1") testNotMetadataOnly( - "unsupported for GroupingSet/Union operator", + "Don't optimize metadata only query for GroupingSet/Union operator", "select partcol1, max(partcol2) from srcpart where partcol1 = 0 group by rollup (partcol1)", "select partcol2 from (select partcol2 from srcpart where partcol1 = 0 union all " + "select partcol2 from srcpart where partcol1 = 1) t group by partcol2")