-
Notifications
You must be signed in to change notification settings - Fork 116
Support Delta Lake file-based source provider #265
Changes from all commits
e12ef65
4452822
8f07454
61c152c
3455efb
02f8c45
c5e8b77
0524bdb
496c345
b3dfe92
f1ae0a3
952d06a
56d5721
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,6 +28,7 @@ import org.apache.spark.sql.execution.datasources._ | |
| import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat | ||
| import org.apache.spark.sql.types.{LongType, StructType} | ||
|
|
||
| import com.microsoft.hyperspace.Hyperspace | ||
| import com.microsoft.hyperspace.index._ | ||
| import com.microsoft.hyperspace.index.plans.logical.BucketUnion | ||
| import com.microsoft.hyperspace.util.HyperspaceConf | ||
|
|
@@ -105,24 +106,23 @@ object RuleUtils { | |
| // TODO: Duplicate listing files for the given relation as in | ||
| // [[transformPlanToUseHybridScan]] | ||
| // See https://github.com/microsoft/hyperspace/issues/160 | ||
| val filesByRelations = plan | ||
| .collect { | ||
| case LogicalRelation( | ||
| HadoopFsRelation(location: PartitioningAwareFileIndex, _, _, _, _, _), | ||
| _, | ||
| _, | ||
| _) => | ||
| location.allFiles.map( | ||
| f => | ||
| // For a given file, file id is only meaningful in the context of a given | ||
| // index. At this point, we do not know which index, if any, would be picked. | ||
| // Therefore, we simply set the file id to UNKNOWN_FILE_ID. | ||
| FileInfo( | ||
| f.getPath.toString, | ||
| f.getLen, | ||
| f.getModificationTime, | ||
| IndexConstants.UNKNOWN_FILE_ID)) | ||
| } | ||
| val filesByRelations = plan.collect { | ||
| case rel: LogicalRelation => | ||
| Hyperspace | ||
| .getContext(spark) | ||
| .sourceProviderManager | ||
| .allFiles(rel) | ||
| .map { f => | ||
| // For a given file, file id is only meaningful in the context of a given | ||
| // index. At this point, we do not know which index, if any, would be picked. | ||
| // Therefore, we simply set the file id to UNKNOWN_FILE_ID. | ||
| FileInfo( | ||
| f.getPath.toString, | ||
| f.getLen, | ||
| f.getModificationTime, | ||
| IndexConstants.UNKNOWN_FILE_ID) | ||
| } | ||
| } | ||
| assert(filesByRelations.length == 1) | ||
| indexes.filter(index => | ||
| index.created && isHybridScanCandidate(index, filesByRelations.flatten)) | ||
|
|
@@ -165,13 +165,14 @@ object RuleUtils { | |
| plan: LogicalPlan, | ||
| useBucketSpec: Boolean): LogicalPlan = { | ||
| // Check pre-requisite. | ||
| assert(getLogicalRelation(plan).isDefined) | ||
| val logicalRelation = getLogicalRelation(plan) | ||
| assert(logicalRelation.isDefined) | ||
|
|
||
| // If there is no change in source data files, the index can be applied by | ||
| // transformPlanToUseIndexOnlyScan regardless of Hybrid Scan config. | ||
| // This tag should always exist if Hybrid Scan is enabled. | ||
| val hybridScanRequired = HyperspaceConf.hybridScanEnabled(spark) && | ||
| index.getTagValue(getLogicalRelation(plan).get, IndexLogEntryTags.HYBRIDSCAN_REQUIRED).get | ||
| index.getTagValue(logicalRelation.get, IndexLogEntryTags.HYBRIDSCAN_REQUIRED).get | ||
|
|
||
| // If the index has appended files and/or deleted files, which means the current index data | ||
| // is outdated, Hybrid Scan should be used to handle the newly updated source files. | ||
|
|
@@ -261,7 +262,8 @@ object RuleUtils { | |
| index: IndexLogEntry, | ||
| plan: LogicalPlan, | ||
| useBucketSpec: Boolean): LogicalPlan = { | ||
| val isParquetSourceFormat = index.relations.head.fileFormat.equals("parquet") | ||
| val fileFormat = index.relations.head.fileFormat | ||
| val isParquetSourceFormat = fileFormat.equals("parquet") || fileFormat.equals("delta") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, does this mean adding a new source provider is not enough? Can we introduce
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can do this as a separate PR if that is preferred. Please create an issue in that case.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this case is too specific to create an API to source provider; and it refers fileformat string in index metadata, not relation.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should be able to plug in a source provider defined externally without changing Hyperspace codebase. For example, let's say I have a source format "blah" that uses parquet internally, and how can I plug in without modifying Hyperspace? One easy way to think about is whether you can implement delta source provider outside Hyperspace.
You can do this in the create path and record it in the metadata.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got your point - it should be able to add a new source provider externally. |
||
| var unhandledAppendedFiles: Seq[Path] = Nil | ||
|
|
||
| // Get transformed plan with index data and appended files if applicable. | ||
|
|
@@ -271,7 +273,7 @@ object RuleUtils { | |
| // can be transformed to 'Project -> Filter -> Logical Relation'. Thus, with transformDown, | ||
| // it will be matched again and transformed recursively which causes stack overflow exception. | ||
| case baseRelation @ LogicalRelation( | ||
| _ @HadoopFsRelation(location: PartitioningAwareFileIndex, _, _, _, _, _), | ||
| _ @HadoopFsRelation(location: FileIndex, _, _, _, _, _), | ||
| baseOutput, | ||
| _, | ||
| _) => | ||
|
|
@@ -281,7 +283,10 @@ object RuleUtils { | |
| // appendedFiles and deletedFiles in IndexLogEntry. | ||
| (index.deletedFiles, index.appendedFiles.map(f => new Path(f.name)).toSeq) | ||
| } else { | ||
| val curFiles = location.allFiles | ||
| val curFiles = Hyperspace | ||
| .getContext(spark) | ||
| .sourceProviderManager | ||
| .allFiles(baseRelation) | ||
| .map(f => FileInfo(f, index.fileIdTracker.addFile(f), asFullPath = true)) | ||
| if (HyperspaceConf.hybridScanDeleteEnabled(spark) && index.hasLineageColumn) { | ||
| val (exist, nonExist) = curFiles.partition(index.sourceFileInfoSet.contains) | ||
|
|
@@ -410,11 +415,11 @@ object RuleUtils { | |
| // Transform the location of LogicalRelation with appended files. | ||
| val planForAppended = originalPlan transformDown { | ||
| case baseRelation @ LogicalRelation( | ||
| fsRelation @ HadoopFsRelation(location: PartitioningAwareFileIndex, _, _, _, _, _), | ||
| fsRelation @ HadoopFsRelation(location: FileIndex, _, _, _, _, _), | ||
| baseOutput, | ||
| _, | ||
| _) => | ||
| val options = extractBasePath(location.partitionSpec) | ||
| val options = extractBasePath(spark, location) | ||
| .map { basePath => | ||
| // Set "basePath" so that partitioned columns are also included in the output schema. | ||
| Map("basePath" -> basePath) | ||
|
|
@@ -440,19 +445,11 @@ object RuleUtils { | |
| planForAppended | ||
| } | ||
|
|
||
| private def extractBasePath(partitionSpec: PartitionSpec): Option[String] = { | ||
| if (partitionSpec == PartitionSpec.emptySpec) { | ||
| private def extractBasePath(spark: SparkSession, location: FileIndex): Option[String] = { | ||
| if (location.partitionSchema.isEmpty) { | ||
| None | ||
| } else { | ||
| // For example, we could have the following in PartitionSpec: | ||
| // - partition columns = "col1", "col2" | ||
| // - partitions: "/path/col1=1/col2=1", "/path/col1=1/col2=2", etc. | ||
| // , and going up the same number of directory levels as the number of partition columns | ||
| // will compute the base path. Note that PartitionSpec.partitions will always contain | ||
| // all the partitions in the path, so "partitions.head" is taken as an initial value. | ||
| val basePath = partitionSpec.partitionColumns | ||
| .foldLeft(partitionSpec.partitions.head.path)((path, _) => path.getParent) | ||
| Some(basePath.toString) | ||
| Some(Hyperspace.getContext(spark).sourceProviderManager.partitionBasePath(location)) | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Delta Lake only allows one path in load()
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yea, we can do something like:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Btw, what happens if the delta lake implementation adds the "path" option in
latestRelation.options?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
https://github.com/microsoft/hyperspace/pull/265/files#diff-f7ecc68d1799e9c2c916973786081d5f35f312785537ccd2eed61bce41a10786R72
"path" key will be removed in
createRelation.