-
Notifications
You must be signed in to change notification settings - Fork 29.2k
[SPARK-32432][SQL] Add support for reading ORC/Parquet files of SymlinkTextInputFormat table And Fix Analyze for SymlinkTextInputFormat table #35734
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -122,7 +122,16 @@ abstract class PartitioningAwareFileIndex( | |
| // Directory does not exist, or has no children files | ||
| Nil | ||
| } | ||
| PartitionDirectory(values, files) | ||
| // Check leaf files since they might be symlink targets | ||
| if (files.isEmpty) { | ||
| val status: Seq[FileStatus] = leafFiles.get(path) match { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are Symlink targets in leaf files? I think leaf files are listed from table root?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yea, but
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At the partition table, we have the same behavior, there is also symbolic in the root directory of the partition |
||
| case Some(existingFile) if isNonEmptyFile(existingFile) => Seq(existingFile) | ||
| case _ => Nil | ||
| } | ||
| PartitionDirectory(values, status) | ||
| } else { | ||
| PartitionDirectory(values, files) | ||
| } | ||
| } | ||
| } | ||
| logTrace("Selected files after partition pruning:\n\t" + selectedPartitions.mkString("\n\t")) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,97 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.execution.datasources | ||
|
|
||
| import java.io.{BufferedReader, InputStreamReader} | ||
| import java.net.URI | ||
| import java.nio.charset.StandardCharsets.UTF_8 | ||
|
|
||
| import com.google.common.io.CharStreams | ||
| import org.apache.hadoop.fs.{FileSystem, Path, PathFilter} | ||
| import scala.collection.JavaConverters._ | ||
|
|
||
| import org.apache.spark.sql.catalyst.catalog.CatalogTable | ||
|
|
||
| object SymlinkTextInputFormatUtil { | ||
|
|
||
| /** | ||
| * Determine if InputFormat is SymlinkTable | ||
| * | ||
| * @param inputFormat Table InputFormat | ||
| * @return | ||
| */ | ||
| def isSymlinkTextFormat(inputFormat: String): Boolean = { | ||
| inputFormat.equals("org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat") | ||
| } | ||
|
|
||
| /** | ||
| * Determine CatalogTable is SymlinkTable | ||
| * | ||
| * @param catalogTable CatalogTable | ||
| * @return | ||
| */ | ||
| def isSymlinkTextFormat(catalogTable: CatalogTable): Boolean = { | ||
| catalogTable.storage.inputFormat.exists(isSymlinkTextFormat) | ||
| } | ||
|
|
||
| /** | ||
| * Get symlink files from target path | ||
| * Mostly copied from BackgroundHiveSplitLoader#getTargetPathsFromSymlink of trino(prestosql) | ||
| * compatible with hive SymlinkTextInputFormat#getTargetPathsFromSymlinksDirs | ||
| * | ||
| * @param fileSystem filesystem | ||
| * @param symlinkDir symlink table location | ||
| * @return | ||
| */ | ||
| def getTargetPathsFromSymlink( | ||
| fileSystem: FileSystem, | ||
| symlinkDir: Path): Seq[Path] = { | ||
|
|
||
| val symlinks = fileSystem.listStatus(symlinkDir, new PathFilter() { | ||
| override def accept(p: Path): Boolean = DataSourceUtils.isDataPath(p) | ||
| }) | ||
|
|
||
| symlinks.flatMap { | ||
| case fileStatus if fileStatus.isFile => | ||
| val reader = new BufferedReader( | ||
| new InputStreamReader(fileSystem.open(fileStatus.getPath), UTF_8)) | ||
| try { | ||
| CharStreams.readLines(reader).asScala | ||
| .map(symlinkStr => new Path(symlinkStr)) | ||
| } finally { | ||
| reader.close() | ||
| Seq.empty | ||
| } | ||
| case _ => | ||
| Seq.empty | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Get symlink uris from target path | ||
| * | ||
| * @param fileSystem filesystem | ||
| * @param location symlink table location | ||
| * @return | ||
| */ | ||
| def getSymlinkTableLocationPaths(fileSystem: FileSystem, location: URI): Seq[Option[URI]] = { | ||
| SymlinkTextInputFormatUtil | ||
| .getTargetPathsFromSymlink(fileSystem, new Path(location)) | ||
| .map(path => Option(path.toUri)) | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -204,10 +204,23 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log | |
| (options, None) | ||
| } | ||
|
|
||
| lazy val fs = tablePath.getFileSystem(sparkSession.sparkContext.hadoopConfiguration) | ||
| val isSymlinkTextFormat = SymlinkTextInputFormatUtil.isSymlinkTextFormat(relation.tableMeta) | ||
|
|
||
| val symlinkTargets = if (isSymlinkTextFormat) { | ||
| SymlinkTextInputFormatUtil.getTargetPathsFromSymlink(fs, tablePath) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When table is partitioned and |
||
| } else { | ||
| Nil | ||
| } | ||
|
|
||
| val result = if (relation.isPartitioned) { | ||
| val partitionSchema = relation.tableMeta.partitionSchema | ||
| val rootPaths: Seq[Path] = if (lazyPruningEnabled) { | ||
| Seq(tablePath) | ||
| if (isSymlinkTextFormat) { | ||
| symlinkTargets | ||
| } else { | ||
| Seq(tablePath) | ||
| } | ||
| } else { | ||
| // By convention (for example, see CatalogFileIndex), the definition of a | ||
| // partitioned table's paths depends on whether that table has any actual partitions. | ||
|
|
@@ -220,6 +233,8 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log | |
|
|
||
| if (paths.isEmpty) { | ||
| Seq(tablePath) | ||
| } else if (isSymlinkTextFormat) { | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @AngersZhuuuu also used in here when |
||
| paths.flatMap(path => SymlinkTextInputFormatUtil.getTargetPathsFromSymlink(fs, path)) | ||
| } else { | ||
| paths | ||
| } | ||
|
|
@@ -264,11 +279,15 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log | |
| logicalRelation | ||
| }) | ||
| } else { | ||
| val rootPath = tablePath | ||
| val rootPaths = if (isSymlinkTextFormat) { | ||
| symlinkTargets | ||
| } else { | ||
| Seq(tablePath) | ||
| } | ||
| withTableCreationLock(tableIdentifier, { | ||
| val cached = getCached( | ||
| tableIdentifier, | ||
| Seq(rootPath), | ||
| rootPaths, | ||
| metastoreSchema, | ||
| fileFormatClass, | ||
| None) | ||
|
|
@@ -278,7 +297,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log | |
| LogicalRelation( | ||
| DataSource( | ||
| sparkSession = sparkSession, | ||
| paths = rootPath.toString :: Nil, | ||
| paths = rootPaths.map(_.toString), | ||
| userSpecifiedSchema = Option(updatedTable.dataSchema), | ||
| bucketSpec = hiveBucketSpec, | ||
| // Do not interpret the 'path' option at all when tables are read using the Hive | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Spark,3.2,1 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| Trino,371.0,2 | ||
| Hive,2.3,3 |
Uh oh!
There was an error while loading. Please reload this page.