-
Notifications
You must be signed in to change notification settings - Fork 29.2k
[SPARK-32432][SQL] Added support for reading ORC/Parquet files with SymlinkTextInputFormat #29330
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,58 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.execution.datasources | ||
|
|
||
| import java.io.{BufferedReader, InputStreamReader, IOException} | ||
| import java.nio.charset.StandardCharsets.UTF_8 | ||
|
|
||
| import scala.collection.JavaConverters._ | ||
|
|
||
| import com.google.common.io.CharStreams | ||
| import org.apache.hadoop.fs.{FileSystem, Path} | ||
|
|
||
| object SymlinkTextInputFormatUtil { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add Javadoc to the class and the methods. |
||
|
|
||
| def isSymlinkTextFormat(inputFormat: String): Boolean = { | ||
| inputFormat.equals("org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat") | ||
| } | ||
|
|
||
| // Mostly copied from SymlinkTextInputFormat#getTargetPathsFromSymlinksDirs of Hive 3.1 | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it compatible with other versions of Hive? |
||
| def getTargetPathsFromSymlink( | ||
| fileSystem: FileSystem, | ||
| symlinkDir: Path): Seq[Path] = { | ||
|
|
||
| val symlinkIterator = fileSystem.listFiles(symlinkDir, true) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi, you should ignore the name start with "_" or ".", just like this hive implement, cause hadoop will generate temp file start with "_" or ".", we should ignore these files.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the comments, I will reflect your comments on this. |
||
| var targetPaths = Seq[Path]() | ||
|
|
||
| while (symlinkIterator.hasNext) { | ||
| val fileStatus = symlinkIterator.next() | ||
| if (fileStatus.isFile) { | ||
| val reader = new BufferedReader( | ||
| new InputStreamReader(fileSystem.open(fileStatus.getPath), UTF_8)) | ||
| try { | ||
| val targets: Seq[Path] = CharStreams.readLines(reader).asScala. | ||
| map(symlinkStr => new Path(symlinkStr)) | ||
| targetPaths = targetPaths ++ targets | ||
| } finally { | ||
| reader.close | ||
| } | ||
| } | ||
| } | ||
| targetPaths | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -166,11 +166,23 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log | |
| val lazyPruningEnabled = sparkSession.sqlContext.conf.manageFilesourcePartitions | ||
| val tablePath = new Path(relation.tableMeta.location) | ||
| val fileFormat = fileFormatClass.getConstructor().newInstance() | ||
| val inputFormat = relation.tableMeta.storage.inputFormat.getOrElse("") | ||
| val fs = tablePath.getFileSystem(sparkSession.sparkContext.hadoopConfiguration) | ||
|
|
||
| val symlinkTargets = if (SymlinkTextInputFormatUtil.isSymlinkTextFormat(inputFormat)) { | ||
| SymlinkTextInputFormatUtil.getTargetPathsFromSymlink(fs, tablePath) | ||
| } else { | ||
| Nil | ||
| } | ||
|
|
||
| val result = if (relation.isPartitioned) { | ||
| val partitionSchema = relation.tableMeta.partitionSchema | ||
| val rootPaths: Seq[Path] = if (lazyPruningEnabled) { | ||
| Seq(tablePath) | ||
| if (SymlinkTextInputFormatUtil.isSymlinkTextFormat(inputFormat)) { | ||
| symlinkTargets | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should be replaced with |
||
| } else { | ||
| Seq(tablePath) | ||
| } | ||
| } else { | ||
| // By convention (for example, see CatalogFileIndex), the definition of a | ||
| // partitioned table's paths depends on whether that table has any actual partitions. | ||
|
|
@@ -183,6 +195,8 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log | |
|
|
||
| if (paths.isEmpty) { | ||
| Seq(tablePath) | ||
| } else if (SymlinkTextInputFormatUtil.isSymlinkTextFormat(inputFormat)) { | ||
| paths.flatMap(path => SymlinkTextInputFormatUtil.getTargetPathsFromSymlink(fs, path)) | ||
| } else { | ||
| paths | ||
| } | ||
|
|
@@ -227,11 +241,16 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log | |
| logicalRelation | ||
| }) | ||
| } else { | ||
| val rootPath = tablePath | ||
| val rootPaths = if (SymlinkTextInputFormatUtil.isSymlinkTextFormat(inputFormat)) { | ||
| symlinkTargets | ||
| } else { | ||
| Seq(tablePath) | ||
| } | ||
|
|
||
| withTableCreationLock(tableIdentifier, { | ||
| val cached = getCached( | ||
| tableIdentifier, | ||
| Seq(rootPath), | ||
| rootPaths, | ||
| metastoreSchema, | ||
| fileFormatClass, | ||
| None) | ||
|
|
@@ -241,7 +260,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log | |
| LogicalRelation( | ||
| DataSource( | ||
| sparkSession = sparkSession, | ||
| paths = rootPath.toString :: Nil, | ||
| paths = rootPaths.map(rootPath => rootPath.toString), | ||
| userSpecifiedSchema = Option(updatedTable.dataSchema), | ||
| bucketSpec = None, | ||
| options = options, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| 1,2,3 | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There should be other examples of a CSV file in the resources, can you use those ones? The same applies to Parquet and ORC. |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| 4,5,6 | ||
| 7,8,9 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: isEmpty?