From 4a7c3275a8a61d7fddb26bc1a32d7b4814701c8d Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Mon, 12 Sep 2016 09:33:47 -0700 Subject: [PATCH 1/2] PARQUET-674: Add DataSource abstraction for openable files. --- .../apache/parquet/io/ParquetDataSource.java | 48 +++++++++++ .../parquet/hadoop/ParquetFileReader.java | 32 +++++--- .../parquet/hadoop/util/HadoopDataSource.java | 79 +++++++++++++++++++ 3 files changed, 148 insertions(+), 11 deletions(-) create mode 100644 parquet-common/src/main/java/org/apache/parquet/io/ParquetDataSource.java create mode 100644 parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/HadoopDataSource.java diff --git a/parquet-common/src/main/java/org/apache/parquet/io/ParquetDataSource.java b/parquet-common/src/main/java/org/apache/parquet/io/ParquetDataSource.java new file mode 100644 index 0000000000..5ea03443a0 --- /dev/null +++ b/parquet-common/src/main/java/org/apache/parquet/io/ParquetDataSource.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.io; + +import java.io.IOException; + +/** + * {@code ParquetDataSource} is an interface with the methods needed by Parquet + * to read data files using {@link SeekableInputStream} instances. + */ +public interface ParquetDataSource { + + /** + * Returns the file location. + */ + String getLocation(); + + /** + * Returns the total length of the file, in bytes. + * @throws IOException if the length cannot be determined + */ + long getLength() throws IOException; + + /** + * Opens a new {@link SeekableInputStream} for the underlying + * data file. + * @throws IOException if the stream cannot be opened. + */ + SeekableInputStream newStream() throws IOException; + +} diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java index 59a7e46cf5..30ce4bdb02 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java @@ -88,11 +88,13 @@ import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.FileMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.hadoop.util.HadoopDataSource; import org.apache.parquet.hadoop.util.HiddenFileFilter; import org.apache.parquet.hadoop.util.HadoopStreams; import org.apache.parquet.io.SeekableInputStream; import org.apache.parquet.hadoop.util.counters.BenchmarkCounter; import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.io.ParquetDataSource; /** * Internal implementation of the Parquet file reader as a block container @@ -410,8 +412,7 @@ public static final ParquetMetadata readFooter(Configuration configuration, Path * @throws IOException if an error occurs while reading the file */ public static ParquetMetadata readFooter(Configuration configuration, Path file, MetadataFilter filter) throws IOException { - FileSystem fileSystem = file.getFileSystem(configuration); - return readFooter(configuration, fileSystem.getFileStatus(file), filter); + return readFooter(HadoopDataSource.fromPath(file, configuration), filter); } /** @@ -431,12 +432,21 @@ public static final ParquetMetadata readFooter(Configuration configuration, File * @throws IOException if an error occurs while reading the file */ public static final ParquetMetadata readFooter(Configuration configuration, FileStatus file, MetadataFilter filter) throws IOException { - FileSystem fileSystem = file.getPath().getFileSystem(configuration); - SeekableInputStream in = HadoopStreams.wrap(fileSystem.open(file.getPath())); - try { - return readFooter(file.getLen(), file.getPath().toString(), in, filter); - } finally { - in.close(); + return readFooter(HadoopDataSource.fromStatus(file, configuration), filter); + } + + /** + * Reads the meta data block in the footer of the file using provided input stream + * @param file a {@link ParquetDataSource} to read + * @param filter the filter to apply to row groups + * @return the metadata blocks in the footer + * @throws IOException if an error occurs while reading the file + */ + public static final ParquetMetadata readFooter( + ParquetDataSource file, MetadataFilter filter) throws IOException { + try (SeekableInputStream in = file.newStream()) { + return readFooter(converter, file.getLength(), file.getLocation(), + in, filter); } } @@ -449,7 +459,7 @@ public static final ParquetMetadata readFooter(Configuration configuration, File * @return the metadata blocks in the footer * @throws IOException if an error occurs while reading the file */ - public static final ParquetMetadata readFooter(long fileLen, String filePath, SeekableInputStream f, MetadataFilter filter) throws IOException { + private static final ParquetMetadata readFooter(ParquetMetadataConverter converter, long fileLen, String filePath, SeekableInputStream f, MetadataFilter filter) throws IOException { if (Log.DEBUG) { LOG.debug("File length " + fileLen); } @@ -563,7 +573,7 @@ public ParquetFileReader(Configuration conf, Path file, MetadataFilter filter) t FileSystem fs = file.getFileSystem(conf); this.fileStatus = fs.getFileStatus(file); this.f = HadoopStreams.wrap(fs.open(file)); - this.footer = readFooter(fileStatus.getLen(), fileStatus.getPath().toString(), f, filter); + this.footer = readFooter(converter, fileStatus.getLen(), fileStatus.getPath().toString(), f, filter); this.fileMetaData = footer.getFileMetaData(); this.blocks = footer.getBlocks(); for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) { @@ -602,7 +612,7 @@ public ParquetMetadata getFooter() { if (footer == null) { try { // don't read the row groups because this.blocks is always set - this.footer = readFooter(fileStatus.getLen(), fileStatus.getPath().toString(), f, SKIP_ROW_GROUPS); + this.footer = readFooter(converter, fileStatus.getLen(), fileStatus.getPath().toString(), f, SKIP_ROW_GROUPS); } catch (IOException e) { throw new ParquetDecodingException("Unable to read file footer", e); } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/HadoopDataSource.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/HadoopDataSource.java new file mode 100644 index 0000000000..6007d1aeb4 --- /dev/null +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/HadoopDataSource.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.hadoop.util; + +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.io.SeekableInputStream; +import org.apache.parquet.io.ParquetDataSource; +import java.io.IOException; + +public class HadoopDataSource implements ParquetDataSource, Configurable { + + private final FileSystem fs; + private final FileStatus stat; + private Configuration conf; + + public static HadoopDataSource fromPath(Path path, Configuration conf) + throws IOException { + FileSystem fs = path.getFileSystem(conf); + return new HadoopDataSource(fs, fs.getFileStatus(path), conf); + } + + public static HadoopDataSource fromStatus(FileStatus stat, Configuration conf) + throws IOException { + FileSystem fs = stat.getPath().getFileSystem(conf); + return new HadoopDataSource(fs, stat, conf); + } + + private HadoopDataSource(FileSystem fs, FileStatus stat, Configuration conf) { + this.conf = conf; + this.fs = fs; + this.stat = stat; + } + + @Override + public String getLocation() { + return stat.getPath().toString(); + } + + @Override + public long getLength() { + return stat.getLen(); + } + + @Override + public SeekableInputStream newStream() throws IOException { + return HadoopStreams.wrap(fs.open(stat.getPath())); + } + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + } + + @Override + public Configuration getConf() { + return conf; + } +} From 8c689e96e96198adcda75a49649dd47df7cbbdc3 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Mon, 3 Oct 2016 11:49:25 -0700 Subject: [PATCH 2/2] PARQUET-674: Implement review comments. --- ...{ParquetDataSource.java => InputFile.java} | 11 ++----- .../parquet/hadoop/ParquetFileReader.java | 14 ++++----- ...opDataSource.java => HadoopInputFile.java} | 31 ++++++------------- 3 files changed, 19 insertions(+), 37 deletions(-) rename parquet-common/src/main/java/org/apache/parquet/io/{ParquetDataSource.java => InputFile.java} (83%) rename parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/{HadoopDataSource.java => HadoopInputFile.java} (68%) diff --git a/parquet-common/src/main/java/org/apache/parquet/io/ParquetDataSource.java b/parquet-common/src/main/java/org/apache/parquet/io/InputFile.java similarity index 83% rename from parquet-common/src/main/java/org/apache/parquet/io/ParquetDataSource.java rename to parquet-common/src/main/java/org/apache/parquet/io/InputFile.java index 5ea03443a0..e2c7cc0ac3 100644 --- a/parquet-common/src/main/java/org/apache/parquet/io/ParquetDataSource.java +++ b/parquet-common/src/main/java/org/apache/parquet/io/InputFile.java @@ -22,15 +22,10 @@ import java.io.IOException; /** - * {@code ParquetDataSource} is an interface with the methods needed by Parquet - * to read data files using {@link SeekableInputStream} instances. + * {@code InputFile} is an interface with the methods needed by Parquet to read + * data files using {@link SeekableInputStream} instances. */ -public interface ParquetDataSource { - - /** - * Returns the file location. - */ - String getLocation(); +public interface InputFile { /** * Returns the total length of the file, in bytes. diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java index 30ce4bdb02..57cdb7dca8 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java @@ -88,13 +88,13 @@ import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.FileMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; -import org.apache.parquet.hadoop.util.HadoopDataSource; +import org.apache.parquet.hadoop.util.HadoopInputFile; import org.apache.parquet.hadoop.util.HiddenFileFilter; import org.apache.parquet.hadoop.util.HadoopStreams; import org.apache.parquet.io.SeekableInputStream; import org.apache.parquet.hadoop.util.counters.BenchmarkCounter; import org.apache.parquet.io.ParquetDecodingException; -import org.apache.parquet.io.ParquetDataSource; +import org.apache.parquet.io.InputFile; /** * Internal implementation of the Parquet file reader as a block container @@ -412,7 +412,7 @@ public static final ParquetMetadata readFooter(Configuration configuration, Path * @throws IOException if an error occurs while reading the file */ public static ParquetMetadata readFooter(Configuration configuration, Path file, MetadataFilter filter) throws IOException { - return readFooter(HadoopDataSource.fromPath(file, configuration), filter); + return readFooter(HadoopInputFile.fromPath(file, configuration), filter); } /** @@ -432,20 +432,20 @@ public static final ParquetMetadata readFooter(Configuration configuration, File * @throws IOException if an error occurs while reading the file */ public static final ParquetMetadata readFooter(Configuration configuration, FileStatus file, MetadataFilter filter) throws IOException { - return readFooter(HadoopDataSource.fromStatus(file, configuration), filter); + return readFooter(HadoopInputFile.fromStatus(file, configuration), filter); } /** * Reads the meta data block in the footer of the file using provided input stream - * @param file a {@link ParquetDataSource} to read + * @param file a {@link InputFile} to read * @param filter the filter to apply to row groups * @return the metadata blocks in the footer * @throws IOException if an error occurs while reading the file */ public static final ParquetMetadata readFooter( - ParquetDataSource file, MetadataFilter filter) throws IOException { + InputFile file, MetadataFilter filter) throws IOException { try (SeekableInputStream in = file.newStream()) { - return readFooter(converter, file.getLength(), file.getLocation(), + return readFooter(converter, file.getLength(), file.toString(), in, filter); } } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/HadoopDataSource.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/HadoopInputFile.java similarity index 68% rename from parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/HadoopDataSource.java rename to parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/HadoopInputFile.java index 6007d1aeb4..d5868d3011 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/HadoopDataSource.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/HadoopInputFile.java @@ -19,44 +19,36 @@ package org.apache.parquet.hadoop.util; -import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.parquet.io.SeekableInputStream; -import org.apache.parquet.io.ParquetDataSource; +import org.apache.parquet.io.InputFile; import java.io.IOException; -public class HadoopDataSource implements ParquetDataSource, Configurable { +public class HadoopInputFile implements InputFile { private final FileSystem fs; private final FileStatus stat; - private Configuration conf; - public static HadoopDataSource fromPath(Path path, Configuration conf) + public static HadoopInputFile fromPath(Path path, Configuration conf) throws IOException { FileSystem fs = path.getFileSystem(conf); - return new HadoopDataSource(fs, fs.getFileStatus(path), conf); + return new HadoopInputFile(fs, fs.getFileStatus(path)); } - public static HadoopDataSource fromStatus(FileStatus stat, Configuration conf) + public static HadoopInputFile fromStatus(FileStatus stat, Configuration conf) throws IOException { FileSystem fs = stat.getPath().getFileSystem(conf); - return new HadoopDataSource(fs, stat, conf); + return new HadoopInputFile(fs, stat); } - private HadoopDataSource(FileSystem fs, FileStatus stat, Configuration conf) { - this.conf = conf; + private HadoopInputFile(FileSystem fs, FileStatus stat) { this.fs = fs; this.stat = stat; } - @Override - public String getLocation() { - return stat.getPath().toString(); - } - @Override public long getLength() { return stat.getLen(); @@ -68,12 +60,7 @@ public SeekableInputStream newStream() throws IOException { } @Override - public void setConf(Configuration conf) { - this.conf = conf; - } - - @Override - public Configuration getConf() { - return conf; + public String toString() { + return stat.getPath().toString(); } }