From 0f8e465e5a86e29af213c434e9d7a360ecd31bae Mon Sep 17 00:00:00 2001 From: huaxingao Date: Sun, 22 Jun 2025 23:48:16 -0700 Subject: [PATCH 1/6] Encapsulate Parquet objects --- .../apache/comet/parquet/ColumnReader.java | 7 + .../org/apache/comet/parquet/FileReader.java | 138 +++++++++++++++--- .../comet/parquet/ParquetColumnSpec.java | 69 +++++++++ .../apache/comet/parquet/RowGroupReader.java | 10 +- .../java/org/apache/comet/parquet/Utils.java | 41 ++++++ 5 files changed, 245 insertions(+), 20 deletions(-) create mode 100644 common/src/main/java/org/apache/comet/parquet/ParquetColumnSpec.java diff --git a/common/src/main/java/org/apache/comet/parquet/ColumnReader.java b/common/src/main/java/org/apache/comet/parquet/ColumnReader.java index 9502aa265d..b2fe965e2e 100644 --- a/common/src/main/java/org/apache/comet/parquet/ColumnReader.java +++ b/common/src/main/java/org/apache/comet/parquet/ColumnReader.java @@ -126,6 +126,13 @@ public void setPageReader(PageReader pageReader) throws IOException { } } + /** This method is called from Apache Iceberg. */ + public void setRowGroupReader(RowGroupReader rowGroupReader, ParquetColumnSpec columnSpec) + throws IOException { + ColumnDescriptor descriptor = Utils.buildColumnDescriptor(columnSpec); + setPageReader(rowGroupReader.getPageReader(descriptor)); + } + @Override public void readBatch(int total) { LOG.debug("Start to batch of size = " + total); diff --git a/common/src/main/java/org/apache/comet/parquet/FileReader.java b/common/src/main/java/org/apache/comet/parquet/FileReader.java index a85e0ebe76..7e54565ba8 100644 --- a/common/src/main/java/org/apache/comet/parquet/FileReader.java +++ b/common/src/main/java/org/apache/comet/parquet/FileReader.java @@ -27,6 +27,7 @@ import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -40,6 +41,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.commons.compress.utils.Sets; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.HadoopReadOptions; import org.apache.parquet.ParquetReadOptions; import org.apache.parquet.Preconditions; import org.apache.parquet.bytes.ByteBufferInputStream; @@ -72,6 +77,7 @@ import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.hadoop.metadata.FileMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.hadoop.util.HadoopInputFile; import org.apache.parquet.hadoop.util.counters.BenchmarkCounter; import org.apache.parquet.internal.column.columnindex.OffsetIndex; import org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter; @@ -102,7 +108,7 @@ public class FileReader implements Closeable { private final SeekableInputStream f; private final InputFile file; private final Map metrics; - private final Map paths = new HashMap<>(); + private final Map paths; private final FileMetaData fileMetaData; // may be null private final List blocks; private final List blockIndexStores; @@ -128,6 +134,36 @@ public FileReader(InputFile file, ParquetReadOptions options, ReadOptions cometO this(file, null, options, cometOptions, null); } + /** This constructor is called from Apache Iceberg. */ + public FileReader( + Path path, + Configuration conf, + ReadOptions cometOptions, + Map properties, + Long start, + Long length, + byte[] fileEncryptionKey, + byte[] fileAADPrefix) + throws IOException { + ParquetReadOptions options = + buildParquetReadOptions(conf, properties, start, length, fileEncryptionKey, fileAADPrefix); + this.converter = new ParquetMetadataConverter(options); + this.file = HadoopInputFile.fromPath(path, conf); + this.f = file.newStream(); + this.options = options; + this.cometOptions = cometOptions; + this.metrics = null; + footer = readFooter(file, f, options, converter); + this.fileMetaData = footer.getFileMetaData(); + this.fileDecryptor = initDecryptor(fileMetaData); + + this.blocks = footer.getBlocks(); + this.blockIndexStores = listWithNulls(this.blocks.size()); + this.blockRowRanges = listWithNulls(this.blocks.size()); + this.paths = buildPaths(fileMetaData); + this.crc = options.usePageChecksumVerification() ? new CRC32() : null; + } + public FileReader( InputFile file, ParquetReadOptions options, @@ -151,28 +187,16 @@ public FileReader( this.cometOptions = cometOptions; this.metrics = metrics; if (footer == null) { - try { - footer = readFooter(file, options, f, converter); - } catch (Exception e) { - // In case that reading footer throws an exception in the constructor, the new stream - // should be closed. Otherwise, there's no way to close this outside. - f.close(); - throw e; - } + footer = readFooter(file, f, options, converter); } this.footer = footer; this.fileMetaData = footer.getFileMetaData(); - this.fileDecryptor = fileMetaData.getFileDecryptor(); // must be called before filterRowGroups! - if (null != fileDecryptor && fileDecryptor.plaintextFile()) { - this.fileDecryptor = null; // Plaintext file. No need in decryptor - } + this.fileDecryptor = initDecryptor(fileMetaData); this.blocks = filterRowGroups(footer.getBlocks()); this.blockIndexStores = listWithNulls(this.blocks.size()); this.blockRowRanges = listWithNulls(this.blocks.size()); - for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) { - paths.put(ColumnPath.get(col.getPath()), col); - } + this.paths = buildPaths(fileMetaData); this.crc = options.usePageChecksumVerification() ? new CRC32() : null; } @@ -209,6 +233,82 @@ public void setRequestedSchema(List projection) { } } + /** This method is called from Apache Iceberg. */ + public void setRequestedSchemaFromSpecs(List specList) { + paths.clear(); + for (ParquetColumnSpec colSpec : specList) { + ColumnDescriptor descriptor = Utils.buildColumnDescriptor(colSpec); + paths.put(ColumnPath.get(colSpec.getPath()), descriptor); + } + } + + private static InternalFileDecryptor initDecryptor(FileMetaData meta) { + InternalFileDecryptor decryptor = meta.getFileDecryptor(); + return (decryptor != null && decryptor.plaintextFile()) ? null : decryptor; + } + + private static Map buildPaths(FileMetaData meta) { + Map paths = new HashMap<>(); + for (ColumnDescriptor col : meta.getSchema().getColumns()) { + paths.put(ColumnPath.get(col.getPath()), col); + } + return paths; + } + + private static ParquetMetadata readFooter( + InputFile file, + SeekableInputStream f, + ParquetReadOptions options, + ParquetMetadataConverter converter) + throws IOException { + try { + return readFooter(file, options, f, converter); + } catch (IOException e) { + f.close(); + throw e; + } + } + + private static ParquetReadOptions buildParquetReadOptions( + Configuration conf, + Map properties, + Long start, + Long length, + byte[] fileEncryptionKey, + byte[] fileAADPrefix) { + + Collection readPropertiesToRemove = + Sets.newHashSet( + "parquet.read.filter", + "parquet.private.read.filter.predicate", + "parquet.read.support.class", + "parquet.crypto.factory.class"); + + for (String property : readPropertiesToRemove) { + conf.unset(property); + } + + ParquetReadOptions.Builder optionsBuilder = HadoopReadOptions.builder(conf); + for (Map.Entry entry : properties.entrySet()) { + optionsBuilder.set(entry.getKey(), entry.getValue()); + } + + if (start != null && length != null) { + optionsBuilder.withRange(start, start + length); + } + + if (fileEncryptionKey != null) { + FileDecryptionProperties fileDecryptionProperties = + FileDecryptionProperties.builder() + .withFooterKey(fileEncryptionKey) + .withAADPrefix(fileAADPrefix) + .build(); + optionsBuilder.withDecryption(fileDecryptionProperties); + } + + return optionsBuilder.build(); + } + /** * Gets the total number of records across all row groups (after applying row group filtering). */ @@ -245,7 +345,7 @@ public boolean skipNextRowGroup() { * Returns the next row group to read (after applying row group filtering), or null if there's no * more row group. */ - public PageReadStore readNextRowGroup() throws IOException { + public RowGroupReader readNextRowGroup() throws IOException { if (currentBlock == blocks.size()) { return null; } @@ -253,7 +353,7 @@ public PageReadStore readNextRowGroup() throws IOException { if (block.getRowCount() == 0) { throw new RuntimeException("Illegal row group of 0 rows"); } - this.currentRowGroup = new RowGroupReader(block.getRowCount()); + this.currentRowGroup = new RowGroupReader(block.getRowCount(), block.getRowIndexOffset()); // prepare the list of consecutive parts to read them in one scan List allParts = new ArrayList<>(); ConsecutivePartList currentParts = null; @@ -362,7 +462,7 @@ ColumnIndexReader getColumnIndexReader(int blockIndex) { return ciStore; } - private PageReadStore readChunks( + private RowGroupReader readChunks( BlockMetaData block, List allParts, ChunkListBuilder builder) throws IOException { if (shouldReadParallel()) { diff --git a/common/src/main/java/org/apache/comet/parquet/ParquetColumnSpec.java b/common/src/main/java/org/apache/comet/parquet/ParquetColumnSpec.java new file mode 100644 index 0000000000..7faa6e62b3 --- /dev/null +++ b/common/src/main/java/org/apache/comet/parquet/ParquetColumnSpec.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.parquet; + +public class ParquetColumnSpec { + + private final String[] path; + private final String physicalType; + private final int typeLength; + private final boolean isRepeated; + private final int maxDefinitionLevel; + private final int maxRepetitionLevel; + + public ParquetColumnSpec( + String[] path, + String physicalType, + int typeLength, + boolean isRepeated, + int maxDefinitionLevel, + int maxRepetitionLevel) { + this.path = path; + this.physicalType = physicalType; + this.typeLength = typeLength; + this.isRepeated = isRepeated; + this.maxDefinitionLevel = maxDefinitionLevel; + this.maxRepetitionLevel = maxRepetitionLevel; + } + + public String[] getPath() { + return path; + } + + public String getPhysicalType() { + return physicalType; + } + + public int getTypeLength() { + return typeLength; + } + + public boolean isRepeated() { + return isRepeated; + } + + public int getMaxRepetitionLevel() { + return maxRepetitionLevel; + } + + public int getMaxDefinitionLevel() { + return maxDefinitionLevel; + } +} diff --git a/common/src/main/java/org/apache/comet/parquet/RowGroupReader.java b/common/src/main/java/org/apache/comet/parquet/RowGroupReader.java index d5d73b0783..1c7de2fe90 100644 --- a/common/src/main/java/org/apache/comet/parquet/RowGroupReader.java +++ b/common/src/main/java/org/apache/comet/parquet/RowGroupReader.java @@ -33,15 +33,18 @@ public class RowGroupReader implements PageReadStore { private final Map readers = new HashMap<>(); private final long rowCount; private final RowRanges rowRanges; + private final long rowIndexOffset; - public RowGroupReader(long rowCount) { + public RowGroupReader(long rowCount, long rowIndexOffset) { this.rowCount = rowCount; this.rowRanges = null; + this.rowIndexOffset = rowIndexOffset; } RowGroupReader(RowRanges rowRanges) { this.rowRanges = rowRanges; this.rowCount = rowRanges.rowCount(); + this.rowIndexOffset = -1; } @Override @@ -64,6 +67,11 @@ public Optional getRowIndexes() { return rowRanges == null ? Optional.empty() : Optional.of(rowRanges.iterator()); } + @Override + public Optional getRowIndexOffset() { + return this.rowIndexOffset < 0L ? Optional.empty() : Optional.of(this.rowIndexOffset); + } + void addColumn(ColumnDescriptor path, ColumnPageReader reader) { if (readers.put(path, reader) != null) { throw new IllegalStateException(path + " was already added"); diff --git a/common/src/main/java/org/apache/comet/parquet/Utils.java b/common/src/main/java/org/apache/comet/parquet/Utils.java index 2f9c507366..d64ab371a3 100644 --- a/common/src/main/java/org/apache/comet/parquet/Utils.java +++ b/common/src/main/java/org/apache/comet/parquet/Utils.java @@ -21,7 +21,9 @@ import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; import org.apache.spark.sql.types.*; import org.apache.comet.CometSchemaImporter; @@ -29,6 +31,19 @@ public class Utils { /** This method is called from Apache Iceberg. */ + public static ColumnReader getColumnReader( + DataType type, + ParquetColumnSpec columnSpec, + CometSchemaImporter importer, + int batchSize, + boolean useDecimal128, + boolean useLazyMaterialization) { + + ColumnDescriptor descriptor = buildColumnDescriptor(columnSpec); + return getColumnReader( + type, descriptor, importer, batchSize, useDecimal128, useLazyMaterialization, true); + } + public static ColumnReader getColumnReader( DataType type, ColumnDescriptor descriptor, @@ -260,4 +275,30 @@ static int getTimeUnitId(LogicalTypeAnnotation.TimeUnit tu) { throw new UnsupportedOperationException("Unsupported TimeUnit " + tu); } } + + public static ColumnDescriptor buildColumnDescriptor(ParquetColumnSpec columnSpec) { + PrimitiveType.PrimitiveTypeName primType = + PrimitiveType.PrimitiveTypeName.valueOf(columnSpec.getPhysicalType()); + + Type.Repetition repetition; + if (columnSpec.getMaxRepetitionLevel() > 0) { + repetition = Type.Repetition.REPEATED; + } else if (columnSpec.getMaxDefinitionLevel() > 0) { + repetition = Type.Repetition.OPTIONAL; + } else { + repetition = Type.Repetition.REQUIRED; + } + + String name = columnSpec.getPath()[columnSpec.getPath().length - 1]; + + PrimitiveType primitiveType; + if (primType == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) { + primitiveType = new PrimitiveType(repetition, primType, columnSpec.getTypeLength(), name); + } else { + primitiveType = new PrimitiveType(repetition, primType, name); + } + + MessageType schema = new MessageType("root", primitiveType); + return schema.getColumnDescription(columnSpec.getPath()); + } } From ab784001a8a83299781f1110804d4f35c19d058c Mon Sep 17 00:00:00 2001 From: huaxingao Date: Tue, 24 Jun 2025 07:22:02 -0700 Subject: [PATCH 2/6] fix --- .../org/apache/comet/parquet/FileReader.java | 76 ++++++++----------- 1 file changed, 32 insertions(+), 44 deletions(-) diff --git a/common/src/main/java/org/apache/comet/parquet/FileReader.java b/common/src/main/java/org/apache/comet/parquet/FileReader.java index 7e54565ba8..51870fcc93 100644 --- a/common/src/main/java/org/apache/comet/parquet/FileReader.java +++ b/common/src/main/java/org/apache/comet/parquet/FileReader.java @@ -108,7 +108,7 @@ public class FileReader implements Closeable { private final SeekableInputStream f; private final InputFile file; private final Map metrics; - private final Map paths; + private final Map paths = new HashMap<>(); private final FileMetaData fileMetaData; // may be null private final List blocks; private final List blockIndexStores; @@ -153,14 +153,26 @@ public FileReader( this.options = options; this.cometOptions = cometOptions; this.metrics = null; - footer = readFooter(file, f, options, converter); + try { + this.footer = readFooter(file, options, f, converter); + } catch (Exception e) { + // In case that reading footer throws an exception in the constructor, the new stream + // should be closed. Otherwise, there's no way to close this outside. + f.close(); + throw e; + } this.fileMetaData = footer.getFileMetaData(); - this.fileDecryptor = initDecryptor(fileMetaData); + this.fileDecryptor = fileMetaData.getFileDecryptor(); // must be called before filterRowGroups! + if (null != fileDecryptor && fileDecryptor.plaintextFile()) { + this.fileDecryptor = null; // Plaintext file. No need in decryptor + } - this.blocks = footer.getBlocks(); + this.blocks = footer.getBlocks(); // filter row group in iceberg this.blockIndexStores = listWithNulls(this.blocks.size()); this.blockRowRanges = listWithNulls(this.blocks.size()); - this.paths = buildPaths(fileMetaData); + for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) { + paths.put(ColumnPath.get(col.getPath()), col); + } this.crc = options.usePageChecksumVerification() ? new CRC32() : null; } @@ -187,16 +199,28 @@ public FileReader( this.cometOptions = cometOptions; this.metrics = metrics; if (footer == null) { - footer = readFooter(file, f, options, converter); + try { + footer = readFooter(file, options, f, converter); + } catch (Exception e) { + // In case that reading footer throws an exception in the constructor, the new stream + // should be closed. Otherwise, there's no way to close this outside. + f.close(); + throw e; + } } this.footer = footer; this.fileMetaData = footer.getFileMetaData(); - this.fileDecryptor = initDecryptor(fileMetaData); + this.fileDecryptor = fileMetaData.getFileDecryptor(); // must be called before filterRowGroups! + if (null != fileDecryptor && fileDecryptor.plaintextFile()) { + this.fileDecryptor = null; // Plaintext file. No need in decryptor + } this.blocks = filterRowGroups(footer.getBlocks()); this.blockIndexStores = listWithNulls(this.blocks.size()); this.blockRowRanges = listWithNulls(this.blocks.size()); - this.paths = buildPaths(fileMetaData); + for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) { + paths.put(ColumnPath.get(col.getPath()), col); + } this.crc = options.usePageChecksumVerification() ? new CRC32() : null; } @@ -233,42 +257,6 @@ public void setRequestedSchema(List projection) { } } - /** This method is called from Apache Iceberg. */ - public void setRequestedSchemaFromSpecs(List specList) { - paths.clear(); - for (ParquetColumnSpec colSpec : specList) { - ColumnDescriptor descriptor = Utils.buildColumnDescriptor(colSpec); - paths.put(ColumnPath.get(colSpec.getPath()), descriptor); - } - } - - private static InternalFileDecryptor initDecryptor(FileMetaData meta) { - InternalFileDecryptor decryptor = meta.getFileDecryptor(); - return (decryptor != null && decryptor.plaintextFile()) ? null : decryptor; - } - - private static Map buildPaths(FileMetaData meta) { - Map paths = new HashMap<>(); - for (ColumnDescriptor col : meta.getSchema().getColumns()) { - paths.put(ColumnPath.get(col.getPath()), col); - } - return paths; - } - - private static ParquetMetadata readFooter( - InputFile file, - SeekableInputStream f, - ParquetReadOptions options, - ParquetMetadataConverter converter) - throws IOException { - try { - return readFooter(file, options, f, converter); - } catch (IOException e) { - f.close(); - throw e; - } - } - private static ParquetReadOptions buildParquetReadOptions( Configuration conf, Map properties, From 8f5d7b928a992694f62818c04c579a87e7af8458 Mon Sep 17 00:00:00 2001 From: huaxingao Date: Wed, 25 Jun 2025 13:19:56 -0700 Subject: [PATCH 3/6] add setRequestedSchemaFromSpecs --- common/pom.xml | 2 +- common/pom.xml.versionsBackup | 220 ++++ .../org/apache/comet/parquet/FileReader.java | 9 + fuzz-testing/pom.xml | 2 +- fuzz-testing/pom.xml.versionsBackup | 117 ++ pom.xml | 2 +- pom.xml.versionsBackup | 1135 +++++++++++++++++ spark-integration/pom.xml | 2 +- spark-integration/pom.xml.versionsBackup | 113 ++ spark/pom.xml | 2 +- spark/pom.xml.versionsBackup | 336 +++++ 11 files changed, 1935 insertions(+), 5 deletions(-) create mode 100644 common/pom.xml.versionsBackup create mode 100644 fuzz-testing/pom.xml.versionsBackup create mode 100644 pom.xml.versionsBackup create mode 100644 spark-integration/pom.xml.versionsBackup create mode 100644 spark/pom.xml.versionsBackup diff --git a/common/pom.xml b/common/pom.xml index 36bc706053..68a238432b 100644 --- a/common/pom.xml +++ b/common/pom.xml @@ -26,7 +26,7 @@ under the License. org.apache.datafusion comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.9.0-SNAPSHOT + 0.9.0.1-SNAPSHOT ../pom.xml diff --git a/common/pom.xml.versionsBackup b/common/pom.xml.versionsBackup new file mode 100644 index 0000000000..36bc706053 --- /dev/null +++ b/common/pom.xml.versionsBackup @@ -0,0 +1,220 @@ + + + + + + + 4.0.0 + + org.apache.datafusion + comet-parent-spark${spark.version.short}_${scala.binary.version} + 0.9.0-SNAPSHOT + ../pom.xml + + + comet-common-spark${spark.version.short}_${scala.binary.version} + comet-common + + + + false + + + + + org.apache.spark + spark-sql_${scala.binary.version} + + + org.apache.parquet + parquet-column + + + org.apache.parquet + parquet-hadoop + + + org.apache.arrow + arrow-vector + + + org.apache.arrow + arrow-memory-unsafe + + + org.apache.arrow + arrow-c-data + + + junit + junit + test + + + org.assertj + assertj-core + test + + + + + + + io.github.git-commit-id + git-commit-id-maven-plugin + ${git-commit-id-maven-plugin.version} + + + get-the-git-infos + + revision + + initialize + + + + true + ${project.build.outputDirectory}/comet-git-info.properties + full + + ^git.branch$ + ^git.build.*$ + ^git.commit.id.(abbrev|full)$ + ^git.remote.*$ + + + + + org.apache.maven.plugins + maven-shade-plugin + + + package + + shade + + + true + true + false + true + + + + org.apache.arrow:* + + + + + *:* + + **/*.thrift + git.properties + log4j.properties + log4j2.properties + arrow-git.properties + + + + org.apache.arrow:arrow-vector + + + codegen/** + + + + + + org.apache.arrow + ${comet.shade.packageName}.arrow + + + org/apache/arrow/c/jni/JniWrapper + org/apache/arrow/c/jni/PrivateData + org/apache/arrow/c/jni/CDataJniException + + org/apache/arrow/c/ArrayStreamExporter$ExportedArrayStreamPrivateData + + + + + + + + + + net.alchim31.maven + scala-maven-plugin + + + org.codehaus.mojo + build-helper-maven-plugin + + + add-shim-source + generate-sources + + add-source + + + + src/main/${shims.majorVerSrc} + src/main/${shims.minorVerSrc} + + + + + + + + + ${project.basedir}/src/main/resources + + + ${project.basedir}/../native/target/x86_64-apple-darwin/release + + libcomet.dylib + + org/apache/comet/darwin/x86_64 + + + ${project.basedir}/../native/target/aarch64-apple-darwin/release + + libcomet.dylib + + org/apache/comet/darwin/aarch64 + + + ${jni.dir} + + libcomet.dylib + libcomet.so + comet.dll + + org/apache/comet/${platform}/${arch} + + + + + diff --git a/common/src/main/java/org/apache/comet/parquet/FileReader.java b/common/src/main/java/org/apache/comet/parquet/FileReader.java index 51870fcc93..59218ec10f 100644 --- a/common/src/main/java/org/apache/comet/parquet/FileReader.java +++ b/common/src/main/java/org/apache/comet/parquet/FileReader.java @@ -257,6 +257,15 @@ public void setRequestedSchema(List projection) { } } + /** This method is called from Apache Iceberg. */ + public void setRequestedSchemaFromSpecs(List specList) { + paths.clear(); + for (ParquetColumnSpec colSpec : specList) { + ColumnDescriptor descriptor = Utils.buildColumnDescriptor(colSpec); + paths.put(ColumnPath.get(colSpec.getPath()), descriptor); + } + } + private static ParquetReadOptions buildParquetReadOptions( Configuration conf, Map properties, diff --git a/fuzz-testing/pom.xml b/fuzz-testing/pom.xml index a748b18140..72d4dee62d 100644 --- a/fuzz-testing/pom.xml +++ b/fuzz-testing/pom.xml @@ -25,7 +25,7 @@ under the License. org.apache.datafusion comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.9.0-SNAPSHOT + 0.9.0.1-SNAPSHOT ../pom.xml diff --git a/fuzz-testing/pom.xml.versionsBackup b/fuzz-testing/pom.xml.versionsBackup new file mode 100644 index 0000000000..a748b18140 --- /dev/null +++ b/fuzz-testing/pom.xml.versionsBackup @@ -0,0 +1,117 @@ + + + + 4.0.0 + + + org.apache.datafusion + comet-parent-spark${spark.version.short}_${scala.binary.version} + 0.9.0-SNAPSHOT + ../pom.xml + + + comet-fuzz-spark${spark.version.short}_${scala.binary.version} + comet-fuzz + http://maven.apache.org + jar + + + + false + + + + + org.scala-lang + scala-library + ${scala.version} + provided + + + org.apache.spark + spark-sql_${scala.binary.version} + provided + + + org.apache.datafusion + comet-spark-spark${spark.version.short}_${scala.binary.version} + ${project.version} + + + org.rogach + scallop_${scala.binary.version} + + + + + src/main/scala + + + org.apache.maven.plugins + maven-compiler-plugin + ${maven-compiler-plugin.version} + + ${java.version} + ${java.version} + + + + net.alchim31.maven + scala-maven-plugin + ${scala.plugin.version} + + + + compile + testCompile + + + + + + maven-assembly-plugin + ${maven-assembly-plugin.version} + + + jar-with-dependencies + + + + + make-assembly + package + + single + + + + + + org.apache.maven.plugins + maven-install-plugin + + true + + + + + diff --git a/pom.xml b/pom.xml index 6c9f97a8e7..d07075ad5d 100644 --- a/pom.xml +++ b/pom.xml @@ -30,7 +30,7 @@ under the License. org.apache.datafusion comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.9.0-SNAPSHOT + 0.9.0.1-SNAPSHOT pom Comet Project Parent POM diff --git a/pom.xml.versionsBackup b/pom.xml.versionsBackup new file mode 100644 index 0000000000..167df5bb4d --- /dev/null +++ b/pom.xml.versionsBackup @@ -0,0 +1,1135 @@ + + + + + + + 4.0.0 + + org.apache + apache + 23 + + org.apache.datafusion + comet-parent-spark${spark.version.short}_${scala.binary.version} + 0.9.0-SNAPSHOT + pom + Comet Project Parent POM + + + common + spark + spark-integration + fuzz-testing + + + + UTF-8 + UTF-8 + 11 + ${java.version} + ${java.version} + 3.11.0 + 3.6.0 + 3.2.4 + 3.1.2 + 3.3.0 + 3.3.0 + 3.1.0 + 9.1 + 3.4.0 + 1.3.0 + 1.0.0 + 4.9.9 + 3.1.0 + 3.11.4 + 0.1.7_0.10.4 + 1.7.0 + 3.6.1 + 0.16 + 2.12.18 + 2.12 + 4.8.0 + 3.2.16 + 2.2.0 + 3.5.6 + 3.5 + provided + 3.25.5 + 1.13.1 + provided + 3.3.4 + 18.3.0 + 1.9.13 + 2.43.0 + 0.8.11 + 4.8.8 + 2.0.7 + 33.2.1-jre + 1.21.0 + 2.31.51 + ${project.basedir}/../native/target/debug + darwin + x86_64 + org.apache.comet.shaded + + ${session.executionRootDirectory} + + + true + + + + -XX:+IgnoreUnrecognizedVMOptions + --add-opens=java.base/java.lang=ALL-UNNAMED + --add-opens=java.base/java.lang.invoke=ALL-UNNAMED + --add-opens=java.base/java.lang.reflect=ALL-UNNAMED + --add-opens=java.base/java.io=ALL-UNNAMED + --add-opens=java.base/java.net=ALL-UNNAMED + --add-opens=java.base/java.nio=ALL-UNNAMED + --add-opens=java.base/java.util=ALL-UNNAMED + --add-opens=java.base/java.util.concurrent=ALL-UNNAMED + --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED + --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens=java.base/sun.nio.ch=ALL-UNNAMED + --add-opens=java.base/sun.nio.cs=ALL-UNNAMED + --add-opens=java.base/sun.security.action=ALL-UNNAMED + --add-opens=java.base/sun.util.calendar=ALL-UNNAMED + -Djdk.reflect.useDirectMethodHandle=false + + -ea -Xmx4g -Xss4m ${extraJavaTestArgs} + spark-3.x + spark-3.5 + + + + + + + org.apache.spark + spark-sql_${scala.binary.version} + ${spark.version} + ${spark.maven.scope} + + + org.apache.parquet + parquet-hadoop + + + org.apache.parquet + parquet-column + + + com.google.guava + guava + + + + + commons-logging + commons-logging + + + + org.apache.arrow + * + + + + + org.apache.spark + spark-catalyst_${scala.binary.version} + ${spark.version} + ${spark.maven.scope} + + + + + commons-logging + commons-logging + + + + + org.apache.arrow + * + + + + + + + org.apache.arrow + arrow-vector + ${arrow.version} + + + + com.fasterxml.jackson.core + jackson-annotations + + + com.fasterxml.jackson.core + jackson-core + + + io.netty + netty-common + + + com.google.code.findbugs + jsr305 + + + + + org.apache.arrow + arrow-memory-unsafe + ${arrow.version} + + + org.apache.arrow + arrow-c-data + ${arrow.version} + + + + + org.apache.parquet + parquet-column + ${parquet.version} + ${parquet.maven.scope} + + + org.apache.parquet + parquet-hadoop + ${parquet.version} + ${parquet.maven.scope} + + + + javax.annotation + javax.annotation-api + + + + + org.apache.parquet + parquet-avro + ${parquet.version} + test + + + org.apache.parquet + parquet-hadoop + ${parquet.version} + tests + test + + + + javax.annotation + javax.annotation-api + + + + + + + org.scala-lang + scala-library + ${scala.version} + + + com.google.protobuf + protobuf-java + ${protobuf.version} + + + org.slf4j + slf4j-api + ${slf4j.version} + ${spark.maven.scope} + + + + + com.google.guava + guava + ${guava.version} + + + + + + org.apache.spark + spark-core_${scala.binary.version} + ${spark.version} + tests + test + + + org.apache.datafusion + * + + + + + org.apache.arrow + * + + + + + commons-logging + commons-logging + + + + com.google.code.findbugs + jsr305 + + + + + org.apache.spark + spark-catalyst_${scala.binary.version} + ${spark.version} + tests + test + + + + + org.apache.arrow + * + + + + + commons-logging + commons-logging + + + + com.google.code.findbugs + jsr305 + + + + + org.apache.spark + spark-sql_${scala.binary.version} + ${spark.version} + tests + test + + + org.apache.parquet + parquet-hadoop + + + org.apache.parquet + parquet-column + + + org.apache.datafusion + * + + + + + commons-logging + commons-logging + + + + org.apache.arrow + * + + + + + org.apache.spark + spark-hive_${scala.binary.version} + ${spark.version} + test + + + org.apache.datafusion + * + + + commons-logging + commons-logging + + + org.apache.arrow + * + + + + + junit + junit + 4.13.2 + test + + + org.assertj + assertj-core + 3.23.1 + test + + + org.scalatest + scalatest_${scala.binary.version} + ${scalatest.version} + test + + + org.scala-lang + scala-reflect + ${scala.version} + test + + + org.scalatestplus + junit-4-13_${scala.binary.version} + 3.2.16.0 + test + + + + + org.apache.spark + spark-hadoop-cloud_${scala.binary.version} + ${spark.version} + tests + test + + + + org.apache.hadoop.thirdparty + hadoop-shaded-guava + + + org.apache.hadoop + hadoop-annotations + + + javax.xml.bind + jaxb-api + + + + + commons-logging + commons-logging + + + + org.apache.arrow + * + + + + + + + org.testcontainers + minio + ${testcontainers.version} + test + + + software.amazon.awssdk + s3 + ${amazon-awssdk-v2.version} + test + + + + org.codehaus.jackson + jackson-mapper-asl + ${codehaus.jackson.version} + test + + + + org.rogach + scallop_${scala.binary.version} + 5.1.0 + + + + org.apache.hadoop + hadoop-client-minicluster + ${hadoop.version} + test + + + + + + + + + release + + ${project.basedir}/../native/target/release + + + + + Win-x86 + + + Windows + x86 + + + + win32 + x86_64 + + + + + Win-amd64 + + + Windows + amd64 + + + + win32 + amd64 + + + + + Darwin-x86 + + + mac + x86 + + + + darwin + x86_64 + + + + + Darwin-aarch64 + + + mac + aarch64 + + + + darwin + aarch64 + + + + + Linux-amd64 + + + Linux + amd64 + + + + linux + amd64 + + + + + Linux-aarch64 + + + Linux + aarch64 + + + + linux + aarch64 + + + + + spark-3.4 + + 2.12.17 + 3.4.3 + 3.4 + 1.13.1 + 2.0.6 + spark-3.4 + + + + + spark-3.5 + + 2.12.18 + 3.5.6 + 3.5 + 1.13.1 + 2.0.7 + spark-3.5 + + + + + + spark-4.0 + + + 2.13.14 + 2.13 + 4.0.0-preview1 + 4.0 + 1.13.1 + 4.9.5 + 2.0.13 + spark-4.0 + not-needed-yet + + 17 + ${java.version} + ${java.version} + + + + + scala-2.12 + + + + scala-2.13 + + 2.13.14 + 2.13 + 4.9.5 + + + + + jdk11 + + 11 + + + 11 + ${java.version} + ${java.version} + + + + + jdk17 + + 17 + + + 17 + ${java.version} + ${java.version} + + + + + semanticdb + + true + true + true + + + + + + net.alchim31.maven + scala-maven-plugin + ${scala.plugin.version} + + + + compile + testCompile + + + + + + -Ywarn-unused + + + -source + ${java.version} + -target + ${java.version} + -Xlint:all,-serial,-path,-try + + + + org.scalameta + semanticdb-scalac_${scala.version} + ${semanticdb.version} + + + + + + io.github.evis + scalafix-maven-plugin_${scala.binary.version} + ${scalafix-maven-plugin.version} + + + + + + + + + + + + net.alchim31.maven + scala-maven-plugin + ${scala.plugin.version} + + + eclipse-add-source + + add-source + + + + scala-compile-first + process-resources + + compile + add-source + + + + scala-test-compile-first + process-test-resources + + testCompile + + + + + ${scala.version} + true + true + incremental + + -unchecked + -deprecation + -feature + -explaintypes + -Xlint:adapted-args + + + -Xms1024m + -Xmx1024m + + + -source + ${maven.compiler.source} + -target + ${maven.compiler.target} + -Xlint:all,-serial,-path,-try + + + + + org.scalatest + scalatest-maven-plugin + ${scalatest-maven-plugin.version} + + ${project.build.directory}/surefire-reports + . + SparkTestSuite.txt + D + + org.apache.comet.IntegrationTestSuite + + + file:src/test/resources/log4j2.properties + true + ${project.build.directory}/tmp + + + + + test + + test + + + + + + org.apache.maven.plugins + maven-shade-plugin + ${maven-shade-plugin.version} + + + org.ow2.asm + asm + ${asm.version} + + + org.ow2.asm + asm-commons + ${asm.version} + + + + + org.apache.maven.plugins + maven-surefire-plugin + ${maven-surefire-plugin.version} + + + file:src/test/resources/log4j2.properties + + false + + + + org.apache.maven.plugins + maven-source-plugin + ${maven-source-plugin.version} + + true + + + + create-source-jar + + jar-no-fork + test-jar-no-fork + + + + + + org.apache.maven.plugins + maven-compiler-plugin + ${maven-compiler-plugin.version} + + ${java.version} + ${java.version} + true + true + + + + org.apache.maven.plugins + maven-failsafe-plugin + ${maven-failsafe-plugin.version} + + + com.diffplug.spotless + spotless-maven-plugin + ${spotless.version} + + + + + + + java|javax,scala,org,org.apache,com,org.apache.comet,\#,\#org.apache.comet + + + ${maven.multiModuleProjectDirectory}/dev/copyright/java-header.txt + + + + + + ${scalafmt.version} + ${maven.multiModuleProjectDirectory}/scalafmt.conf + + + ${maven.multiModuleProjectDirectory}/dev/copyright/scala-header.txt + + + + + + org.codehaus.mojo + flatten-maven-plugin + ${flatten-maven-plugin.version} + + + org.jacoco + jacoco-maven-plugin + ${jacoco.version} + + + org.codehaus.mojo + build-helper-maven-plugin + ${build-helper-maven-plugin.version} + + + + + + org.scalastyle + scalastyle-maven-plugin + ${scalastyle-maven-plugin.version} + + false + true + false + false + ${basedir}/src/main/scala + ${basedir}/src/test/scala + ${maven.multiModuleProjectDirectory}/dev/scalastyle-config.xml + ${basedir}/target/scalastyle-output.xml + ${project.build.sourceEncoding} + ${project.reporting.outputEncoding} + + + + + check + + compile + + + + + com.diffplug.spotless + spotless-maven-plugin + + + + check + + compile + + + + + org.apache.maven.plugins + maven-source-plugin + + + org.apache.maven.plugins + maven-failsafe-plugin + + + + integration-test + verify + + + false + + + + + + org.apache.rat + apache-rat-plugin + ${apache-rat-plugin.version} + + + verify + + check + + + + + true + + **/*.iml + **/*.log + **/*.md.vm + **/.classpath + **/.project + **/.settings/** + **/build/** + **/target/** + **/apache-spark/** + .dockerignore + .git/** + .github/** + .gitignore + .gitmodules + **/.idea/** + **/dependency-reduced-pom.xml + **/testdata/** + **/.lldbinit + rust-toolchain + Makefile + dev/Dockerfile* + dev/diffs/** + dev/deploy-file + **/test/resources/** + **/benchmarks/*.txt + **/inspections/*.txt + tpcds-kit/** + tpcds-sf-1/** + tpch/** + docs/*.txt + docs/logos/*.png + docs/logos/*.svg + docs/source/_static/images/** + dev/release/rat_exclude_files.txt + dev/release/requirements.txt + native/proto/src/generated/** + + + + + org.apache.maven.plugins + maven-enforcer-plugin + ${maven-enforcer-plugin.version} + + + no-duplicate-declared-dependencies + + enforce + + + + + + + + compile + provided + + + org.apache.spark.unused.UnusedStubClass + + + + org.apache.spark + spark-sql_${scala.binary.version} + + + javax.annotation.meta.TypeQualifier + javax.annotation.Nonnull + javax.annotation.meta.When + javax.annotation.Nonnull$Checker + javax.annotation.meta.TypeQualifierValidator + + org.apache.parquet.filter2.predicate.SparkFilterApi + + + org.apache.spark.sql.ExtendedExplainGenerator + + + + com.google.code.findbugs + jsr305 + + + javax.annotation.meta.TypeQualifier + javax.annotation.Nonnull + javax.annotation.meta.When + javax.annotation.Nonnull$Checker + javax.annotation.meta.TypeQualifierValidator + javax.annotation.Nullable + javax.annotation.meta.TypeQualifierNickname + + + + true + true + + + + + + + + org.codehaus.mojo + extra-enforcer-rules + ${extra-enforcer-rules.version} + + + + + org.codehaus.mojo + flatten-maven-plugin + + + + flatten + process-resources + + flatten + + + + + flatten.clean + clean + + clean + + + + + + org.jacoco + jacoco-maven-plugin + + + default-prepare-agent + + prepare-agent + + + + report + test + + report + + + + + + + diff --git a/spark-integration/pom.xml b/spark-integration/pom.xml index d381359f6a..4c3c28073e 100644 --- a/spark-integration/pom.xml +++ b/spark-integration/pom.xml @@ -26,7 +26,7 @@ under the License. org.apache.datafusion comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.9.0-SNAPSHOT + 0.9.0.1-SNAPSHOT ../pom.xml diff --git a/spark-integration/pom.xml.versionsBackup b/spark-integration/pom.xml.versionsBackup new file mode 100644 index 0000000000..d381359f6a --- /dev/null +++ b/spark-integration/pom.xml.versionsBackup @@ -0,0 +1,113 @@ + + + + + + + 4.0.0 + + org.apache.datafusion + comet-parent-spark${spark.version.short}_${scala.binary.version} + 0.9.0-SNAPSHOT + ../pom.xml + + + comet-spark-integration-spark${spark.version.short}_${scala.binary.version} + comet-spark-integration + pom + + + + false + + + + + org.apache.datafusion + comet-spark-spark${spark.version.short}_${scala.binary.version} + ${project.version} + + + + org.apache.datafusion + comet-common-spark${spark.version.short}_${scala.binary.version} + + + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + + put-client-artifacts-in-a-property + pre-integration-test + + build-classpath + + + true + ; + comet-artifacts + + + + + + org.codehaus.mojo + exec-maven-plugin + + + check-jar-contents + integration-test + + exec + + + bash + ${project.build.testOutputDirectory} + false + + ${project.basedir}/../dev/ensure-jars-have-correct-contents.sh + ${comet-artifacts} + + + + + + + org.apache.maven.plugins + maven-install-plugin + + true + + + + + + diff --git a/spark/pom.xml b/spark/pom.xml index 95ea6971b9..79bc6f83e6 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -26,7 +26,7 @@ under the License. org.apache.datafusion comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.9.0-SNAPSHOT + 0.9.0.1-SNAPSHOT ../pom.xml diff --git a/spark/pom.xml.versionsBackup b/spark/pom.xml.versionsBackup new file mode 100644 index 0000000000..95ea6971b9 --- /dev/null +++ b/spark/pom.xml.versionsBackup @@ -0,0 +1,336 @@ + + + + + + + 4.0.0 + + org.apache.datafusion + comet-parent-spark${spark.version.short}_${scala.binary.version} + 0.9.0-SNAPSHOT + ../pom.xml + + + comet-spark-spark${spark.version.short}_${scala.binary.version} + comet-spark + + + + false + + + + + org.apache.datafusion + comet-common-spark${spark.version.short}_${scala.binary.version} + ${project.version} + + + org.apache.arrow + * + + + + + org.apache.spark + spark-sql_${scala.binary.version} + + + org.scala-lang + scala-library + + + org.scala-lang + scala-reflect + provided + + + com.google.protobuf + protobuf-java + + + org.scalatest + scalatest_${scala.binary.version} + test + + + org.scalatestplus + junit-4-13_${scala.binary.version} + test + + + org.apache.spark + spark-core_${scala.binary.version} + tests + + + org.apache.spark + spark-catalyst_${scala.binary.version} + tests + + + org.apache.spark + spark-sql_${scala.binary.version} + tests + + + org.apache.spark + spark-hadoop-cloud_${scala.binary.version} + tests + + + junit + junit + test + + + com.google.guava + guava + + + * + * + + + + + org.codehaus.jackson + jackson-mapper-asl + test + + + org.apache.parquet + parquet-hadoop + + + org.apache.parquet + parquet-hadoop + tests + + + + + org.apache.arrow + arrow-memory-unsafe + test + + + org.apache.arrow + arrow-c-data + test + + + org.apache.hadoop + hadoop-client-minicluster + test + + + + hadoop-client-api + org.apache.hadoop + + + hadoop-client-runtime + org.apache.hadoop + + + snappy-java + org.xerial.snappy + + + junit + junit + + + + + org.testcontainers + minio + + + software.amazon.awssdk + s3 + + + + + + + com.github.os72 + protoc-jar-maven-plugin + ${protoc-jar-maven-plugin.version} + + + generate-sources + + run + + + com.google.protobuf:protoc:${protobuf.version} + + ../native/proto/src/proto + + + + + + + org.scalatest + scalatest-maven-plugin + + + org.apache.maven.plugins + maven-shade-plugin + + + package + + shade + + + true + true + false + true + + + org.apache.datafusion:comet-common-spark${spark.version.short}_${scala.binary.version} + + com.google.protobuf:protobuf-java + com.google.guava:guava + + + + + *:* + + **/*.proto + **/*.thrift + git.properties + log4j.properties + log4j2.properties + **/SparkFilterApi.* + + + + org.apache.parquet:parquet-hadoop:tests + + + org/apache/parquet/crypto/keytools/mocks/InMemoryKMS* + + + + + + com.google.protobuf + ${comet.shade.packageName}.protobuf + + + com.google.common + ${comet.shade.packageName}.guava + + + com.google.thirdparty + ${comet.shade.packageName}.guava.thirdparty + + + + + + + + org.apache.maven.plugins + maven-failsafe-plugin + + + + integration-test + verify + + + false + -ea ${extraJavaTestArgs} + + + + + + net.alchim31.maven + scala-maven-plugin + + + org.codehaus.mojo + build-helper-maven-plugin + + + add-test-source + generate-test-sources + + add-test-source + + + + src/test/${shims.majorVerSrc} + src/test/${shims.minorVerSrc} + + + + + add-shim-source + generate-sources + + add-source + + + + src/main/${shims.majorVerSrc} + src/main/${shims.minorVerSrc} + + + + + + + org.codehaus.mojo + exec-maven-plugin + ${exec-maven-plugin.version} + + + generate-user-guide-reference-docs + package + + java + + + org.apache.comet.GenerateDocs + compile + + + + + + + + From 068ea7307d20224928f43aff2e41dd03ec1313e1 Mon Sep 17 00:00:00 2001 From: huaxingao Date: Wed, 25 Jun 2025 14:08:02 -0700 Subject: [PATCH 4/6] remove accidetally changes --- common/pom.xml | 2 +- common/pom.xml.versionsBackup | 220 ----- fuzz-testing/pom.xml | 2 +- fuzz-testing/pom.xml.versionsBackup | 117 --- pom.xml | 2 +- pom.xml.versionsBackup | 1135 ---------------------- spark-integration/pom.xml | 2 +- spark-integration/pom.xml.versionsBackup | 113 --- spark/pom.xml | 2 +- spark/pom.xml.versionsBackup | 336 ------- 10 files changed, 5 insertions(+), 1926 deletions(-) delete mode 100644 common/pom.xml.versionsBackup delete mode 100644 fuzz-testing/pom.xml.versionsBackup delete mode 100644 pom.xml.versionsBackup delete mode 100644 spark-integration/pom.xml.versionsBackup delete mode 100644 spark/pom.xml.versionsBackup diff --git a/common/pom.xml b/common/pom.xml index 68a238432b..36bc706053 100644 --- a/common/pom.xml +++ b/common/pom.xml @@ -26,7 +26,7 @@ under the License. org.apache.datafusion comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.9.0.1-SNAPSHOT + 0.9.0-SNAPSHOT ../pom.xml diff --git a/common/pom.xml.versionsBackup b/common/pom.xml.versionsBackup deleted file mode 100644 index 36bc706053..0000000000 --- a/common/pom.xml.versionsBackup +++ /dev/null @@ -1,220 +0,0 @@ - - - - - - - 4.0.0 - - org.apache.datafusion - comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.9.0-SNAPSHOT - ../pom.xml - - - comet-common-spark${spark.version.short}_${scala.binary.version} - comet-common - - - - false - - - - - org.apache.spark - spark-sql_${scala.binary.version} - - - org.apache.parquet - parquet-column - - - org.apache.parquet - parquet-hadoop - - - org.apache.arrow - arrow-vector - - - org.apache.arrow - arrow-memory-unsafe - - - org.apache.arrow - arrow-c-data - - - junit - junit - test - - - org.assertj - assertj-core - test - - - - - - - io.github.git-commit-id - git-commit-id-maven-plugin - ${git-commit-id-maven-plugin.version} - - - get-the-git-infos - - revision - - initialize - - - - true - ${project.build.outputDirectory}/comet-git-info.properties - full - - ^git.branch$ - ^git.build.*$ - ^git.commit.id.(abbrev|full)$ - ^git.remote.*$ - - - - - org.apache.maven.plugins - maven-shade-plugin - - - package - - shade - - - true - true - false - true - - - - org.apache.arrow:* - - - - - *:* - - **/*.thrift - git.properties - log4j.properties - log4j2.properties - arrow-git.properties - - - - org.apache.arrow:arrow-vector - - - codegen/** - - - - - - org.apache.arrow - ${comet.shade.packageName}.arrow - - - org/apache/arrow/c/jni/JniWrapper - org/apache/arrow/c/jni/PrivateData - org/apache/arrow/c/jni/CDataJniException - - org/apache/arrow/c/ArrayStreamExporter$ExportedArrayStreamPrivateData - - - - - - - - - - net.alchim31.maven - scala-maven-plugin - - - org.codehaus.mojo - build-helper-maven-plugin - - - add-shim-source - generate-sources - - add-source - - - - src/main/${shims.majorVerSrc} - src/main/${shims.minorVerSrc} - - - - - - - - - ${project.basedir}/src/main/resources - - - ${project.basedir}/../native/target/x86_64-apple-darwin/release - - libcomet.dylib - - org/apache/comet/darwin/x86_64 - - - ${project.basedir}/../native/target/aarch64-apple-darwin/release - - libcomet.dylib - - org/apache/comet/darwin/aarch64 - - - ${jni.dir} - - libcomet.dylib - libcomet.so - comet.dll - - org/apache/comet/${platform}/${arch} - - - - - diff --git a/fuzz-testing/pom.xml b/fuzz-testing/pom.xml index 72d4dee62d..a748b18140 100644 --- a/fuzz-testing/pom.xml +++ b/fuzz-testing/pom.xml @@ -25,7 +25,7 @@ under the License. org.apache.datafusion comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.9.0.1-SNAPSHOT + 0.9.0-SNAPSHOT ../pom.xml diff --git a/fuzz-testing/pom.xml.versionsBackup b/fuzz-testing/pom.xml.versionsBackup deleted file mode 100644 index a748b18140..0000000000 --- a/fuzz-testing/pom.xml.versionsBackup +++ /dev/null @@ -1,117 +0,0 @@ - - - - 4.0.0 - - - org.apache.datafusion - comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.9.0-SNAPSHOT - ../pom.xml - - - comet-fuzz-spark${spark.version.short}_${scala.binary.version} - comet-fuzz - http://maven.apache.org - jar - - - - false - - - - - org.scala-lang - scala-library - ${scala.version} - provided - - - org.apache.spark - spark-sql_${scala.binary.version} - provided - - - org.apache.datafusion - comet-spark-spark${spark.version.short}_${scala.binary.version} - ${project.version} - - - org.rogach - scallop_${scala.binary.version} - - - - - src/main/scala - - - org.apache.maven.plugins - maven-compiler-plugin - ${maven-compiler-plugin.version} - - ${java.version} - ${java.version} - - - - net.alchim31.maven - scala-maven-plugin - ${scala.plugin.version} - - - - compile - testCompile - - - - - - maven-assembly-plugin - ${maven-assembly-plugin.version} - - - jar-with-dependencies - - - - - make-assembly - package - - single - - - - - - org.apache.maven.plugins - maven-install-plugin - - true - - - - - diff --git a/pom.xml b/pom.xml index d07075ad5d..6c9f97a8e7 100644 --- a/pom.xml +++ b/pom.xml @@ -30,7 +30,7 @@ under the License. org.apache.datafusion comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.9.0.1-SNAPSHOT + 0.9.0-SNAPSHOT pom Comet Project Parent POM diff --git a/pom.xml.versionsBackup b/pom.xml.versionsBackup deleted file mode 100644 index 167df5bb4d..0000000000 --- a/pom.xml.versionsBackup +++ /dev/null @@ -1,1135 +0,0 @@ - - - - - - - 4.0.0 - - org.apache - apache - 23 - - org.apache.datafusion - comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.9.0-SNAPSHOT - pom - Comet Project Parent POM - - - common - spark - spark-integration - fuzz-testing - - - - UTF-8 - UTF-8 - 11 - ${java.version} - ${java.version} - 3.11.0 - 3.6.0 - 3.2.4 - 3.1.2 - 3.3.0 - 3.3.0 - 3.1.0 - 9.1 - 3.4.0 - 1.3.0 - 1.0.0 - 4.9.9 - 3.1.0 - 3.11.4 - 0.1.7_0.10.4 - 1.7.0 - 3.6.1 - 0.16 - 2.12.18 - 2.12 - 4.8.0 - 3.2.16 - 2.2.0 - 3.5.6 - 3.5 - provided - 3.25.5 - 1.13.1 - provided - 3.3.4 - 18.3.0 - 1.9.13 - 2.43.0 - 0.8.11 - 4.8.8 - 2.0.7 - 33.2.1-jre - 1.21.0 - 2.31.51 - ${project.basedir}/../native/target/debug - darwin - x86_64 - org.apache.comet.shaded - - ${session.executionRootDirectory} - - - true - - - - -XX:+IgnoreUnrecognizedVMOptions - --add-opens=java.base/java.lang=ALL-UNNAMED - --add-opens=java.base/java.lang.invoke=ALL-UNNAMED - --add-opens=java.base/java.lang.reflect=ALL-UNNAMED - --add-opens=java.base/java.io=ALL-UNNAMED - --add-opens=java.base/java.net=ALL-UNNAMED - --add-opens=java.base/java.nio=ALL-UNNAMED - --add-opens=java.base/java.util=ALL-UNNAMED - --add-opens=java.base/java.util.concurrent=ALL-UNNAMED - --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED - --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED - --add-opens=java.base/sun.nio.ch=ALL-UNNAMED - --add-opens=java.base/sun.nio.cs=ALL-UNNAMED - --add-opens=java.base/sun.security.action=ALL-UNNAMED - --add-opens=java.base/sun.util.calendar=ALL-UNNAMED - -Djdk.reflect.useDirectMethodHandle=false - - -ea -Xmx4g -Xss4m ${extraJavaTestArgs} - spark-3.x - spark-3.5 - - - - - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark.version} - ${spark.maven.scope} - - - org.apache.parquet - parquet-hadoop - - - org.apache.parquet - parquet-column - - - com.google.guava - guava - - - - - commons-logging - commons-logging - - - - org.apache.arrow - * - - - - - org.apache.spark - spark-catalyst_${scala.binary.version} - ${spark.version} - ${spark.maven.scope} - - - - - commons-logging - commons-logging - - - - - org.apache.arrow - * - - - - - - - org.apache.arrow - arrow-vector - ${arrow.version} - - - - com.fasterxml.jackson.core - jackson-annotations - - - com.fasterxml.jackson.core - jackson-core - - - io.netty - netty-common - - - com.google.code.findbugs - jsr305 - - - - - org.apache.arrow - arrow-memory-unsafe - ${arrow.version} - - - org.apache.arrow - arrow-c-data - ${arrow.version} - - - - - org.apache.parquet - parquet-column - ${parquet.version} - ${parquet.maven.scope} - - - org.apache.parquet - parquet-hadoop - ${parquet.version} - ${parquet.maven.scope} - - - - javax.annotation - javax.annotation-api - - - - - org.apache.parquet - parquet-avro - ${parquet.version} - test - - - org.apache.parquet - parquet-hadoop - ${parquet.version} - tests - test - - - - javax.annotation - javax.annotation-api - - - - - - - org.scala-lang - scala-library - ${scala.version} - - - com.google.protobuf - protobuf-java - ${protobuf.version} - - - org.slf4j - slf4j-api - ${slf4j.version} - ${spark.maven.scope} - - - - - com.google.guava - guava - ${guava.version} - - - - - - org.apache.spark - spark-core_${scala.binary.version} - ${spark.version} - tests - test - - - org.apache.datafusion - * - - - - - org.apache.arrow - * - - - - - commons-logging - commons-logging - - - - com.google.code.findbugs - jsr305 - - - - - org.apache.spark - spark-catalyst_${scala.binary.version} - ${spark.version} - tests - test - - - - - org.apache.arrow - * - - - - - commons-logging - commons-logging - - - - com.google.code.findbugs - jsr305 - - - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark.version} - tests - test - - - org.apache.parquet - parquet-hadoop - - - org.apache.parquet - parquet-column - - - org.apache.datafusion - * - - - - - commons-logging - commons-logging - - - - org.apache.arrow - * - - - - - org.apache.spark - spark-hive_${scala.binary.version} - ${spark.version} - test - - - org.apache.datafusion - * - - - commons-logging - commons-logging - - - org.apache.arrow - * - - - - - junit - junit - 4.13.2 - test - - - org.assertj - assertj-core - 3.23.1 - test - - - org.scalatest - scalatest_${scala.binary.version} - ${scalatest.version} - test - - - org.scala-lang - scala-reflect - ${scala.version} - test - - - org.scalatestplus - junit-4-13_${scala.binary.version} - 3.2.16.0 - test - - - - - org.apache.spark - spark-hadoop-cloud_${scala.binary.version} - ${spark.version} - tests - test - - - - org.apache.hadoop.thirdparty - hadoop-shaded-guava - - - org.apache.hadoop - hadoop-annotations - - - javax.xml.bind - jaxb-api - - - - - commons-logging - commons-logging - - - - org.apache.arrow - * - - - - - - - org.testcontainers - minio - ${testcontainers.version} - test - - - software.amazon.awssdk - s3 - ${amazon-awssdk-v2.version} - test - - - - org.codehaus.jackson - jackson-mapper-asl - ${codehaus.jackson.version} - test - - - - org.rogach - scallop_${scala.binary.version} - 5.1.0 - - - - org.apache.hadoop - hadoop-client-minicluster - ${hadoop.version} - test - - - - - - - - - release - - ${project.basedir}/../native/target/release - - - - - Win-x86 - - - Windows - x86 - - - - win32 - x86_64 - - - - - Win-amd64 - - - Windows - amd64 - - - - win32 - amd64 - - - - - Darwin-x86 - - - mac - x86 - - - - darwin - x86_64 - - - - - Darwin-aarch64 - - - mac - aarch64 - - - - darwin - aarch64 - - - - - Linux-amd64 - - - Linux - amd64 - - - - linux - amd64 - - - - - Linux-aarch64 - - - Linux - aarch64 - - - - linux - aarch64 - - - - - spark-3.4 - - 2.12.17 - 3.4.3 - 3.4 - 1.13.1 - 2.0.6 - spark-3.4 - - - - - spark-3.5 - - 2.12.18 - 3.5.6 - 3.5 - 1.13.1 - 2.0.7 - spark-3.5 - - - - - - spark-4.0 - - - 2.13.14 - 2.13 - 4.0.0-preview1 - 4.0 - 1.13.1 - 4.9.5 - 2.0.13 - spark-4.0 - not-needed-yet - - 17 - ${java.version} - ${java.version} - - - - - scala-2.12 - - - - scala-2.13 - - 2.13.14 - 2.13 - 4.9.5 - - - - - jdk11 - - 11 - - - 11 - ${java.version} - ${java.version} - - - - - jdk17 - - 17 - - - 17 - ${java.version} - ${java.version} - - - - - semanticdb - - true - true - true - - - - - - net.alchim31.maven - scala-maven-plugin - ${scala.plugin.version} - - - - compile - testCompile - - - - - - -Ywarn-unused - - - -source - ${java.version} - -target - ${java.version} - -Xlint:all,-serial,-path,-try - - - - org.scalameta - semanticdb-scalac_${scala.version} - ${semanticdb.version} - - - - - - io.github.evis - scalafix-maven-plugin_${scala.binary.version} - ${scalafix-maven-plugin.version} - - - - - - - - - - - - net.alchim31.maven - scala-maven-plugin - ${scala.plugin.version} - - - eclipse-add-source - - add-source - - - - scala-compile-first - process-resources - - compile - add-source - - - - scala-test-compile-first - process-test-resources - - testCompile - - - - - ${scala.version} - true - true - incremental - - -unchecked - -deprecation - -feature - -explaintypes - -Xlint:adapted-args - - - -Xms1024m - -Xmx1024m - - - -source - ${maven.compiler.source} - -target - ${maven.compiler.target} - -Xlint:all,-serial,-path,-try - - - - - org.scalatest - scalatest-maven-plugin - ${scalatest-maven-plugin.version} - - ${project.build.directory}/surefire-reports - . - SparkTestSuite.txt - D - - org.apache.comet.IntegrationTestSuite - - - file:src/test/resources/log4j2.properties - true - ${project.build.directory}/tmp - - - - - test - - test - - - - - - org.apache.maven.plugins - maven-shade-plugin - ${maven-shade-plugin.version} - - - org.ow2.asm - asm - ${asm.version} - - - org.ow2.asm - asm-commons - ${asm.version} - - - - - org.apache.maven.plugins - maven-surefire-plugin - ${maven-surefire-plugin.version} - - - file:src/test/resources/log4j2.properties - - false - - - - org.apache.maven.plugins - maven-source-plugin - ${maven-source-plugin.version} - - true - - - - create-source-jar - - jar-no-fork - test-jar-no-fork - - - - - - org.apache.maven.plugins - maven-compiler-plugin - ${maven-compiler-plugin.version} - - ${java.version} - ${java.version} - true - true - - - - org.apache.maven.plugins - maven-failsafe-plugin - ${maven-failsafe-plugin.version} - - - com.diffplug.spotless - spotless-maven-plugin - ${spotless.version} - - - - - - - java|javax,scala,org,org.apache,com,org.apache.comet,\#,\#org.apache.comet - - - ${maven.multiModuleProjectDirectory}/dev/copyright/java-header.txt - - - - - - ${scalafmt.version} - ${maven.multiModuleProjectDirectory}/scalafmt.conf - - - ${maven.multiModuleProjectDirectory}/dev/copyright/scala-header.txt - - - - - - org.codehaus.mojo - flatten-maven-plugin - ${flatten-maven-plugin.version} - - - org.jacoco - jacoco-maven-plugin - ${jacoco.version} - - - org.codehaus.mojo - build-helper-maven-plugin - ${build-helper-maven-plugin.version} - - - - - - org.scalastyle - scalastyle-maven-plugin - ${scalastyle-maven-plugin.version} - - false - true - false - false - ${basedir}/src/main/scala - ${basedir}/src/test/scala - ${maven.multiModuleProjectDirectory}/dev/scalastyle-config.xml - ${basedir}/target/scalastyle-output.xml - ${project.build.sourceEncoding} - ${project.reporting.outputEncoding} - - - - - check - - compile - - - - - com.diffplug.spotless - spotless-maven-plugin - - - - check - - compile - - - - - org.apache.maven.plugins - maven-source-plugin - - - org.apache.maven.plugins - maven-failsafe-plugin - - - - integration-test - verify - - - false - - - - - - org.apache.rat - apache-rat-plugin - ${apache-rat-plugin.version} - - - verify - - check - - - - - true - - **/*.iml - **/*.log - **/*.md.vm - **/.classpath - **/.project - **/.settings/** - **/build/** - **/target/** - **/apache-spark/** - .dockerignore - .git/** - .github/** - .gitignore - .gitmodules - **/.idea/** - **/dependency-reduced-pom.xml - **/testdata/** - **/.lldbinit - rust-toolchain - Makefile - dev/Dockerfile* - dev/diffs/** - dev/deploy-file - **/test/resources/** - **/benchmarks/*.txt - **/inspections/*.txt - tpcds-kit/** - tpcds-sf-1/** - tpch/** - docs/*.txt - docs/logos/*.png - docs/logos/*.svg - docs/source/_static/images/** - dev/release/rat_exclude_files.txt - dev/release/requirements.txt - native/proto/src/generated/** - - - - - org.apache.maven.plugins - maven-enforcer-plugin - ${maven-enforcer-plugin.version} - - - no-duplicate-declared-dependencies - - enforce - - - - - - - - compile - provided - - - org.apache.spark.unused.UnusedStubClass - - - - org.apache.spark - spark-sql_${scala.binary.version} - - - javax.annotation.meta.TypeQualifier - javax.annotation.Nonnull - javax.annotation.meta.When - javax.annotation.Nonnull$Checker - javax.annotation.meta.TypeQualifierValidator - - org.apache.parquet.filter2.predicate.SparkFilterApi - - - org.apache.spark.sql.ExtendedExplainGenerator - - - - com.google.code.findbugs - jsr305 - - - javax.annotation.meta.TypeQualifier - javax.annotation.Nonnull - javax.annotation.meta.When - javax.annotation.Nonnull$Checker - javax.annotation.meta.TypeQualifierValidator - javax.annotation.Nullable - javax.annotation.meta.TypeQualifierNickname - - - - true - true - - - - - - - - org.codehaus.mojo - extra-enforcer-rules - ${extra-enforcer-rules.version} - - - - - org.codehaus.mojo - flatten-maven-plugin - - - - flatten - process-resources - - flatten - - - - - flatten.clean - clean - - clean - - - - - - org.jacoco - jacoco-maven-plugin - - - default-prepare-agent - - prepare-agent - - - - report - test - - report - - - - - - - diff --git a/spark-integration/pom.xml b/spark-integration/pom.xml index 4c3c28073e..d381359f6a 100644 --- a/spark-integration/pom.xml +++ b/spark-integration/pom.xml @@ -26,7 +26,7 @@ under the License. org.apache.datafusion comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.9.0.1-SNAPSHOT + 0.9.0-SNAPSHOT ../pom.xml diff --git a/spark-integration/pom.xml.versionsBackup b/spark-integration/pom.xml.versionsBackup deleted file mode 100644 index d381359f6a..0000000000 --- a/spark-integration/pom.xml.versionsBackup +++ /dev/null @@ -1,113 +0,0 @@ - - - - - - - 4.0.0 - - org.apache.datafusion - comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.9.0-SNAPSHOT - ../pom.xml - - - comet-spark-integration-spark${spark.version.short}_${scala.binary.version} - comet-spark-integration - pom - - - - false - - - - - org.apache.datafusion - comet-spark-spark${spark.version.short}_${scala.binary.version} - ${project.version} - - - - org.apache.datafusion - comet-common-spark${spark.version.short}_${scala.binary.version} - - - - - - - - - org.apache.maven.plugins - maven-dependency-plugin - - - - put-client-artifacts-in-a-property - pre-integration-test - - build-classpath - - - true - ; - comet-artifacts - - - - - - org.codehaus.mojo - exec-maven-plugin - - - check-jar-contents - integration-test - - exec - - - bash - ${project.build.testOutputDirectory} - false - - ${project.basedir}/../dev/ensure-jars-have-correct-contents.sh - ${comet-artifacts} - - - - - - - org.apache.maven.plugins - maven-install-plugin - - true - - - - - - diff --git a/spark/pom.xml b/spark/pom.xml index 79bc6f83e6..95ea6971b9 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -26,7 +26,7 @@ under the License. org.apache.datafusion comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.9.0.1-SNAPSHOT + 0.9.0-SNAPSHOT ../pom.xml diff --git a/spark/pom.xml.versionsBackup b/spark/pom.xml.versionsBackup deleted file mode 100644 index 95ea6971b9..0000000000 --- a/spark/pom.xml.versionsBackup +++ /dev/null @@ -1,336 +0,0 @@ - - - - - - - 4.0.0 - - org.apache.datafusion - comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.9.0-SNAPSHOT - ../pom.xml - - - comet-spark-spark${spark.version.short}_${scala.binary.version} - comet-spark - - - - false - - - - - org.apache.datafusion - comet-common-spark${spark.version.short}_${scala.binary.version} - ${project.version} - - - org.apache.arrow - * - - - - - org.apache.spark - spark-sql_${scala.binary.version} - - - org.scala-lang - scala-library - - - org.scala-lang - scala-reflect - provided - - - com.google.protobuf - protobuf-java - - - org.scalatest - scalatest_${scala.binary.version} - test - - - org.scalatestplus - junit-4-13_${scala.binary.version} - test - - - org.apache.spark - spark-core_${scala.binary.version} - tests - - - org.apache.spark - spark-catalyst_${scala.binary.version} - tests - - - org.apache.spark - spark-sql_${scala.binary.version} - tests - - - org.apache.spark - spark-hadoop-cloud_${scala.binary.version} - tests - - - junit - junit - test - - - com.google.guava - guava - - - * - * - - - - - org.codehaus.jackson - jackson-mapper-asl - test - - - org.apache.parquet - parquet-hadoop - - - org.apache.parquet - parquet-hadoop - tests - - - - - org.apache.arrow - arrow-memory-unsafe - test - - - org.apache.arrow - arrow-c-data - test - - - org.apache.hadoop - hadoop-client-minicluster - test - - - - hadoop-client-api - org.apache.hadoop - - - hadoop-client-runtime - org.apache.hadoop - - - snappy-java - org.xerial.snappy - - - junit - junit - - - - - org.testcontainers - minio - - - software.amazon.awssdk - s3 - - - - - - - com.github.os72 - protoc-jar-maven-plugin - ${protoc-jar-maven-plugin.version} - - - generate-sources - - run - - - com.google.protobuf:protoc:${protobuf.version} - - ../native/proto/src/proto - - - - - - - org.scalatest - scalatest-maven-plugin - - - org.apache.maven.plugins - maven-shade-plugin - - - package - - shade - - - true - true - false - true - - - org.apache.datafusion:comet-common-spark${spark.version.short}_${scala.binary.version} - - com.google.protobuf:protobuf-java - com.google.guava:guava - - - - - *:* - - **/*.proto - **/*.thrift - git.properties - log4j.properties - log4j2.properties - **/SparkFilterApi.* - - - - org.apache.parquet:parquet-hadoop:tests - - - org/apache/parquet/crypto/keytools/mocks/InMemoryKMS* - - - - - - com.google.protobuf - ${comet.shade.packageName}.protobuf - - - com.google.common - ${comet.shade.packageName}.guava - - - com.google.thirdparty - ${comet.shade.packageName}.guava.thirdparty - - - - - - - - org.apache.maven.plugins - maven-failsafe-plugin - - - - integration-test - verify - - - false - -ea ${extraJavaTestArgs} - - - - - - net.alchim31.maven - scala-maven-plugin - - - org.codehaus.mojo - build-helper-maven-plugin - - - add-test-source - generate-test-sources - - add-test-source - - - - src/test/${shims.majorVerSrc} - src/test/${shims.minorVerSrc} - - - - - add-shim-source - generate-sources - - add-source - - - - src/main/${shims.majorVerSrc} - src/main/${shims.minorVerSrc} - - - - - - - org.codehaus.mojo - exec-maven-plugin - ${exec-maven-plugin.version} - - - generate-user-guide-reference-docs - package - - java - - - org.apache.comet.GenerateDocs - compile - - - - - - - - From 7ce8466d0b4113f45a5d75d726bd5e7b2872fa95 Mon Sep 17 00:00:00 2001 From: huaxingao Date: Wed, 25 Jun 2025 17:49:44 -0700 Subject: [PATCH 5/6] address comments --- .../java/org/apache/comet/parquet/FileReader.java | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/common/src/main/java/org/apache/comet/parquet/FileReader.java b/common/src/main/java/org/apache/comet/parquet/FileReader.java index 59218ec10f..76ae6333d9 100644 --- a/common/src/main/java/org/apache/comet/parquet/FileReader.java +++ b/common/src/main/java/org/apache/comet/parquet/FileReader.java @@ -58,6 +58,7 @@ import org.apache.parquet.column.page.PageReadStore; import org.apache.parquet.compression.CompressionCodecFactory; import org.apache.parquet.crypto.AesCipher; +import org.apache.parquet.crypto.EncryptionPropertiesFactory; import org.apache.parquet.crypto.FileDecryptionProperties; import org.apache.parquet.crypto.InternalColumnDecryptionSetup; import org.apache.parquet.crypto.InternalFileDecryptor; @@ -72,12 +73,12 @@ import org.apache.parquet.format.PageHeader; import org.apache.parquet.format.Util; import org.apache.parquet.format.converter.ParquetMetadataConverter; +import org.apache.parquet.hadoop.ParquetInputFormat; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.hadoop.metadata.FileMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; -import org.apache.parquet.hadoop.util.HadoopInputFile; import org.apache.parquet.hadoop.util.counters.BenchmarkCounter; import org.apache.parquet.internal.column.columnindex.OffsetIndex; import org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter; @@ -148,7 +149,7 @@ public FileReader( ParquetReadOptions options = buildParquetReadOptions(conf, properties, start, length, fileEncryptionKey, fileAADPrefix); this.converter = new ParquetMetadataConverter(options); - this.file = HadoopInputFile.fromPath(path, conf); + this.file = CometInputFile.fromPath(path, conf); this.f = file.newStream(); this.options = options; this.cometOptions = cometOptions; @@ -274,12 +275,14 @@ private static ParquetReadOptions buildParquetReadOptions( byte[] fileEncryptionKey, byte[] fileAADPrefix) { + // Iceberg remove these read properties when building the ParquetReadOptions. + // We want build the exact same ParquetReadOptions as Iceberg's. Collection readPropertiesToRemove = Sets.newHashSet( - "parquet.read.filter", - "parquet.private.read.filter.predicate", - "parquet.read.support.class", - "parquet.crypto.factory.class"); + ParquetInputFormat.UNBOUND_RECORD_FILTER, + ParquetInputFormat.FILTER_PREDICATE, + ParquetInputFormat.READ_SUPPORT_CLASS, + EncryptionPropertiesFactory.CRYPTO_FACTORY_CLASS_PROPERTY_NAME); for (String property : readPropertiesToRemove) { conf.unset(property); From ea08f9ac19dc48fc6244caffb6f7495b55c587c2 Mon Sep 17 00:00:00 2001 From: huaxingao Date: Thu, 26 Jun 2025 14:17:10 -0700 Subject: [PATCH 6/6] change to java.util.Set --- common/src/main/java/org/apache/comet/parquet/FileReader.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/src/main/java/org/apache/comet/parquet/FileReader.java b/common/src/main/java/org/apache/comet/parquet/FileReader.java index 76ae6333d9..af6c5b3c0b 100644 --- a/common/src/main/java/org/apache/comet/parquet/FileReader.java +++ b/common/src/main/java/org/apache/comet/parquet/FileReader.java @@ -31,6 +31,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; @@ -41,7 +42,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.commons.compress.utils.Sets; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.parquet.HadoopReadOptions; @@ -278,7 +278,7 @@ private static ParquetReadOptions buildParquetReadOptions( // Iceberg remove these read properties when building the ParquetReadOptions. // We want build the exact same ParquetReadOptions as Iceberg's. Collection readPropertiesToRemove = - Sets.newHashSet( + Set.of( ParquetInputFormat.UNBOUND_RECORD_FILTER, ParquetInputFormat.FILTER_PREDICATE, ParquetInputFormat.READ_SUPPORT_CLASS,