diff --git a/contrib/storage-hive/hive-exec-shade/pom.xml b/contrib/storage-hive/hive-exec-shade/pom.xml
index 1cd5980b0b8..cc98078a9d5 100644
--- a/contrib/storage-hive/hive-exec-shade/pom.xml
+++ b/contrib/storage-hive/hive-exec-shade/pom.xml
@@ -32,7 +32,7 @@
Drill : Contrib : Storage : Hive : Exec Shaded
- 1.8.3
+ 1.15.1
@@ -219,6 +219,12 @@
META-INF/versions/22/**
+
+ org.apache.parquet:parquet-hadoop-bundle
+
+ META-INF/versions/22/**
+
+
diff --git a/contrib/storage-kafka/pom.xml b/contrib/storage-kafka/pom.xml
index 35cb6022e62..8ff48673f35 100644
--- a/contrib/storage-kafka/pom.xml
+++ b/contrib/storage-kafka/pom.xml
@@ -97,7 +97,7 @@
io.confluent
kafka-avro-serializer
- 6.1.1
+ 7.9.0
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ColumnDataReader.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ColumnDataReader.java
deleted file mode 100644
index 1d9ccdad509..00000000000
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ColumnDataReader.java
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.drill.exec.store.parquet;
-
-import io.netty.buffer.DrillBuf;
-
-import java.io.IOException;
-import java.io.OutputStream;
-
-import org.apache.hadoop.fs.FSDataInputStream;
-
-import org.apache.parquet.bytes.BytesInput;
-import org.apache.parquet.format.PageHeader;
-import org.apache.parquet.format.Util;
-import org.apache.parquet.hadoop.util.HadoopStreams;
-
-/**
- * @deprecated it is never used. So can be removed in Drill 1.21.0
- */
-public class ColumnDataReader {
- static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(ColumnDataReader.class);
-
- private final long endPosition;
- public final FSDataInputStream input;
-
- public ColumnDataReader(FSDataInputStream input, long start, long length) throws IOException{
- this.input = input;
- this.input.seek(start);
- this.endPosition = start + length;
- }
-
- public PageHeader readPageHeader() throws IOException{
- return Util.readPageHeader(input);
- }
-
- public FSDataInputStream getInputStream() {
- return input;
- }
-
- public BytesInput getPageAsBytesInput(int pageLength) throws IOException{
- byte[] b = new byte[pageLength];
- input.read(b);
- return new HadoopBytesInput(b);
- }
-
- public void loadPage(DrillBuf target, int pageLength) throws IOException {
- target.clear();
- HadoopStreams.wrap(input).read(target.nioBuffer(0, pageLength));
- target.writerIndex(pageLength);
- }
-
- public void clear(){
- try{
- input.close();
- }catch(IOException ex){
- logger.warn("Error while closing input stream.", ex);
- }
- }
-
- public boolean hasRemainder() throws IOException{
- return input.getPos() < endPosition;
- }
-
- public class HadoopBytesInput extends BytesInput{
-
- private final byte[] pageBytes;
-
- public HadoopBytesInput(byte[] pageBytes) {
- super();
- this.pageBytes = pageBytes;
- }
-
- @Override
- public byte[] toByteArray() throws IOException {
- return pageBytes;
- }
-
- @Override
- public long size() {
- return pageBytes.length;
- }
-
- @Override
- public void writeAllTo(OutputStream out) throws IOException {
- out.write(pageBytes);
- }
- }
-}
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRecordWriter.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRecordWriter.java
index c57a7c2e9d7..14e9a2b9d6a 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRecordWriter.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRecordWriter.java
@@ -69,7 +69,7 @@
import org.apache.parquet.column.values.factory.DefaultV2ValuesWriterFactory;
import org.apache.parquet.column.values.factory.ValuesWriterFactory;
import org.apache.parquet.compression.CompressionCodecFactory;
-import org.apache.parquet.hadoop.ParquetColumnChunkPageWriteStore;
+import org.apache.parquet.hadoop.ColumnChunkPageWriteStore;
import org.apache.parquet.hadoop.ParquetFileWriter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.io.ColumnIOFactory;
@@ -120,7 +120,7 @@ public class ParquetRecordWriter extends ParquetOutputRecordWriter {
private long recordCountForNextMemCheck = MINIMUM_RECORD_COUNT_FOR_CHECK;
private ColumnWriteStore store;
- private ParquetColumnChunkPageWriteStore pageStore;
+ private ColumnChunkPageWriteStore pageStore;
private RecordConsumer consumer;
private BatchSchema batchSchema;
@@ -285,10 +285,7 @@ private void newSchema() {
.withWriterVersion(writerVersion)
.build();
- // TODO: Replace ParquetColumnChunkPageWriteStore with ColumnChunkPageWriteStore from parquet library
- // once DRILL-7906 (PARQUET-1006) will be resolved
- pageStore = new ParquetColumnChunkPageWriteStore(codecFactory.getCompressor(codec), schema,
- parquetProperties.getInitialSlabSize(), pageSize, parquetProperties.getAllocator(),
+ pageStore = new ColumnChunkPageWriteStore(codecFactory.getCompressor(codec), schema, parquetProperties.getAllocator(),
parquetProperties.getColumnIndexTruncateLength(), parquetProperties.getPageWriteChecksumEnabled());
store = writerVersion == WriterVersion.PARQUET_1_0
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet2/DrillParquetReader.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet2/DrillParquetReader.java
index ae76971a373..f35c2323ce6 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet2/DrillParquetReader.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet2/DrillParquetReader.java
@@ -55,7 +55,6 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@@ -417,13 +416,9 @@ public void close() {
recordReader = null;
recordMaterializer = null;
nullFilledVectors = null;
- try {
- if (pageReadStore != null) {
- pageReadStore.close();
- pageReadStore = null;
- }
- } catch (IOException e) {
- logger.warn("Failure while closing PageReadStore", e);
+ if (pageReadStore != null) {
+ pageReadStore.close();
+ pageReadStore = null;
}
}
diff --git a/exec/java-exec/src/main/java/org/apache/parquet/hadoop/ColumnChunkIncReadStore.java b/exec/java-exec/src/main/java/org/apache/parquet/hadoop/ColumnChunkIncReadStore.java
index 7834eaa8166..5dc6658a24f 100644
--- a/exec/java-exec/src/main/java/org/apache/parquet/hadoop/ColumnChunkIncReadStore.java
+++ b/exec/java-exec/src/main/java/org/apache/parquet/hadoop/ColumnChunkIncReadStore.java
@@ -295,9 +295,13 @@ public void addColumn(ColumnDescriptor descriptor, ColumnChunkMetaData metaData)
columns.put(descriptor, reader);
}
- public void close() throws IOException {
+ public void close() {
for (FSDataInputStream stream : streams) {
- stream.close();
+ try {
+ stream.close();
+ } catch (IOException e) {
+ logger.warn("Error closing stream: {}", e.getMessage(), e);
+ }
}
for (ColumnChunkIncPageReader reader : columns.values()) {
reader.close();
diff --git a/exec/java-exec/src/main/java/org/apache/parquet/hadoop/ParquetColumnChunkPageWriteStore.java b/exec/java-exec/src/main/java/org/apache/parquet/hadoop/ParquetColumnChunkPageWriteStore.java
deleted file mode 100644
index 28dfc278596..00000000000
--- a/exec/java-exec/src/main/java/org/apache/parquet/hadoop/ParquetColumnChunkPageWriteStore.java
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.parquet.hadoop;
-
-import java.io.Closeable;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.zip.CRC32;
-
-import org.apache.parquet.bytes.BytesInput;
-import org.apache.parquet.bytes.CapacityByteArrayOutputStream;
-import org.apache.parquet.column.ColumnDescriptor;
-import org.apache.parquet.column.Encoding;
-import org.apache.parquet.column.ParquetProperties;
-import org.apache.parquet.column.page.DictionaryPage;
-import org.apache.parquet.column.page.PageWriteStore;
-import org.apache.parquet.column.page.PageWriter;
-import org.apache.parquet.column.statistics.Statistics;
-import org.apache.parquet.column.values.bloomfilter.BloomFilter;
-import org.apache.parquet.column.values.bloomfilter.BloomFilterWriteStore;
-import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter;
-import org.apache.parquet.compression.CompressionCodecFactory.BytesInputCompressor;
-import org.apache.parquet.crypto.AesCipher;
-import org.apache.parquet.crypto.InternalColumnEncryptionSetup;
-import org.apache.parquet.crypto.InternalFileEncryptor;
-import org.apache.parquet.crypto.ModuleCipherFactory.ModuleType;
-import org.apache.parquet.format.BlockCipher;
-import org.apache.parquet.format.converter.ParquetMetadataConverter;
-import org.apache.parquet.hadoop.metadata.ColumnPath;
-import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder;
-import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder;
-import org.apache.parquet.io.ParquetEncodingException;
-import org.apache.parquet.schema.MessageType;
-import org.apache.parquet.bytes.ByteBufferAllocator;
-import org.apache.yetus.audience.InterfaceAudience;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-@InterfaceAudience.Private
-public class ParquetColumnChunkPageWriteStore implements PageWriteStore, BloomFilterWriteStore,
-AutoCloseable {
- private static final Logger LOG = LoggerFactory.getLogger(ParquetColumnChunkPageWriteStore.class);
-
- private static ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
-
- private static final class ColumnChunkPageWriter implements PageWriter, BloomFilterWriter, Closeable {
-
- private final ColumnDescriptor path;
- private final BytesInputCompressor compressor;
- private final CapacityByteArrayOutputStream tempOutputStream;
- private final CapacityByteArrayOutputStream buf;
- private DictionaryPage dictionaryPage;
-
- private long uncompressedLength;
- private long compressedLength;
- private long totalValueCount;
- private int pageCount;
-
- // repetition and definition level encodings are used only for v1 pages and don't change
- private Set rlEncodings = new HashSet();
- private Set dlEncodings = new HashSet();
- private List dataEncodings = new ArrayList();
-
- private BloomFilter bloomFilter;
- private ColumnIndexBuilder columnIndexBuilder;
- private OffsetIndexBuilder offsetIndexBuilder;
- private Statistics totalStatistics;
- private final ByteBufferAllocator allocator;
-
- private final CRC32 crc;
- boolean pageWriteChecksumEnabled;
-
- private final BlockCipher.Encryptor headerBlockEncryptor;
- private final BlockCipher.Encryptor pageBlockEncryptor;
- private final int rowGroupOrdinal;
- private final int columnOrdinal;
- private int pageOrdinal;
- private final byte[] dataPageAAD;
- private final byte[] dataPageHeaderAAD;
- private final byte[] fileAAD;
-
- private ColumnChunkPageWriter(ColumnDescriptor path,
- BytesInputCompressor compressor,
- int initialSlabSize,
- int maxCapacityHint,
- ByteBufferAllocator allocator,
- int columnIndexTruncateLength,
- boolean pageWriteChecksumEnabled,
- BlockCipher.Encryptor headerBlockEncryptor,
- BlockCipher.Encryptor pageBlockEncryptor,
- byte[] fileAAD,
- int rowGroupOrdinal,
- int columnOrdinal) {
- this.path = path;
- this.compressor = compressor;
- this.allocator = allocator;
- this.tempOutputStream = new CapacityByteArrayOutputStream(initialSlabSize, maxCapacityHint, allocator);
- this.buf = new CapacityByteArrayOutputStream(initialSlabSize, maxCapacityHint, allocator);
- this.columnIndexBuilder = ColumnIndexBuilder.getBuilder(path.getPrimitiveType(), columnIndexTruncateLength);
- this.offsetIndexBuilder = OffsetIndexBuilder.getBuilder();
- this.pageWriteChecksumEnabled = pageWriteChecksumEnabled;
- this.crc = pageWriteChecksumEnabled ? new CRC32() : null;
-
- this.headerBlockEncryptor = headerBlockEncryptor;
- this.pageBlockEncryptor = pageBlockEncryptor;
- this.fileAAD = fileAAD;
- this.rowGroupOrdinal = rowGroupOrdinal;
- this.columnOrdinal = columnOrdinal;
- this.pageOrdinal = -1;
- if (null != headerBlockEncryptor) {
- dataPageHeaderAAD = AesCipher.createModuleAAD(fileAAD, ModuleType.DataPageHeader,
- rowGroupOrdinal, columnOrdinal, 0);
- } else {
- dataPageHeaderAAD = null;
- }
- if (null != pageBlockEncryptor) {
- dataPageAAD = AesCipher.createModuleAAD(fileAAD, ModuleType.DataPage,
- rowGroupOrdinal, columnOrdinal, 0);
- } else {
- dataPageAAD = null;
- }
- }
-
- @Override
- @Deprecated
- public void writePage(BytesInput bytesInput, int valueCount, Statistics> statistics, Encoding rlEncoding,
- Encoding dlEncoding, Encoding valuesEncoding) throws IOException {
- // Setting the builders to the no-op ones so no column/offset indexes will be written for this column chunk
- columnIndexBuilder = ColumnIndexBuilder.getNoOpBuilder();
- offsetIndexBuilder = OffsetIndexBuilder.getNoOpBuilder();
-
- writePage(bytesInput, valueCount, -1, statistics, rlEncoding, dlEncoding, valuesEncoding);
- }
-
- @Override
- public void writePage(BytesInput bytes,
- int valueCount,
- int rowCount,
- Statistics statistics,
- Encoding rlEncoding,
- Encoding dlEncoding,
- Encoding valuesEncoding) throws IOException {
- pageOrdinal++;
- long uncompressedSize = bytes.size();
- if (uncompressedSize > Integer.MAX_VALUE) {
- throw new ParquetEncodingException(
- "Cannot write page larger than Integer.MAX_VALUE bytes: " +
- uncompressedSize);
- }
- BytesInput compressedBytes = compressor.compress(bytes);
- if (null != pageBlockEncryptor) {
- AesCipher.quickUpdatePageAAD(dataPageAAD, pageOrdinal);
- compressedBytes = BytesInput.from(pageBlockEncryptor.encrypt(compressedBytes.toByteArray(), dataPageAAD));
- }
- long compressedSize = compressedBytes.size();
- if (compressedSize > Integer.MAX_VALUE) {
- throw new ParquetEncodingException(
- "Cannot write compressed page larger than Integer.MAX_VALUE bytes: "
- + compressedSize);
- }
- tempOutputStream.reset();
- if (null != headerBlockEncryptor) {
- AesCipher.quickUpdatePageAAD(dataPageHeaderAAD, pageOrdinal);
- }
- if (pageWriteChecksumEnabled) {
- crc.reset();
- crc.update(compressedBytes.toByteArray());
- parquetMetadataConverter.writeDataPageV1Header(
- (int)uncompressedSize,
- (int)compressedSize,
- valueCount,
- rlEncoding,
- dlEncoding,
- valuesEncoding,
- (int) crc.getValue(),
- tempOutputStream,
- headerBlockEncryptor,
- dataPageHeaderAAD);
- } else {
- parquetMetadataConverter.writeDataPageV1Header(
- (int)uncompressedSize,
- (int)compressedSize,
- valueCount,
- rlEncoding,
- dlEncoding,
- valuesEncoding,
- tempOutputStream,
- headerBlockEncryptor,
- dataPageHeaderAAD);
- }
- this.uncompressedLength += uncompressedSize;
- this.compressedLength += compressedSize;
- this.totalValueCount += valueCount;
- this.pageCount += 1;
-
- // Copying the statistics if it is not initialized yet so we have the correct typed one
- if (totalStatistics == null) {
- totalStatistics = statistics.copy();
- } else {
- totalStatistics.mergeStatistics(statistics);
- }
-
- columnIndexBuilder.add(statistics);
- offsetIndexBuilder.add(toIntWithCheck(tempOutputStream.size() + compressedSize), rowCount);
-
- // by concatenating before writing instead of writing twice,
- // we only allocate one buffer to copy into instead of multiple.
- BytesInput.concat(BytesInput.from(tempOutputStream), compressedBytes).writeAllTo(buf); // is used instead of above
- rlEncodings.add(rlEncoding);
- dlEncodings.add(dlEncoding);
- dataEncodings.add(valuesEncoding);
- }
-
- @Override
- public void writePageV2(
- int rowCount, int nullCount, int valueCount,
- BytesInput repetitionLevels, BytesInput definitionLevels,
- Encoding dataEncoding, BytesInput data,
- Statistics> statistics) throws IOException {
- pageOrdinal++;
-
- int rlByteLength = toIntWithCheck(repetitionLevels.size());
- int dlByteLength = toIntWithCheck(definitionLevels.size());
- int uncompressedSize = toIntWithCheck(
- data.size() + repetitionLevels.size() + definitionLevels.size()
- );
- // TODO: decide if we compress
- BytesInput compressedData = compressor.compress(data);
- if (null != pageBlockEncryptor) {
- AesCipher.quickUpdatePageAAD(dataPageAAD, pageOrdinal);
- compressedData = BytesInput.from(pageBlockEncryptor.encrypt(compressedData.toByteArray(), dataPageAAD));
- }
- int compressedSize = toIntWithCheck(
- compressedData.size() + repetitionLevels.size() + definitionLevels.size()
- );
- tempOutputStream.reset();
- if (null != headerBlockEncryptor) {
- AesCipher.quickUpdatePageAAD(dataPageHeaderAAD, pageOrdinal);
- }
- parquetMetadataConverter.writeDataPageV2Header(
- uncompressedSize, compressedSize,
- valueCount, nullCount, rowCount,
- dataEncoding,
- rlByteLength,
- dlByteLength,
- tempOutputStream,
- headerBlockEncryptor,
- dataPageHeaderAAD);
- this.uncompressedLength += uncompressedSize;
- this.compressedLength += compressedSize;
- this.totalValueCount += valueCount;
- this.pageCount += 1;
-
- // Copying the statistics if it is not initialized yet so we have the correct typed one
- if (totalStatistics == null) {
- totalStatistics = statistics.copy();
- } else {
- totalStatistics.mergeStatistics(statistics);
- }
-
- columnIndexBuilder.add(statistics);
- offsetIndexBuilder.add(toIntWithCheck((long) tempOutputStream.size() + compressedSize), rowCount);
-
- // by concatenating before writing instead of writing twice,
- // we only allocate one buffer to copy into instead of multiple.
- BytesInput.concat(
- BytesInput.from(tempOutputStream),
- repetitionLevels,
- definitionLevels,
- compressedData).writeAllTo(buf);
- dataEncodings.add(dataEncoding);
- }
-
- private int toIntWithCheck(long size) {
- if (size > Integer.MAX_VALUE) {
- throw new ParquetEncodingException(
- "Cannot write page larger than " + Integer.MAX_VALUE + " bytes: " +
- size);
- }
- return (int)size;
- }
-
- @Override
- public long getMemSize() {
- return buf.size();
- }
-
- public void writeToFileWriter(ParquetFileWriter writer) throws IOException {
- if (null == headerBlockEncryptor) {
- writer.writeColumnChunk(
- path,
- totalValueCount,
- compressor.getCodecName(),
- dictionaryPage,
- BytesInput.from(buf),
- uncompressedLength,
- compressedLength,
- totalStatistics,
- columnIndexBuilder,
- offsetIndexBuilder,
- bloomFilter,
- rlEncodings,
- dlEncodings,
- dataEncodings);
- } else {
- writer.writeColumnChunk(
- path,
- totalValueCount,
- compressor.getCodecName(),
- dictionaryPage,
- BytesInput.from(buf),
- uncompressedLength,
- compressedLength,
- totalStatistics,
- columnIndexBuilder,
- offsetIndexBuilder,
- bloomFilter,
- rlEncodings,
- dlEncodings,
- dataEncodings,
- headerBlockEncryptor,
- rowGroupOrdinal,
- columnOrdinal,
- fileAAD);
- }
- if (LOG.isDebugEnabled()) {
- LOG.debug(
- String.format(
- "written %,dB for %s: %,d values, %,dB raw, %,dB comp, %d pages, encodings: %s",
- buf.size(), path, totalValueCount, uncompressedLength, compressedLength, pageCount, new HashSet(dataEncodings))
- + (dictionaryPage != null ? String.format(
- ", dic { %,d entries, %,dB raw, %,dB comp}",
- dictionaryPage.getDictionarySize(), dictionaryPage.getUncompressedSize(), dictionaryPage.getDictionarySize())
- : ""));
- }
- rlEncodings.clear();
- dlEncodings.clear();
- dataEncodings.clear();
- pageCount = 0;
- pageOrdinal = -1;
- }
-
- @Override
- public long allocatedSize() {
- return buf.size();
- }
-
- @Override
- public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException {
- if (this.dictionaryPage != null) {
- throw new ParquetEncodingException("Only one dictionary page is allowed");
- }
- BytesInput dictionaryBytes = dictionaryPage.getBytes();
- int uncompressedSize = (int)dictionaryBytes.size();
- BytesInput compressedBytes = compressor.compress(dictionaryBytes);
- if (null != pageBlockEncryptor) {
- byte[] dictonaryPageAAD = AesCipher.createModuleAAD(fileAAD, ModuleType.DictionaryPage,
- rowGroupOrdinal, columnOrdinal, -1);
- compressedBytes = BytesInput.from(pageBlockEncryptor.encrypt(compressedBytes.toByteArray(), dictonaryPageAAD));
- }
- this.dictionaryPage = new DictionaryPage(BytesInput.copy(compressedBytes), uncompressedSize,
- dictionaryPage.getDictionarySize(), dictionaryPage.getEncoding());
- }
-
- @Override
- public String memUsageString(String prefix) {
- return buf.memUsageString(prefix + " ColumnChunkPageWriter");
- }
-
- @Override
- public void writeBloomFilter(BloomFilter bloomFilter) {
- this.bloomFilter = bloomFilter;
- }
-
- @Override
- public void close() {
- tempOutputStream.close();
- buf.close();
- }
- }
-
- private final Map writers = new HashMap();
- private final MessageType schema;
-
- public ParquetColumnChunkPageWriteStore(BytesInputCompressor compressor, MessageType schema, int initialSlabSize,
- int maxCapacityHint, ByteBufferAllocator allocator,
- int columnIndexTruncateLength) {
- this(compressor, schema, initialSlabSize, maxCapacityHint, allocator, columnIndexTruncateLength,
- ParquetProperties.DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED);
- }
-
- public ParquetColumnChunkPageWriteStore(BytesInputCompressor compressor, MessageType schema, int initialSlabSize,
- int maxCapacityHint, ByteBufferAllocator allocator,
- int columnIndexTruncateLength, boolean pageWriteChecksumEnabled) {
- this.schema = schema;
- for (ColumnDescriptor path : schema.getColumns()) {
- writers.put(path, new ColumnChunkPageWriter(path, compressor, initialSlabSize, maxCapacityHint, allocator, columnIndexTruncateLength,
- pageWriteChecksumEnabled, null, null, null, -1, -1));
- }
- }
-
- public ParquetColumnChunkPageWriteStore(BytesInputCompressor compressor, MessageType schema, int initialSlabSize,
- int maxCapacityHint, ByteBufferAllocator allocator,
- int columnIndexTruncateLength, boolean pageWriteChecksumEnabled,
- InternalFileEncryptor fileEncryptor, int rowGroupOrdinal) {
- this.schema = schema;
- if (null == fileEncryptor) {
- for (ColumnDescriptor path : schema.getColumns()) {
- writers.put(path, new ColumnChunkPageWriter(path, compressor, initialSlabSize, maxCapacityHint, allocator,
- columnIndexTruncateLength, pageWriteChecksumEnabled, null, null,
- null, -1, -1));
- }
- return;
- }
-
- // Encrypted file
- int columnOrdinal = -1;
- byte[] fileAAD = fileEncryptor.getFileAAD();
- for (ColumnDescriptor path : schema.getColumns()) {
- columnOrdinal++;
- BlockCipher.Encryptor headerBlockEncryptor = null;
- BlockCipher.Encryptor pageBlockEncryptor = null;
- ColumnPath columnPath = ColumnPath.get(path.getPath());
-
- InternalColumnEncryptionSetup columnSetup = fileEncryptor.getColumnSetup(columnPath, true, columnOrdinal);
- if (columnSetup.isEncrypted()) {
- headerBlockEncryptor = columnSetup.getMetaDataEncryptor();
- pageBlockEncryptor = columnSetup.getDataEncryptor();
- }
-
- writers.put(path, new ColumnChunkPageWriter(path, compressor, initialSlabSize, maxCapacityHint, allocator,
- columnIndexTruncateLength, pageWriteChecksumEnabled, headerBlockEncryptor, pageBlockEncryptor, fileAAD,
- rowGroupOrdinal, columnOrdinal));
- }
- }
-
- @Override
- public PageWriter getPageWriter(ColumnDescriptor path) {
- return writers.get(path);
- }
-
- @Override
- public BloomFilterWriter getBloomFilterWriter(ColumnDescriptor path) {
- return writers.get(path);
- }
-
-
-
- public void flushToFileWriter(ParquetFileWriter writer) throws IOException {
- for (ColumnDescriptor path : schema.getColumns()) {
- ColumnChunkPageWriter pageWriter = writers.get(path);
- pageWriter.writeToFileWriter(writer);
- }
- }
-
- @Override
- public void close() {
- for (ColumnChunkPageWriter pageWriter : writers.values()) {
- pageWriter.close();
- }
- }
-
-}
diff --git a/exec/java-exec/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java b/exec/java-exec/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
index f90f4c84131..275cb6654a8 100644
--- a/exec/java-exec/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
+++ b/exec/java-exec/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
@@ -7,13 +7,14 @@
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
*/
package org.apache.parquet.hadoop;
@@ -27,30 +28,34 @@
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
+import java.util.Optional;
import java.util.Set;
import java.util.zip.CRC32;
-
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-
+import org.apache.parquet.ParquetSizeOverflowException;
import org.apache.parquet.Preconditions;
import org.apache.parquet.Version;
+import org.apache.parquet.bytes.ByteBufferAllocator;
+import org.apache.parquet.bytes.ByteBufferReleaser;
import org.apache.parquet.bytes.BytesInput;
import org.apache.parquet.bytes.BytesUtils;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.bytes.ReusingByteBufferAllocator;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.Encoding;
import org.apache.parquet.column.EncodingStats;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.column.page.DictionaryPage;
+import org.apache.parquet.column.statistics.SizeStatistics;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.column.values.bloomfilter.BloomFilter;
import org.apache.parquet.crypto.AesCipher;
@@ -61,19 +66,19 @@
import org.apache.parquet.crypto.ModuleCipherFactory;
import org.apache.parquet.crypto.ModuleCipherFactory.ModuleType;
import org.apache.parquet.crypto.ParquetCryptoRuntimeException;
-import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel;
-import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.format.BlockCipher;
import org.apache.parquet.format.Util;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
+import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
+import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
-import org.apache.parquet.hadoop.metadata.StrictKeyValueMetadataMergeStrategy;
import org.apache.parquet.hadoop.metadata.FileMetaData;
import org.apache.parquet.hadoop.metadata.GlobalMetaData;
import org.apache.parquet.hadoop.metadata.KeyValueMetadataMergeStrategy;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.hadoop.metadata.StrictKeyValueMetadataMergeStrategy;
import org.apache.parquet.hadoop.util.HadoopOutputFile;
import org.apache.parquet.hadoop.util.HadoopStreams;
import org.apache.parquet.internal.column.columnindex.ColumnIndex;
@@ -83,9 +88,9 @@
import org.apache.parquet.internal.hadoop.metadata.IndexReference;
import org.apache.parquet.io.InputFile;
import org.apache.parquet.io.OutputFile;
-import org.apache.parquet.io.SeekableInputStream;
import org.apache.parquet.io.ParquetEncodingException;
import org.apache.parquet.io.PositionOutputStream;
+import org.apache.parquet.io.SeekableInputStream;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.TypeUtil;
@@ -94,1541 +99,2196 @@
/**
* Internal implementation of the Parquet file writer as a block container
- * Note: this is temporary Drill-Parquet class needed to write empty parquet files. Details in
+ * Note: this is temporary Drill-Parquet class needed to write empty parquet files.
+ * This is a full copy of the Parquet library implementation with the lines that throw an error
+ * on writing empty Parquet files commented out. See details in:
* PARQUET-2026 and
* DRILL-7907
*/
-public class ParquetFileWriter {
- private static final Logger LOG = LoggerFactory.getLogger(ParquetFileWriter.class);
-
- private final ParquetMetadataConverter metadataConverter;
-
- public static final String PARQUET_METADATA_FILE = "_metadata";
- public static final String MAGIC_STR = "PAR1";
- public static final byte[] MAGIC = MAGIC_STR.getBytes(StandardCharsets.US_ASCII);
- public static final String EF_MAGIC_STR = "PARE";
- public static final byte[] EFMAGIC = EF_MAGIC_STR.getBytes(StandardCharsets.US_ASCII);
- public static final String PARQUET_COMMON_METADATA_FILE = "_common_metadata";
- public static final int CURRENT_VERSION = 1;
-
- // File creation modes
- public static enum Mode {
- CREATE,
- OVERWRITE
- }
-
- protected final PositionOutputStream out;
-
- private final MessageType schema;
- private final AlignmentStrategy alignment;
- private final int columnIndexTruncateLength;
-
- // file data
- private List blocks = new ArrayList();
-
- // The column/offset indexes per blocks per column chunks
- private final List> columnIndexes = new ArrayList<>();
- private final List> offsetIndexes = new ArrayList<>();
-
- // The Bloom filters
- private final List