diff --git a/exec/java-exec/pom.xml b/exec/java-exec/pom.xml
index f30700c9f3d..a08e17207d1 100644
--- a/exec/java-exec/pom.xml
+++ b/exec/java-exec/pom.xml
@@ -257,10 +257,6 @@
-
- org.apache.parquet
- parquet-format
-
org.apache.parquet
parquet-common
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRecordWriter.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRecordWriter.java
index 4fd5064468d..45a2c7fa252 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRecordWriter.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRecordWriter.java
@@ -57,7 +57,6 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.parquet.bytes.CapacityByteArrayOutputStream;
import org.apache.parquet.column.ColumnWriteStore;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.column.ParquetProperties.WriterVersion;
@@ -251,10 +250,6 @@ private void newSchema() throws IOException {
// We don't want this number to be too small either. Ideally, slightly bigger than the page size,
// but not bigger than the block buffer
int initialPageBufferSize = max(MINIMUM_BUFFER_SIZE, min(pageSize + pageSize / 10, initialBlockBufferSize));
- // TODO: Use initialSlabSize from ParquetProperties once drill will be updated to the latest version of Parquet library
- int initialSlabSize = CapacityByteArrayOutputStream.initialSlabSizeHeuristic(64, pageSize, 10);
- // TODO: Replace ParquetColumnChunkPageWriteStore with ColumnChunkPageWriteStore from parquet library
- // once PARQUET-1006 will be resolved
ParquetProperties parquetProperties = ParquetProperties.builder()
.withPageSize(pageSize)
.withDictionaryEncoding(enableDictionary)
@@ -263,10 +258,11 @@ private void newSchema() throws IOException {
.withAllocator(new ParquetDirectByteBufferAllocator(oContext))
.withValuesWriterFactory(new DefaultV1ValuesWriterFactory())
.build();
- pageStore = new ParquetColumnChunkPageWriteStore(codecFactory.getCompressor(codec), schema, initialSlabSize,
- pageSize, parquetProperties.getAllocator(), parquetProperties.getPageWriteChecksumEnabled(),
- parquetProperties.getColumnIndexTruncateLength()
- );
+ // TODO: Replace ParquetColumnChunkPageWriteStore with ColumnChunkPageWriteStore from parquet library
+ // once DRILL-7906 (PARQUET-1006) will be resolved
+ pageStore = new ParquetColumnChunkPageWriteStore(codecFactory.getCompressor(codec), schema,
+ parquetProperties.getInitialSlabSize(), pageSize, parquetProperties.getAllocator(),
+ parquetProperties.getColumnIndexTruncateLength(), parquetProperties.getPageWriteChecksumEnabled());
store = new ColumnWriteStoreV1(pageStore, parquetProperties);
MessageColumnIO columnIO = new ColumnIOFactory(false).getColumnIO(this.schema);
consumer = columnIO.getRecordWriter(store);
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ColumnReaderFactory.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ColumnReaderFactory.java
index fcb61f69f5e..25c99044c56 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ColumnReaderFactory.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ColumnReaderFactory.java
@@ -201,13 +201,15 @@ static ColumnReader> createFixedColumnReader(ParquetRecordReader recordReader,
} else if (convertedType == ConvertedType.INTERVAL) {
return new NullableFixedByteAlignedReaders.NullableIntervalReader(recordReader, descriptor,
columnChunkMetaData, fixedLength, (NullableIntervalVector) v, schemaElement);
+ } else {
+ return new NullableFixedByteAlignedReaders.NullableFixedBinaryReader(recordReader, descriptor,
+ columnChunkMetaData, fixedLength, (NullableVarBinaryVector) v, schemaElement);
}
} else {
return getNullableColumnReader(recordReader, descriptor,
columnChunkMetaData, fixedLength, v, schemaElement);
}
}
- throw new Exception("Unexpected parquet metadata configuration.");
}
static VarLengthValuesColumn> getReader(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet2/DrillParquetGroupConverter.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet2/DrillParquetGroupConverter.java
index fbe3ae32eda..fdb6e79d0e8 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet2/DrillParquetGroupConverter.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet2/DrillParquetGroupConverter.java
@@ -22,6 +22,7 @@
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
+import java.util.Optional;
import java.util.function.BiFunction;
import java.util.function.Function;
@@ -71,6 +72,7 @@
import org.apache.parquet.io.api.GroupConverter;
import org.apache.parquet.io.api.PrimitiveConverter;
import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;
import org.apache.parquet.schema.Type.Repetition;
@@ -328,23 +330,30 @@ protected PrimitiveConverter getConverterForType(String name, PrimitiveType type
}
}
case FIXED_LEN_BYTE_ARRAY:
- switch (type.getOriginalType()) {
- case DECIMAL: {
+ LogicalTypeAnnotation.LogicalTypeAnnotationVisitor typeAnnotationVisitor = new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() {
+ @Override
+ public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) {
ParquetReaderUtility.checkDecimalTypeEnabled(options);
- return getVarDecimalConverter(name, type);
+ return Optional.of(getVarDecimalConverter(name, type));
}
- case INTERVAL: {
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) {
IntervalWriter writer = type.isRepetition(Repetition.REPEATED)
? getWriter(name, (m, f) -> m.list(f).interval(), l -> l.list().interval())
- : getWriter(name, (m, f) -> m.interval(f), l -> l.interval());
- return new DrillFixedLengthByteArrayToInterval(writer);
+ : getWriter(name, MapWriter::interval, ListWriter::interval);
+ return Optional.of(new DrillFixedLengthByteArrayToInterval(writer));
}
- default: {
+ };
+
+ LogicalTypeAnnotation logicalTypeAnnotation = type.getLogicalTypeAnnotation();
+ if (logicalTypeAnnotation != null) {
+ return logicalTypeAnnotation.accept(typeAnnotationVisitor).orElseGet(() -> {
VarBinaryWriter writer = type.isRepetition(Repetition.REPEATED)
? getWriter(name, (m, f) -> m.list(f).varBinary(), l -> l.list().varBinary())
- : getWriter(name, (m, f) -> m.varBinary(f), l -> l.varBinary());
+ : getWriter(name, MapWriter::varBinary, ListWriter::varBinary);
return new DrillFixedBinaryToVarbinaryConverter(writer, type.getTypeLength(), mutator.getManagedBuffer());
- }
+ });
}
default:
throw new UnsupportedOperationException("Unsupported type: " + type.getPrimitiveTypeName());
diff --git a/exec/java-exec/src/main/java/org/apache/parquet/hadoop/ParquetColumnChunkPageWriteStore.java b/exec/java-exec/src/main/java/org/apache/parquet/hadoop/ParquetColumnChunkPageWriteStore.java
index b8f707d21c3..790e3c344fe 100644
--- a/exec/java-exec/src/main/java/org/apache/parquet/hadoop/ParquetColumnChunkPageWriteStore.java
+++ b/exec/java-exec/src/main/java/org/apache/parquet/hadoop/ParquetColumnChunkPageWriteStore.java
@@ -31,77 +31,43 @@
import org.apache.parquet.bytes.CapacityByteArrayOutputStream;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.Encoding;
+import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.column.page.DictionaryPage;
import org.apache.parquet.column.page.PageWriteStore;
import org.apache.parquet.column.page.PageWriter;
import org.apache.parquet.column.statistics.Statistics;
+import org.apache.parquet.column.values.bloomfilter.BloomFilter;
+import org.apache.parquet.column.values.bloomfilter.BloomFilterWriteStore;
+import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter;
+import org.apache.parquet.crypto.AesCipher;
+import org.apache.parquet.crypto.InternalColumnEncryptionSetup;
+import org.apache.parquet.crypto.InternalFileEncryptor;
+import org.apache.parquet.crypto.ModuleCipherFactory.ModuleType;
+import org.apache.parquet.format.BlockCipher;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.CodecFactory.BytesCompressor;
+import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder;
import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder;
import org.apache.parquet.io.ParquetEncodingException;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.bytes.ByteBufferAllocator;
+import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-/**
- * This is a copy of ColumnChunkPageWriteStore from parquet library except of OutputStream that is used here.
- * Using of CapacityByteArrayOutputStream allows to use different ByteBuffer allocators.
- * It will be no need in this class once PARQUET-1006 is resolved.
- */
-public class ParquetColumnChunkPageWriteStore implements PageWriteStore, Closeable {
-
- private static final Logger logger = LoggerFactory.getLogger(ParquetColumnChunkPageWriteStore.class);
+@InterfaceAudience.Private
+public class ParquetColumnChunkPageWriteStore implements PageWriteStore, BloomFilterWriteStore, Closeable {
+ private static final Logger LOG = LoggerFactory.getLogger(ParquetColumnChunkPageWriteStore.class);
private static ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
- private final Map writers = new HashMap<>();
- private final MessageType schema;
-
- public ParquetColumnChunkPageWriteStore(BytesCompressor compressor,
- MessageType schema,
- int initialSlabSize,
- int maxCapacityHint,
- ByteBufferAllocator allocator,
- boolean pageWriteChecksumEnabled,
- int columnIndexTruncateLength) {
- this.schema = schema;
- for (ColumnDescriptor path : schema.getColumns()) {
- writers.put(path, new ColumnChunkPageWriter(path, compressor, initialSlabSize,
- maxCapacityHint, allocator, pageWriteChecksumEnabled, columnIndexTruncateLength));
- }
- }
-
- @Override
- public PageWriter getPageWriter(ColumnDescriptor path) {
- return writers.get(path);
- }
-
- /**
- * Writes the column chunks in the corresponding row group
- * @param writer the parquet file writer
- * @throws IOException if the file can not be created
- */
- public void flushToFileWriter(ParquetFileWriter writer) throws IOException {
- for (ColumnDescriptor path : schema.getColumns()) {
- ColumnChunkPageWriter pageWriter = writers.get(path);
- pageWriter.writeToFileWriter(writer);
- }
- }
-
- @Override
- public void close() {
- for (ColumnChunkPageWriter pageWriter : writers.values()) {
- pageWriter.close();
- }
- }
-
- private static final class ColumnChunkPageWriter implements PageWriter, Closeable {
+ private static final class ColumnChunkPageWriter implements PageWriter, BloomFilterWriter, Closeable {
private final ColumnDescriptor path;
private final BytesCompressor compressor;
+ private final CapacityByteArrayOutputStream tempOutputStream;
private final CapacityByteArrayOutputStream buf;
private DictionaryPage dictionaryPage;
@@ -111,37 +77,74 @@ private static final class ColumnChunkPageWriter implements PageWriter, Closeabl
private int pageCount;
// repetition and definition level encodings are used only for v1 pages and don't change
- private Set rlEncodings = new HashSet<>();
- private Set dlEncodings = new HashSet<>();
- private List dataEncodings = new ArrayList<>();
+ private Set rlEncodings = new HashSet();
+ private Set dlEncodings = new HashSet();
+ private List dataEncodings = new ArrayList();
+ private BloomFilter bloomFilter;
private ColumnIndexBuilder columnIndexBuilder;
private OffsetIndexBuilder offsetIndexBuilder;
private Statistics totalStatistics;
+ private final ByteBufferAllocator allocator;
private final CRC32 crc;
boolean pageWriteChecksumEnabled;
+ private final BlockCipher.Encryptor headerBlockEncryptor;
+ private final BlockCipher.Encryptor pageBlockEncryptor;
+ private final int rowGroupOrdinal;
+ private final int columnOrdinal;
+ private int pageOrdinal;
+ private final byte[] dataPageAAD;
+ private final byte[] dataPageHeaderAAD;
+ private final byte[] fileAAD;
+
private ColumnChunkPageWriter(ColumnDescriptor path,
BytesCompressor compressor,
int initialSlabSize,
int maxCapacityHint,
ByteBufferAllocator allocator,
+ int columnIndexTruncateLength,
boolean pageWriteChecksumEnabled,
- int columnIndexTruncateLength) {
+ BlockCipher.Encryptor headerBlockEncryptor,
+ BlockCipher.Encryptor pageBlockEncryptor,
+ byte[] fileAAD,
+ int rowGroupOrdinal,
+ int columnOrdinal) {
this.path = path;
this.compressor = compressor;
+ this.allocator = allocator;
+ this.tempOutputStream = new CapacityByteArrayOutputStream(initialSlabSize, maxCapacityHint, allocator);
this.buf = new CapacityByteArrayOutputStream(initialSlabSize, maxCapacityHint, allocator);
- this.totalStatistics = Statistics.createStats(this.path.getPrimitiveType());
this.columnIndexBuilder = ColumnIndexBuilder.getBuilder(path.getPrimitiveType(), columnIndexTruncateLength);
this.offsetIndexBuilder = OffsetIndexBuilder.getBuilder();
this.pageWriteChecksumEnabled = pageWriteChecksumEnabled;
this.crc = pageWriteChecksumEnabled ? new CRC32() : null;
+
+ this.headerBlockEncryptor = headerBlockEncryptor;
+ this.pageBlockEncryptor = pageBlockEncryptor;
+ this.fileAAD = fileAAD;
+ this.rowGroupOrdinal = rowGroupOrdinal;
+ this.columnOrdinal = columnOrdinal;
+ this.pageOrdinal = -1;
+ if (null != headerBlockEncryptor) {
+ dataPageHeaderAAD = AesCipher.createModuleAAD(fileAAD, ModuleType.DataPageHeader,
+ rowGroupOrdinal, columnOrdinal, 0);
+ } else {
+ dataPageHeaderAAD = null;
+ }
+ if (null != pageBlockEncryptor) {
+ dataPageAAD = AesCipher.createModuleAAD(fileAAD, ModuleType.DataPage,
+ rowGroupOrdinal, columnOrdinal, 0);
+ } else {
+ dataPageAAD = null;
+ }
}
@Override
+ @Deprecated
public void writePage(BytesInput bytesInput, int valueCount, Statistics> statistics, Encoding rlEncoding,
- Encoding dlEncoding, Encoding valuesEncoding) throws IOException {
+ Encoding dlEncoding, Encoding valuesEncoding) throws IOException {
// Setting the builders to the no-op ones so no column/offset indexes will be written for this column chunk
columnIndexBuilder = ColumnIndexBuilder.getNoOpBuilder();
offsetIndexBuilder = OffsetIndexBuilder.getNoOpBuilder();
@@ -150,49 +153,66 @@ public void writePage(BytesInput bytesInput, int valueCount, Statistics> stati
}
@Override
- public void writePage(BytesInput bytes, int valueCount, int rowCount, Statistics statistics,
- Encoding rlEncoding, Encoding dlEncoding, Encoding valuesEncoding) throws IOException {
+ public void writePage(BytesInput bytes,
+ int valueCount,
+ int rowCount,
+ Statistics statistics,
+ Encoding rlEncoding,
+ Encoding dlEncoding,
+ Encoding valuesEncoding) throws IOException {
+ pageOrdinal++;
long uncompressedSize = bytes.size();
if (uncompressedSize > Integer.MAX_VALUE) {
throw new ParquetEncodingException(
- "Cannot write page larger than Integer.MAX_VALUE bytes: " +
- uncompressedSize);
+ "Cannot write page larger than Integer.MAX_VALUE bytes: " +
+ uncompressedSize);
}
-
BytesInput compressedBytes = compressor.compress(bytes);
+ if (null != pageBlockEncryptor) {
+ AesCipher.quickUpdatePageAAD(dataPageAAD, pageOrdinal);
+ compressedBytes = BytesInput.from(pageBlockEncryptor.encrypt(compressedBytes.toByteArray(), dataPageAAD));
+ }
long compressedSize = compressedBytes.size();
if (compressedSize > Integer.MAX_VALUE) {
throw new ParquetEncodingException(
- "Cannot write compressed page larger than Integer.MAX_VALUE bytes: "
- + compressedSize);
+ "Cannot write compressed page larger than Integer.MAX_VALUE bytes: "
+ + compressedSize);
+ }
+ tempOutputStream.reset();
+ if (null != headerBlockEncryptor) {
+ AesCipher.quickUpdatePageAAD(dataPageHeaderAAD, pageOrdinal);
}
-
if (pageWriteChecksumEnabled) {
crc.reset();
crc.update(compressedBytes.toByteArray());
- parquetMetadataConverter.writeDataPageV1Header((int) uncompressedSize, (int) compressedSize,
- valueCount, rlEncoding, dlEncoding, valuesEncoding, (int) crc.getValue(), buf);
+ parquetMetadataConverter.writeDataPageV1Header(
+ (int)uncompressedSize,
+ (int)compressedSize,
+ valueCount,
+ rlEncoding,
+ dlEncoding,
+ valuesEncoding,
+ (int) crc.getValue(),
+ tempOutputStream,
+ headerBlockEncryptor,
+ dataPageHeaderAAD);
} else {
- parquetMetadataConverter.writeDataPageV1Header((int) uncompressedSize, (int) compressedSize,
- valueCount, rlEncoding, dlEncoding, valuesEncoding, buf);
+ parquetMetadataConverter.writeDataPageV1Header(
+ (int)uncompressedSize,
+ (int)compressedSize,
+ valueCount,
+ rlEncoding,
+ dlEncoding,
+ valuesEncoding,
+ tempOutputStream,
+ headerBlockEncryptor,
+ dataPageHeaderAAD);
}
-
this.uncompressedLength += uncompressedSize;
this.compressedLength += compressedSize;
this.totalValueCount += valueCount;
this.pageCount += 1;
- addStatistics(statistics);
-
- offsetIndexBuilder.add(toIntWithCheck(buf.size() + compressedSize), rowCount);
-
- compressedBytes.writeAllTo(buf);
- rlEncodings.add(rlEncoding);
- dlEncodings.add(dlEncoding);
- dataEncodings.add(valuesEncoding);
- }
-
- private void addStatistics(Statistics statistics) {
// Copying the statistics if it is not initialized yet so we have the correct typed one
if (totalStatistics == null) {
totalStatistics = statistics.copy();
@@ -201,55 +221,81 @@ private void addStatistics(Statistics statistics) {
}
columnIndexBuilder.add(statistics);
+ offsetIndexBuilder.add(toIntWithCheck(tempOutputStream.size() + compressedSize), rowCount);
+
+ // by concatenating before writing instead of writing twice,
+ // we only allocate one buffer to copy into instead of multiple.
+ BytesInput.concat(BytesInput.from(tempOutputStream), compressedBytes).writeAllTo(buf); // is used instead of above
+ rlEncodings.add(rlEncoding);
+ dlEncodings.add(dlEncoding);
+ dataEncodings.add(valuesEncoding);
}
@Override
- public void writePageV2(int rowCount,
- int nullCount,
- int valueCount,
- BytesInput repetitionLevels,
- BytesInput definitionLevels,
- Encoding dataEncoding,
- BytesInput data,
- Statistics> statistics) throws IOException {
+ public void writePageV2(
+ int rowCount, int nullCount, int valueCount,
+ BytesInput repetitionLevels, BytesInput definitionLevels,
+ Encoding dataEncoding, BytesInput data,
+ Statistics> statistics) throws IOException {
+ pageOrdinal++;
+
int rlByteLength = toIntWithCheck(repetitionLevels.size());
int dlByteLength = toIntWithCheck(definitionLevels.size());
int uncompressedSize = toIntWithCheck(
- data.size() + repetitionLevels.size() + definitionLevels.size()
+ data.size() + repetitionLevels.size() + definitionLevels.size()
);
+ // TODO: decide if we compress
BytesInput compressedData = compressor.compress(data);
+ if (null != pageBlockEncryptor) {
+ AesCipher.quickUpdatePageAAD(dataPageAAD, pageOrdinal);
+ compressedData = BytesInput.from(pageBlockEncryptor.encrypt(compressedData.toByteArray(), dataPageAAD));
+ }
int compressedSize = toIntWithCheck(
- compressedData.size() + repetitionLevels.size() + definitionLevels.size()
+ compressedData.size() + repetitionLevels.size() + definitionLevels.size()
);
+ tempOutputStream.reset();
+ if (null != headerBlockEncryptor) {
+ AesCipher.quickUpdatePageAAD(dataPageHeaderAAD, pageOrdinal);
+ }
parquetMetadataConverter.writeDataPageV2Header(
- uncompressedSize, compressedSize,
- valueCount, nullCount, rowCount,
- statistics,
- dataEncoding,
- rlByteLength,
- dlByteLength,
- buf);
+ uncompressedSize, compressedSize,
+ valueCount, nullCount, rowCount,
+ dataEncoding,
+ rlByteLength,
+ dlByteLength,
+ tempOutputStream,
+ headerBlockEncryptor,
+ dataPageHeaderAAD);
this.uncompressedLength += uncompressedSize;
this.compressedLength += compressedSize;
this.totalValueCount += valueCount;
this.pageCount += 1;
- addStatistics(statistics);
-
- offsetIndexBuilder.add(toIntWithCheck(buf.size() + compressedSize), rowCount);
-
- repetitionLevels.writeAllTo(buf);
- definitionLevels.writeAllTo(buf);
- compressedData.writeAllTo(buf);
+ // Copying the statistics if it is not initialized yet so we have the correct typed one
+ if (totalStatistics == null) {
+ totalStatistics = statistics.copy();
+ } else {
+ totalStatistics.mergeStatistics(statistics);
+ }
+ columnIndexBuilder.add(statistics);
+ offsetIndexBuilder.add(toIntWithCheck((long) tempOutputStream.size() + compressedSize), rowCount);
+
+ // by concatenating before writing instead of writing twice,
+ // we only allocate one buffer to copy into instead of multiple.
+ BytesInput.concat(
+ BytesInput.from(tempOutputStream),
+ repetitionLevels,
+ definitionLevels,
+ compressedData).writeAllTo(buf);
dataEncodings.add(dataEncoding);
}
private int toIntWithCheck(long size) {
if (size > Integer.MAX_VALUE) {
throw new ParquetEncodingException(
- "Cannot write page larger than " + Integer.MAX_VALUE + " bytes: " +
- size);
+ "Cannot write page larger than " + Integer.MAX_VALUE + " bytes: " +
+ size);
}
return (int)size;
}
@@ -259,35 +305,64 @@ public long getMemSize() {
return buf.size();
}
- /**
- * Writes a number of pages within corresponding column chunk
- * @param writer the parquet file writer
- * @throws IOException if the file can not be created
- */
public void writeToFileWriter(ParquetFileWriter writer) throws IOException {
- writer.writeColumnChunk(path, totalValueCount, compressor.getCodecName(),
- dictionaryPage, BytesInput.from(buf), uncompressedLength, compressedLength, totalStatistics,
- columnIndexBuilder, offsetIndexBuilder, rlEncodings, dlEncodings, dataEncodings);
- if (logger.isDebugEnabled()) {
- logger.debug(
- String.format(
- "written %,dB for %s: %,d values, %,dB raw, %,dB comp, %d pages, encodings: %s",
- buf.size(), path, totalValueCount, uncompressedLength, compressedLength, pageCount, new HashSet<>(dataEncodings))
- + (dictionaryPage != null ? String.format(
- ", dic { %,d entries, %,dB raw, %,dB comp}",
- dictionaryPage.getDictionarySize(), dictionaryPage.getUncompressedSize(), dictionaryPage.getDictionarySize())
- : "")
- );
+ if (null == headerBlockEncryptor) {
+ writer.writeColumnChunk(
+ path,
+ totalValueCount,
+ compressor.getCodecName(),
+ dictionaryPage,
+ BytesInput.from(buf),
+ uncompressedLength,
+ compressedLength,
+ totalStatistics,
+ columnIndexBuilder,
+ offsetIndexBuilder,
+ bloomFilter,
+ rlEncodings,
+ dlEncodings,
+ dataEncodings);
+ } else {
+ writer.writeColumnChunk(
+ path,
+ totalValueCount,
+ compressor.getCodecName(),
+ dictionaryPage,
+ BytesInput.from(buf),
+ uncompressedLength,
+ compressedLength,
+ totalStatistics,
+ columnIndexBuilder,
+ offsetIndexBuilder,
+ bloomFilter,
+ rlEncodings,
+ dlEncodings,
+ dataEncodings,
+ headerBlockEncryptor,
+ rowGroupOrdinal,
+ columnOrdinal,
+ fileAAD);
+ }
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(
+ String.format(
+ "written %,dB for %s: %,d values, %,dB raw, %,dB comp, %d pages, encodings: %s",
+ buf.size(), path, totalValueCount, uncompressedLength, compressedLength, pageCount, new HashSet(dataEncodings))
+ + (dictionaryPage != null ? String.format(
+ ", dic { %,d entries, %,dB raw, %,dB comp}",
+ dictionaryPage.getDictionarySize(), dictionaryPage.getUncompressedSize(), dictionaryPage.getDictionarySize())
+ : ""));
}
rlEncodings.clear();
dlEncodings.clear();
dataEncodings.clear();
pageCount = 0;
+ pageOrdinal = -1;
}
@Override
public long allocatedSize() {
- return buf.getCapacity();
+ return buf.size();
}
@Override
@@ -298,8 +373,13 @@ public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOExceptio
BytesInput dictionaryBytes = dictionaryPage.getBytes();
int uncompressedSize = (int)dictionaryBytes.size();
BytesInput compressedBytes = compressor.compress(dictionaryBytes);
+ if (null != pageBlockEncryptor) {
+ byte[] dictonaryPageAAD = AesCipher.createModuleAAD(fileAAD, ModuleType.DictionaryPage,
+ rowGroupOrdinal, columnOrdinal, -1);
+ compressedBytes = BytesInput.from(pageBlockEncryptor.encrypt(compressedBytes.toByteArray(), dictonaryPageAAD));
+ }
this.dictionaryPage = new DictionaryPage(BytesInput.copy(compressedBytes), uncompressedSize,
- dictionaryPage.getDictionarySize(), dictionaryPage.getEncoding());
+ dictionaryPage.getDictionarySize(), dictionaryPage.getEncoding());
}
@Override
@@ -307,10 +387,97 @@ public String memUsageString(String prefix) {
return buf.memUsageString(prefix + " ColumnChunkPageWriter");
}
+ @Override
+ public void writeBloomFilter(BloomFilter bloomFilter) {
+ this.bloomFilter = bloomFilter;
+ }
+
@Override
public void close() {
+ tempOutputStream.close();
buf.close();
}
}
+ private final Map writers = new HashMap();
+ private final MessageType schema;
+
+ public ParquetColumnChunkPageWriteStore(BytesCompressor compressor, MessageType schema, int initialSlabSize,
+ int maxCapacityHint, ByteBufferAllocator allocator,
+ int columnIndexTruncateLength) {
+ this(compressor, schema, initialSlabSize, maxCapacityHint, allocator, columnIndexTruncateLength,
+ ParquetProperties.DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED);
+ }
+
+ public ParquetColumnChunkPageWriteStore(BytesCompressor compressor, MessageType schema, int initialSlabSize,
+ int maxCapacityHint, ByteBufferAllocator allocator,
+ int columnIndexTruncateLength, boolean pageWriteChecksumEnabled) {
+ this.schema = schema;
+ for (ColumnDescriptor path : schema.getColumns()) {
+ writers.put(path, new ColumnChunkPageWriter(path, compressor, initialSlabSize, maxCapacityHint, allocator, columnIndexTruncateLength,
+ pageWriteChecksumEnabled, null, null, null, -1, -1));
+ }
+ }
+
+ public ParquetColumnChunkPageWriteStore(BytesCompressor compressor, MessageType schema, int initialSlabSize,
+ int maxCapacityHint, ByteBufferAllocator allocator,
+ int columnIndexTruncateLength, boolean pageWriteChecksumEnabled,
+ InternalFileEncryptor fileEncryptor, int rowGroupOrdinal) {
+ this.schema = schema;
+ if (null == fileEncryptor) {
+ for (ColumnDescriptor path : schema.getColumns()) {
+ writers.put(path, new ColumnChunkPageWriter(path, compressor, initialSlabSize, maxCapacityHint, allocator,
+ columnIndexTruncateLength, pageWriteChecksumEnabled, null, null,
+ null, -1, -1));
+ }
+ return;
+ }
+
+ // Encrypted file
+ int columnOrdinal = -1;
+ byte[] fileAAD = fileEncryptor.getFileAAD();
+ for (ColumnDescriptor path : schema.getColumns()) {
+ columnOrdinal++;
+ BlockCipher.Encryptor headerBlockEncryptor = null;
+ BlockCipher.Encryptor pageBlockEncryptor = null;
+ ColumnPath columnPath = ColumnPath.get(path.getPath());
+
+ InternalColumnEncryptionSetup columnSetup = fileEncryptor.getColumnSetup(columnPath, true, columnOrdinal);
+ if (columnSetup.isEncrypted()) {
+ headerBlockEncryptor = columnSetup.getMetaDataEncryptor();
+ pageBlockEncryptor = columnSetup.getDataEncryptor();
+ }
+
+ writers.put(path, new ColumnChunkPageWriter(path, compressor, initialSlabSize, maxCapacityHint, allocator,
+ columnIndexTruncateLength, pageWriteChecksumEnabled, headerBlockEncryptor, pageBlockEncryptor, fileAAD,
+ rowGroupOrdinal, columnOrdinal));
+ }
+ }
+
+ @Override
+ public PageWriter getPageWriter(ColumnDescriptor path) {
+ return writers.get(path);
+ }
+
+ @Override
+ public BloomFilterWriter getBloomFilterWriter(ColumnDescriptor path) {
+ return writers.get(path);
+ }
+
+
+
+ public void flushToFileWriter(ParquetFileWriter writer) throws IOException {
+ for (ColumnDescriptor path : schema.getColumns()) {
+ ColumnChunkPageWriter pageWriter = writers.get(path);
+ pageWriter.writeToFileWriter(writer);
+ }
+ }
+
+ @Override
+ public void close() {
+ for (ColumnChunkPageWriter pageWriter : writers.values()) {
+ pageWriter.close();
+ }
+ }
+
}
diff --git a/exec/java-exec/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java b/exec/java-exec/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
new file mode 100644
index 00000000000..f90f4c84131
--- /dev/null
+++ b/exec/java-exec/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
@@ -0,0 +1,1634 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.parquet.hadoop;
+
+import static org.apache.parquet.format.Util.writeFileCryptoMetaData;
+import static org.apache.parquet.format.Util.writeFileMetaData;
+import static org.apache.parquet.format.converter.ParquetMetadataConverter.MAX_STATS_SIZE;
+import static org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE;
+import static org.apache.parquet.hadoop.ParquetWriter.MAX_PADDING_SIZE_DEFAULT;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.zip.CRC32;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import org.apache.parquet.Preconditions;
+import org.apache.parquet.Version;
+import org.apache.parquet.bytes.BytesInput;
+import org.apache.parquet.bytes.BytesUtils;
+import org.apache.parquet.column.ColumnDescriptor;
+import org.apache.parquet.column.Encoding;
+import org.apache.parquet.column.EncodingStats;
+import org.apache.parquet.column.ParquetProperties;
+import org.apache.parquet.column.page.DictionaryPage;
+import org.apache.parquet.column.statistics.Statistics;
+import org.apache.parquet.column.values.bloomfilter.BloomFilter;
+import org.apache.parquet.crypto.AesCipher;
+import org.apache.parquet.crypto.ColumnEncryptionProperties;
+import org.apache.parquet.crypto.FileEncryptionProperties;
+import org.apache.parquet.crypto.InternalColumnEncryptionSetup;
+import org.apache.parquet.crypto.InternalFileEncryptor;
+import org.apache.parquet.crypto.ModuleCipherFactory;
+import org.apache.parquet.crypto.ModuleCipherFactory.ModuleType;
+import org.apache.parquet.crypto.ParquetCryptoRuntimeException;
+import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel;
+import org.apache.parquet.hadoop.metadata.ColumnPath;
+import org.apache.parquet.format.BlockCipher;
+import org.apache.parquet.format.Util;
+import org.apache.parquet.format.converter.ParquetMetadataConverter;
+import org.apache.parquet.hadoop.metadata.BlockMetaData;
+import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.apache.parquet.hadoop.metadata.StrictKeyValueMetadataMergeStrategy;
+import org.apache.parquet.hadoop.metadata.FileMetaData;
+import org.apache.parquet.hadoop.metadata.GlobalMetaData;
+import org.apache.parquet.hadoop.metadata.KeyValueMetadataMergeStrategy;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.hadoop.util.HadoopOutputFile;
+import org.apache.parquet.hadoop.util.HadoopStreams;
+import org.apache.parquet.internal.column.columnindex.ColumnIndex;
+import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder;
+import org.apache.parquet.internal.column.columnindex.OffsetIndex;
+import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder;
+import org.apache.parquet.internal.hadoop.metadata.IndexReference;
+import org.apache.parquet.io.InputFile;
+import org.apache.parquet.io.OutputFile;
+import org.apache.parquet.io.SeekableInputStream;
+import org.apache.parquet.io.ParquetEncodingException;
+import org.apache.parquet.io.PositionOutputStream;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.PrimitiveType;
+import org.apache.parquet.schema.TypeUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Internal implementation of the Parquet file writer as a block container
+ * Note: this is temporary Drill-Parquet class needed to write empty parquet files. Details in
+ * PARQUET-2026 and
+ * DRILL-7907
+ */
+public class ParquetFileWriter {
+ private static final Logger LOG = LoggerFactory.getLogger(ParquetFileWriter.class);
+
+ private final ParquetMetadataConverter metadataConverter;
+
+ public static final String PARQUET_METADATA_FILE = "_metadata";
+ public static final String MAGIC_STR = "PAR1";
+ public static final byte[] MAGIC = MAGIC_STR.getBytes(StandardCharsets.US_ASCII);
+ public static final String EF_MAGIC_STR = "PARE";
+ public static final byte[] EFMAGIC = EF_MAGIC_STR.getBytes(StandardCharsets.US_ASCII);
+ public static final String PARQUET_COMMON_METADATA_FILE = "_common_metadata";
+ public static final int CURRENT_VERSION = 1;
+
+ // File creation modes
+ public static enum Mode {
+ CREATE,
+ OVERWRITE
+ }
+
+ protected final PositionOutputStream out;
+
+ private final MessageType schema;
+ private final AlignmentStrategy alignment;
+ private final int columnIndexTruncateLength;
+
+ // file data
+ private List blocks = new ArrayList();
+
+ // The column/offset indexes per blocks per column chunks
+ private final List> columnIndexes = new ArrayList<>();
+ private final List> offsetIndexes = new ArrayList<>();
+
+ // The Bloom filters
+ private final List