From b373fc2649a3169616e4f0f2625d0e9ea01b334e Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Wed, 20 Aug 2025 18:45:15 +0200 Subject: [PATCH] Spark 3.4: Support Parquet dictionary encoded UUIDs Backport of https://github.com/apache/iceberg/pull/13324 --- .../vectorized/ArrowVectorAccessorFactory.java | 8 ++++++++ .../vectorized/TestParquetVectorizedReads.java | 14 ++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java index 29e938bb092e..b4bb9a918732 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java @@ -22,11 +22,13 @@ import java.nio.ByteBuffer; import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.iceberg.arrow.vectorized.GenericArrowVectorAccessorFactory; import org.apache.iceberg.util.UUIDUtil; +import org.apache.parquet.column.Dictionary; import org.apache.spark.sql.types.Decimal; import org.apache.spark.sql.vectorized.ArrowColumnVector; import org.apache.spark.sql.vectorized.ColumnarArray; @@ -81,6 +83,12 @@ public UTF8String ofRow(FixedSizeBinaryVector vector, int rowId) { return UTF8String.fromString(UUIDUtil.convert(vector.get(rowId)).toString()); } + @Override + public UTF8String ofRow(IntVector offsetVector, Dictionary dictionary, int rowId) { + byte[] bytes = dictionary.decodeToBinary(offsetVector.get(rowId)).getBytes(); + return UTF8String.fromString(UUIDUtil.convert(bytes).toString()); + } + @Override public UTF8String ofBytes(byte[] bytes) { return UTF8String.fromBytes(bytes); diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java index 288304f306ab..fbe0b6254d8a 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java @@ -322,4 +322,18 @@ public void testUnsupportedReadsForParquetV2() throws Exception { .hasMessageStartingWith("Cannot support vectorized reads for column") .hasMessageEndingWith("Disable vectorized reads to read this table/file"); } + + @Test + public void testUuidReads() throws Exception { + // Just one row to maintain dictionary encoding + int numRows = 1; + Schema schema = new Schema(optional(100, "uuid", Types.UUIDType.get())); + + InMemoryOutputFile dataFile = new InMemoryOutputFile(); + Iterable data = generateData(schema, numRows, 0L, 0, IDENTITY); + try (FileAppender writer = getParquetV2Writer(schema, dataFile)) { + writer.addAll(data); + } + assertRecordsMatch(schema, numRows, data, dataFile.toInputFile(), false, BATCH_SIZE); + } }