diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java index 757331dcb5dc..3d7fb6966c23 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java @@ -37,6 +37,7 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.data.RandomGenericData; import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.parquet.GenericParquetReaders; import org.apache.iceberg.data.parquet.GenericParquetWriter; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.FileAppender; @@ -73,7 +74,7 @@ public class TestParquetVectorizedReads extends AvroDataTestBase { private static final String PLAIN = "PLAIN"; private static final List GOLDEN_FILE_ENCODINGS = - ImmutableList.of("PLAIN_DICTIONARY", "RLE", "RLE_DICTIONARY", "DELTA_BINARY_PACKED"); + ImmutableList.of("PLAIN_DICTIONARY", "RLE_DICTIONARY", "DELTA_BINARY_PACKED"); private static final Map GOLDEN_FILE_TYPES = ImmutableMap.of( "string", Types.StringType.get(), @@ -440,31 +441,30 @@ public void testUuidReads() throws Exception { assertRecordsMatch(schema, numRows, data, dataFile, false, BATCH_SIZE); } - private void assertIdenticalFileContents(File actual, File expected, Schema schema) - throws IOException { - try (CloseableIterable actualReader = - Parquet.read(Files.localInput(actual)) + private void assertIdenticalFileContents( + File actual, File expected, Schema schema, boolean vectorized) throws IOException { + try (CloseableIterable expectedIterator = + Parquet.read(Files.localInput(expected)) .project(schema) - .createReaderFunc(t -> SparkParquetReaders.buildReader(schema, t, ID_TO_CONSTANT)) + .createReaderFunc(msgType -> GenericParquetReaders.buildReader(schema, msgType)) .build()) { - Iterator actualIterator = actualReader.iterator(); - try (CloseableIterable plainReader = - Parquet.read(Files.localInput(expected)) - .project(schema) - .createReaderFunc(t -> SparkParquetReaders.buildReader(schema, t, ID_TO_CONSTANT)) - .build()) { - Iterator expectedIterator = plainReader.iterator(); - - List expectedList = Lists.newArrayList(); - expectedIterator.forEachRemaining(expectedList::add); - List actualList = Lists.newArrayList(); - actualIterator.forEachRemaining(actualList::add); - - assertThat(actualList) - .as("Comparison between files failed %s <-> %s", actual, expected) - .isNotEmpty() - .hasSameSizeAs(expectedList) - .hasSameElementsAs(expectedList); + List expectedRecords = Lists.newArrayList(expectedIterator); + if (vectorized) { + assertRecordsMatch( + schema, expectedRecords.size(), expectedRecords, actual, false, BATCH_SIZE); + } else { + try (CloseableIterable actualIterator = + Parquet.read(Files.localInput(actual)) + .project(schema) + .createReaderFunc(msgType -> SparkParquetReaders.buildReader(schema, msgType)) + .build()) { + List actualRecords = Lists.newArrayList(actualIterator); + assertThat(actualRecords).hasSameSizeAs(expectedRecords); + for (int i = 0; i < actualRecords.size(); i++) { + GenericsHelpers.assertEqualsUnsafe( + schema.asStruct(), expectedRecords.get(i), actualRecords.get(i)); + } + } } } } @@ -474,14 +474,19 @@ static Stream goldenFilesAndEncodings() { .flatMap( encoding -> GOLDEN_FILE_TYPES.entrySet().stream() - .map( - typeEntry -> - Arguments.of(encoding, typeEntry.getKey(), typeEntry.getValue()))); + .flatMap( + e -> + Stream.of(true, false) + .map( + vectorized -> + Arguments.of( + encoding, e.getKey(), e.getValue(), vectorized)))); } @ParameterizedTest @MethodSource("goldenFilesAndEncodings") - public void testGoldenFiles(String encoding, String typeName, PrimitiveType primitiveType) + public void testGoldenFiles( + String encoding, String typeName, PrimitiveType primitiveType, boolean vectorized) throws Exception { Path goldenResourcePath = Paths.get("encodings", encoding, typeName + ".parquet"); URL goldenFileUrl = getClass().getClassLoader().getResource(goldenResourcePath.toString()); @@ -495,6 +500,9 @@ public void testGoldenFiles(String encoding, String typeName, PrimitiveType prim Schema expectedSchema = new Schema(optional(1, "data", primitiveType)); assertIdenticalFileContents( - new File(goldenFileUrl.toURI()), new File(plainFileUrl.toURI()), expectedSchema); + new File(goldenFileUrl.toURI()), + new File(plainFileUrl.toURI()), + expectedSchema, + vectorized); } } diff --git a/spark/v4.0/spark/src/test/resources/encodings/RLE/boolean.parquet b/spark/v4.0/spark/src/test/resources/encodings/RLE/boolean.parquet deleted file mode 100644 index fe4f62455d08..000000000000 Binary files a/spark/v4.0/spark/src/test/resources/encodings/RLE/boolean.parquet and /dev/null differ