From e1719bb3b54b5a89ebc92aa7c11f0f198d292022 Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Thu, 9 Nov 2017 12:52:32 +0100 Subject: [PATCH 01/17] PARQUET-1025: Refactor Binary to prepare from custom comparators Also update parquet-format version --- .../org/apache/parquet/io/api/Binary.java | 175 +++++++++++------- .../org/apache/parquet/io/api/TestBinary.java | 27 ++- 2 files changed, 135 insertions(+), 67 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java index 50b98c202e..d0fcc92b56 100644 --- a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java +++ b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java @@ -38,6 +38,12 @@ abstract public class Binary implements Comparable, Serializable { + public interface ComparatorHelper { + int compare(byte[] array1, int offset1, int length1, byte[] array2, int offset2, int length2); + int compare(ByteBuffer buffer1, int offset1, int length1, ByteBuffer buffer2, int offset2, int length2); + int compare(byte[] array1, int offset1, int length1, ByteBuffer buffer2, int offset2, int length2); + } + protected boolean isBackingBytesReused; // this isn't really something others should extend @@ -73,9 +79,19 @@ private Binary() { } abstract public int compareTo(Binary other); - abstract int compareTo(byte[] bytes, int offset, int length); + abstract public int compareTo(Binary other, ComparatorHelper helper); + + int compareTo(byte[] bytes, int offset, int length) { + return compareTo(bytes, offset, length, DEFAULT_COMPARATOR_HELPER); + } + + abstract int compareTo(byte[] bytes, int offset, int length, ComparatorHelper helper); - abstract int compareTo(ByteBuffer bytes, int offset, int length); + int compareTo(ByteBuffer bytes, int offset, int length) { + return compareTo(bytes, offset, length, DEFAULT_COMPARATOR_HELPER); + } + + abstract int compareTo(ByteBuffer bytes, int offset, int length, ComparatorHelper helper); abstract public ByteBuffer toByteBuffer(); @@ -189,17 +205,22 @@ boolean equals(ByteBuffer bytes, int otherOffset, int otherLength) { @Override public int compareTo(Binary other) { - return other.compareTo(value, offset, length); + return -other.compareTo(value, offset, length); } @Override - int compareTo(byte[] other, int otherOffset, int otherLength) { - return Binary.compareTwoByteArrays(value, offset, length, other, otherOffset, otherLength); + public int compareTo(Binary other, ComparatorHelper helper) { + return -other.compareTo(value, offset, length, helper); } @Override - int compareTo(ByteBuffer bytes, int otherOffset, int otherLength) { - return Binary.compareByteArrayToByteBuffer(value, offset, length, bytes, otherOffset, otherLength); + int compareTo(byte[] other, int otherOffset, int otherLength, ComparatorHelper helper) { + return helper.compare(value, offset, length, other, otherOffset, otherLength); + } + + @Override + int compareTo(ByteBuffer bytes, int otherOffset, int otherLength, ComparatorHelper helper) { + return helper.compare(value, offset, length, bytes, otherOffset, otherLength); } @Override @@ -345,17 +366,22 @@ boolean equals(ByteBuffer bytes, int otherOffset, int otherLength) { @Override public int compareTo(Binary other) { - return other.compareTo(value, 0, value.length); + return -other.compareTo(value, 0, value.length); } @Override - int compareTo(byte[] other, int otherOffset, int otherLength) { - return Binary.compareTwoByteArrays(value, 0, value.length, other, otherOffset, otherLength); + public int compareTo(Binary other, ComparatorHelper helper) { + return -other.compareTo(value, 0, value.length, helper); } @Override - int compareTo(ByteBuffer bytes, int otherOffset, int otherLength) { - return Binary.compareByteArrayToByteBuffer(value, 0, value.length, bytes, otherOffset, otherLength); + int compareTo(byte[] other, int otherOffset, int otherLength, ComparatorHelper helper) { + return helper.compare(value, 0, value.length, other, otherOffset, otherLength); + } + + @Override + int compareTo(ByteBuffer bytes, int otherOffset, int otherLength, ComparatorHelper helper) { + return helper.compare(value, 0, value.length, bytes, otherOffset, otherLength); } @Override @@ -506,25 +532,34 @@ boolean equals(ByteBuffer otherBytes, int otherOffset, int otherLength) { @Override public int compareTo(Binary other) { if (value.hasArray()) { - return other.compareTo(value.array(), value.arrayOffset() + offset, length); + return -other.compareTo(value.array(), value.arrayOffset() + offset, length); + } else { + return -other.compareTo(value, offset, length); + } + } + + @Override + public int compareTo(Binary other, ComparatorHelper helper) { + if (value.hasArray()) { + return -other.compareTo(value.array(), value.arrayOffset() + offset, length, helper); } else { - return other.compareTo(value, offset, length); + return -other.compareTo(value, offset, length, helper); } } @Override - int compareTo(byte[] other, int otherOffset, int otherLength) { + int compareTo(byte[] other, int otherOffset, int otherLength, ComparatorHelper helper) { if (value.hasArray()) { - return Binary.compareTwoByteArrays(value.array(), value.arrayOffset() + offset, length, + return helper.compare(value.array(), value.arrayOffset() + offset, length, other, otherOffset, otherLength); } { - return Binary.compareByteBufferToByteArray(value, offset, length, other, otherOffset, otherLength); + return -helper.compare(other, otherOffset, otherLength, value, offset, length); } } @Override - int compareTo(ByteBuffer bytes, int otherOffset, int otherLength) { - return Binary.compareTwoByteBuffers(value, offset, length, bytes, otherOffset, otherLength); + int compareTo(ByteBuffer bytes, int otherOffset, int otherLength, ComparatorHelper helper) { + return helper.compare(value, offset, length, bytes, otherOffset, otherLength); } @Override @@ -666,63 +701,75 @@ private static final boolean equals(byte[] array1, int offset1, int length1, byt return true; } - private static final int compareByteBufferToByteArray(ByteBuffer buf, int offset1, int length1, - byte[] array, int offset2, int length2) { - return -1 * Binary.compareByteArrayToByteBuffer(array, offset1, length1, buf, offset2, length2); - } - - private static final int compareByteArrayToByteBuffer(byte[] array1, int offset1, int length1, - ByteBuffer buf, int offset2, int length2) { - if (array1 == null && buf == null) return 0; - int min_length = (length1 < length2) ? length1 : length2; - for (int i = 0; i < min_length; i++) { - if (array1[i + offset1] < buf.get(i + offset2)) { - return 1; + private static final ComparatorHelper DEFAULT_COMPARATOR_HELPER = new ComparatorHelper() { + @Override + public int compare(byte[] array1, int offset1, int length1, + ByteBuffer buf, int offset2, int length2) { + if (array1 == null && buf == null) return 0; + int min_length = (length1 < length2) ? length1 : length2; + for (int i = 0; i < min_length; i++) { + if (array1[i + offset1] < buf.get(i + offset2)) { + return -1; + } + if (array1[i + offset1] > buf.get(i + offset2)) { + return 1; + } } - if (array1[i + offset1] > buf.get(i + offset2)) { + // check remainder + if (length1 == length2) { + return 0; + } else if (length1 < length2) { return -1; + } else { + return 1; } } - // check remainder - if (length1 == length2) { return 0; } - else if (length1 < length2) { return 1;} - else { return -1; } - } - private static final int compareTwoByteBuffers(ByteBuffer buf1, int offset1, int length1, - ByteBuffer buf2, int offset2, int length2) { - if (buf1 == null && buf2 == null) return 0; - int min_length = (length1 < length2) ? length1 : length2; - for (int i = 0; i < min_length; i++) { - if (buf1.get(i + offset1) < buf2.get(i + offset2)) { - return 1; + @Override + public int compare(ByteBuffer buf1, int offset1, int length1, + ByteBuffer buf2, int offset2, int length2) { + if (buf1 == null && buf2 == null) return 0; + int min_length = (length1 < length2) ? length1 : length2; + for (int i = 0; i < min_length; i++) { + if (buf1.get(i + offset1) < buf2.get(i + offset2)) { + return -1; + } + if (buf1.get(i + offset1) > buf2.get(i + offset2)) { + return 1; + } } - if (buf1.get(i + offset1) > buf2.get(i + offset2)) { + // check remainder + if (length1 == length2) { + return 0; + } else if (length1 < length2) { return -1; + } else { + return 1; } } - // check remainder - if (length1 == length2) { return 0; } - else if (length1 < length2) { return 1;} - else { return -1; } - } - private static final int compareTwoByteArrays(byte[] array1, int offset1, int length1, - byte[] array2, int offset2, int length2) { - if (array1 == null && array2 == null) return 0; - if (array1 == array2 && offset1 == offset2 && length1 == length2) return 0; - int min_length = (length1 < length2) ? length1 : length2; - for (int i = 0; i < min_length; i++) { - if (array1[i + offset1] < array2[i + offset2]) { - return 1; + @Override + public int compare(byte[] array1, int offset1, int length1, + byte[] array2, int offset2, int length2) { + if (array1 == null && array2 == null) return 0; + if (array1 == array2 && offset1 == offset2 && length1 == length2) return 0; + int min_length = (length1 < length2) ? length1 : length2; + for (int i = 0; i < min_length; i++) { + if (array1[i + offset1] < array2[i + offset2]) { + return -1; + } + if (array1[i + offset1] > array2[i + offset2]) { + return 1; + } } - if (array1[i + offset1] > array2[i + offset2]) { + // check remainder + if (length1 == length2) { + return 0; + } else if (length1 < length2) { return -1; + } else { + return 1; } } - // check remainder - if (length1 == length2) { return 0; } - else if (length1 < length2) { return 1;} - else { return -1; } - } + }; } diff --git a/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java b/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java index a541e1bd13..0045c900c0 100644 --- a/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java +++ b/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java @@ -18,6 +18,9 @@ */ package org.apache.parquet.io.api; +import org.apache.parquet.io.api.TestBinary.BinaryFactory.BinaryAndOriginal; +import org.junit.Test; + import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -26,13 +29,11 @@ import java.nio.ByteBuffer; import java.util.Arrays; -import org.apache.parquet.io.api.TestBinary.BinaryFactory.BinaryAndOriginal; -import org.junit.Test; - import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertSame; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; public class TestBinary { @@ -248,4 +249,24 @@ private void testBinary(BinaryFactory bf, boolean reused) throws Exception { testSerializable(bf, reused); } + + @Test + public void testCompare() { + Binary b1 = Binary.fromCharSequence("aaaaaaaa"); + Binary b2 = Binary.fromString("aaaaaaab"); + Binary b3 = Binary.fromReusedByteArray("aaaaaaaaaaa".getBytes(), 1, 8); + Binary b4 = Binary.fromConstantByteBuffer(ByteBuffer.wrap("aaaaaaac".getBytes())); + + assertTrue(b1.compareTo(b2) < 0); + assertTrue(b2.compareTo(b1) > 0); + assertTrue(b3.compareTo(b4) < 0); + assertTrue(b4.compareTo(b3) > 0); + assertTrue(b1.compareTo(b4) < 0); + assertTrue(b4.compareTo(b1) > 0); + assertTrue(b2.compareTo(b4) < 0); + assertTrue(b4.compareTo(b2) > 0); + + assertTrue(b1.compareTo(b3) == 0); + assertTrue(b3.compareTo(b1) == 0); + } } From 3378b6d34a7d4d3227889b18b54bb69d265d4b9a Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Thu, 9 Nov 2017 16:35:33 +0100 Subject: [PATCH 02/17] PARQUET-1025: Implement comparators and use them with statistics --- .../java/org/apache/parquet/cli/Util.java | 50 +--- .../cli/commands/CheckParquet251Command.java | 7 +- .../apache/parquet/column/Comparators.java | 273 ++++++++++++++++++ .../column/statistics/BinaryStatistics.java | 43 ++- .../column/statistics/BooleanStatistics.java | 51 +++- .../column/statistics/DoubleStatistics.java | 51 +++- .../column/statistics/FloatStatistics.java | 51 +++- .../column/statistics/IntStatistics.java | 51 +++- .../column/statistics/LongStatistics.java | 53 +++- .../parquet/column/statistics/Statistics.java | 92 +++++- .../parquet/filter2/predicate/Statistics.java | 18 +- .../column/statistics/TestStatistics.java | 28 ++ .../statisticslevel/StatisticsFilter.java | 17 +- 13 files changed, 654 insertions(+), 131 deletions(-) create mode 100644 parquet-column/src/main/java/org/apache/parquet/column/Comparators.java diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Util.java b/parquet-cli/src/main/java/org/apache/parquet/cli/Util.java index 07a5364c64..04b390162b 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/Util.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Util.java @@ -29,10 +29,6 @@ import org.apache.parquet.column.EncodingStats; import org.apache.parquet.column.statistics.BinaryStatistics; import org.apache.parquet.column.statistics.BooleanStatistics; -import org.apache.parquet.column.statistics.DoubleStatistics; -import org.apache.parquet.column.statistics.FloatStatistics; -import org.apache.parquet.column.statistics.IntStatistics; -import org.apache.parquet.column.statistics.LongStatistics; import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.schema.MessageType; @@ -40,7 +36,6 @@ import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; import java.nio.charset.StandardCharsets; -import java.util.Locale; import java.util.Set; import static org.apache.parquet.column.Encoding.BIT_PACKED; @@ -96,34 +91,14 @@ public static String minMaxAsString(Statistics stats, OriginalType annotation) { return ""; } // TODO: use original types when showing decimal, timestamp, etc. - if (stats instanceof BooleanStatistics) { - return String.format("%s / %s", - ((BooleanStatistics) stats).getMin(), - ((BooleanStatistics) stats).getMax()); - } else if (stats instanceof IntStatistics) { - return String.format("%d / %d", - ((IntStatistics) stats).getMin(), - ((IntStatistics) stats).getMax()); - } else if (stats instanceof LongStatistics) { - return String.format("%d / %d", - ((LongStatistics) stats).getMin(), - ((LongStatistics) stats).getMax()); - } else if (stats instanceof FloatStatistics) { - return String.format("%f / %f", - ((FloatStatistics) stats).getMin(), - ((FloatStatistics) stats).getMax()); - } else if (stats instanceof DoubleStatistics) { - return String.format("%f / %f", - ((DoubleStatistics) stats).getMin(), - ((DoubleStatistics) stats).getMax()); - } else if (stats instanceof BinaryStatistics) { + if (stats instanceof BinaryStatistics) { byte[] minBytes = stats.getMinBytes(); byte[] maxBytes = stats.getMaxBytes(); return String.format("%s / %s", printable(minBytes, annotation == OriginalType.UTF8, 30), printable(maxBytes, annotation == OriginalType.UTF8, 30)); } else { - throw new RuntimeException("Unknown stats type: " + stats); + return String.format("%s / %s", stats.minAsString(), stats.maxAsString()); } } @@ -134,24 +109,6 @@ public static String toString(Statistics stats, long count, OriginalType annotat // TODO: use original types when showing decimal, timestamp, etc. if (stats instanceof BooleanStatistics) { return String.format("nulls: %d/%d", stats.getNumNulls(), count); - } else if (stats instanceof IntStatistics) { - return String.format("min: %d max: %d nulls: %d/%d", - ((IntStatistics) stats).getMin(), ((IntStatistics) stats).getMax(), - stats.getNumNulls(), count); - } else if (stats instanceof LongStatistics) { - return String.format("min: %d max: %d nulls: %d/%d", - ((LongStatistics) stats).getMin(), ((LongStatistics) stats).getMax(), - stats.getNumNulls(), count); - } else if (stats instanceof FloatStatistics) { - return String.format("min: %f max: %f nulls: %d/%d", - ((FloatStatistics) stats).getMin(), - ((FloatStatistics) stats).getMax(), - stats.getNumNulls(), count); - } else if (stats instanceof DoubleStatistics) { - return String.format("min: %f max: %f nulls: %d/%d", - ((DoubleStatistics) stats).getMin(), - ((DoubleStatistics) stats).getMax(), - stats.getNumNulls(), count); } else if (stats instanceof BinaryStatistics) { byte[] minBytes = stats.getMinBytes(); byte[] maxBytes = stats.getMaxBytes(); @@ -160,7 +117,8 @@ public static String toString(Statistics stats, long count, OriginalType annotat printable(maxBytes, annotation == OriginalType.UTF8, 30), stats.getNumNulls(), count); } else { - throw new RuntimeException("Unknown stats type: " + stats); + return String.format("min: %s max: %s nulls: %d/%d", + stats.minAsString(), stats.maxAsString(), stats.getNumNulls(), count); } } diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CheckParquet251Command.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CheckParquet251Command.java index 8f6082122b..3d7d283899 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CheckParquet251Command.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CheckParquet251Command.java @@ -53,6 +53,7 @@ import javax.annotation.Nullable; import java.io.IOException; import java.util.Arrays; +import java.util.Comparator; import java.util.List; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; @@ -184,9 +185,11 @@ private class StatsValidator> { private final boolean hasNonNull; private final T min; private final T max; + private final Comparator comparator; public StatsValidator(DataPage page) { Statistics stats = getStatisticsFromPageHeader(page); + this.comparator = stats.comparator(); this.hasNonNull = stats.hasNonNullValue(); if (hasNonNull) { this.min = stats.genericGetMin(); @@ -199,10 +202,10 @@ public StatsValidator(DataPage page) { public void validate(T value) { if (hasNonNull) { - if (min.compareTo(value) > 0) { + if (comparator.compare(min, value) > 0) { throw new BadStatsException("Min should be <= all values."); } - if (max.compareTo(value) < 0) { + if (comparator.compare(max, value) < 0) { throw new BadStatsException("Max should be >= all values."); } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/Comparators.java b/parquet-column/src/main/java/org/apache/parquet/column/Comparators.java new file mode 100644 index 0000000000..03956e41b9 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/Comparators.java @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column; + +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; + +import java.util.Comparator; + +/** + * Utility class to provide {@link java.util.Comparator} implementations for logical types. + */ +public class Comparators { + + public static class BooleanComparator implements Comparator { + @Override + public int compare(Boolean o1, Boolean o2) { + return compare(o1.booleanValue(), o2.booleanValue()); + } + + public int compare(boolean b1, boolean b2) { + return Boolean.compare(b1, b2); + } + + /** + * Returns the string representation of the specified value for debugging/logging purposes + */ + public String toString(boolean b) { + return Boolean.toString(b); + } + } + + public static class IntComparator implements Comparator { + @Override + public int compare(Integer o1, Integer o2) { + return compare(o1.intValue(), o2.intValue()); + } + + public int compare(int i1, int i2) { + return Integer.compare(i1, i2); + } + + /** + * Returns the string representation of the specified value for debugging/logging purposes + */ + public String toString(int i) { + return Integer.toString(i); + } + } + + public static class LongComparator implements Comparator { + @Override + public int compare(Long o1, Long o2) { + return compare(o1.longValue(), o2.longValue()); + } + + public int compare(long l1, long l2) { + return Long.compare(l1, l2); + } + + /** + * Returns the string representation of the specified value for debugging/logging purposes + */ + public String toString(long l) { + return Long.toString(l); + } + } + + public static class FloatComparator implements Comparator { + @Override + public int compare(Float o1, Float o2) { + return compare(o1.floatValue(), o2.floatValue()); + } + + public int compare(float f1, float f2) { + return Float.compare(f1, f2); + } + + /** + * Returns the string representation of the specified value for debugging/logging purposes + */ + public String toString(float f) { + return String.format("%.5f", f); + } + } + + public static class DoubleComparator implements Comparator { + @Override + public int compare(Double o1, Double o2) { + return compare(o1.doubleValue(), o2.doubleValue()); + } + + public int compare(double d1, double d2) { + return Double.compare(d1, d2); + } + + /** + * Returns the string representation of the specified value for debugging/logging purposes + */ + public String toString(double d) { + return String.format("%.5f", d); + } + } + + public static class BinaryComparator implements Comparator { + @Override + public int compare(Binary o1, Binary o2) { + return o1.compareTo(o2); + } + + /** + * Returns the string representation of the specified value for debugging/logging purposes + */ + public String toString(Binary binary) { + return binary.toStringUsingUTF8(); + } + } + + private static final BooleanComparator BOOLEAN_COMPARATOR = new BooleanComparator(); + private static final IntComparator INT_COMPARATOR = new IntComparator(); + private static final LongComparator LONG_COMPARATOR = new LongComparator(); + private static final FloatComparator FLOAT_COMPARATOR = new FloatComparator(); + private static final DoubleComparator DOUBLE_COMPARATOR = new DoubleComparator(); + private static final BinaryComparator BINARY_COMPARATOR = new BinaryComparator(); + + /** + * Returns the proper {@link Comparator} implementation for the specified primitive and logical types. {@code + * logicalType} might be {@code null}. In case of the specification does not allow a logical type for the related + * primitive type, {@code logicalType} must be {@code null}. + */ + public static Comparator comparator(PrimitiveType.PrimitiveTypeName type, OriginalType logicalType) { + switch (type) { + case BOOLEAN: + return booleanComparator(logicalType); + case INT32: + return int32Comparator(logicalType); + case INT64: + return int64Comparator(logicalType); + case FLOAT: + return floatComparator(logicalType); + case DOUBLE: + return doubleComparator(logicalType); + case INT96: + case FIXED_LEN_BYTE_ARRAY: + case BINARY: + return binaryComparator(type, logicalType); + default: + throw new UnknownColumnTypeException(type); + } + } + + public static BooleanComparator booleanComparator(OriginalType logicalType) { + if (logicalType != null) + throw new IllegalArgumentException("Invalid logical type for BOOLEAN: " + logicalType); + return BOOLEAN_COMPARATOR; + } + + public static LongComparator int64Comparator(OriginalType logicalType) { + if (logicalType == null) + return LONG_COMPARATOR; + switch (logicalType) { + case INT_64: + case DECIMAL: + case TIME_MICROS: + case TIMESTAMP_MILLIS: + case TIMESTAMP_MICROS: + return LONG_COMPARATOR; + case UINT_64: + // TODO: return unsigned comparator + return LONG_COMPARATOR; + default: + throw new IllegalArgumentException("Invalid logical type for INT64: " + logicalType); + } + } + + public static IntComparator int32Comparator(OriginalType logicalType) { + if (logicalType == null) + return INT_COMPARATOR; + switch (logicalType) { + case INT_8: + case INT_16: + case INT_32: + case DECIMAL: + case DATE: + case TIME_MILLIS: + return INT_COMPARATOR; + case UINT_8: + case UINT_16: + case UINT_32: + // TODO: return unsigned comparator + return INT_COMPARATOR; + default: + throw new IllegalArgumentException("Invalid logical type for INT32: " + logicalType); + } + } + + public static FloatComparator floatComparator(OriginalType logicalType) { + if (logicalType != null) + throw new IllegalArgumentException("Invalid logical type for FLOAT: " + logicalType); + return FLOAT_COMPARATOR; + } + + public static DoubleComparator doubleComparator(OriginalType logicalType) { + if (logicalType != null) + throw new IllegalArgumentException("Invalid logical type for DOUBLE: " + logicalType); + return DOUBLE_COMPARATOR; + } + + public static BinaryComparator binaryComparator(PrimitiveType.PrimitiveTypeName type, OriginalType logicalType) { + switch (type) { + case INT96: + if (logicalType == null) + // TODO: what to return here? + return BINARY_COMPARATOR; + break; + case FIXED_LEN_BYTE_ARRAY: + if (logicalType == null) + // TODO: return lexicographical comparator + return BINARY_COMPARATOR; + switch (logicalType) { + case DECIMAL: + // TODO: return signed comparator + return BINARY_COMPARATOR; + case INTERVAL: + // TODO: return lexicographical comparator + return BINARY_COMPARATOR; + } + break; + case BINARY: + if (logicalType == null) + // TODO: return lexicographical comparator + return BINARY_COMPARATOR; + switch (logicalType) { + case UTF8: + case ENUM: + case INTERVAL: + // TODO: return lexicographical comparator + return BINARY_COMPARATOR; + case JSON: + case BSON: + // TODO: Based on specs we do not have ordering for these while we specified lexicographical in ColumnOrder + return BINARY_COMPARATOR; + case DECIMAL: + // TODO: return signed comparator + return BINARY_COMPARATOR; + } + break; + default: + throw new IllegalArgumentException("Not a binary type: " + type); + } + throw new IllegalArgumentException("Invalid logical type for " + type + ": " + logicalType); + } + + private Comparators() { + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java index c319b4adb0..250f15600a 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java @@ -18,13 +18,29 @@ */ package org.apache.parquet.column.statistics; +import org.apache.parquet.column.Comparators; import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; public class BinaryStatistics extends Statistics { + private final Comparators.BinaryComparator comparator; private Binary max; private Binary min; + /** + * @deprecated Use {@link Statistics#getStatsBasedOnType(PrimitiveType.PrimitiveTypeName, OriginalType)} instead + */ + @Deprecated + public BinaryStatistics() { + this(PrimitiveType.PrimitiveTypeName.BINARY, null); + } + + BinaryStatistics(PrimitiveType.PrimitiveTypeName type, OriginalType logicalType) { + comparator = Comparators.binaryComparator(type, logicalType); + } + @Override public void updateStats(Binary value) { if (!this.hasNonNullValue()) { @@ -68,18 +84,18 @@ public byte[] getMinBytes() { } @Override - public boolean isSmallerThan(long size) { - return !hasNonNullValue() || ((min.length() + max.length()) < size); + public String minAsString() { + return comparator.toString(min); } @Override - public String toString() { - if (this.hasNonNullValue()) - return String.format("min: %s, max: %s, num_nulls: %d", min.toStringUsingUTF8(), max.toStringUsingUTF8(), this.getNumNulls()); - else if (!this.isEmpty()) - return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); - else - return "no stats for this column"; + public String maxAsString() { + return comparator.toString(max); + } + + @Override + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || ((min.length() + max.length()) < size); } /** @@ -87,8 +103,8 @@ else if (!this.isEmpty()) */ @Deprecated public void updateStats(Binary min_value, Binary max_value) { - if (min.compareTo(min_value) > 0) { min = min_value.copy(); } - if (max.compareTo(max_value) < 0) { max = max_value.copy(); } + if (comparator.compare(min, min_value) > 0) { min = min_value.copy(); } + if (comparator.compare(max, max_value) < 0) { max = max_value.copy(); } } /** @@ -111,6 +127,11 @@ public Binary genericGetMax() { return max; } + @Override + public Comparators.BinaryComparator comparator() { + return comparator; + } + /** * @deprecated use {@link #genericGetMax()}, will be removed in 2.0.0 */ diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java index 22c23933bd..074ad0896f 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java @@ -19,12 +19,28 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.Comparators; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; public class BooleanStatistics extends Statistics { + private final Comparators.BooleanComparator comparator; private boolean max; private boolean min; + /** + * @deprecated Use {@link Statistics#getStatsBasedOnType(PrimitiveType.PrimitiveTypeName, OriginalType)} instead + */ + @Deprecated + public BooleanStatistics() { + this(null); + } + + BooleanStatistics(OriginalType logicalType) { + this.comparator = Comparators.booleanComparator(logicalType); + } + @Override public void updateStats(boolean value) { if (!this.hasNonNullValue()) { @@ -62,23 +78,23 @@ public byte[] getMinBytes() { } @Override - public boolean isSmallerThan(long size) { - return !hasNonNullValue() || (2 < size); + public String minAsString() { + return comparator.toString(min); + } + + @Override + public String maxAsString() { + return comparator.toString(max); } @Override - public String toString() { - if (this.hasNonNullValue()) - return String.format("min: %b, max: %b, num_nulls: %d", min, max, this.getNumNulls()); - else if(!this.isEmpty()) - return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); - else - return "no stats for this column"; + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (2 < size); } public void updateStats(boolean min_value, boolean max_value) { - if (min && !min_value) { min = min_value; } - if (!max && max_value) { max = max_value; } + if (comparator.compare(min, min_value) > 0) { min = min_value; } + if (comparator.compare(max, max_value) < 0) { max = max_value; } } public void initializeStats(boolean min_value, boolean max_value) { @@ -97,6 +113,19 @@ public Boolean genericGetMax() { return max; } + @Override + public Comparators.BooleanComparator comparator() { + return comparator; + } + + public int compareToMin(boolean value) { + return comparator.compare(min, value); + } + + public int compareToMax(boolean value) { + return comparator.compare(max, value); + } + public boolean getMax() { return max; } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java index d67a550a6f..67c3eaff77 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java @@ -19,12 +19,28 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.Comparators; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; public class DoubleStatistics extends Statistics { + private final Comparators.DoubleComparator comparator; private double max; private double min; + /** + * @deprecated Use {@link Statistics#getStatsBasedOnType(PrimitiveType.PrimitiveTypeName, OriginalType)} instead + */ + @Deprecated + public DoubleStatistics() { + this(null); + } + + DoubleStatistics(OriginalType logicalType) { + this.comparator = Comparators.doubleComparator(logicalType); + } + @Override public void updateStats(double value) { if (!this.hasNonNullValue()) { @@ -62,23 +78,23 @@ public byte[] getMinBytes() { } @Override - public boolean isSmallerThan(long size) { - return !hasNonNullValue() || (16 < size); + public String minAsString() { + return comparator.toString(min); + } + + @Override + public String maxAsString() { + return comparator.toString(max); } @Override - public String toString() { - if(this.hasNonNullValue()) - return String.format("min: %.5f, max: %.5f, num_nulls: %d", min, max, this.getNumNulls()); - else if (!this.isEmpty()) - return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); - else - return "no stats for this column"; + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (16 < size); } public void updateStats(double min_value, double max_value) { - if (min_value < min) { min = min_value; } - if (max_value > max) { max = max_value; } + if (comparator.compare(min, min_value) > 0) { min = min_value; } + if (comparator.compare(max, max_value) < 0) { max = max_value; } } public void initializeStats(double min_value, double max_value) { @@ -97,6 +113,19 @@ public Double genericGetMax() { return max; } + @Override + public Comparators.DoubleComparator comparator() { + return comparator; + } + + public int compareToMin(double value) { + return comparator.compare(min, value); + } + + public int compareToMax(double value) { + return comparator.compare(max, value); + } + public double getMax() { return max; } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java index dffc2077ed..733ca9321a 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java @@ -19,12 +19,28 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.Comparators; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; public class FloatStatistics extends Statistics { + private final Comparators.FloatComparator comparator; private float max; private float min; + /** + * @deprecated Use {@link Statistics#getStatsBasedOnType(PrimitiveType.PrimitiveTypeName, OriginalType)} instead + */ + @Deprecated + public FloatStatistics() { + this(null); + } + + FloatStatistics(OriginalType logicalType) { + this.comparator = Comparators.floatComparator(logicalType); + } + @Override public void updateStats(float value) { if (!this.hasNonNullValue()) { @@ -62,23 +78,23 @@ public byte[] getMinBytes() { } @Override - public boolean isSmallerThan(long size) { - return !hasNonNullValue() || (8 < size); + public String minAsString() { + return comparator.toString(min); + } + + @Override + public String maxAsString() { + return comparator.toString(max); } @Override - public String toString() { - if (this.hasNonNullValue()) - return String.format("min: %.5f, max: %.5f, num_nulls: %d", min, max, this.getNumNulls()); - else if (!this.isEmpty()) - return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); - else - return "no stats for this column"; + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (8 < size); } public void updateStats(float min_value, float max_value) { - if (min_value < min) { min = min_value; } - if (max_value > max) { max = max_value; } + if (comparator.compare(min, min_value) > 0) { min = min_value; } + if (comparator.compare(max, max_value) < 0) { max = max_value; } } public void initializeStats(float min_value, float max_value) { @@ -97,6 +113,19 @@ public Float genericGetMax() { return max; } + @Override + public Comparators.FloatComparator comparator() { + return comparator; + } + + public int compareToMin(float value) { + return comparator.compare(min, value); + } + + public int compareToMax(float value) { + return comparator.compare(max, value); + } + public float getMax() { return max; } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java index a5d7ba196e..ee303a4304 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java @@ -19,12 +19,28 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.Comparators; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; public class IntStatistics extends Statistics { + private final Comparators.IntComparator comparator; private int max; private int min; + /** + * @deprecated Use {@link Statistics#getStatsBasedOnType(PrimitiveType.PrimitiveTypeName, OriginalType)} instead + */ + @Deprecated + public IntStatistics() { + this(null); + } + + IntStatistics(OriginalType logicalType) { + comparator = Comparators.int32Comparator(logicalType); + } + @Override public void updateStats(int value) { if (!this.hasNonNullValue()) { @@ -62,23 +78,23 @@ public byte[] getMinBytes() { } @Override - public boolean isSmallerThan(long size) { - return !hasNonNullValue() || (8 < size); + public String minAsString() { + return comparator.toString(min); + } + + @Override + public String maxAsString() { + return comparator.toString(max); } @Override - public String toString() { - if (this.hasNonNullValue()) - return String.format("min: %d, max: %d, num_nulls: %d", min, max, this.getNumNulls()); - else if (!this.isEmpty()) - return String.format("num_nulls: %d, min/max is not defined", this.getNumNulls()); - else - return "no stats for this column"; + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (8 < size); } public void updateStats(int min_value, int max_value) { - if (min_value < min) { min = min_value; } - if (max_value > max) { max = max_value; } + if (comparator.compare(min, min_value) > 0) { min = min_value; } + if (comparator.compare(max, max_value) < 0) { max = max_value; } } public void initializeStats(int min_value, int max_value) { @@ -97,6 +113,19 @@ public Integer genericGetMax() { return max; } + @Override + public Comparators.IntComparator comparator() { + return comparator; + } + + public int compareToMin(int value) { + return comparator.compare(min, value); + } + + public int compareToMax(int value) { + return comparator.compare(max, value); + } + public int getMax() { return max; } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java index f7971efdd8..e521110623 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java @@ -18,13 +18,31 @@ */ package org.apache.parquet.column.statistics; +import it.unimi.dsi.fastutil.longs.LongComparator; +import it.unimi.dsi.fastutil.longs.LongComparators; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.Comparators; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; public class LongStatistics extends Statistics { + private final Comparators.LongComparator comparator; private long max; private long min; + /** + * @deprecated Use {@link Statistics#getStatsBasedOnType(PrimitiveType.PrimitiveTypeName, OriginalType)} instead + */ + @Deprecated + public LongStatistics() { + this(null); + } + + LongStatistics(OriginalType logicalType) { + this.comparator = Comparators.int64Comparator(logicalType); + } + @Override public void updateStats(long value) { if (!this.hasNonNullValue()) { @@ -62,23 +80,23 @@ public byte[] getMinBytes() { } @Override - public boolean isSmallerThan(long size) { - return !hasNonNullValue() || (16 < size); + public String minAsString() { + return comparator.toString(min); + } + + @Override + public String maxAsString() { + return comparator.toString(max); } @Override - public String toString() { - if (this.hasNonNullValue()) - return String.format("min: %d, max: %d, num_nulls: %d", min, max, this.getNumNulls()); - else if (!this.isEmpty()) - return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); - else - return "no stats for this column"; + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (16 < size); } public void updateStats(long min_value, long max_value) { - if (min_value < min) { min = min_value; } - if (max_value > max) { max = max_value; } + if (comparator.compare(min, min_value) > 0) { min = min_value; } + if (comparator.compare(max, max_value) < 0) { max = max_value; } } public void initializeStats(long min_value, long max_value) { @@ -97,6 +115,19 @@ public Long genericGetMax() { return max; } + @Override + public LongComparator comparator() { + return LongComparators.NATURAL_COMPARATOR; + } + + public int compareToMin(long value) { + return comparator.compare(min, value); + } + + public int compareToMax(long value) { + return comparator.compare(max, value); + } + public long getMax() { return max; } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index 30153c0743..5a3d036b1c 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -20,8 +20,10 @@ import org.apache.parquet.column.UnknownColumnTypeException; import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import java.util.Arrays; +import java.util.Comparator; /** @@ -29,7 +31,7 @@ * * @author Katya Gonina */ -public abstract class Statistics> { +public abstract class Statistics { private boolean hasNonNullValue; private long num_nulls; @@ -43,7 +45,9 @@ public Statistics() { * Returns the typed statistics object based on the passed type parameter * @param type PrimitiveTypeName type of the column * @return instance of a typed statistics class + * @deprecated Use {@link #getStatsBasedOnType(PrimitiveTypeName, OriginalType)} instead */ + @Deprecated public static Statistics getStatsBasedOnType(PrimitiveTypeName type) { switch(type) { case INT32: @@ -67,6 +71,27 @@ public static Statistics getStatsBasedOnType(PrimitiveTypeName type) { } } + public static Statistics getStatsBasedOnType(PrimitiveTypeName type, OriginalType logicalType) { + switch(type) { + case INT32: + return new IntStatistics(logicalType); + case INT64: + return new LongStatistics(logicalType); + case FLOAT: + return new FloatStatistics(logicalType); + case DOUBLE: + return new DoubleStatistics(logicalType); + case BOOLEAN: + return new BooleanStatistics(logicalType); + case BINARY: + case INT96: + case FIXED_LEN_BYTE_ARRAY: + return new BinaryStatistics(type, logicalType); + default: + throw new UnknownColumnTypeException(type); + } + } + /** * updates statistics min and max using the passed value * @param value value to use to update min and max @@ -175,9 +200,46 @@ public void mergeStatistics(Statistics stats) { */ abstract public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes); + /** + * Returns the generic object representing the min value in the statistics. The self comparing logic of the returned + * object might not be the proper one (e.g. unsigned comparison for int/long) therefore it is strongly recommended to + * use the related comparing method {@link #compareToMin(Object)} or the comparator returned by {@link + * #comparator()}. + */ abstract public T genericGetMin(); + + /** + * Returns the generic object representing the max value in the statistics. The self comparing logic of the returned + * object might not be the proper one (e.g. unsigned comparison for int/long) therefore it is strongly recommended to + * use the related comparing method {@link #compareToMax(Object)} or the comparator returned by {@link + * #comparator()}. + */ abstract public T genericGetMax(); + /** + * Returns the comparator to be used to compare two generic values in the proper way (e.g. unsigned comparison for + * int/long) + */ + public abstract Comparator comparator(); + + /** + * Compares the specified value to min in the proper way. + * + * @see {@link Comparable#compareTo(Object)} + */ + public int compareToMin(T value) { + return comparator().compare(genericGetMin(), value); + } + + /** + * Compares the specified value to max in the proper way. + * + * @see {@link Comparable#compareTo(Object)} + */ + public int compareToMax(T value) { + return comparator().compare(genericGetMax(), value); + } + /** * Abstract method to return the max value as a byte array * @return byte array corresponding to the max value @@ -190,6 +252,16 @@ public void mergeStatistics(Statistics stats) { */ abstract public byte[] getMinBytes(); + /** + * Returns the string representation of min for debugging/logging purposes. + */ + public abstract String minAsString(); + + /** + * Returns the string representation of max for debugging/logging purposes. + */ + public abstract String maxAsString(); + /** * Abstract method to return whether the min and max values fit in the given * size. @@ -198,11 +270,15 @@ public void mergeStatistics(Statistics stats) { */ abstract public boolean isSmallerThan(long size); - /** - * toString() to display min, max, num_nulls in a string - */ - abstract public String toString(); - + @Override + public String toString() { + if (this.hasNonNullValue()) + return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(), this.getNumNulls()); + else if (!this.isEmpty()) + return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); + else + return "no stats for this column"; + } /** * Increments the null count by one @@ -250,11 +326,11 @@ public boolean isEmpty() { public boolean hasNonNullValue() { return hasNonNullValue; } - + /** * Sets the page/column as having a valid non-null value * kind of misnomer here - */ + */ protected void markAsNotEmpty() { hasNonNullValue = true; } diff --git a/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java index 22e4027e3c..db8d85cba7 100644 --- a/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java @@ -18,6 +18,8 @@ */ package org.apache.parquet.filter2.predicate; +import java.util.Comparator; + import static org.apache.parquet.Preconditions.checkNotNull; /** @@ -26,17 +28,31 @@ public class Statistics { private final T min; private final T max; + private final Comparator comparator; - public Statistics(T min, T max) { + public Statistics(T min, T max, Comparator comparator) { this.min = checkNotNull(min, "min"); this.max = checkNotNull(max, "max"); + this.comparator = checkNotNull(comparator, "comparator"); } + /** + * The self-comparison logic of {@code T} might not proper for the actual logical type (e.g. unsigned int). Use {@link + * #getComparator()} for comparing. + */ public T getMin() { return min; } + /** + * The self-comparison logic of {@code T} might not proper for the actual logical type (e.g. unsigned int). Use {@link + * #getComparator()} for comparing. + */ public T getMax() { return max; } + + public Comparator getComparator() { + return comparator; + } } diff --git a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java index 690c7e1730..721e407b14 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java @@ -74,6 +74,13 @@ public void testIntMinMax() { assertEquals(statsNeg.getMax(), 54); assertEquals(statsNeg.getMin(), -66); + assertEquals(-1, statsNeg.compareToMax(55)); + assertEquals(0, statsNeg.compareToMax(54)); + assertEquals(1, statsNeg.compareToMax(5)); + assertEquals(-1, statsNeg.compareToMin(0)); + assertEquals(0, statsNeg.compareToMin(-66)); + assertEquals(1, statsNeg.compareToMin(-67)); + // Test converting to and from byte[] byte[] intMaxBytes = statsNeg.getMaxBytes(); byte[] intMinBytes = statsNeg.getMinBytes(); @@ -135,6 +142,13 @@ public void testLongMinMax() { assertEquals(statsNeg.getMax(), 993); assertEquals(statsNeg.getMin(), -9914); + assertEquals(-1, statsNeg.compareToMax(994)); + assertEquals(0, statsNeg.compareToMax(993)); + assertEquals(1, statsNeg.compareToMax(-1000)); + assertEquals(-1, statsNeg.compareToMin(10000)); + assertEquals(0, statsNeg.compareToMin(-9914)); + assertEquals(1, statsNeg.compareToMin(-9915)); + // Test converting to and from byte[] byte[] longMaxBytes = statsNeg.getMaxBytes(); byte[] longMinBytes = statsNeg.getMinBytes(); @@ -196,6 +210,13 @@ public void testFloatMinMax() { assertEquals(statsNeg.getMax(), 0.65f, 1e-10); assertEquals(statsNeg.getMin(), -412.99f, 1e-10); + assertEquals(-1, statsNeg.compareToMax(1)); + assertEquals(0, statsNeg.compareToMax(0.65F)); + assertEquals(1, statsNeg.compareToMax(0.649F)); + assertEquals(-1, statsNeg.compareToMin(-412.98F)); + assertEquals(0, statsNeg.compareToMin(-412.99F)); + assertEquals(1, statsNeg.compareToMin(-450)); + // Test converting to and from byte[] byte[] floatMaxBytes = statsNeg.getMaxBytes(); byte[] floatMinBytes = statsNeg.getMinBytes(); @@ -257,6 +278,13 @@ public void testDoubleMinMax() { assertEquals(statsNeg.getMax(), 23.0d, 1e-10); assertEquals(statsNeg.getMin(), -944.5d, 1e-10); + assertEquals(-1, statsNeg.compareToMax(23.0001D)); + assertEquals(0, statsNeg.compareToMax(23D)); + assertEquals(1, statsNeg.compareToMax(0D)); + assertEquals(-1, statsNeg.compareToMin(-400D)); + assertEquals(0, statsNeg.compareToMin(-944.5D)); + assertEquals(1, statsNeg.compareToMin(-944.500001D)); + // Test converting to and from byte[] byte[] doubleMaxBytes = statsNeg.getMaxBytes(); byte[] doubleMinBytes = statsNeg.getMinBytes(); diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java index ac7132e74e..b75c47b635 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java @@ -134,7 +134,7 @@ public > Boolean visit(Eq eq) { } // drop if value < min || value > max - return value.compareTo(stats.genericGetMin()) < 0 || value.compareTo(stats.genericGetMax()) > 0; + return stats.compareToMin(value) > 0 || stats.compareToMax(value) < 0; } @Override @@ -173,7 +173,7 @@ public > Boolean visit(NotEq notEq) { } // drop if this is a column where min = max = value - return value.compareTo(stats.genericGetMin()) == 0 && value.compareTo(stats.genericGetMax()) == 0; + return stats.compareToMin(value) == 0 && stats.compareToMax(value) == 0; } @Override @@ -204,7 +204,7 @@ public > Boolean visit(Lt lt) { T value = lt.getValue(); // drop if value <= min - return value.compareTo(stats.genericGetMin()) <= 0; + return stats.compareToMin(value) >= 0; } @Override @@ -235,7 +235,7 @@ public > Boolean visit(LtEq ltEq) { T value = ltEq.getValue(); // drop if value < min - return value.compareTo(stats.genericGetMin()) < 0; + return stats.compareToMin(value) > 0; } @Override @@ -266,7 +266,7 @@ public > Boolean visit(Gt gt) { T value = gt.getValue(); // drop if value >= max - return value.compareTo(stats.genericGetMax()) >= 0; + return stats.compareToMax(value) <= 0; } @Override @@ -296,8 +296,8 @@ public > Boolean visit(GtEq gtEq) { T value = gtEq.getValue(); - // drop if value >= max - return value.compareTo(stats.genericGetMax()) > 0; + // drop if value > max + return stats.compareToMax(value) < 0; } @Override @@ -356,7 +356,8 @@ private , U extends UserDefinedPredicate> Boolean vis } org.apache.parquet.filter2.predicate.Statistics udpStats = - new org.apache.parquet.filter2.predicate.Statistics(stats.genericGetMin(), stats.genericGetMax()); + new org.apache.parquet.filter2.predicate.Statistics(stats.genericGetMin(), stats.genericGetMax(), + stats.comparator()); if (inverted) { return udp.inverseCanDrop(udpStats); From 52cd58f6179c85dcae78892d1075c480d0674fac Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Fri, 10 Nov 2017 15:03:09 +0100 Subject: [PATCH 03/17] PARQUET-1025: Move comparators to Type --- .../apache/parquet/column/Comparators.java | 273 ------------------ .../column/statistics/BinaryStatistics.java | 27 +- .../column/statistics/BooleanStatistics.java | 27 +- .../column/statistics/DoubleStatistics.java | 26 +- .../column/statistics/FloatStatistics.java | 26 +- .../column/statistics/IntStatistics.java | 27 +- .../column/statistics/LongStatistics.java | 22 +- .../parquet/column/statistics/Statistics.java | 81 ++++-- .../parquet/filter2/predicate/Statistics.java | 6 + .../parquet/schema/PrimitiveComparator.java | 119 ++++++++ .../apache/parquet/schema/PrimitiveType.java | 74 +++++ .../java/org/apache/parquet/schema/Type.java | 8 + 12 files changed, 301 insertions(+), 415 deletions(-) delete mode 100644 parquet-column/src/main/java/org/apache/parquet/column/Comparators.java create mode 100644 parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java diff --git a/parquet-column/src/main/java/org/apache/parquet/column/Comparators.java b/parquet-column/src/main/java/org/apache/parquet/column/Comparators.java deleted file mode 100644 index 03956e41b9..0000000000 --- a/parquet-column/src/main/java/org/apache/parquet/column/Comparators.java +++ /dev/null @@ -1,273 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.parquet.column; - -import org.apache.parquet.io.api.Binary; -import org.apache.parquet.schema.OriginalType; -import org.apache.parquet.schema.PrimitiveType; - -import java.util.Comparator; - -/** - * Utility class to provide {@link java.util.Comparator} implementations for logical types. - */ -public class Comparators { - - public static class BooleanComparator implements Comparator { - @Override - public int compare(Boolean o1, Boolean o2) { - return compare(o1.booleanValue(), o2.booleanValue()); - } - - public int compare(boolean b1, boolean b2) { - return Boolean.compare(b1, b2); - } - - /** - * Returns the string representation of the specified value for debugging/logging purposes - */ - public String toString(boolean b) { - return Boolean.toString(b); - } - } - - public static class IntComparator implements Comparator { - @Override - public int compare(Integer o1, Integer o2) { - return compare(o1.intValue(), o2.intValue()); - } - - public int compare(int i1, int i2) { - return Integer.compare(i1, i2); - } - - /** - * Returns the string representation of the specified value for debugging/logging purposes - */ - public String toString(int i) { - return Integer.toString(i); - } - } - - public static class LongComparator implements Comparator { - @Override - public int compare(Long o1, Long o2) { - return compare(o1.longValue(), o2.longValue()); - } - - public int compare(long l1, long l2) { - return Long.compare(l1, l2); - } - - /** - * Returns the string representation of the specified value for debugging/logging purposes - */ - public String toString(long l) { - return Long.toString(l); - } - } - - public static class FloatComparator implements Comparator { - @Override - public int compare(Float o1, Float o2) { - return compare(o1.floatValue(), o2.floatValue()); - } - - public int compare(float f1, float f2) { - return Float.compare(f1, f2); - } - - /** - * Returns the string representation of the specified value for debugging/logging purposes - */ - public String toString(float f) { - return String.format("%.5f", f); - } - } - - public static class DoubleComparator implements Comparator { - @Override - public int compare(Double o1, Double o2) { - return compare(o1.doubleValue(), o2.doubleValue()); - } - - public int compare(double d1, double d2) { - return Double.compare(d1, d2); - } - - /** - * Returns the string representation of the specified value for debugging/logging purposes - */ - public String toString(double d) { - return String.format("%.5f", d); - } - } - - public static class BinaryComparator implements Comparator { - @Override - public int compare(Binary o1, Binary o2) { - return o1.compareTo(o2); - } - - /** - * Returns the string representation of the specified value for debugging/logging purposes - */ - public String toString(Binary binary) { - return binary.toStringUsingUTF8(); - } - } - - private static final BooleanComparator BOOLEAN_COMPARATOR = new BooleanComparator(); - private static final IntComparator INT_COMPARATOR = new IntComparator(); - private static final LongComparator LONG_COMPARATOR = new LongComparator(); - private static final FloatComparator FLOAT_COMPARATOR = new FloatComparator(); - private static final DoubleComparator DOUBLE_COMPARATOR = new DoubleComparator(); - private static final BinaryComparator BINARY_COMPARATOR = new BinaryComparator(); - - /** - * Returns the proper {@link Comparator} implementation for the specified primitive and logical types. {@code - * logicalType} might be {@code null}. In case of the specification does not allow a logical type for the related - * primitive type, {@code logicalType} must be {@code null}. - */ - public static Comparator comparator(PrimitiveType.PrimitiveTypeName type, OriginalType logicalType) { - switch (type) { - case BOOLEAN: - return booleanComparator(logicalType); - case INT32: - return int32Comparator(logicalType); - case INT64: - return int64Comparator(logicalType); - case FLOAT: - return floatComparator(logicalType); - case DOUBLE: - return doubleComparator(logicalType); - case INT96: - case FIXED_LEN_BYTE_ARRAY: - case BINARY: - return binaryComparator(type, logicalType); - default: - throw new UnknownColumnTypeException(type); - } - } - - public static BooleanComparator booleanComparator(OriginalType logicalType) { - if (logicalType != null) - throw new IllegalArgumentException("Invalid logical type for BOOLEAN: " + logicalType); - return BOOLEAN_COMPARATOR; - } - - public static LongComparator int64Comparator(OriginalType logicalType) { - if (logicalType == null) - return LONG_COMPARATOR; - switch (logicalType) { - case INT_64: - case DECIMAL: - case TIME_MICROS: - case TIMESTAMP_MILLIS: - case TIMESTAMP_MICROS: - return LONG_COMPARATOR; - case UINT_64: - // TODO: return unsigned comparator - return LONG_COMPARATOR; - default: - throw new IllegalArgumentException("Invalid logical type for INT64: " + logicalType); - } - } - - public static IntComparator int32Comparator(OriginalType logicalType) { - if (logicalType == null) - return INT_COMPARATOR; - switch (logicalType) { - case INT_8: - case INT_16: - case INT_32: - case DECIMAL: - case DATE: - case TIME_MILLIS: - return INT_COMPARATOR; - case UINT_8: - case UINT_16: - case UINT_32: - // TODO: return unsigned comparator - return INT_COMPARATOR; - default: - throw new IllegalArgumentException("Invalid logical type for INT32: " + logicalType); - } - } - - public static FloatComparator floatComparator(OriginalType logicalType) { - if (logicalType != null) - throw new IllegalArgumentException("Invalid logical type for FLOAT: " + logicalType); - return FLOAT_COMPARATOR; - } - - public static DoubleComparator doubleComparator(OriginalType logicalType) { - if (logicalType != null) - throw new IllegalArgumentException("Invalid logical type for DOUBLE: " + logicalType); - return DOUBLE_COMPARATOR; - } - - public static BinaryComparator binaryComparator(PrimitiveType.PrimitiveTypeName type, OriginalType logicalType) { - switch (type) { - case INT96: - if (logicalType == null) - // TODO: what to return here? - return BINARY_COMPARATOR; - break; - case FIXED_LEN_BYTE_ARRAY: - if (logicalType == null) - // TODO: return lexicographical comparator - return BINARY_COMPARATOR; - switch (logicalType) { - case DECIMAL: - // TODO: return signed comparator - return BINARY_COMPARATOR; - case INTERVAL: - // TODO: return lexicographical comparator - return BINARY_COMPARATOR; - } - break; - case BINARY: - if (logicalType == null) - // TODO: return lexicographical comparator - return BINARY_COMPARATOR; - switch (logicalType) { - case UTF8: - case ENUM: - case INTERVAL: - // TODO: return lexicographical comparator - return BINARY_COMPARATOR; - case JSON: - case BSON: - // TODO: Based on specs we do not have ordering for these while we specified lexicographical in ColumnOrder - return BINARY_COMPARATOR; - case DECIMAL: - // TODO: return signed comparator - return BINARY_COMPARATOR; - } - break; - default: - throw new IllegalArgumentException("Not a binary type: " + type); - } - throw new IllegalArgumentException("Invalid logical type for " + type + ": " + logicalType); - } - - private Comparators() { - } -} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java index 250f15600a..3222a86301 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java @@ -18,27 +18,25 @@ */ package org.apache.parquet.column.statistics; -import org.apache.parquet.column.Comparators; import org.apache.parquet.io.api.Binary; -import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; public class BinaryStatistics extends Statistics { - private final Comparators.BinaryComparator comparator; private Binary max; private Binary min; /** - * @deprecated Use {@link Statistics#getStatsBasedOnType(PrimitiveType.PrimitiveTypeName, OriginalType)} instead + * @deprecated Use {@link Statistics#getStatsBasedOnType(Type)} instead */ @Deprecated public BinaryStatistics() { - this(PrimitiveType.PrimitiveTypeName.BINARY, null); + this(new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.BINARY, "")); } - BinaryStatistics(PrimitiveType.PrimitiveTypeName type, OriginalType logicalType) { - comparator = Comparators.binaryComparator(type, logicalType); + BinaryStatistics(Type type) { + super(type); } @Override @@ -84,13 +82,9 @@ public byte[] getMinBytes() { } @Override - public String minAsString() { - return comparator.toString(min); - } - - @Override - public String maxAsString() { - return comparator.toString(max); + String toString(Binary value) { + // TODO: have separate toString for different logical types? + return value.toStringUsingUTF8(); } @Override @@ -127,11 +121,6 @@ public Binary genericGetMax() { return max; } - @Override - public Comparators.BinaryComparator comparator() { - return comparator; - } - /** * @deprecated use {@link #genericGetMax()}, will be removed in 2.0.0 */ diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java index 074ad0896f..44556576ed 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java @@ -19,26 +19,24 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; -import org.apache.parquet.column.Comparators; -import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; public class BooleanStatistics extends Statistics { - private final Comparators.BooleanComparator comparator; private boolean max; private boolean min; /** - * @deprecated Use {@link Statistics#getStatsBasedOnType(PrimitiveType.PrimitiveTypeName, OriginalType)} instead + * @deprecated Use {@link Statistics#getStatsBasedOnType(Type)} instead */ @Deprecated public BooleanStatistics() { - this(null); + this(new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.BOOLEAN, "")); } - BooleanStatistics(OriginalType logicalType) { - this.comparator = Comparators.booleanComparator(logicalType); + BooleanStatistics(Type type) { + super(type); } @Override @@ -77,16 +75,6 @@ public byte[] getMinBytes() { return BytesUtils.booleanToBytes(min); } - @Override - public String minAsString() { - return comparator.toString(min); - } - - @Override - public String maxAsString() { - return comparator.toString(max); - } - @Override public boolean isSmallerThan(long size) { return !hasNonNullValue() || (2 < size); @@ -113,11 +101,6 @@ public Boolean genericGetMax() { return max; } - @Override - public Comparators.BooleanComparator comparator() { - return comparator; - } - public int compareToMin(boolean value) { return comparator.compare(min, value); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java index 67c3eaff77..18d594bd74 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java @@ -19,26 +19,24 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; -import org.apache.parquet.column.Comparators; -import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; public class DoubleStatistics extends Statistics { - private final Comparators.DoubleComparator comparator; private double max; private double min; /** - * @deprecated Use {@link Statistics#getStatsBasedOnType(PrimitiveType.PrimitiveTypeName, OriginalType)} instead + * @deprecated Use {@link Statistics#getStatsBasedOnType(Type)} instead */ @Deprecated public DoubleStatistics() { - this(null); + this(new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.DOUBLE, "")); } - DoubleStatistics(OriginalType logicalType) { - this.comparator = Comparators.doubleComparator(logicalType); + DoubleStatistics(Type type) { + super(type); } @Override @@ -78,13 +76,8 @@ public byte[] getMinBytes() { } @Override - public String minAsString() { - return comparator.toString(min); - } - - @Override - public String maxAsString() { - return comparator.toString(max); + String toString(Double value) { + return String.format("%.5f", value); } @Override @@ -113,11 +106,6 @@ public Double genericGetMax() { return max; } - @Override - public Comparators.DoubleComparator comparator() { - return comparator; - } - public int compareToMin(double value) { return comparator.compare(min, value); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java index 733ca9321a..34a76dd0ef 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java @@ -19,26 +19,24 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; -import org.apache.parquet.column.Comparators; -import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; public class FloatStatistics extends Statistics { - private final Comparators.FloatComparator comparator; private float max; private float min; /** - * @deprecated Use {@link Statistics#getStatsBasedOnType(PrimitiveType.PrimitiveTypeName, OriginalType)} instead + * @deprecated Use {@link Statistics#getStatsBasedOnType(Type)} instead */ @Deprecated public FloatStatistics() { - this(null); + this(new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.FLOAT, "")); } - FloatStatistics(OriginalType logicalType) { - this.comparator = Comparators.floatComparator(logicalType); + FloatStatistics(Type type) { + super(type); } @Override @@ -78,13 +76,8 @@ public byte[] getMinBytes() { } @Override - public String minAsString() { - return comparator.toString(min); - } - - @Override - public String maxAsString() { - return comparator.toString(max); + String toString(Float value) { + return String.format("%.5f", value); } @Override @@ -113,11 +106,6 @@ public Float genericGetMax() { return max; } - @Override - public Comparators.FloatComparator comparator() { - return comparator; - } - public int compareToMin(float value) { return comparator.compare(min, value); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java index ee303a4304..f69ef556b8 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java @@ -19,26 +19,24 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; -import org.apache.parquet.column.Comparators; -import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; public class IntStatistics extends Statistics { - private final Comparators.IntComparator comparator; private int max; private int min; /** - * @deprecated Use {@link Statistics#getStatsBasedOnType(PrimitiveType.PrimitiveTypeName, OriginalType)} instead + * @deprecated Use {@link Statistics#getStatsBasedOnType(Type)} instead */ @Deprecated public IntStatistics() { - this(null); + this(new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.INT32, "")); } - IntStatistics(OriginalType logicalType) { - comparator = Comparators.int32Comparator(logicalType); + IntStatistics(Type type) { + super(type); } @Override @@ -78,13 +76,9 @@ public byte[] getMinBytes() { } @Override - public String minAsString() { - return comparator.toString(min); - } - - @Override - public String maxAsString() { - return comparator.toString(max); + String toString(Integer value) { + // TODO: implement unsigned int as required + return value.toString(); } @Override @@ -113,11 +107,6 @@ public Integer genericGetMax() { return max; } - @Override - public Comparators.IntComparator comparator() { - return comparator; - } - public int compareToMin(int value) { return comparator.compare(min, value); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java index e521110623..fbfc3a2930 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java @@ -21,26 +21,24 @@ import it.unimi.dsi.fastutil.longs.LongComparator; import it.unimi.dsi.fastutil.longs.LongComparators; import org.apache.parquet.bytes.BytesUtils; -import org.apache.parquet.column.Comparators; -import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; public class LongStatistics extends Statistics { - private final Comparators.LongComparator comparator; private long max; private long min; /** - * @deprecated Use {@link Statistics#getStatsBasedOnType(PrimitiveType.PrimitiveTypeName, OriginalType)} instead + * @deprecated Use {@link Statistics#getStatsBasedOnType(Type)} instead */ @Deprecated public LongStatistics() { - this(null); + this(new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.INT64, "")); } - LongStatistics(OriginalType logicalType) { - this.comparator = Comparators.int64Comparator(logicalType); + LongStatistics(Type type) { + super(type); } @Override @@ -80,13 +78,9 @@ public byte[] getMinBytes() { } @Override - public String minAsString() { - return comparator.toString(min); - } - - @Override - public String maxAsString() { - return comparator.toString(max); + String toString(Long value) { + // TODO: implement unsigned int as required + return value.toString(); } @Override diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index 5a3d036b1c..e332f50489 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -20,8 +20,10 @@ import org.apache.parquet.column.UnknownColumnTypeException; import org.apache.parquet.io.api.Binary; -import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveComparator; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type; + import java.util.Arrays; import java.util.Comparator; @@ -31,21 +33,23 @@ * * @author Katya Gonina */ -public abstract class Statistics { +public abstract class Statistics> { + final PrimitiveComparator comparator; private boolean hasNonNullValue; private long num_nulls; - public Statistics() { + Statistics(Type type) { hasNonNullValue = false; num_nulls = 0; + this.comparator = type.comparator(); } /** * Returns the typed statistics object based on the passed type parameter * @param type PrimitiveTypeName type of the column * @return instance of a typed statistics class - * @deprecated Use {@link #getStatsBasedOnType(PrimitiveTypeName, OriginalType)} instead + * @deprecated Use {@link #getStatsBasedOnType(Type)} instead */ @Deprecated public static Statistics getStatsBasedOnType(PrimitiveTypeName type) { @@ -71,24 +75,31 @@ public static Statistics getStatsBasedOnType(PrimitiveTypeName type) { } } - public static Statistics getStatsBasedOnType(PrimitiveTypeName type, OriginalType logicalType) { - switch(type) { + /** + * Returns the typed statistics object based on the passed type parameter + * + * @param type type of the column + * @return instance of a typed statistics class + */ + public static Statistics getStatsBasedOnType(Type type) { + PrimitiveTypeName primitive = type.asPrimitiveType().getPrimitiveTypeName(); + switch (primitive) { case INT32: - return new IntStatistics(logicalType); + return new IntStatistics(type); case INT64: - return new LongStatistics(logicalType); + return new LongStatistics(type); case FLOAT: - return new FloatStatistics(logicalType); + return new FloatStatistics(type); case DOUBLE: - return new DoubleStatistics(logicalType); + return new DoubleStatistics(type); case BOOLEAN: - return new BooleanStatistics(logicalType); + return new BooleanStatistics(type); case BINARY: case INT96: case FIXED_LEN_BYTE_ARRAY: - return new BinaryStatistics(type, logicalType); + return new BinaryStatistics(type); default: - throw new UnknownColumnTypeException(type); + throw new UnknownColumnTypeException(primitive); } } @@ -201,43 +212,45 @@ public void mergeStatistics(Statistics stats) { abstract public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes); /** - * Returns the generic object representing the min value in the statistics. The self comparing logic of the returned - * object might not be the proper one (e.g. unsigned comparison for int/long) therefore it is strongly recommended to - * use the related comparing method {@link #compareToMin(Object)} or the comparator returned by {@link - * #comparator()}. + * Returns the min value in the statistics. The java natural order of the returned type defined by {@link + * T#compareTo(Object)} might not be the proper one. For example, UINT_32 requires unsigned comparison instead of the + * natural signed one. Use {@link #compareToMin(Comparable)} or the comparator returned by {@link #comparator()} to + * always get the proper ordering. */ abstract public T genericGetMin(); /** - * Returns the generic object representing the max value in the statistics. The self comparing logic of the returned - * object might not be the proper one (e.g. unsigned comparison for int/long) therefore it is strongly recommended to - * use the related comparing method {@link #compareToMax(Object)} or the comparator returned by {@link - * #comparator()}. + * Returns the max value in the statistics. The java natural order of the returned type defined by {@link + * T#compareTo(Object)} might not be the proper one. For example, UINT_32 requires unsigned comparison instead of the + * natural signed one. Use {@link #compareToMax(Comparable)} or the comparator returned by {@link #comparator()} to + * always get the proper ordering. */ abstract public T genericGetMax(); /** - * Returns the comparator to be used to compare two generic values in the proper way (e.g. unsigned comparison for - * int/long) + * Returns the comparator to be used to compare two generic values in the proper way (for example, unsigned comparison + * for UINT_32). */ - public abstract Comparator comparator(); + public Comparator comparator() { + return comparator; + } /** * Compares the specified value to min in the proper way. * - * @see {@link Comparable#compareTo(Object)} + * @see Comparable#compareTo(Object) */ public int compareToMin(T value) { - return comparator().compare(genericGetMin(), value); + return comparator.compare(genericGetMin(), value); } /** * Compares the specified value to max in the proper way. * - * @see {@link Comparable#compareTo(Object)} + * @see Comparable#compareTo(Object) */ public int compareToMax(T value) { - return comparator().compare(genericGetMax(), value); + return comparator.compare(genericGetMax(), value); } /** @@ -255,12 +268,20 @@ public int compareToMax(T value) { /** * Returns the string representation of min for debugging/logging purposes. */ - public abstract String minAsString(); + public String minAsString() { + return toString(genericGetMin()); + } /** * Returns the string representation of max for debugging/logging purposes. */ - public abstract String maxAsString(); + public String maxAsString() { + return toString(genericGetMax()); + } + + String toString(T value) { + return value.toString(); + } /** * Abstract method to return whether the min and max values fit in the given diff --git a/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java index db8d85cba7..92358a7702 100644 --- a/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java @@ -37,6 +37,7 @@ public Statistics(T min, T max, Comparator comparator) { } /** + * Returns the generic object representing the min value in the statistics. * The self-comparison logic of {@code T} might not proper for the actual logical type (e.g. unsigned int). Use {@link * #getComparator()} for comparing. */ @@ -45,6 +46,7 @@ public T getMin() { } /** + * Returns the generic object representing the max value in the statistics. * The self-comparison logic of {@code T} might not proper for the actual logical type (e.g. unsigned int). Use {@link * #getComparator()} for comparing. */ @@ -52,6 +54,10 @@ public T getMax() { return max; } + /** + * Returns the comparator to be used to compare two generic values in the proper way (e.g. unsigned comparison for + * UINT_32) + */ public Comparator getComparator() { return comparator; } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java new file mode 100644 index 0000000000..79eff9b9bb --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.schema; + +import org.apache.parquet.io.api.Binary; + +import java.util.Comparator; + +/** + * {@link Comparator} implementation that also supports the comparison of the related primitive type to avoid the + * performance penalty of boxing/unboxing. The {@code compare} methods for the not supported primitive types throw + * {@link UnsupportedOperationException}. + */ +public abstract class PrimitiveComparator implements Comparator { + + public int compare(boolean b1, boolean b2) { + throw new UnsupportedOperationException(); + } + + public int compare(int i1, int i2) { + throw new UnsupportedOperationException(); + } + + public int compare(long l1, long l2) { + throw new UnsupportedOperationException(); + } + + public int compare(float f1, float f2) { + throw new UnsupportedOperationException(); + } + + public int compare(double d1, double d2) { + throw new UnsupportedOperationException(); + } + + static PrimitiveComparator BOOLEAN_COMPARATOR = new PrimitiveComparator() { + @Override + public int compare(Boolean o1, Boolean o2) { + return compare(o1.booleanValue(), o2.booleanValue()); + } + + @Override + public int compare(boolean b1, boolean b2) { + return Boolean.compare(b1, b2); + } + }; + + static PrimitiveComparator SIGNED_INT32_COMPARATOR = new PrimitiveComparator() { + @Override + public int compare(Integer o1, Integer o2) { + return compare(o1.intValue(), o2.intValue()); + } + + @Override + public int compare(int i1, int i2) { + return Integer.compare(i1, i2); + } + }; + + static PrimitiveComparator SIGNED_INT64_COMPARATOR = new PrimitiveComparator() { + @Override + public int compare(Long o1, Long o2) { + return compare(o1.longValue(), o2.longValue()); + } + + @Override + public int compare(long l1, long l2) { + return Long.compare(l1, l2); + } + }; + + static PrimitiveComparator FLOAT_COMPARATOR = new PrimitiveComparator() { + @Override + public int compare(Float o1, Float o2) { + return compare(o1.floatValue(), o2.floatValue()); + } + + @Override + public int compare(float f1, float f2) { + return Float.compare(f1, f2); + } + }; + + static PrimitiveComparator DOUBLE_COMPARATOR = new PrimitiveComparator() { + @Override + public int compare(Double o1, Double o2) { + return compare(o1.doubleValue(), o2.doubleValue()); + } + + @Override + public int compare(double d1, double d2) { + return Double.compare(d1, d2); + } + }; + + // TODO: this one is temporary as the self-comparison of Binary is not proper + static PrimitiveComparator BINARY_COMPARATOR = new PrimitiveComparator() { + @Override + public int compare(Binary o1, Binary o2) { + return o1.compareTo(o2); + } + }; +} diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java index 8056188d25..3c91383179 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java @@ -86,6 +86,14 @@ public void addValueToPrimitiveConverter( public T convert(PrimitiveTypeNameConverter converter) throws E { return converter.convertINT64(this); } + + @Override + PrimitiveComparator comparator(OriginalType logicalType) { + if (logicalType == OriginalType.UINT_64) + // TODO: return unsigned comparator + return PrimitiveComparator.SIGNED_INT64_COMPARATOR; + return PrimitiveComparator.SIGNED_INT64_COMPARATOR; + } }, INT32("getInteger", Integer.TYPE) { @Override @@ -109,6 +117,19 @@ public void addValueToPrimitiveConverter( public T convert(PrimitiveTypeNameConverter converter) throws E { return converter.convertINT32(this); } + + @Override + PrimitiveComparator comparator(OriginalType logicalType) { + if (logicalType != null) + switch (logicalType) { + case UINT_8: + case UINT_16: + case UINT_32: + // TODO: return unsigned comparator + return PrimitiveComparator.SIGNED_INT32_COMPARATOR; + } + return PrimitiveComparator.SIGNED_INT32_COMPARATOR; + } }, BOOLEAN("getBoolean", Boolean.TYPE) { @Override @@ -132,6 +153,11 @@ public void addValueToPrimitiveConverter( public T convert(PrimitiveTypeNameConverter converter) throws E { return converter.convertBOOLEAN(this); } + + @Override + PrimitiveComparator comparator(OriginalType logicalType) { + return PrimitiveComparator.BOOLEAN_COMPARATOR; + } }, BINARY("getBinary", Binary.class) { @Override @@ -155,6 +181,22 @@ public void addValueToPrimitiveConverter( public T convert(PrimitiveTypeNameConverter converter) throws E { return converter.convertBINARY(this); } + + @Override + PrimitiveComparator comparator(OriginalType logicalType) { + if (logicalType != null) + switch (logicalType) { + case JSON: + case BSON: + // TODO: Based on specs we do not have ordering for these while we specified lexicographical in ColumnOrder + return PrimitiveComparator.BINARY_COMPARATOR; + case DECIMAL: + // TODO: return signed comparator + return PrimitiveComparator.BINARY_COMPARATOR; + } + // TODO: return lexicographical comparator + return PrimitiveComparator.BINARY_COMPARATOR; + } }, FLOAT("getFloat", Float.TYPE) { @Override @@ -178,6 +220,11 @@ public void addValueToPrimitiveConverter( public T convert(PrimitiveTypeNameConverter converter) throws E { return converter.convertFLOAT(this); } + + @Override + PrimitiveComparator comparator(OriginalType logicalType) { + return PrimitiveComparator.FLOAT_COMPARATOR; + } }, DOUBLE("getDouble", Double.TYPE) { @Override @@ -201,6 +248,11 @@ public void addValueToPrimitiveConverter( public T convert(PrimitiveTypeNameConverter converter) throws E { return converter.convertDOUBLE(this); } + + @Override + PrimitiveComparator comparator(OriginalType logicalType) { + return PrimitiveComparator.DOUBLE_COMPARATOR; + } }, INT96("getBinary", Binary.class) { @Override @@ -222,6 +274,12 @@ public void addValueToPrimitiveConverter( public T convert(PrimitiveTypeNameConverter converter) throws E { return converter.convertINT96(this); } + + @Override + PrimitiveComparator comparator(OriginalType logicalType) { + // TODO: what to return here? + return PrimitiveComparator.BINARY_COMPARATOR; + } }, FIXED_LEN_BYTE_ARRAY("getBinary", Binary.class) { @Override @@ -245,6 +303,15 @@ public void addValueToPrimitiveConverter( public T convert(PrimitiveTypeNameConverter converter) throws E { return converter.convertFIXED_LEN_BYTE_ARRAY(this); } + + @Override + PrimitiveComparator comparator(OriginalType logicalType) { + if (logicalType == OriginalType.DECIMAL) + // TODO: return signed comparator + return PrimitiveComparator.BINARY_COMPARATOR; + // TODO: return lexicographical comparator + return PrimitiveComparator.BINARY_COMPARATOR; + } }; public final String getMethod; @@ -275,6 +342,8 @@ abstract public void addValueToPrimitiveConverter( abstract public T convert(PrimitiveTypeNameConverter converter) throws E; + abstract PrimitiveComparator comparator(OriginalType logicalType); + } private final PrimitiveTypeName primitive; @@ -547,4 +616,9 @@ protected Type union(Type toMerge, boolean strict) { return builder.as(getOriginalType()).named(getName()); } + + @Override + public PrimitiveComparator comparator() { + return getPrimitiveTypeName().comparator(getOriginalType()); + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/Type.java b/parquet-column/src/main/java/org/apache/parquet/schema/Type.java index 176b9a6e27..a5f7c59e70 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/Type.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/Type.java @@ -324,4 +324,12 @@ void checkContains(Type subType) { */ abstract T convert(List path, TypeConverter converter); + /** + * Returns the {@link Type} specific comparator for properly comparing values. The natural ordering of the values + * might not proper in certain cases (e.g. {@code UINT_32} requires unsigned comparison of {@code int} values while + * the natural ordering is signed.) + */ + public PrimitiveComparator comparator() { + throw new UnsupportedOperationException("No comparator is implemented for type: " + this); + } } From 20b937f465cf2fae16fb25a78a233c86cac5e015 Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Thu, 16 Nov 2017 11:05:59 +0100 Subject: [PATCH 04/17] PARQUET-1025: reading/writing new min-max statistics; use the comparators as needed --- .../cli/commands/CheckParquet251Command.java | 4 +- .../parquet/column/ColumnDescriptor.java | 41 ++++++--- .../parquet/column/impl/ColumnWriterV1.java | 2 +- .../parquet/column/impl/ColumnWriterV2.java | 2 +- .../column/statistics/BinaryStatistics.java | 14 ++- .../column/statistics/BooleanStatistics.java | 17 ++-- .../column/statistics/DoubleStatistics.java | 17 ++-- .../column/statistics/FloatStatistics.java | 17 ++-- .../column/statistics/IntStatistics.java | 17 ++-- .../column/statistics/LongStatistics.java | 24 ++---- .../parquet/column/statistics/Statistics.java | 85 ++++++++++++------- ...allyUpdatedFilterPredicateBuilderBase.java | 19 +++++ .../apache/parquet/io/MessageColumnIO.java | 2 +- .../apache/parquet/io/PrimitiveColumnIO.java | 7 +- .../org/apache/parquet/io/api/Binary.java | 7 ++ .../apache/parquet/schema/MessageType.java | 6 +- .../parquet/schema/PrimitiveComparator.java | 24 ++++-- .../apache/parquet/schema/PrimitiveType.java | 19 +++-- ...ntallyUpdatedFilterPredicateGenerator.java | 43 +++++----- .../dictionarylevel/DictionaryFilter.java | 15 ++-- .../converter/ParquetMetadataConverter.java | 55 ++++++++---- .../hadoop/ColumnChunkPageWriteStore.java | 20 +++-- .../parquet/hadoop/ParquetFileWriter.java | 21 +++-- .../hadoop/metadata/ColumnChunkMetaData.java | 30 ++++++- .../metadata/ColumnChunkProperties.java | 17 +++- .../TestParquetMetadataConverter.java | 10 ++- .../parquet/statistics/TestStatistics.java | 11 ++- 27 files changed, 343 insertions(+), 203 deletions(-) diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CheckParquet251Command.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CheckParquet251Command.java index 3d7d283899..fbeebdfba6 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CheckParquet251Command.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CheckParquet251Command.java @@ -346,8 +346,8 @@ private void validateStatsForPage(DataPage page, DictionaryPage dict, console.debug(String.format( "Validated stats min=%s max=%s nulls=%d for page=%s col=%s", - String.valueOf(stats.genericGetMin()), - String.valueOf(stats.genericGetMax()), stats.getNumNulls(), page, + stats.minAsString(), + stats.maxAsString(), stats.getNumNulls(), page, Arrays.toString(desc.getPath()))); } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/ColumnDescriptor.java b/parquet-column/src/main/java/org/apache/parquet/column/ColumnDescriptor.java index 61f13a2740..8c9ba46989 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/ColumnDescriptor.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/ColumnDescriptor.java @@ -18,9 +18,12 @@ */ package org.apache.parquet.column; -import java.util.Arrays; - +import org.apache.parquet.example.data.simple.Primitive; +import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type; + +import java.util.Arrays; /** * Describes a column's type as well as its position in its containing schema. @@ -31,8 +34,7 @@ public class ColumnDescriptor implements Comparable { private final String[] path; - private final PrimitiveTypeName type; - private final int typeLength; + private final PrimitiveType type; private final int maxRep; private final int maxDef; @@ -42,8 +44,10 @@ public class ColumnDescriptor implements Comparable { * @param type the type of the field * @param maxRep the maximum repetition level for that path * @param maxDef the maximum definition level for that path + * @deprecated Use {@link #ColumnDescriptor(String[], PrimitiveTypeName, int, int)} */ - public ColumnDescriptor(String[] path, PrimitiveTypeName type, int maxRep, + @Deprecated + public ColumnDescriptor(String[] path, PrimitiveTypeName type, int maxRep, int maxDef) { this(path, type, 0, maxRep, maxDef); } @@ -54,13 +58,23 @@ public ColumnDescriptor(String[] path, PrimitiveTypeName type, int maxRep, * @param type the type of the field * @param maxRep the maximum repetition level for that path * @param maxDef the maximum definition level for that path + * @deprecated Use {@link #ColumnDescriptor(String[], PrimitiveTypeName, int, int)} */ - public ColumnDescriptor(String[] path, PrimitiveTypeName type, + @Deprecated + public ColumnDescriptor(String[] path, PrimitiveTypeName type, int typeLength, int maxRep, int maxDef) { - super(); + this(path, new PrimitiveType(Type.Repetition.OPTIONAL, type, typeLength,""), maxRep, maxDef); + } + + /** + * @param path the path to the leaf field in the schema + * @param type the type of the field + * @param maxRep the maximum repetition level for that path + * @param maxDef the maximum definition level for that path + */ + public ColumnDescriptor(String[] path, PrimitiveType type, int maxRep, int maxDef) { this.path = path; this.type = type; - this.typeLength = typeLength; this.maxRep = maxRep; this.maxDef = maxDef; } @@ -90,14 +104,21 @@ public int getMaxDefinitionLevel() { * @return the type of that column */ public PrimitiveTypeName getType() { - return type; + return type.getPrimitiveTypeName(); } /** * @return the size of the type **/ public int getTypeLength() { - return typeLength; + return type.getTypeLength(); + } + + /** + * Returns the full type object of the column + */ + public PrimitiveType getFullType() { + return type; } @Override diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java index c5b3884194..8e975aaaa1 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java @@ -80,7 +80,7 @@ private void log(Object value, int r, int d) { } private void resetStatistics() { - this.statistics = Statistics.getStatsBasedOnType(this.path.getType()); + this.statistics = Statistics.createLegacyStats(this.path.getType()); } /** diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java index c6fd91b5eb..a68fb6c3d1 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java @@ -77,7 +77,7 @@ private void log(Object value, int r, int d) { } private void resetStatistics() { - this.statistics = Statistics.getStatsBasedOnType(this.path.getType()); + this.statistics = Statistics.createStats(path.getFullType()); } private void definitionLevel(int definitionLevel) { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java index 3222a86301..7c8e30d397 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java @@ -27,16 +27,12 @@ public class BinaryStatistics extends Statistics { private Binary max; private Binary min; - /** - * @deprecated Use {@link Statistics#getStatsBasedOnType(Type)} instead - */ - @Deprecated public BinaryStatistics() { - this(new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.BINARY, "")); + super(); } BinaryStatistics(Type type) { - super(type); + super(type.comparator()); } @Override @@ -84,7 +80,7 @@ public byte[] getMinBytes() { @Override String toString(Binary value) { // TODO: have separate toString for different logical types? - return value.toStringUsingUTF8(); + return value == null ? "null" : value.toStringUsingUTF8(); } @Override @@ -97,8 +93,8 @@ public boolean isSmallerThan(long size) { */ @Deprecated public void updateStats(Binary min_value, Binary max_value) { - if (comparator.compare(min, min_value) > 0) { min = min_value.copy(); } - if (comparator.compare(max, max_value) < 0) { max = max_value.copy(); } + if (comparator().compare(min, min_value) > 0) { min = min_value.copy(); } + if (comparator().compare(max, max_value) < 0) { max = max_value.copy(); } } /** diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java index 44556576ed..6f893db64c 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java @@ -19,7 +19,6 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; -import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; public class BooleanStatistics extends Statistics { @@ -27,16 +26,12 @@ public class BooleanStatistics extends Statistics { private boolean max; private boolean min; - /** - * @deprecated Use {@link Statistics#getStatsBasedOnType(Type)} instead - */ - @Deprecated public BooleanStatistics() { - this(new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.BOOLEAN, "")); + super(); } BooleanStatistics(Type type) { - super(type); + super(type.comparator()); } @Override @@ -81,8 +76,8 @@ public boolean isSmallerThan(long size) { } public void updateStats(boolean min_value, boolean max_value) { - if (comparator.compare(min, min_value) > 0) { min = min_value; } - if (comparator.compare(max, max_value) < 0) { max = max_value; } + if (comparator().compare(min, min_value) > 0) { min = min_value; } + if (comparator().compare(max, max_value) < 0) { max = max_value; } } public void initializeStats(boolean min_value, boolean max_value) { @@ -102,11 +97,11 @@ public Boolean genericGetMax() { } public int compareToMin(boolean value) { - return comparator.compare(min, value); + return comparator().compare(min, value); } public int compareToMax(boolean value) { - return comparator.compare(max, value); + return comparator().compare(max, value); } public boolean getMax() { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java index 18d594bd74..b1cf3f0958 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java @@ -19,7 +19,6 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; -import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; public class DoubleStatistics extends Statistics { @@ -27,16 +26,12 @@ public class DoubleStatistics extends Statistics { private double max; private double min; - /** - * @deprecated Use {@link Statistics#getStatsBasedOnType(Type)} instead - */ - @Deprecated public DoubleStatistics() { - this(new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.DOUBLE, "")); + super(); } DoubleStatistics(Type type) { - super(type); + super(type.comparator()); } @Override @@ -86,8 +81,8 @@ public boolean isSmallerThan(long size) { } public void updateStats(double min_value, double max_value) { - if (comparator.compare(min, min_value) > 0) { min = min_value; } - if (comparator.compare(max, max_value) < 0) { max = max_value; } + if (comparator().compare(min, min_value) > 0) { min = min_value; } + if (comparator().compare(max, max_value) < 0) { max = max_value; } } public void initializeStats(double min_value, double max_value) { @@ -107,11 +102,11 @@ public Double genericGetMax() { } public int compareToMin(double value) { - return comparator.compare(min, value); + return comparator().compare(min, value); } public int compareToMax(double value) { - return comparator.compare(max, value); + return comparator().compare(max, value); } public double getMax() { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java index 34a76dd0ef..6fcf3df65a 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java @@ -19,7 +19,6 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; -import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; public class FloatStatistics extends Statistics { @@ -27,16 +26,12 @@ public class FloatStatistics extends Statistics { private float max; private float min; - /** - * @deprecated Use {@link Statistics#getStatsBasedOnType(Type)} instead - */ - @Deprecated public FloatStatistics() { - this(new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.FLOAT, "")); + super(); } FloatStatistics(Type type) { - super(type); + super(type.comparator()); } @Override @@ -86,8 +81,8 @@ public boolean isSmallerThan(long size) { } public void updateStats(float min_value, float max_value) { - if (comparator.compare(min, min_value) > 0) { min = min_value; } - if (comparator.compare(max, max_value) < 0) { max = max_value; } + if (comparator().compare(min, min_value) > 0) { min = min_value; } + if (comparator().compare(max, max_value) < 0) { max = max_value; } } public void initializeStats(float min_value, float max_value) { @@ -107,11 +102,11 @@ public Float genericGetMax() { } public int compareToMin(float value) { - return comparator.compare(min, value); + return comparator().compare(min, value); } public int compareToMax(float value) { - return comparator.compare(max, value); + return comparator().compare(max, value); } public float getMax() { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java index f69ef556b8..fd3063ee38 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java @@ -19,7 +19,6 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; -import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; public class IntStatistics extends Statistics { @@ -27,16 +26,12 @@ public class IntStatistics extends Statistics { private int max; private int min; - /** - * @deprecated Use {@link Statistics#getStatsBasedOnType(Type)} instead - */ - @Deprecated public IntStatistics() { - this(new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.INT32, "")); + super(); } IntStatistics(Type type) { - super(type); + super(type.comparator()); } @Override @@ -87,8 +82,8 @@ public boolean isSmallerThan(long size) { } public void updateStats(int min_value, int max_value) { - if (comparator.compare(min, min_value) > 0) { min = min_value; } - if (comparator.compare(max, max_value) < 0) { max = max_value; } + if (comparator().compare(min, min_value) > 0) { min = min_value; } + if (comparator().compare(max, max_value) < 0) { max = max_value; } } public void initializeStats(int min_value, int max_value) { @@ -108,11 +103,11 @@ public Integer genericGetMax() { } public int compareToMin(int value) { - return comparator.compare(min, value); + return comparator().compare(min, value); } public int compareToMax(int value) { - return comparator.compare(max, value); + return comparator().compare(max, value); } public int getMax() { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java index fbfc3a2930..1102574339 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java @@ -18,10 +18,7 @@ */ package org.apache.parquet.column.statistics; -import it.unimi.dsi.fastutil.longs.LongComparator; -import it.unimi.dsi.fastutil.longs.LongComparators; import org.apache.parquet.bytes.BytesUtils; -import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; public class LongStatistics extends Statistics { @@ -29,16 +26,12 @@ public class LongStatistics extends Statistics { private long max; private long min; - /** - * @deprecated Use {@link Statistics#getStatsBasedOnType(Type)} instead - */ - @Deprecated public LongStatistics() { - this(new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.INT64, "")); + super(); } LongStatistics(Type type) { - super(type); + super(type.comparator()); } @Override @@ -89,8 +82,8 @@ public boolean isSmallerThan(long size) { } public void updateStats(long min_value, long max_value) { - if (comparator.compare(min, min_value) > 0) { min = min_value; } - if (comparator.compare(max, max_value) < 0) { max = max_value; } + if (comparator().compare(min, min_value) > 0) { min = min_value; } + if (comparator().compare(max, max_value) < 0) { max = max_value; } } public void initializeStats(long min_value, long max_value) { @@ -109,17 +102,12 @@ public Long genericGetMax() { return max; } - @Override - public LongComparator comparator() { - return LongComparators.NATURAL_COMPARATOR; - } - public int compareToMin(long value) { - return comparator.compare(min, value); + return comparator().compare(min, value); } public int compareToMax(long value) { - return comparator.compare(max, value); + return comparator().compare(max, value); } public long getMax() { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index e332f50489..ce27be511e 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -18,6 +18,7 @@ */ package org.apache.parquet.column.statistics; +import org.apache.parquet.ShouldNeverHappenException; import org.apache.parquet.column.UnknownColumnTypeException; import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.PrimitiveComparator; @@ -26,6 +27,7 @@ import java.util.Arrays; import java.util.Comparator; +import java.util.Objects; /** @@ -33,55 +35,71 @@ * * @author Katya Gonina */ -public abstract class Statistics> { +public abstract class Statistics> implements Cloneable { - final PrimitiveComparator comparator; + private final PrimitiveComparator comparator; private boolean hasNonNullValue; private long num_nulls; - Statistics(Type type) { + Statistics() { + this(PrimitiveComparator.comparableComparator()); + } + + Statistics(PrimitiveComparator comparator) { hasNonNullValue = false; num_nulls = 0; - this.comparator = type.comparator(); + this.comparator = comparator; } /** * Returns the typed statistics object based on the passed type parameter * @param type PrimitiveTypeName type of the column * @return instance of a typed statistics class - * @deprecated Use {@link #getStatsBasedOnType(Type)} instead + * @deprecated Use {@link #createStats(Type)} or {@link #createLegacyStats(PrimitiveTypeName)} instead */ @Deprecated public static Statistics getStatsBasedOnType(PrimitiveTypeName type) { switch(type) { - case INT32: - return new IntStatistics(); - case INT64: - return new LongStatistics(); - case FLOAT: - return new FloatStatistics(); - case DOUBLE: - return new DoubleStatistics(); - case BOOLEAN: - return new BooleanStatistics(); - case BINARY: - return new BinaryStatistics(); - case INT96: - return new BinaryStatistics(); - case FIXED_LEN_BYTE_ARRAY: - return new BinaryStatistics(); - default: - throw new UnknownColumnTypeException(type); + case INT32: + return new IntStatistics(); + case INT64: + return new LongStatistics(); + case FLOAT: + return new FloatStatistics(); + case DOUBLE: + return new DoubleStatistics(); + case BOOLEAN: + return new BooleanStatistics(); + case BINARY: + return new BinaryStatistics(); + case INT96: + return new BinaryStatistics(); + case FIXED_LEN_BYTE_ARRAY: + return new BinaryStatistics(); + default: + throw new UnknownColumnTypeException(type); } } /** - * Returns the typed statistics object based on the passed type parameter + * Creates an empty {@code Statistics} instance for the specified type to be used for reading/writing the legacy + * min/max statistics. * * @param type type of the column * @return instance of a typed statistics class */ - public static Statistics getStatsBasedOnType(Type type) { + public static Statistics createLegacyStats(PrimitiveTypeName type) { + return getStatsBasedOnType(type); + } + + /** + * Creates an empty {@code Statistics} instance for the specified type to be used for reading/writing the new min/max + * statistics used in the V2 format. + * + * @param type type of the column + * @return instance of a typed statistics class + */ + public static Statistics createStats(Type type) { PrimitiveTypeName primitive = type.asPrimitiveType().getPrimitiveTypeName(); switch (primitive) { case INT32: @@ -231,7 +249,7 @@ public void mergeStatistics(Statistics stats) { * Returns the comparator to be used to compare two generic values in the proper way (for example, unsigned comparison * for UINT_32). */ - public Comparator comparator() { + public final Comparator comparator() { return comparator; } @@ -240,7 +258,7 @@ public Comparator comparator() { * * @see Comparable#compareTo(Object) */ - public int compareToMin(T value) { + public final int compareToMin(T value) { return comparator.compare(genericGetMin(), value); } @@ -249,7 +267,7 @@ public int compareToMin(T value) { * * @see Comparable#compareTo(Object) */ - public int compareToMax(T value) { + public final int compareToMax(T value) { return comparator.compare(genericGetMax(), value); } @@ -280,7 +298,7 @@ public String maxAsString() { } String toString(T value) { - return value.toString(); + return Objects.toString(value); } /** @@ -355,5 +373,14 @@ public boolean hasNonNullValue() { protected void markAsNotEmpty() { hasNonNullValue = true; } + + @Override + public Statistics clone() { + try { + return (Statistics) super.clone(); + } catch(CloneNotSupportedException e) { + throw new ShouldNeverHappenException(e); + } + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/filter2/recordlevel/IncrementallyUpdatedFilterPredicateBuilderBase.java b/parquet-column/src/main/java/org/apache/parquet/filter2/recordlevel/IncrementallyUpdatedFilterPredicateBuilderBase.java index 8def88eec4..0c5160bad2 100644 --- a/parquet-column/src/main/java/org/apache/parquet/filter2/recordlevel/IncrementallyUpdatedFilterPredicateBuilderBase.java +++ b/parquet-column/src/main/java/org/apache/parquet/filter2/recordlevel/IncrementallyUpdatedFilterPredicateBuilderBase.java @@ -23,6 +23,7 @@ import java.util.List; import java.util.Map; +import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.filter2.predicate.FilterPredicate; import org.apache.parquet.filter2.predicate.FilterPredicate.Visitor; @@ -30,6 +31,8 @@ import org.apache.parquet.filter2.predicate.Operators.Not; import org.apache.parquet.filter2.predicate.Operators.Or; import org.apache.parquet.filter2.recordlevel.IncrementallyUpdatedFilterPredicate.ValueInspector; +import org.apache.parquet.io.PrimitiveColumnIO; +import org.apache.parquet.schema.PrimitiveComparator; import static org.apache.parquet.Preconditions.checkArgument; @@ -55,9 +58,20 @@ public abstract class IncrementallyUpdatedFilterPredicateBuilderBase implements Visitor { private boolean built = false; private final Map> valueInspectorsByColumn = new HashMap>(); + private final Map> comparatorsByColumn = new HashMap<>(); + @Deprecated public IncrementallyUpdatedFilterPredicateBuilderBase() { } + public IncrementallyUpdatedFilterPredicateBuilderBase(List leaves) { + for (PrimitiveColumnIO leaf : leaves) { + ColumnDescriptor descriptor = leaf.getColumnDescriptor(); + ColumnPath path = ColumnPath.get(descriptor.getPath()); + PrimitiveComparator comparator = descriptor.getFullType().comparator(); + comparatorsByColumn.put(path, comparator); + } + } + public final IncrementallyUpdatedFilterPredicate build(FilterPredicate pred) { checkArgument(!built, "This builder has already been used"); IncrementallyUpdatedFilterPredicate incremental = pred.accept(this); @@ -78,6 +92,11 @@ public Map> getValueInspectorsByColumn() { return valueInspectorsByColumn; } + @SuppressWarnings("unchecked") + protected final PrimitiveComparator getComparator(ColumnPath path) { + return (PrimitiveComparator) comparatorsByColumn.get(path); + } + @Override public final IncrementallyUpdatedFilterPredicate visit(And and) { return new IncrementallyUpdatedFilterPredicate.And(and.getLeft().accept(this), and.getRight().accept(this)); diff --git a/parquet-column/src/main/java/org/apache/parquet/io/MessageColumnIO.java b/parquet-column/src/main/java/org/apache/parquet/io/MessageColumnIO.java index 67efdb3a37..7346c5a35f 100644 --- a/parquet-column/src/main/java/org/apache/parquet/io/MessageColumnIO.java +++ b/parquet-column/src/main/java/org/apache/parquet/io/MessageColumnIO.java @@ -109,7 +109,7 @@ public RecordReader getRecordReader(final PageReadStore columns, public RecordReader visit(FilterPredicateCompat filterPredicateCompat) { FilterPredicate predicate = filterPredicateCompat.getFilterPredicate(); - IncrementallyUpdatedFilterPredicateBuilder builder = new IncrementallyUpdatedFilterPredicateBuilder(); + IncrementallyUpdatedFilterPredicateBuilder builder = new IncrementallyUpdatedFilterPredicateBuilder(leaves); IncrementallyUpdatedFilterPredicate streamingPredicate = builder.build(predicate); RecordMaterializer filteringRecordMaterializer = new FilteringRecordMaterializer( recordMaterializer, diff --git a/parquet-column/src/main/java/org/apache/parquet/io/PrimitiveColumnIO.java b/parquet-column/src/main/java/org/apache/parquet/io/PrimitiveColumnIO.java index 15c28c8cc9..e40b24f133 100644 --- a/parquet-column/src/main/java/org/apache/parquet/io/PrimitiveColumnIO.java +++ b/parquet-column/src/main/java/org/apache/parquet/io/PrimitiveColumnIO.java @@ -52,10 +52,9 @@ void setLevels(int r, int d, String[] fieldPath, int[] fieldIndexPath, List getPaths() { @@ -111,8 +110,7 @@ public List getColumns() { PrimitiveType primitiveType = getType(path).asPrimitiveType(); columns.add(new ColumnDescriptor( path, - primitiveType.getPrimitiveTypeName(), - primitiveType.getTypeLength(), + primitiveType, getMaxRepetitionLevel(path), getMaxDefinitionLevel(path))); } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java index 79eff9b9bb..f307b599e1 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java @@ -49,7 +49,19 @@ public int compare(double d1, double d2) { throw new UnsupportedOperationException(); } - static PrimitiveComparator BOOLEAN_COMPARATOR = new PrimitiveComparator() { + private static final PrimitiveComparator> COMPARABLE_COMPARATOR = new PrimitiveComparator>() { + @Override + public int compare(Comparable o1, Comparable o2) { + return o1.compareTo(o2); + } + }; + + @SuppressWarnings("unchecked") + public static > PrimitiveComparator comparableComparator() { + return (PrimitiveComparator) COMPARABLE_COMPARATOR; + } + + static final PrimitiveComparator BOOLEAN_COMPARATOR = new PrimitiveComparator() { @Override public int compare(Boolean o1, Boolean o2) { return compare(o1.booleanValue(), o2.booleanValue()); @@ -61,7 +73,7 @@ public int compare(boolean b1, boolean b2) { } }; - static PrimitiveComparator SIGNED_INT32_COMPARATOR = new PrimitiveComparator() { + static final PrimitiveComparator SIGNED_INT32_COMPARATOR = new PrimitiveComparator() { @Override public int compare(Integer o1, Integer o2) { return compare(o1.intValue(), o2.intValue()); @@ -73,7 +85,7 @@ public int compare(int i1, int i2) { } }; - static PrimitiveComparator SIGNED_INT64_COMPARATOR = new PrimitiveComparator() { + static final PrimitiveComparator SIGNED_INT64_COMPARATOR = new PrimitiveComparator() { @Override public int compare(Long o1, Long o2) { return compare(o1.longValue(), o2.longValue()); @@ -85,7 +97,7 @@ public int compare(long l1, long l2) { } }; - static PrimitiveComparator FLOAT_COMPARATOR = new PrimitiveComparator() { + static final PrimitiveComparator FLOAT_COMPARATOR = new PrimitiveComparator() { @Override public int compare(Float o1, Float o2) { return compare(o1.floatValue(), o2.floatValue()); @@ -97,7 +109,7 @@ public int compare(float f1, float f2) { } }; - static PrimitiveComparator DOUBLE_COMPARATOR = new PrimitiveComparator() { + static final PrimitiveComparator DOUBLE_COMPARATOR = new PrimitiveComparator() { @Override public int compare(Double o1, Double o2) { return compare(o1.doubleValue(), o2.doubleValue()); @@ -110,7 +122,7 @@ public int compare(double d1, double d2) { }; // TODO: this one is temporary as the self-comparison of Binary is not proper - static PrimitiveComparator BINARY_COMPARATOR = new PrimitiveComparator() { + static final PrimitiveComparator BINARY_COMPARATOR = new PrimitiveComparator() { @Override public int compare(Binary o1, Binary o2) { return o1.compareTo(o2); diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java index 3c91383179..6439db1868 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java @@ -89,9 +89,10 @@ public T convert(PrimitiveTypeNameConverter conve @Override PrimitiveComparator comparator(OriginalType logicalType) { - if (logicalType == OriginalType.UINT_64) + if (logicalType == OriginalType.UINT_64) { // TODO: return unsigned comparator return PrimitiveComparator.SIGNED_INT64_COMPARATOR; + } return PrimitiveComparator.SIGNED_INT64_COMPARATOR; } }, @@ -120,7 +121,7 @@ public T convert(PrimitiveTypeNameConverter conve @Override PrimitiveComparator comparator(OriginalType logicalType) { - if (logicalType != null) + if (logicalType != null) { switch (logicalType) { case UINT_8: case UINT_16: @@ -128,6 +129,7 @@ PrimitiveComparator comparator(OriginalType logicalType) { // TODO: return unsigned comparator return PrimitiveComparator.SIGNED_INT32_COMPARATOR; } + } return PrimitiveComparator.SIGNED_INT32_COMPARATOR; } }, @@ -184,7 +186,7 @@ public T convert(PrimitiveTypeNameConverter conve @Override PrimitiveComparator comparator(OriginalType logicalType) { - if (logicalType != null) + if (logicalType != null) { switch (logicalType) { case JSON: case BSON: @@ -194,6 +196,7 @@ PrimitiveComparator comparator(OriginalType logicalType) { // TODO: return signed comparator return PrimitiveComparator.BINARY_COMPARATOR; } + } // TODO: return lexicographical comparator return PrimitiveComparator.BINARY_COMPARATOR; } @@ -277,7 +280,7 @@ public T convert(PrimitiveTypeNameConverter conve @Override PrimitiveComparator comparator(OriginalType logicalType) { - // TODO: what to return here? + // TODO: return signed comparator return PrimitiveComparator.BINARY_COMPARATOR; } }, @@ -306,9 +309,10 @@ public T convert(PrimitiveTypeNameConverter conve @Override PrimitiveComparator comparator(OriginalType logicalType) { - if (logicalType == OriginalType.DECIMAL) + if (logicalType == OriginalType.DECIMAL) { // TODO: return signed comparator return PrimitiveComparator.BINARY_COMPARATOR; + } // TODO: return lexicographical comparator return PrimitiveComparator.BINARY_COMPARATOR; } @@ -618,7 +622,8 @@ protected Type union(Type toMerge, boolean strict) { } @Override - public PrimitiveComparator comparator() { - return getPrimitiveTypeName().comparator(getOriginalType()); + @SuppressWarnings("unchecked") + public PrimitiveComparator comparator() { + return (PrimitiveComparator) getPrimitiveTypeName().comparator(getOriginalType()); } } diff --git a/parquet-generator/src/main/java/org/apache/parquet/filter2/IncrementallyUpdatedFilterPredicateGenerator.java b/parquet-generator/src/main/java/org/apache/parquet/filter2/IncrementallyUpdatedFilterPredicateGenerator.java index 1dfaf6f03d..fc5413e11a 100644 --- a/parquet-generator/src/main/java/org/apache/parquet/filter2/IncrementallyUpdatedFilterPredicateGenerator.java +++ b/parquet-generator/src/main/java/org/apache/parquet/filter2/IncrementallyUpdatedFilterPredicateGenerator.java @@ -45,28 +45,28 @@ public IncrementallyUpdatedFilterPredicateGenerator(File file) throws IOExceptio private static class TypeInfo { public final String className; public final String primitiveName; - public final boolean useComparable; public final boolean supportsInequality; - private TypeInfo(String className, String primitiveName, boolean useComparable, boolean supportsInequality) { + private TypeInfo(String className, String primitiveName, boolean supportsInequality) { this.className = className; this.primitiveName = primitiveName; - this.useComparable = useComparable; this.supportsInequality = supportsInequality; } } private static final TypeInfo[] TYPES = new TypeInfo[]{ - new TypeInfo("Integer", "int", false, true), - new TypeInfo("Long", "long", false, true), - new TypeInfo("Boolean", "boolean", false, false), - new TypeInfo("Float", "float", false, true), - new TypeInfo("Double", "double", false, true), - new TypeInfo("Binary", "Binary", true, true), + new TypeInfo("Integer", "int", true), + new TypeInfo("Long", "long", true), + new TypeInfo("Boolean", "boolean", false), + new TypeInfo("Float", "float", true), + new TypeInfo("Double", "double", true), + new TypeInfo("Binary", "Binary", true), }; public void run() throws IOException { add("package org.apache.parquet.filter2.recordlevel;\n" + + "\n" + + "import java.util.List;\n" + "\n" + "import org.apache.parquet.hadoop.metadata.ColumnPath;\n" + "import org.apache.parquet.filter2.predicate.Operators.Eq;\n" + @@ -79,7 +79,9 @@ public void run() throws IOException { "import org.apache.parquet.filter2.predicate.Operators.UserDefined;\n" + "import org.apache.parquet.filter2.predicate.UserDefinedPredicate;\n" + "import org.apache.parquet.filter2.recordlevel.IncrementallyUpdatedFilterPredicate.ValueInspector;\n" + - "import org.apache.parquet.io.api.Binary;\n\n" + + "import org.apache.parquet.io.api.Binary;\n" + + "import org.apache.parquet.io.PrimitiveColumnIO;\n" + + "import org.apache.parquet.schema.PrimitiveComparator;\n\n" + "/**\n" + " * This class is auto-generated by {@link parquet.filter2.IncrementallyUpdatedFilterPredicateGenerator}\n" + " * Do not manually edit!\n" + @@ -88,6 +90,10 @@ public void run() throws IOException { add("public class IncrementallyUpdatedFilterPredicateBuilder extends IncrementallyUpdatedFilterPredicateBuilderBase {\n\n"); + add(" public IncrementallyUpdatedFilterPredicateBuilder(List leaves) {\n" + + " super(leaves);\n" + + " }\n\n"); + addVisitBegin("Eq"); for (TypeInfo info : TYPES) { addEqNotEqCase(info, true); @@ -180,6 +186,7 @@ private void addEqNotEqCase(TypeInfo info, boolean isEq) throws IOException { " };\n" + " } else {\n" + " final " + info.primitiveName + " target = (" + info.className + ") (Object) pred.getValue();\n" + + " final PrimitiveComparator<" + info.className + "> comparator = getComparator(columnPath);\n" + "\n" + " valueInspector = new ValueInspector() {\n" + " @Override\n" + @@ -190,11 +197,7 @@ private void addEqNotEqCase(TypeInfo info, boolean isEq) throws IOException { " @Override\n" + " public void update(" + info.primitiveName + " value) {\n"); - if (info.useComparable) { - add(" setResult(" + compareEquality("value", "target", isEq) + ");\n"); - } else { - add(" setResult(" + (isEq ? "value == target" : "value != target" ) + ");\n"); - } + add(" setResult(" + compareEquality("value", "target", isEq) + ");\n"); add(" }\n" + " };\n" + @@ -212,6 +215,7 @@ private void addInequalityCase(TypeInfo info, String op) throws IOException { add(" if (clazz.equals(" + info.className + ".class)) {\n" + " final " + info.primitiveName + " target = (" + info.className + ") (Object) pred.getValue();\n" + + " final PrimitiveComparator<" + info.className + "> comparator = getComparator(columnPath);\n" + "\n" + " valueInspector = new ValueInspector() {\n" + " @Override\n" + @@ -222,11 +226,8 @@ private void addInequalityCase(TypeInfo info, String op) throws IOException { " @Override\n" + " public void update(" + info.primitiveName + " value) {\n"); - if (info.useComparable) { - add(" setResult(value.compareTo(target) " + op + " 0);\n"); - } else { - add(" setResult(value " + op + " target);\n"); - } + add(" setResult(comparator.compare(value, target) " + op + " 0);\n"); + add(" }\n" + " };\n" + " }\n\n"); @@ -260,7 +261,7 @@ private void addUdpCase(TypeInfo info, boolean invert)throws IOException { } private String compareEquality(String var, String target, boolean eq) { - return var + ".compareTo(" + target + ")" + (eq ? " == 0 " : " != 0"); + return "comparator.compare(" + var + ", " + target + ")" + (eq ? " == 0 " : " != 0"); } private void add(String s) throws IOException { diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java index 19604ec98e..e410b3fd13 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java @@ -33,6 +33,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; +import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -75,7 +76,7 @@ private ColumnChunkMetaData getColumnChunk(ColumnPath columnPath) { @SuppressWarnings("unchecked") private > Set expandDictionary(ColumnChunkMetaData meta) throws IOException { - ColumnDescriptor col = new ColumnDescriptor(meta.getPath().toArray(), meta.getType(), -1, -1); + ColumnDescriptor col = new ColumnDescriptor(meta.getPath().toArray(), meta.getFullType(), -1, -1); DictionaryPage page = dictionaries.readDictionaryPage(col); // the chunk may not be dictionary-encoded @@ -212,8 +213,9 @@ public > Boolean visit(Lt lt) { return BLOCK_MIGHT_MATCH; } + Comparator comparator = meta.getFullType().comparator(); for (T entry : dictSet) { - if (value.compareTo(entry) > 0) { + if (comparator.compare(value, entry) > 0) { return BLOCK_MIGHT_MATCH; } } @@ -253,8 +255,9 @@ public > Boolean visit(LtEq ltEq) { return BLOCK_MIGHT_MATCH; } + Comparator comparator = meta.getFullType().comparator(); for (T entry : dictSet) { - if (value.compareTo(entry) >= 0) { + if (comparator.compare(value, entry) >= 0) { return BLOCK_MIGHT_MATCH; } } @@ -292,8 +295,9 @@ public > Boolean visit(Gt gt) { return BLOCK_MIGHT_MATCH; } + Comparator comparator = meta.getFullType().comparator(); for (T entry : dictSet) { - if (value.compareTo(entry) < 0) { + if (comparator.compare(value, entry) < 0) { return BLOCK_MIGHT_MATCH; } } @@ -333,8 +337,9 @@ public > Boolean visit(GtEq gtEq) { return BLOCK_MIGHT_MATCH; } + Comparator comparator = meta.getFullType().comparator(); for (T entry : dictSet) { - if (value.compareTo(entry) <= 0) { + if (comparator.compare(value, entry) <= 0) { return BLOCK_MIGHT_MATCH; } } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 163056c4dc..3ea91e066b 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -335,8 +335,12 @@ public static Statistics toParquetStatistics( if (!statistics.isEmpty() && statistics.isSmallerThan(MAX_STATS_SIZE)) { stats.setNull_count(statistics.getNumNulls()); if (statistics.hasNonNullValue()) { + byte[] maxBytes = statistics.getMaxBytes(); stats.setMax(statistics.getMaxBytes()); + stats.setMax_value(maxBytes); + byte[] minBytes = statistics.getMinBytes(); stats.setMin(statistics.getMinBytes()); + stats.setMin_value(minBytes); } } return stats; @@ -357,29 +361,44 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist @Deprecated public static org.apache.parquet.column.statistics.Statistics fromParquetStatistics (String createdBy, Statistics statistics, PrimitiveTypeName type) { - return fromParquetStatisticsInternal(createdBy, statistics, type, defaultSortOrder(type)); + return fromParquetStatisticsInternal(createdBy, statistics, new PrimitiveType(Repetition.OPTIONAL, type, ""), + defaultSortOrder(type)); } // Visible for testing static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal - (String createdBy, Statistics statistics, PrimitiveTypeName type, SortOrder typeSortOrder) { - // create stats object based on the column type - org.apache.parquet.column.statistics.Statistics stats = org.apache.parquet.column.statistics.Statistics.getStatsBasedOnType(type); + (String createdBy, Statistics statistics, PrimitiveType type, SortOrder typeSortOrder) { // If there was no statistics written to the footer, create an empty Statistics object and return + if (statistics == null) { + return org.apache.parquet.column.statistics.Statistics + .createLegacyStats(type.asPrimitiveType().getPrimitiveTypeName()); + } - boolean isSet = statistics != null && statistics.isSetMax() && statistics.isSetMin(); - boolean maxEqualsMin = isSet ? Arrays.equals(statistics.getMin(), statistics.getMax()) : false; - boolean sortOrdersMatch = SortOrder.SIGNED == typeSortOrder; - // NOTE: See docs in CorruptStatistics for explanation of why this check is needed - // The sort order is checked to avoid returning min/max stats that are not - // valid with the type's sort order. Currently, all stats are aggregated - // using a signed ordering, which isn't valid for strings or unsigned ints. - if (statistics != null && !CorruptStatistics.shouldIgnoreStatistics(createdBy, type) && - ( sortOrdersMatch || maxEqualsMin)) { - if (isSet) { - stats.setMinMaxFromBytes(statistics.min.array(), statistics.max.array()); - } + org.apache.parquet.column.statistics.Statistics stats; + // Use the new V2 min-max statistics over the former one if it is filled + if (statistics.isSetMin_value() && statistics.isSetMax_value()) { + stats = org.apache.parquet.column.statistics.Statistics.createStats(type); + stats.setMinMaxFromBytes(statistics.min_value.array(), statistics.max_value.array()); stats.setNumNulls(statistics.null_count); + } else { + // create stats object based on the column type + stats = org.apache.parquet.column.statistics.Statistics + .createLegacyStats(type.asPrimitiveType().getPrimitiveTypeName()); + + boolean isSet = statistics.isSetMax() && statistics.isSetMin(); + boolean maxEqualsMin = isSet ? Arrays.equals(statistics.getMin(), statistics.getMax()) : false; + boolean sortOrdersMatch = SortOrder.SIGNED == typeSortOrder; + // NOTE: See docs in CorruptStatistics for explanation of why this check is needed + // The sort order is checked to avoid returning min/max stats that are not + // valid with the type's sort order. Currently, all stats are aggregated + // using a signed ordering, which isn't valid for strings or unsigned ints. + if (!CorruptStatistics.shouldIgnoreStatistics(createdBy, type.getPrimitiveTypeName()) && + (sortOrdersMatch || maxEqualsMin)) { + if (isSet) { + stats.setMinMaxFromBytes(statistics.min.array(), statistics.max.array()); + } + stats.setNumNulls(statistics.null_count); + } } return stats; } @@ -389,7 +408,7 @@ public org.apache.parquet.column.statistics.Statistics fromParquetStatistics( SortOrder expectedOrder = overrideSortOrderToSigned(type) ? SortOrder.SIGNED : sortOrder(type); return fromParquetStatisticsInternal( - createdBy, statistics, type.getPrimitiveTypeName(), expectedOrder); + createdBy, statistics, type, expectedOrder); } /** @@ -846,7 +865,7 @@ public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws ColumnPath path = getPath(metaData); ColumnChunkMetaData column = ColumnChunkMetaData.get( path, - messageType.getType(path.toArray()).asPrimitiveType().getPrimitiveTypeName(), + messageType.getType(path.toArray()).asPrimitiveType(), fromFormatCodec(metaData.codec), convertEncodingStats(metaData.getEncoding_stats()), fromFormatEncodings(metaData.encodings), diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java index ac3cd3b8b2..4d887dfa9b 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java @@ -18,8 +18,6 @@ */ package org.apache.parquet.hadoop; -import static org.apache.parquet.column.statistics.Statistics.getStatsBasedOnType; - import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.ArrayList; @@ -79,7 +77,6 @@ private ColumnChunkPageWriter(ColumnDescriptor path, this.compressor = compressor; this.allocator = allocator; this.buf = new ConcatenatingByteArrayCollector(); - this.totalStatistics = getStatsBasedOnType(this.path.getType()); } @Override @@ -116,7 +113,14 @@ public void writePage(BytesInput bytes, this.compressedLength += compressedSize; this.totalValueCount += valueCount; this.pageCount += 1; - this.totalStatistics.mergeStatistics(statistics); + + // Cloning the statistics if it is not initialized yet so we have the correct typed one + if (totalStatistics == null) { + totalStatistics = statistics.clone(); + } else { + totalStatistics.mergeStatistics(statistics); + } + // by concatenating before collecting instead of collecting twice, // we only allocate one buffer to copy into instead of multiple. buf.collect(BytesInput.concat(BytesInput.from(tempOutputStream), compressedBytes)); @@ -154,7 +158,13 @@ public void writePageV2( this.compressedLength += compressedSize; this.totalValueCount += valueCount; this.pageCount += 1; - this.totalStatistics.mergeStatistics(statistics); + + // Cloning the statistics if it is not initialized yet so we have the correct typed one + if (totalStatistics == null) { + totalStatistics = statistics.clone(); + } else { + totalStatistics.mergeStatistics(statistics); + } // by concatenating before collecting instead of collecting twice, // we only allocate one buffer to copy into instead of multiple. diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java index da8635d099..be0b66bdd2 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java @@ -65,6 +65,7 @@ import org.apache.parquet.io.ParquetEncodingException; import org.apache.parquet.io.PositionOutputStream; import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.TypeUtil; import org.slf4j.Logger; @@ -116,7 +117,7 @@ public static enum Mode { // column chunk data set at the start of a column private CompressionCodecName currentChunkCodec; // set in startColumn private ColumnPath currentChunkPath; // set in startColumn - private PrimitiveTypeName currentChunkType; // set in startColumn + private PrimitiveType currentChunkType; // set in startColumn private long currentChunkValueCount; // set in startColumn private long currentChunkFirstDataPage; // set in startColumn (out.pos()) private long currentChunkDictionaryPageOffset; // set in writeDictionaryPage @@ -317,15 +318,14 @@ public void startColumn(ColumnDescriptor descriptor, encodingStatsBuilder.clear(); currentEncodings = new HashSet(); currentChunkPath = ColumnPath.get(descriptor.getPath()); - currentChunkType = descriptor.getType(); + currentChunkType = descriptor.getFullType(); currentChunkCodec = compressionCodecName; currentChunkValueCount = valueCount; currentChunkFirstDataPage = out.getPos(); compressedLength = 0; uncompressedLength = 0; - // need to know what type of stats to initialize to - // better way to do this? - currentStatistics = Statistics.getStatsBasedOnType(currentChunkType); + // The statistics will be cloned from the first one added at writeDataPage(s) so we have the correct typed one + currentStatistics = null; } /** @@ -425,7 +425,14 @@ public void writeDataPage( this.compressedLength += compressedPageSize + headerSize; LOG.debug("{}: write data page content {}", out.getPos(), compressedPageSize); bytes.writeAllTo(out); - currentStatistics.mergeStatistics(statistics); + + // Cloning the statistics if it is not initialized yet so we have the correct typed one + if (currentStatistics == null) { + currentStatistics = statistics.clone(); + } else { + currentStatistics.mergeStatistics(statistics); + } + encodingStatsBuilder.addDataEncoding(valuesEncoding); currentEncodings.add(rlEncoding); currentEncodings.add(dlEncoding); @@ -599,7 +606,7 @@ public void appendRowGroup(SeekableInputStream from, BlockMetaData rowGroup, currentBlock.addColumn(ColumnChunkMetaData.get( chunk.getPath(), - chunk.getType(), + chunk.getFullType(), chunk.getCodec(), chunk.getEncodingStats(), chunk.getEncodings(), diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java index 720bd77924..178a49a5fd 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java @@ -24,7 +24,9 @@ import org.apache.parquet.column.EncodingStats; import org.apache.parquet.column.statistics.BooleanStatistics; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type; /** * Column meta data for a block stored in the file footer and passed in the InputSplit @@ -65,6 +67,7 @@ public static ColumnChunkMetaData get( valueCount, totalSize, totalUncompressedSize); } + @Deprecated public static ColumnChunkMetaData get( ColumnPath path, PrimitiveTypeName type, @@ -77,6 +80,22 @@ public static ColumnChunkMetaData get( long valueCount, long totalSize, long totalUncompressedSize) { + return get(path, new PrimitiveType(Type.Repetition.OPTIONAL, type, ""), codec, encodingStats, encodings, statistics, + firstDataPage, dictionaryPageOffset, valueCount, totalSize, totalUncompressedSize); + } + + public static ColumnChunkMetaData get( + ColumnPath path, + PrimitiveType type, + CompressionCodecName codec, + EncodingStats encodingStats, + Set encodings, + Statistics statistics, + long firstDataPage, + long dictionaryPageOffset, + long valueCount, + long totalSize, + long totalUncompressedSize) { // to save space we store those always positive longs in ints when they fit. if (positiveLongFitsInAnInt(firstDataPage) && positiveLongFitsInAnInt(dictionaryPageOffset) @@ -161,6 +180,13 @@ public PrimitiveTypeName getType() { return properties.getType(); } + /** + * @return the full type object of the column + */ + public PrimitiveType getFullType() { + return properties.getFullType(); + } + /** * @return start of the column data offset */ @@ -231,7 +257,7 @@ class IntColumnChunkMetaData extends ColumnChunkMetaData { */ IntColumnChunkMetaData( ColumnPath path, - PrimitiveTypeName type, + PrimitiveType type, CompressionCodecName codec, EncodingStats encodingStats, Set encodings, @@ -336,7 +362,7 @@ class LongColumnChunkMetaData extends ColumnChunkMetaData { */ LongColumnChunkMetaData( ColumnPath path, - PrimitiveTypeName type, + PrimitiveType type, CompressionCodecName codec, EncodingStats encodingStats, Set encodings, diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkProperties.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkProperties.java index 5e2667501d..dbd3c369fa 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkProperties.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkProperties.java @@ -22,24 +22,31 @@ import java.util.Set; import org.apache.parquet.column.Encoding; +import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type; public class ColumnChunkProperties { private static Canonicalizer properties = new Canonicalizer(); + @Deprecated public static ColumnChunkProperties get(ColumnPath path, PrimitiveTypeName type, CompressionCodecName codec, Set encodings) { + return get(path, new PrimitiveType(Type.Repetition.OPTIONAL, type, ""), codec, encodings); + } + + public static ColumnChunkProperties get(ColumnPath path, PrimitiveType type, CompressionCodecName codec, Set encodings) { return properties.canonicalize(new ColumnChunkProperties(codec, path, type, encodings)); } private final CompressionCodecName codec; private final ColumnPath path; - private final PrimitiveTypeName type; + private final PrimitiveType type; private final Set encodings; private ColumnChunkProperties(CompressionCodecName codec, ColumnPath path, - PrimitiveTypeName type, + PrimitiveType type, Set encodings) { super(); this.codec = codec; @@ -57,6 +64,10 @@ public ColumnPath getPath() { } public PrimitiveTypeName getType() { + return type.getPrimitiveTypeName(); + } + + public PrimitiveType getFullType() { return type; } @@ -68,7 +79,7 @@ public Set getEncodings() { public boolean equals(Object obj) { if (obj instanceof ColumnChunkProperties) { ColumnChunkProperties other = (ColumnChunkProperties)obj; - return other.codec == codec && other.path.equals(path) && other.type == type && equals(other.encodings, encodings); + return other.codec == codec && other.path.equals(path) && other.type.equals(type) && equals(other.encodings, encodings); } return false; } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 4df45ddc96..2e708f826f 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -61,6 +61,7 @@ import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType; import org.junit.Assert; import org.junit.Test; @@ -403,7 +404,7 @@ public void testBinaryStats() { formatStats.isSetNull_count()); Statistics roundTripStats = ParquetMetadataConverter.fromParquetStatisticsInternal( - Version.FULL_VERSION, formatStats, PrimitiveTypeName.BINARY, + Version.FULL_VERSION, formatStats, new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, ""), ParquetMetadataConverter.SortOrder.SIGNED); Assert.assertTrue(roundTripStats.isEmpty()); @@ -528,9 +529,14 @@ public void testIgnoreStatsWithSignedSortOrder() { stats.updateStats(Binary.fromString("z")); stats.incrementNumNulls(); + org.apache.parquet.format.Statistics v1Statistics = ParquetMetadataConverter.toParquetStatistics(stats); + // Unsetting the min/max values the enforce the v1 ignoring logic + v1Statistics.unsetMin_value(); + v1Statistics.unsetMax_value(); + Statistics convertedStats = converter.fromParquetStatistics( Version.FULL_VERSION, - ParquetMetadataConverter.toParquetStatistics(stats), + v1Statistics, Types.required(PrimitiveTypeName.BINARY) .as(OriginalType.UTF8).named("b")); diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java index d157cc3719..bdd858fe0b 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java @@ -52,6 +52,7 @@ import java.io.File; import java.io.IOException; import java.util.Arrays; +import java.util.Comparator; import java.util.List; import java.util.Random; @@ -163,9 +164,11 @@ private static class StatsValidator> { private final boolean hasNonNull; private final T min; private final T max; + private final Comparator comparator; public StatsValidator(DataPage page) { Statistics stats = getStatisticsFromPageHeader(page); + this.comparator = stats.comparator(); this.hasNonNull = stats.hasNonNullValue(); if (hasNonNull) { this.min = stats.genericGetMin(); @@ -178,8 +181,8 @@ public StatsValidator(DataPage page) { public void validate(T value) { if (hasNonNull) { - assertTrue("min should be <= all values", min.compareTo(value) <= 0); - assertTrue("min should be >= all values", max.compareTo(value) >= 0); + assertTrue("min should be <= all values", comparator.compare(min, value) <= 0); + assertTrue("min should be >= all values", comparator.compare(max, value) >= 0); } } } @@ -306,8 +309,8 @@ private void validateStatsForPage(DataPage page, DictionaryPage dict, ColumnDesc System.err.println(String.format( "Validated stats min=%s max=%s nulls=%d for page=%s col=%s", - String.valueOf(stats.genericGetMin()), - String.valueOf(stats.genericGetMax()), stats.getNumNulls(), page, + stats.minAsString(), + stats.maxAsString(), stats.getNumNulls(), page, Arrays.toString(desc.getPath()))); } } From 51bc1f827d116cc9f13b9d2dd295632bb5c92974 Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Mon, 20 Nov 2017 11:28:36 +0100 Subject: [PATCH 05/17] PARQUET-1025: Add the proper comparators as required; revert Binary related changes --- .../column/statistics/BinaryStatistics.java | 4 +- .../column/statistics/BooleanStatistics.java | 5 +- .../column/statistics/DoubleStatistics.java | 5 +- .../column/statistics/FloatStatistics.java | 5 +- .../column/statistics/IntStatistics.java | 5 +- .../column/statistics/LongStatistics.java | 5 +- .../parquet/column/statistics/Statistics.java | 6 +- .../org/apache/parquet/io/api/Binary.java | 178 ++++------- .../parquet/schema/PrimitiveComparator.java | 111 +++++-- .../apache/parquet/schema/PrimitiveType.java | 30 +- .../schema/TestPrimitiveComparator.java | 286 ++++++++++++++++++ 11 files changed, 474 insertions(+), 166 deletions(-) create mode 100644 parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java index 7c8e30d397..46adf41c3b 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java @@ -21,6 +21,7 @@ import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; public class BinaryStatistics extends Statistics { @@ -28,7 +29,8 @@ public class BinaryStatistics extends Statistics { private Binary min; public BinaryStatistics() { - super(); + // Creating a fake primitive type to have the proper comparator + this(Types.optional(PrimitiveType.PrimitiveTypeName.BINARY).named("")); } BinaryStatistics(Type type) { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java index 6f893db64c..c62606b305 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java @@ -19,7 +19,9 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; public class BooleanStatistics extends Statistics { @@ -27,7 +29,8 @@ public class BooleanStatistics extends Statistics { private boolean min; public BooleanStatistics() { - super(); + // Creating a fake primitive type to have the proper comparator + this(Types.optional(PrimitiveType.PrimitiveTypeName.BOOLEAN).named("")); } BooleanStatistics(Type type) { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java index b1cf3f0958..8e9f4540f0 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java @@ -19,7 +19,9 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; public class DoubleStatistics extends Statistics { @@ -27,7 +29,8 @@ public class DoubleStatistics extends Statistics { private double min; public DoubleStatistics() { - super(); + // Creating a fake primitive type to have the proper comparator + this(Types.optional(PrimitiveType.PrimitiveTypeName.DOUBLE).named("")); } DoubleStatistics(Type type) { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java index 6fcf3df65a..16ba380dce 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java @@ -19,7 +19,9 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; public class FloatStatistics extends Statistics { @@ -27,7 +29,8 @@ public class FloatStatistics extends Statistics { private float min; public FloatStatistics() { - super(); + // Creating a fake primitive type to have the proper comparator + this(Types.optional(PrimitiveType.PrimitiveTypeName.FLOAT).named("")); } FloatStatistics(Type type) { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java index fd3063ee38..9c55a6c674 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java @@ -19,7 +19,9 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; public class IntStatistics extends Statistics { @@ -27,7 +29,8 @@ public class IntStatistics extends Statistics { private int min; public IntStatistics() { - super(); + // Creating a fake primitive type to have the proper comparator + this(Types.optional(PrimitiveType.PrimitiveTypeName.INT32).named("")); } IntStatistics(Type type) { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java index 1102574339..7d5d94a3f6 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java @@ -19,7 +19,9 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; public class LongStatistics extends Statistics { @@ -27,7 +29,8 @@ public class LongStatistics extends Statistics { private long min; public LongStatistics() { - super(); + // Creating a fake primitive type to have the proper comparator + this(Types.optional(PrimitiveType.PrimitiveTypeName.INT64).named("")); } LongStatistics(Type type) { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index ce27be511e..acab8b75d8 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -41,10 +41,6 @@ public abstract class Statistics> implements Cloneable { private boolean hasNonNullValue; private long num_nulls; - Statistics() { - this(PrimitiveComparator.comparableComparator()); - } - Statistics(PrimitiveComparator comparator) { hasNonNullValue = false; num_nulls = 0; @@ -249,7 +245,7 @@ public void mergeStatistics(Statistics stats) { * Returns the comparator to be used to compare two generic values in the proper way (for example, unsigned comparison * for UINT_32). */ - public final Comparator comparator() { + public final PrimitiveComparator comparator() { return comparator; } diff --git a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java index 6886b60280..46fa336b36 100644 --- a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java +++ b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java @@ -33,18 +33,11 @@ import org.apache.parquet.io.ParquetDecodingException; import org.apache.parquet.io.ParquetEncodingException; -import org.apache.parquet.schema.Type; import static org.apache.parquet.bytes.BytesUtils.UTF8; abstract public class Binary implements Comparable, Serializable { - public interface ComparatorHelper { - int compare(byte[] array1, int offset1, int length1, byte[] array2, int offset2, int length2); - int compare(ByteBuffer buffer1, int offset1, int length1, ByteBuffer buffer2, int offset2, int length2); - int compare(byte[] array1, int offset1, int length1, ByteBuffer buffer2, int offset2, int length2); - } - protected boolean isBackingBytesReused; // this isn't really something others should extend @@ -86,19 +79,9 @@ private Binary() { } @Deprecated abstract public int compareTo(Binary other); - abstract public int compareTo(Binary other, ComparatorHelper helper); - - int compareTo(byte[] bytes, int offset, int length) { - return compareTo(bytes, offset, length, DEFAULT_COMPARATOR_HELPER); - } - - abstract int compareTo(byte[] bytes, int offset, int length, ComparatorHelper helper); + abstract int compareTo(byte[] bytes, int offset, int length); - int compareTo(ByteBuffer bytes, int offset, int length) { - return compareTo(bytes, offset, length, DEFAULT_COMPARATOR_HELPER); - } - - abstract int compareTo(ByteBuffer bytes, int offset, int length, ComparatorHelper helper); + abstract int compareTo(ByteBuffer bytes, int offset, int length); abstract public ByteBuffer toByteBuffer(); @@ -212,22 +195,17 @@ boolean equals(ByteBuffer bytes, int otherOffset, int otherLength) { @Override public int compareTo(Binary other) { - return -other.compareTo(value, offset, length); + return other.compareTo(value, offset, length); } @Override - public int compareTo(Binary other, ComparatorHelper helper) { - return -other.compareTo(value, offset, length, helper); + int compareTo(byte[] other, int otherOffset, int otherLength) { + return Binary.compareTwoByteArrays(value, offset, length, other, otherOffset, otherLength); } @Override - int compareTo(byte[] other, int otherOffset, int otherLength, ComparatorHelper helper) { - return helper.compare(value, offset, length, other, otherOffset, otherLength); - } - - @Override - int compareTo(ByteBuffer bytes, int otherOffset, int otherLength, ComparatorHelper helper) { - return helper.compare(value, offset, length, bytes, otherOffset, otherLength); + int compareTo(ByteBuffer bytes, int otherOffset, int otherLength) { + return Binary.compareByteArrayToByteBuffer(value, offset, length, bytes, otherOffset, otherLength); } @Override @@ -373,22 +351,17 @@ boolean equals(ByteBuffer bytes, int otherOffset, int otherLength) { @Override public int compareTo(Binary other) { - return -other.compareTo(value, 0, value.length); + return other.compareTo(value, 0, value.length); } @Override - public int compareTo(Binary other, ComparatorHelper helper) { - return -other.compareTo(value, 0, value.length, helper); + int compareTo(byte[] other, int otherOffset, int otherLength) { + return Binary.compareTwoByteArrays(value, 0, value.length, other, otherOffset, otherLength); } @Override - int compareTo(byte[] other, int otherOffset, int otherLength, ComparatorHelper helper) { - return helper.compare(value, 0, value.length, other, otherOffset, otherLength); - } - - @Override - int compareTo(ByteBuffer bytes, int otherOffset, int otherLength, ComparatorHelper helper) { - return helper.compare(value, 0, value.length, bytes, otherOffset, otherLength); + int compareTo(ByteBuffer bytes, int otherOffset, int otherLength) { + return Binary.compareByteArrayToByteBuffer(value, 0, value.length, bytes, otherOffset, otherLength); } @Override @@ -539,39 +512,30 @@ boolean equals(ByteBuffer otherBytes, int otherOffset, int otherLength) { @Override public int compareTo(Binary other) { if (value.hasArray()) { - return -other.compareTo(value.array(), value.arrayOffset() + offset, length); - } else { - return -other.compareTo(value, offset, length); - } - } - - @Override - public int compareTo(Binary other, ComparatorHelper helper) { - if (value.hasArray()) { - return -other.compareTo(value.array(), value.arrayOffset() + offset, length, helper); + return other.compareTo(value.array(), value.arrayOffset() + offset, length); } else { - return -other.compareTo(value, offset, length, helper); + return other.compareTo(value, offset, length); } } @Override - int compareTo(byte[] other, int otherOffset, int otherLength, ComparatorHelper helper) { + int compareTo(byte[] other, int otherOffset, int otherLength) { if (value.hasArray()) { - return helper.compare(value.array(), value.arrayOffset() + offset, length, + return Binary.compareTwoByteArrays(value.array(), value.arrayOffset() + offset, length, other, otherOffset, otherLength); } { - return -helper.compare(other, otherOffset, otherLength, value, offset, length); + return Binary.compareByteBufferToByteArray(value, offset, length, other, otherOffset, otherLength); } } @Override - int compareTo(ByteBuffer bytes, int otherOffset, int otherLength, ComparatorHelper helper) { - return helper.compare(value, offset, length, bytes, otherOffset, otherLength); + int compareTo(ByteBuffer bytes, int otherOffset, int otherLength) { + return Binary.compareTwoByteBuffers(value, offset, length, bytes, otherOffset, otherLength); } @Override public ByteBuffer toByteBuffer() { - ByteBuffer ret = value.slice(); + ByteBuffer ret = value.duplicate(); ret.position(offset); ret.limit(offset + length); return ret; @@ -708,75 +672,63 @@ private static final boolean equals(byte[] array1, int offset1, int length1, byt return true; } - private static final ComparatorHelper DEFAULT_COMPARATOR_HELPER = new ComparatorHelper() { - @Override - public int compare(byte[] array1, int offset1, int length1, - ByteBuffer buf, int offset2, int length2) { - if (array1 == null && buf == null) return 0; - int min_length = (length1 < length2) ? length1 : length2; - for (int i = 0; i < min_length; i++) { - if (array1[i + offset1] < buf.get(i + offset2)) { - return -1; - } - if (array1[i + offset1] > buf.get(i + offset2)) { - return 1; - } + private static final int compareByteBufferToByteArray(ByteBuffer buf, int offset1, int length1, + byte[] array, int offset2, int length2) { + return -1 * Binary.compareByteArrayToByteBuffer(array, offset1, length1, buf, offset2, length2); + } + + private static final int compareByteArrayToByteBuffer(byte[] array1, int offset1, int length1, + ByteBuffer buf, int offset2, int length2) { + if (array1 == null && buf == null) return 0; + int min_length = (length1 < length2) ? length1 : length2; + for (int i = 0; i < min_length; i++) { + if (array1[i + offset1] < buf.get(i + offset2)) { + return 1; } - // check remainder - if (length1 == length2) { - return 0; - } else if (length1 < length2) { + if (array1[i + offset1] > buf.get(i + offset2)) { return -1; - } else { - return 1; } } + // check remainder + if (length1 == length2) { return 0; } + else if (length1 < length2) { return 1;} + else { return -1; } + } - @Override - public int compare(ByteBuffer buf1, int offset1, int length1, - ByteBuffer buf2, int offset2, int length2) { - if (buf1 == null && buf2 == null) return 0; - int min_length = (length1 < length2) ? length1 : length2; - for (int i = 0; i < min_length; i++) { - if (buf1.get(i + offset1) < buf2.get(i + offset2)) { - return -1; - } - if (buf1.get(i + offset1) > buf2.get(i + offset2)) { - return 1; - } + private static final int compareTwoByteBuffers(ByteBuffer buf1, int offset1, int length1, + ByteBuffer buf2, int offset2, int length2) { + if (buf1 == null && buf2 == null) return 0; + int min_length = (length1 < length2) ? length1 : length2; + for (int i = 0; i < min_length; i++) { + if (buf1.get(i + offset1) < buf2.get(i + offset2)) { + return 1; } - // check remainder - if (length1 == length2) { - return 0; - } else if (length1 < length2) { + if (buf1.get(i + offset1) > buf2.get(i + offset2)) { return -1; - } else { - return 1; } } + // check remainder + if (length1 == length2) { return 0; } + else if (length1 < length2) { return 1;} + else { return -1; } + } - @Override - public int compare(byte[] array1, int offset1, int length1, - byte[] array2, int offset2, int length2) { - if (array1 == null && array2 == null) return 0; - if (array1 == array2 && offset1 == offset2 && length1 == length2) return 0; - int min_length = (length1 < length2) ? length1 : length2; - for (int i = 0; i < min_length; i++) { - if (array1[i + offset1] < array2[i + offset2]) { - return -1; - } - if (array1[i + offset1] > array2[i + offset2]) { - return 1; - } + private static final int compareTwoByteArrays(byte[] array1, int offset1, int length1, + byte[] array2, int offset2, int length2) { + if (array1 == null && array2 == null) return 0; + if (array1 == array2 && offset1 == offset2 && length1 == length2) return 0; + int min_length = (length1 < length2) ? length1 : length2; + for (int i = 0; i < min_length; i++) { + if (array1[i + offset1] < array2[i + offset2]) { + return 1; } - // check remainder - if (length1 == length2) { - return 0; - } else if (length1 < length2) { + if (array1[i + offset1] > array2[i + offset2]) { return -1; - } else { - return 1; } } - }; + // check remainder + if (length1 == length2) { return 0; } + else if (length1 < length2) { return 1;} + else { return -1; } + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java index f307b599e1..977e614c51 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java @@ -20,6 +20,7 @@ import org.apache.parquet.io.api.Binary; +import java.nio.ByteBuffer; import java.util.Comparator; /** @@ -49,18 +50,6 @@ public int compare(double d1, double d2) { throw new UnsupportedOperationException(); } - private static final PrimitiveComparator> COMPARABLE_COMPARATOR = new PrimitiveComparator>() { - @Override - public int compare(Comparable o1, Comparable o2) { - return o1.compareTo(o2); - } - }; - - @SuppressWarnings("unchecked") - public static > PrimitiveComparator comparableComparator() { - return (PrimitiveComparator) COMPARABLE_COMPARATOR; - } - static final PrimitiveComparator BOOLEAN_COMPARATOR = new PrimitiveComparator() { @Override public int compare(Boolean o1, Boolean o2) { @@ -73,30 +62,50 @@ public int compare(boolean b1, boolean b2) { } }; - static final PrimitiveComparator SIGNED_INT32_COMPARATOR = new PrimitiveComparator() { + private static abstract class IntComparator extends PrimitiveComparator { @Override - public int compare(Integer o1, Integer o2) { + public final int compare(Integer o1, Integer o2) { return compare(o1.intValue(), o2.intValue()); } + } + static final PrimitiveComparator SIGNED_INT32_COMPARATOR = new IntComparator() { @Override public int compare(int i1, int i2) { return Integer.compare(i1, i2); } }; - static final PrimitiveComparator SIGNED_INT64_COMPARATOR = new PrimitiveComparator() { + static final PrimitiveComparator UNSIGNED_INT32_COMPARATOR = new IntComparator() { + @Override + public int compare(int i1, int i2) { + // Implemented based on com.google.common.primitives.UnsignedInts.compare(int, int) + return Integer.compare(i1 ^ Integer.MIN_VALUE, i2 ^ Integer.MIN_VALUE); + } + }; + + private static abstract class LongComparator extends PrimitiveComparator { @Override - public int compare(Long o1, Long o2) { + public final int compare(Long o1, Long o2) { return compare(o1.longValue(), o2.longValue()); } + } + static final PrimitiveComparator SIGNED_INT64_COMPARATOR = new LongComparator() { @Override public int compare(long l1, long l2) { return Long.compare(l1, l2); } }; + static final PrimitiveComparator UNSIGNED_INT64_COMPARATOR = new LongComparator() { + @Override + public int compare(long l1, long l2) { + // Implemented based on com.google.common.primitives.UnsignedLongs.compare(long, long) + return Long.compare(l1 ^ Long.MIN_VALUE, l2 ^ Long.MIN_VALUE); + } + }; + static final PrimitiveComparator FLOAT_COMPARATOR = new PrimitiveComparator() { @Override public int compare(Float o1, Float o2) { @@ -121,11 +130,73 @@ public int compare(double d1, double d2) { } }; - // TODO: this one is temporary as the self-comparison of Binary is not proper - static final PrimitiveComparator BINARY_COMPARATOR = new PrimitiveComparator() { + private static abstract class BinaryComparator extends PrimitiveComparator { + @Override + public final int compare(Binary o1, Binary o2) { + return compare(o1.toByteBuffer(), o2.toByteBuffer()); + } + + abstract int compare(ByteBuffer b1, ByteBuffer b2); + + final int toUnsigned(byte b) { + return b & 0xFF; + } + } + + static final PrimitiveComparator LEXICOGRAPHICAL_BINARY_COMPARATOR = new BinaryComparator() { + @Override + int compare(ByteBuffer b1, ByteBuffer b2) { + int l1 = b1.remaining(); + int l2 = b2.remaining(); + int p1 = b1.position(); + int p2 = b2.position(); + int minL = Math.min(l1, l2); + + for (int i = 0; i < minL; ++i) { + int result = unsignedCompare(b1.get(p1 + i), b2.get(p2 + i)); + if (result != 0) { + return result; + } + } + + return l1 - l2; + } + + private int unsignedCompare(byte b1, byte b2) { + return toUnsigned(b1) - toUnsigned(b2); + } + }; + + static final PrimitiveComparator SIGNED_BINARY_COMPARATOR = new BinaryComparator() { + private static final int NEGATIVE_PREFIX = 0xFF; + private static final int POSITIVE_PREFIX = 0; + @Override - public int compare(Binary o1, Binary o2) { - return o1.compareTo(o2); + int compare(ByteBuffer b1, ByteBuffer b2) { + int l1 = b1.remaining(); + int l2 = b2.remaining(); + int p1 = b1.position(); + int p2 = b2.position(); + + boolean isNegative1 = l1 > 0 ? b1.get(p1) < 0 : false; + boolean isNegative2 = l2 > 0 ? b2.get(p2) < 0 : false; + if (isNegative1 != isNegative2) { + return isNegative1 ? -1 : 1; + } + + int maxL = Math.max(l1, l2); + int iDiff1 = maxL - l1; + int iDiff2 = maxL - l2; + int prefix = isNegative1 ? NEGATIVE_PREFIX : POSITIVE_PREFIX; + for (int i = 0; i < maxL; ++i) { + int value1 = i < iDiff1 ? prefix : toUnsigned(b1.get(p1 + i - iDiff1)); + int value2 = i < iDiff2 ? prefix : toUnsigned(b2.get(p2 + i - iDiff2)); + int result = value1 - value2; + if (result != 0) { + return result; + } + } + return 0; } }; } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java index 6439db1868..b00d0effee 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java @@ -90,8 +90,7 @@ public T convert(PrimitiveTypeNameConverter conve @Override PrimitiveComparator comparator(OriginalType logicalType) { if (logicalType == OriginalType.UINT_64) { - // TODO: return unsigned comparator - return PrimitiveComparator.SIGNED_INT64_COMPARATOR; + return PrimitiveComparator.UNSIGNED_INT64_COMPARATOR; } return PrimitiveComparator.SIGNED_INT64_COMPARATOR; } @@ -126,8 +125,7 @@ PrimitiveComparator comparator(OriginalType logicalType) { case UINT_8: case UINT_16: case UINT_32: - // TODO: return unsigned comparator - return PrimitiveComparator.SIGNED_INT32_COMPARATOR; + return PrimitiveComparator.UNSIGNED_INT32_COMPARATOR; } } return PrimitiveComparator.SIGNED_INT32_COMPARATOR; @@ -186,19 +184,10 @@ public T convert(PrimitiveTypeNameConverter conve @Override PrimitiveComparator comparator(OriginalType logicalType) { - if (logicalType != null) { - switch (logicalType) { - case JSON: - case BSON: - // TODO: Based on specs we do not have ordering for these while we specified lexicographical in ColumnOrder - return PrimitiveComparator.BINARY_COMPARATOR; - case DECIMAL: - // TODO: return signed comparator - return PrimitiveComparator.BINARY_COMPARATOR; - } + if (logicalType == OriginalType.DECIMAL) { + return PrimitiveComparator.SIGNED_BINARY_COMPARATOR; } - // TODO: return lexicographical comparator - return PrimitiveComparator.BINARY_COMPARATOR; + return PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR; } }, FLOAT("getFloat", Float.TYPE) { @@ -280,8 +269,7 @@ public T convert(PrimitiveTypeNameConverter conve @Override PrimitiveComparator comparator(OriginalType logicalType) { - // TODO: return signed comparator - return PrimitiveComparator.BINARY_COMPARATOR; + return PrimitiveComparator.SIGNED_BINARY_COMPARATOR; } }, FIXED_LEN_BYTE_ARRAY("getBinary", Binary.class) { @@ -310,11 +298,9 @@ public T convert(PrimitiveTypeNameConverter conve @Override PrimitiveComparator comparator(OriginalType logicalType) { if (logicalType == OriginalType.DECIMAL) { - // TODO: return signed comparator - return PrimitiveComparator.BINARY_COMPARATOR; + return PrimitiveComparator.SIGNED_BINARY_COMPARATOR; } - // TODO: return lexicographical comparator - return PrimitiveComparator.BINARY_COMPARATOR; + return PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR; } }; diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java new file mode 100644 index 0000000000..5cb626b2cd --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java @@ -0,0 +1,286 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.schema; + +import org.apache.parquet.io.api.Binary; +import org.junit.Test; + +import java.math.BigInteger; +import java.nio.ByteBuffer; + +import static org.apache.parquet.schema.PrimitiveComparator.BOOLEAN_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.DOUBLE_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.FLOAT_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.SIGNED_BINARY_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.SIGNED_INT32_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.SIGNED_INT64_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.UNSIGNED_INT32_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.UNSIGNED_INT64_COMPARATOR; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +public class TestPrimitiveComparator { + + @Test + public void testBooleanComparator() { + boolean[] values = {false, true}; + + for (int i = 0; i < values.length; ++i) { + for (int j = 0; j < values.length; ++j) { + boolean vi = values[i]; + boolean vj = values[j]; + int exp = i - j; + assertSignumEquals(vi, vj, exp, BOOLEAN_COMPARATOR.compare(vi, vj)); + assertSignumEquals(vi, vj, exp, BOOLEAN_COMPARATOR.compare(Boolean.valueOf(vi), Boolean.valueOf(vj))); + } + } + + checkThrowingUnsupportedException(BOOLEAN_COMPARATOR, Boolean.TYPE); + } + + @Test + public void testSignedInt32Comparator() { + testInt32Comparator(SIGNED_INT32_COMPARATOR, + Integer.MIN_VALUE, + -12345, + -1, + 0, + 1, + 12345, + Integer.MAX_VALUE); + } + + @Test + public void testUnsignedInt32Comparator() { + testInt32Comparator(UNSIGNED_INT32_COMPARATOR, + 0, + 1, + 12345, + Integer.MAX_VALUE, + Integer.MIN_VALUE, + -12345, + -1); + } + + private void testInt32Comparator(PrimitiveComparator comparator, int... values) { + for (int i = 0; i < values.length; ++i) { + for (int j = 0; j < values.length; ++j) { + int vi = values[i]; + int vj = values[j]; + int exp = i - j; + assertSignumEquals(vi, vj, exp, comparator.compare(vi, vj)); + assertSignumEquals(vi, vj, exp, comparator.compare(Integer.valueOf(vi), Integer.valueOf(vj))); + } + } + + checkThrowingUnsupportedException(comparator, Integer.TYPE); + } + + @Test + public void testSignedInt64Comparator() { + testInt64Comparator(SIGNED_INT64_COMPARATOR, + Long.MIN_VALUE, + -12345678901L, + -1, + 0, + 1, + 12345678901L, + Long.MAX_VALUE); + } + + @Test + public void testUnsignedInt64Comparator() { + testInt64Comparator(UNSIGNED_INT64_COMPARATOR, + 0, + 1, + 12345678901L, + Long.MAX_VALUE, + Long.MIN_VALUE, + -12345678901L, + -1); + } + + private void testInt64Comparator(PrimitiveComparator comparator, long... values) { + for (int i = 0; i < values.length; ++i) { + for (int j = 0; j < values.length; ++j) { + long vi = values[i]; + long vj = values[j]; + int exp = i - j; + assertSignumEquals(vi, vj, exp, comparator.compare(vi, vj)); + assertSignumEquals(vi, vj, exp, comparator.compare(Long.valueOf(vi), Long.valueOf(vj))); + } + } + + checkThrowingUnsupportedException(comparator, Long.TYPE); + } + + @Test + public void testFloatComparator() { + float[] values = { + Float.NEGATIVE_INFINITY, + -Float.MAX_VALUE, + -1234.5678F, + -Float.MIN_VALUE, + 0, + Float.MIN_VALUE, + 1234.5678F, + Float.MAX_VALUE, + Float.POSITIVE_INFINITY}; + + for (int i = 0; i < values.length; ++i) { + for (int j = 0; j < values.length; ++j) { + float vi = values[i]; + float vj = values[j]; + int exp = i - j; + assertSignumEquals(vi, vj, exp, FLOAT_COMPARATOR.compare(vi, vj)); + assertSignumEquals(vi, vj, exp, FLOAT_COMPARATOR.compare(Float.valueOf(vi), Float.valueOf(vj))); + } + } + + checkThrowingUnsupportedException(FLOAT_COMPARATOR, Float.TYPE); + } + + @Test + public void testDoubleComparator() { + double[] values = { + Double.NEGATIVE_INFINITY, + -Double.MAX_VALUE, + -123456.7890123456789, + -Double.MIN_VALUE, + 0, + Double.MIN_VALUE, + 123456.7890123456789, + Double.MAX_VALUE, + Double.POSITIVE_INFINITY}; + + for (int i = 0; i < values.length; ++i) { + for (int j = 0; j < values.length; ++j) { + double vi = values[i]; + double vj = values[j]; + int exp = i - j; + assertSignumEquals(vi, vj, exp, DOUBLE_COMPARATOR.compare(vi, vj)); + assertSignumEquals(vi, vj, exp, DOUBLE_COMPARATOR.compare(Double.valueOf(vi), Double.valueOf(vj))); + } + } + + checkThrowingUnsupportedException(DOUBLE_COMPARATOR, Double.TYPE); + } + + @Test + public void testLexicographicalBinaryComparator() { + testObjectComparator(LEXICOGRAPHICAL_BINARY_COMPARATOR, + Binary.fromConstantByteArray(new byte[0]), + Binary.fromConstantByteArray(new byte[]{127, 127, 0, 127}, 2, 1), + Binary.fromCharSequence("aaa"), + Binary.fromString("aaaa"), + Binary.fromReusedByteArray("aaab".getBytes()), + Binary.fromReusedByteArray("azzza".getBytes(), 1, 3), + Binary.fromReusedByteBuffer(ByteBuffer.wrap("zzzzzz".getBytes())), + Binary.fromReusedByteBuffer(ByteBuffer.wrap("aazzzzzzaa".getBytes(), 2, 7)), + Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[]{-128, -128, -128})), + Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[]{-128, -128, -1}, 1, 2)) + ); + } + + @Test + public void testSignedBinaryComparator() { + testObjectComparator(SIGNED_BINARY_COMPARATOR, + Binary.fromConstantByteArray(new BigInteger("-9999999999999999999999999999999999999999").toByteArray()), + Binary.fromReusedByteArray(new BigInteger("-9999999999999999999999999999999999999998").toByteArray()), + Binary.fromConstantByteArray(BigInteger.valueOf(Long.MIN_VALUE).subtract(BigInteger.ONE).toByteArray()), + Binary.fromConstantByteArray(BigInteger.valueOf(Long.MIN_VALUE).toByteArray()), + Binary.fromConstantByteArray(BigInteger.valueOf(Long.MIN_VALUE).add(BigInteger.ONE).toByteArray()), + Binary.fromReusedByteArray(new BigInteger("-1").toByteArray()), + Binary.fromConstantByteBuffer(ByteBuffer.wrap(new BigInteger("0").toByteArray())), + Binary.fromReusedByteBuffer(ByteBuffer.wrap(new BigInteger("1").toByteArray())), + Binary.fromConstantByteBuffer( + ByteBuffer.wrap(BigInteger.valueOf(Long.MAX_VALUE).subtract(BigInteger.ONE).toByteArray())), + Binary.fromConstantByteBuffer(ByteBuffer.wrap(BigInteger.valueOf(Long.MAX_VALUE).toByteArray())), + Binary + .fromConstantByteBuffer(ByteBuffer.wrap(BigInteger.valueOf(Long.MAX_VALUE).add(BigInteger.ONE).toByteArray())), + Binary.fromConstantByteBuffer( + ByteBuffer.wrap(new BigInteger("999999999999999999999999999999999999999").toByteArray())), + Binary.fromReusedByteBuffer( + ByteBuffer.wrap(new BigInteger("9999999999999999999999999999999999999998").toByteArray())), + Binary.fromConstantByteBuffer( + ByteBuffer.wrap(new BigInteger("9999999999999999999999999999999999999999").toByteArray())) + ); + } + + private void testObjectComparator(PrimitiveComparator comparator, T... values) { + for (int i = 0; i < values.length; ++i) { + for (int j = 0; j < values.length; ++j) { + T vi = values[i]; + T vj = values[j]; + int exp = i - j; + assertSignumEquals(vi, vj, exp, comparator.compare(vi, vj)); + } + } + + checkThrowingUnsupportedException(comparator, null); + } + + private void assertSignumEquals(T v1, T v2, int expected, int actual) { + String sign = expected < 0 ? " < " : expected > 0 ? " > " : " = "; + assertEquals("expected: " + v1 + sign + v2, signum(expected), signum(actual)); + } + + private int signum(int i) { + return i < 0 ? -1 : i > 0 ? 1 : 0; + } + + private void checkThrowingUnsupportedException(PrimitiveComparator comparator, Class exclude) { + if (Integer.TYPE != exclude) { + try { + comparator.compare(0, 0); + fail("An UnsupportedOperationException should have been thrown"); + } catch (UnsupportedOperationException e) { + } + } + if (Long.TYPE != exclude) { + try { + comparator.compare(0L, 0L); + fail("An UnsupportedOperationException should have been thrown"); + } catch (UnsupportedOperationException e) { + } + } + if (Float.TYPE != exclude) { + try { + comparator.compare(0F, 0F); + fail("An UnsupportedOperationException should have been thrown"); + } catch (UnsupportedOperationException e) { + } + } + if (Double.TYPE != exclude) { + try { + comparator.compare(0D, 0D); + fail("An UnsupportedOperationException should have been thrown"); + } catch (UnsupportedOperationException e) { + } + } + if (Boolean.TYPE != exclude) { + try { + comparator.compare(false, false); + fail("An UnsupportedOperationException should have been thrown"); + } catch (UnsupportedOperationException e) { + } + } + } +} From 688ef2efe8a4d066aed43cc818fbf28069a40a30 Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Wed, 22 Nov 2017 11:21:30 +0100 Subject: [PATCH 06/17] PARQUET-1025: Updates according to zi's and rdblue's comments --- .../column/statistics/BinaryStatistics.java | 19 +++++++++++++++++-- .../column/statistics/BooleanStatistics.java | 19 +++++++++++++++++-- .../column/statistics/DoubleStatistics.java | 19 +++++++++++++++++-- .../column/statistics/FloatStatistics.java | 18 +++++++++++++++++- .../column/statistics/IntStatistics.java | 19 +++++++++++++++++-- .../column/statistics/LongStatistics.java | 19 +++++++++++++++++-- .../parquet/column/statistics/Statistics.java | 17 +++++++---------- .../parquet/filter2/predicate/Statistics.java | 9 +++++++++ .../parquet/schema/PrimitiveComparator.java | 10 +++++----- .../hadoop/ColumnChunkPageWriteStore.java | 4 ++-- .../parquet/hadoop/ParquetFileWriter.java | 2 +- .../hadoop/metadata/ColumnChunkMetaData.java | 4 ++-- 12 files changed, 128 insertions(+), 31 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java index 46adf41c3b..d03c42ee26 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java @@ -25,18 +25,28 @@ public class BinaryStatistics extends Statistics { + // A fake type object to be used to generate the proper comparator + private static final Type DEFAULT_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.BINARY).named(""); + private Binary max; private Binary min; public BinaryStatistics() { - // Creating a fake primitive type to have the proper comparator - this(Types.optional(PrimitiveType.PrimitiveTypeName.BINARY).named("")); + this(DEFAULT_TYPE); } BinaryStatistics(Type type) { super(type.comparator()); } + private BinaryStatistics(BinaryStatistics other) { + super(other.comparator()); + if (other.hasNonNullValue()) { + initializeStats(other.min, other.max); + } + setNumNulls(other.getNumNulls()); + } + @Override public void updateStats(Binary value) { if (!this.hasNonNullValue()) { @@ -144,4 +154,9 @@ public void setMinMax(Binary min, Binary max) { this.min = min; this.markAsNotEmpty(); } + + @Override + public BinaryStatistics copy() { + return new BinaryStatistics(this); + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java index c62606b305..7e3670073e 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java @@ -25,18 +25,28 @@ public class BooleanStatistics extends Statistics { + // A fake type object to be used to generate the proper comparator + private static final Type DEFAULT_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.BOOLEAN).named(""); + private boolean max; private boolean min; public BooleanStatistics() { - // Creating a fake primitive type to have the proper comparator - this(Types.optional(PrimitiveType.PrimitiveTypeName.BOOLEAN).named("")); + this(DEFAULT_TYPE); } BooleanStatistics(Type type) { super(type.comparator()); } + private BooleanStatistics(BooleanStatistics other) { + super(other.comparator()); + if (other.hasNonNullValue()) { + initializeStats(other.min, other.max); + } + setNumNulls(other.getNumNulls()); + } + @Override public void updateStats(boolean value) { if (!this.hasNonNullValue()) { @@ -120,4 +130,9 @@ public void setMinMax(boolean min, boolean max) { this.min = min; this.markAsNotEmpty(); } + + @Override + public BooleanStatistics copy() { + return new BooleanStatistics(this); + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java index 8e9f4540f0..54f857d8c9 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java @@ -25,18 +25,28 @@ public class DoubleStatistics extends Statistics { + // A fake type object to be used to generate the proper comparator + private static final Type DEFAULT_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.DOUBLE).named(""); + private double max; private double min; public DoubleStatistics() { - // Creating a fake primitive type to have the proper comparator - this(Types.optional(PrimitiveType.PrimitiveTypeName.DOUBLE).named("")); + this(DEFAULT_TYPE); } DoubleStatistics(Type type) { super(type.comparator()); } + private DoubleStatistics(DoubleStatistics other) { + super(other.comparator()); + if (other.hasNonNullValue()) { + initializeStats(other.min, other.max); + } + setNumNulls(other.getNumNulls()); + } + @Override public void updateStats(double value) { if (!this.hasNonNullValue()) { @@ -125,4 +135,9 @@ public void setMinMax(double min, double max) { this.min = min; this.markAsNotEmpty(); } + + @Override + public DoubleStatistics copy() { + return new DoubleStatistics(this); + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java index 16ba380dce..80458d3291 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java @@ -25,18 +25,29 @@ public class FloatStatistics extends Statistics { + // A fake type object to be used to generate the proper comparator + private static final Type DEFAULT_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.FLOAT).named(""); + private float max; private float min; public FloatStatistics() { // Creating a fake primitive type to have the proper comparator - this(Types.optional(PrimitiveType.PrimitiveTypeName.FLOAT).named("")); + this(DEFAULT_TYPE); } FloatStatistics(Type type) { super(type.comparator()); } + private FloatStatistics(FloatStatistics other) { + super(other.comparator()); + if (other.hasNonNullValue()) { + initializeStats(other.min, other.max); + } + setNumNulls(other.getNumNulls()); + } + @Override public void updateStats(float value) { if (!this.hasNonNullValue()) { @@ -125,4 +136,9 @@ public void setMinMax(float min, float max) { this.min = min; this.markAsNotEmpty(); } + + @Override + public FloatStatistics copy() { + return new FloatStatistics(this); + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java index 9c55a6c674..929c35330a 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java @@ -25,18 +25,28 @@ public class IntStatistics extends Statistics { + // A fake type object to be used to generate the proper comparator + private static final Type DEFAULT_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.INT32).named(""); + private int max; private int min; public IntStatistics() { - // Creating a fake primitive type to have the proper comparator - this(Types.optional(PrimitiveType.PrimitiveTypeName.INT32).named("")); + this(DEFAULT_TYPE); } IntStatistics(Type type) { super(type.comparator()); } + private IntStatistics(IntStatistics other) { + super(other.comparator()); + if (other.hasNonNullValue()) { + initializeStats(other.min, other.max); + } + setNumNulls(other.getNumNulls()); + } + @Override public void updateStats(int value) { if (!this.hasNonNullValue()) { @@ -126,4 +136,9 @@ public void setMinMax(int min, int max) { this.min = min; this.markAsNotEmpty(); } + + @Override + public IntStatistics copy() { + return new IntStatistics(this); + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java index 7d5d94a3f6..e823f26289 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java @@ -25,18 +25,28 @@ public class LongStatistics extends Statistics { + // A fake type object to be used to generate the proper comparator + private static final Type DEFAULT_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.INT64).named(""); + private long max; private long min; public LongStatistics() { - // Creating a fake primitive type to have the proper comparator - this(Types.optional(PrimitiveType.PrimitiveTypeName.INT64).named("")); + this(DEFAULT_TYPE); } LongStatistics(Type type) { super(type.comparator()); } + private LongStatistics(LongStatistics other) { + super(other.comparator()); + if (other.hasNonNullValue()) { + initializeStats(other.min, other.max); + } + setNumNulls(other.getNumNulls()); + } + @Override public void updateStats(long value) { if (!this.hasNonNullValue()) { @@ -126,4 +136,9 @@ public void setMinMax(long min, long max) { this.min = min; this.markAsNotEmpty(); } + + @Override + public LongStatistics copy() { + return new LongStatistics(this); + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index acab8b75d8..f1d9d3bac8 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -35,7 +35,7 @@ * * @author Katya Gonina */ -public abstract class Statistics> implements Cloneable { +public abstract class Statistics> { private final PrimitiveComparator comparator; private boolean hasNonNullValue; @@ -84,7 +84,7 @@ public static Statistics getStatsBasedOnType(PrimitiveTypeName type) { * @param type type of the column * @return instance of a typed statistics class */ - public static Statistics createLegacyStats(PrimitiveTypeName type) { + public static Statistics createLegacyStats(PrimitiveTypeName type) { return getStatsBasedOnType(type); } @@ -370,13 +370,10 @@ protected void markAsNotEmpty() { hasNonNullValue = true; } - @Override - public Statistics clone() { - try { - return (Statistics) super.clone(); - } catch(CloneNotSupportedException e) { - throw new ShouldNeverHappenException(e); - } - } + /** + * Returns a new independent statistics instance of this class. All the values + * are copied. + */ + public abstract Statistics copy(); } diff --git a/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java index 92358a7702..3325195810 100644 --- a/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java @@ -30,6 +30,15 @@ public class Statistics { private final T max; private final Comparator comparator; + // Intended for use only within Parquet itself. + @Deprecated + public Statistics(T min, T max) { + this.min = checkNotNull(min, "min"); + this.max = checkNotNull(max, "max"); + this.comparator = null; + } + + // Intended for use only within Parquet itself. public Statistics(T min, T max, Comparator comparator) { this.min = checkNotNull(min, "min"); this.max = checkNotNull(max, "max"); diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java index 977e614c51..95fe562393 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java @@ -31,23 +31,23 @@ public abstract class PrimitiveComparator implements Comparator { public int compare(boolean b1, boolean b2) { - throw new UnsupportedOperationException(); + throw new UnsupportedOperationException("compare(boolean, boolean) was called on a non-boolean comparator"); } public int compare(int i1, int i2) { - throw new UnsupportedOperationException(); + throw new UnsupportedOperationException("compare(int, int) was called on a non-int comparator"); } public int compare(long l1, long l2) { - throw new UnsupportedOperationException(); + throw new UnsupportedOperationException("compare(long, long) was called on a non-long comparator"); } public int compare(float f1, float f2) { - throw new UnsupportedOperationException(); + throw new UnsupportedOperationException("compare(float, float) was called on a non-float comparator"); } public int compare(double d1, double d2) { - throw new UnsupportedOperationException(); + throw new UnsupportedOperationException("compare(double, double) was called on a non-double comparator"); } static final PrimitiveComparator BOOLEAN_COMPARATOR = new PrimitiveComparator() { diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java index 4d887dfa9b..8eeab5c55f 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java @@ -116,7 +116,7 @@ public void writePage(BytesInput bytes, // Cloning the statistics if it is not initialized yet so we have the correct typed one if (totalStatistics == null) { - totalStatistics = statistics.clone(); + totalStatistics = statistics.copy(); } else { totalStatistics.mergeStatistics(statistics); } @@ -161,7 +161,7 @@ public void writePageV2( // Cloning the statistics if it is not initialized yet so we have the correct typed one if (totalStatistics == null) { - totalStatistics = statistics.clone(); + totalStatistics = statistics.copy(); } else { totalStatistics.mergeStatistics(statistics); } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java index be0b66bdd2..dcfb88b694 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java @@ -428,7 +428,7 @@ public void writeDataPage( // Cloning the statistics if it is not initialized yet so we have the correct typed one if (currentStatistics == null) { - currentStatistics = statistics.clone(); + currentStatistics = statistics.copy(); } else { currentStatistics.mergeStatistics(statistics); } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java index 178a49a5fd..4b428e50c0 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java @@ -26,7 +26,7 @@ import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; -import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; /** * Column meta data for a block stored in the file footer and passed in the InputSplit @@ -80,7 +80,7 @@ public static ColumnChunkMetaData get( long valueCount, long totalSize, long totalUncompressedSize) { - return get(path, new PrimitiveType(Type.Repetition.OPTIONAL, type, ""), codec, encodingStats, encodings, statistics, + return get(path, Types.optional(type).named(""), codec, encodingStats, encodings, statistics, firstDataPage, dictionaryPageOffset, valueCount, totalSize, totalUncompressedSize); } From 318e585d9dd6369994f09d4e7717191554e2b707 Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Thu, 23 Nov 2017 17:34:59 +0100 Subject: [PATCH 07/17] PARQUET-1025: Finalize reading/writing new stats; modify/implement unit tests accordingly --- .../parquet/column/impl/ColumnWriterV1.java | 2 +- .../parquet/column/statistics/Statistics.java | 23 +- .../parquet/schema/PrimitiveComparator.java | 23 ++ .../converter/ParquetMetadataConverter.java | 19 +- .../TestParquetMetadataConverter.java | 235 ++++++++++++++++-- .../parquet/statistics/RandomValues.java | 64 ++++- .../parquet/statistics/TestStatistics.java | 215 ++++++++++------ 7 files changed, 463 insertions(+), 118 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java index 8e975aaaa1..80bd1ed5ee 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java @@ -80,7 +80,7 @@ private void log(Object value, int r, int d) { } private void resetStatistics() { - this.statistics = Statistics.createLegacyStats(this.path.getType()); + this.statistics = Statistics.createStats(this.path.getFullType()); } /** diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index f1d9d3bac8..0a0c93ade5 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -18,7 +18,6 @@ */ package org.apache.parquet.column.statistics; -import org.apache.parquet.ShouldNeverHappenException; import org.apache.parquet.column.UnknownColumnTypeException; import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.PrimitiveComparator; @@ -26,7 +25,6 @@ import org.apache.parquet.schema.Type; import java.util.Arrays; -import java.util.Comparator; import java.util.Objects; @@ -49,9 +47,12 @@ public abstract class Statistics> { /** * Returns the typed statistics object based on the passed type parameter - * @param type PrimitiveTypeName type of the column + * + * @param type + * PrimitiveTypeName type of the column * @return instance of a typed statistics class - * @deprecated Use {@link #createStats(Type)} or {@link #createLegacyStats(PrimitiveTypeName)} instead + * @deprecated Use {@link #createStats(Type)} or + * {@link #createLegacyStats(PrimitiveTypeName)} instead */ @Deprecated public static Statistics getStatsBasedOnType(PrimitiveTypeName type) { @@ -78,10 +79,11 @@ public static Statistics getStatsBasedOnType(PrimitiveTypeName type) { } /** - * Creates an empty {@code Statistics} instance for the specified type to be used for reading/writing the legacy - * min/max statistics. + * Creates an empty {@code Statistics} instance for the specified type to be + * used for reading/writing the legacy min/max statistics. * - * @param type type of the column + * @param type + * type of the column * @return instance of a typed statistics class */ public static Statistics createLegacyStats(PrimitiveTypeName type) { @@ -89,10 +91,11 @@ public static Statistics createLegacyStats(PrimitiveTypeName type) { } /** - * Creates an empty {@code Statistics} instance for the specified type to be used for reading/writing the new min/max - * statistics used in the V2 format. + * Creates an empty {@code Statistics} instance for the specified type to be + * used for reading/writing the new min/max statistics used in the V2 format. * - * @param type type of the column + * @param type + * type of the column * @return instance of a typed statistics class */ public static Statistics createStats(Type type) { diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java index 95fe562393..80e60b3fae 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java @@ -50,6 +50,14 @@ public int compare(double d1, double d2) { throw new UnsupportedOperationException("compare(double, double) was called on a non-double comparator"); } + /** + * Returns whether this comparator is compliant with the sorting definition of + * the former V1 min-max statistics. + */ + public boolean isFormerStatsCompliant() { + return true; + } + static final PrimitiveComparator BOOLEAN_COMPARATOR = new PrimitiveComparator() { @Override public int compare(Boolean o1, Boolean o2) { @@ -82,6 +90,11 @@ public int compare(int i1, int i2) { // Implemented based on com.google.common.primitives.UnsignedInts.compare(int, int) return Integer.compare(i1 ^ Integer.MIN_VALUE, i2 ^ Integer.MIN_VALUE); } + + @Override + public boolean isFormerStatsCompliant() { + return false; + } }; private static abstract class LongComparator extends PrimitiveComparator { @@ -104,6 +117,11 @@ public int compare(long l1, long l2) { // Implemented based on com.google.common.primitives.UnsignedLongs.compare(long, long) return Long.compare(l1 ^ Long.MIN_VALUE, l2 ^ Long.MIN_VALUE); } + + @Override + public boolean isFormerStatsCompliant() { + return false; + } }; static final PrimitiveComparator FLOAT_COMPARATOR = new PrimitiveComparator() { @@ -198,5 +216,10 @@ int compare(ByteBuffer b1, ByteBuffer b2) { } return 0; } + + @Override + public boolean isFormerStatsCompliant() { + return false; + } }; } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 3ea91e066b..41a1f167c6 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -335,12 +335,19 @@ public static Statistics toParquetStatistics( if (!statistics.isEmpty() && statistics.isSmallerThan(MAX_STATS_SIZE)) { stats.setNull_count(statistics.getNumNulls()); if (statistics.hasNonNullValue()) { - byte[] maxBytes = statistics.getMaxBytes(); - stats.setMax(statistics.getMaxBytes()); - stats.setMax_value(maxBytes); - byte[] minBytes = statistics.getMinBytes(); - stats.setMin(statistics.getMinBytes()); - stats.setMin_value(minBytes); + byte[] min = statistics.getMinBytes(); + byte[] max = statistics.getMaxBytes(); + + // Fill the former min-max statistics only if comparison logic is the one + // specified in V1 format (e.g. signed comparison for numbers, unsigned + // lexicographical for binary) or the min and max are equal + if (statistics.comparator().isFormerStatsCompliant() || Arrays.equals(min, max)) { + stats.setMin(min); + stats.setMax(max); + } + + stats.setMin_value(min); + stats.setMax_value(max); } } return stats; diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 2e708f826f..48d3a52125 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -22,6 +22,7 @@ import static org.apache.parquet.format.converter.ParquetMetadataConverter.filterFileMetaDataByStart; import static org.apache.parquet.schema.MessageTypeParser.parseMessageType; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertSame; import static org.junit.Assert.fail; import static org.apache.parquet.format.CompressionCodec.UNCOMPRESSED; @@ -34,6 +35,8 @@ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.math.BigInteger; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -371,7 +374,16 @@ public void testEncodingsCache() { } @Test - public void testBinaryStats() { + public void testBinaryStatsV1() { + testBinaryStats(StatsHelper.V1); + } + + @Test + public void testBinaryStatsV2() { + testBinaryStats(StatsHelper.V2); + } + + private void testBinaryStats(StatsHelper helper) { // make fake stats and verify the size check BinaryStatistics stats = new BinaryStatistics(); stats.incrementNumNulls(3004); @@ -386,7 +398,7 @@ public void testBinaryStats() { stats.isSmallerThan(totalLen + 1)); org.apache.parquet.format.Statistics formatStats = - ParquetMetadataConverter.toParquetStatistics(stats); + helper.toParquetStatistics(stats); Assert.assertArrayEquals("Min should match", min, formatStats.getMin()); Assert.assertArrayEquals("Max should match", max, formatStats.getMax()); @@ -396,7 +408,7 @@ public void testBinaryStats() { // convert to empty stats because the values are too large stats.setMinMaxFromBytes(max, max); - formatStats = ParquetMetadataConverter.toParquetStatistics(stats); + formatStats = helper.toParquetStatistics(stats); Assert.assertFalse("Min should not be set", formatStats.isSetMin()); Assert.assertFalse("Max should not be set", formatStats.isSetMax()); @@ -411,7 +423,16 @@ Version.FULL_VERSION, formatStats, new PrimitiveType(Repetition.OPTIONAL, Primit } @Test - public void testIntegerStats() { + public void testIntegerStatsV1() { + testIntegerStats(StatsHelper.V1); + } + + @Test + public void testIntegerStatsV2() { + testIntegerStats(StatsHelper.V2); + } + + private void testIntegerStats(StatsHelper helper) { // make fake stats and verify the size check IntStatistics stats = new IntStatistics(); stats.incrementNumNulls(3004); @@ -421,7 +442,7 @@ public void testIntegerStats() { stats.updateStats(max); org.apache.parquet.format.Statistics formatStats = - ParquetMetadataConverter.toParquetStatistics(stats); + helper.toParquetStatistics(stats); Assert.assertEquals("Min should match", min, BytesUtils.bytesToInt(formatStats.getMin())); @@ -432,7 +453,16 @@ public void testIntegerStats() { } @Test - public void testLongStats() { + public void testLongStatsV1() { + testLongStats(StatsHelper.V1); + } + + @Test + public void testLongStatsV2() { + testLongStats(StatsHelper.V2); + } + + private void testLongStats(StatsHelper helper) { // make fake stats and verify the size check LongStatistics stats = new LongStatistics(); stats.incrementNumNulls(3004); @@ -442,7 +472,7 @@ public void testLongStats() { stats.updateStats(max); org.apache.parquet.format.Statistics formatStats = - ParquetMetadataConverter.toParquetStatistics(stats); + helper.toParquetStatistics(stats); Assert.assertEquals("Min should match", min, BytesUtils.bytesToLong(formatStats.getMin())); @@ -453,7 +483,16 @@ public void testLongStats() { } @Test - public void testFloatStats() { + public void testFloatStatsV1() { + testFloatStats(StatsHelper.V1); + } + + @Test + public void testFloatStatsV2() { + testFloatStats(StatsHelper.V2); + } + + private void testFloatStats(StatsHelper helper) { // make fake stats and verify the size check FloatStatistics stats = new FloatStatistics(); stats.incrementNumNulls(3004); @@ -463,7 +502,7 @@ public void testFloatStats() { stats.updateStats(max); org.apache.parquet.format.Statistics formatStats = - ParquetMetadataConverter.toParquetStatistics(stats); + helper.toParquetStatistics(stats); Assert.assertEquals("Min should match", min, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMin())), @@ -476,7 +515,16 @@ public void testFloatStats() { } @Test - public void testDoubleStats() { + public void testDoubleStatsV1() { + testDoubleStats(StatsHelper.V1); + } + + @Test + public void testDoubleStatsV2() { + testDoubleStats(StatsHelper.V2); + } + + private void testDoubleStats(StatsHelper helper) { // make fake stats and verify the size check DoubleStatistics stats = new DoubleStatistics(); stats.incrementNumNulls(3004); @@ -486,7 +534,7 @@ public void testDoubleStats() { stats.updateStats(max); org.apache.parquet.format.Statistics formatStats = - ParquetMetadataConverter.toParquetStatistics(stats); + helper.toParquetStatistics(stats); Assert.assertEquals("Min should match", min, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMin())), @@ -499,7 +547,16 @@ public void testDoubleStats() { } @Test - public void testBooleanStats() { + public void testBooleanStatsV1() { + testBooleanStats(StatsHelper.V1); + } + + @Test + public void testBooleanStatsV2() { + testBooleanStats(StatsHelper.V2); + } + + private void testBooleanStats(StatsHelper helper) { // make fake stats and verify the size check BooleanStatistics stats = new BooleanStatistics(); stats.incrementNumNulls(3004); @@ -509,7 +566,7 @@ public void testBooleanStats() { stats.updateStats(max); org.apache.parquet.format.Statistics formatStats = - ParquetMetadataConverter.toParquetStatistics(stats); + helper.toParquetStatistics(stats); Assert.assertEquals("Min should match", min, BytesUtils.bytesToBool(formatStats.getMin())); @@ -529,14 +586,9 @@ public void testIgnoreStatsWithSignedSortOrder() { stats.updateStats(Binary.fromString("z")); stats.incrementNumNulls(); - org.apache.parquet.format.Statistics v1Statistics = ParquetMetadataConverter.toParquetStatistics(stats); - // Unsetting the min/max values the enforce the v1 ignoring logic - v1Statistics.unsetMin_value(); - v1Statistics.unsetMax_value(); - Statistics convertedStats = converter.fromParquetStatistics( Version.FULL_VERSION, - v1Statistics, + StatsHelper.V1.toParquetStatistics(stats), Types.required(PrimitiveTypeName.BINARY) .as(OriginalType.UTF8).named("b")); @@ -544,7 +596,16 @@ public void testIgnoreStatsWithSignedSortOrder() { } @Test - public void testStillUseStatsWithSignedSortOrderIfSingleValue() { + public void testStillUseStatsWithSignedSortOrderIfSingleValueV1() { + testStillUseStatsWithSignedSortOrderIfSingleValue(StatsHelper.V1); + } + + @Test + public void testStillUseStatsWithSignedSortOrderIfSingleValueV2() { + testStillUseStatsWithSignedSortOrderIfSingleValue(StatsHelper.V2); + } + + private void testStillUseStatsWithSignedSortOrderIfSingleValue(StatsHelper helper) { ParquetMetadataConverter converter = new ParquetMetadataConverter(); BinaryStatistics stats = new BinaryStatistics(); stats.incrementNumNulls(); @@ -564,7 +625,16 @@ public void testStillUseStatsWithSignedSortOrderIfSingleValue() { } @Test - public void testUseStatsWithSignedSortOrder() { + public void testUseStatsWithSignedSortOrderV1() { + testUseStatsWithSignedSortOrder(StatsHelper.V1); + } + + @Test + public void testUseStatsWithSignedSortOrderV2() { + testUseStatsWithSignedSortOrder(StatsHelper.V2); + } + + private void testUseStatsWithSignedSortOrder(StatsHelper helper) { // override defaults and use stats that were accumulated using signed order Configuration conf = new Configuration(); conf.setBoolean("parquet.strings.signed-min-max.enabled", true); @@ -579,7 +649,7 @@ public void testUseStatsWithSignedSortOrder() { Statistics convertedStats = converter.fromParquetStatistics( Version.FULL_VERSION, - ParquetMetadataConverter.toParquetStatistics(stats), + helper.toParquetStatistics(stats), Types.required(PrimitiveTypeName.BINARY) .as(OriginalType.UTF8).named("b")); @@ -590,4 +660,125 @@ public void testUseStatsWithSignedSortOrder() { Assert.assertEquals("Should have correct max (unsigned sort)", Binary.fromString("z"), convertedStats.genericGetMax()); } + + @Test + public void testV2OnlyStats() { + testV2OnlyStats(createStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_8).named(""), + Byte.MAX_VALUE, + Byte.MIN_VALUE)); + testV2OnlyStats(createStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_16).named(""), + Short.MAX_VALUE, + Short.MIN_VALUE)); + testV2OnlyStats(createStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_32).named(""), + Integer.MAX_VALUE, + Integer.MIN_VALUE)); + testV2OnlyStats(createStats(Types.optional(PrimitiveTypeName.INT64).as(OriginalType.UINT_64).named(""), + Long.MAX_VALUE, + Long.MIN_VALUE)); + testV2OnlyStats( + createStats(Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.DECIMAL).precision(6).named(""), + new BigInteger("-123456"), + new BigInteger("123456"))); + testV2OnlyStats(createStats( + Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(14).as(OriginalType.DECIMAL).precision(7) + .named(""), + new BigInteger("-1234567"), + new BigInteger("1234567"))); + testV2OnlyStats(createStats(Types.optional(PrimitiveTypeName.INT96).named(""), + new BigInteger("-12345678"), + new BigInteger("12345678"))); + } + + private void testV2OnlyStats(Statistics stats) { + org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats); + assertFalse(statistics.isSetMin()); + assertFalse(statistics.isSetMax()); + assertEquals(ByteBuffer.wrap(stats.getMinBytes()), statistics.min_value); + assertEquals(ByteBuffer.wrap(stats.getMaxBytes()), statistics.max_value); + } + + @Test + public void testV2StatsEqualMinMax() { + testV2StatsEqualMinMax(createStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_8).named(""), + Byte.MAX_VALUE, + Byte.MAX_VALUE)); + testV2StatsEqualMinMax(createStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_16).named(""), + Short.MIN_VALUE, + Short.MIN_VALUE)); + testV2StatsEqualMinMax(createStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_32).named(""), + Integer.MAX_VALUE, + Integer.MAX_VALUE)); + testV2StatsEqualMinMax(createStats(Types.optional(PrimitiveTypeName.INT64).as(OriginalType.UINT_64).named(""), + Long.MIN_VALUE, + Long.MIN_VALUE)); + testV2StatsEqualMinMax(createStats(Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.DECIMAL).precision(6).named(""), + new BigInteger("123456"), + new BigInteger("123456"))); + testV2StatsEqualMinMax(createStats( + Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(14).as(OriginalType.DECIMAL).precision(7).named(""), + new BigInteger("-1234567"), + new BigInteger("-1234567"))); + testV2StatsEqualMinMax(createStats(Types.optional(PrimitiveTypeName.INT96).named(""), + new BigInteger("12345678"), + new BigInteger("12345678"))); + } + + private void testV2StatsEqualMinMax(Statistics stats) { + org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats); + assertEquals(ByteBuffer.wrap(stats.getMinBytes()), statistics.min); + assertEquals(ByteBuffer.wrap(stats.getMaxBytes()), statistics.max); + assertEquals(ByteBuffer.wrap(stats.getMinBytes()), statistics.min_value); + assertEquals(ByteBuffer.wrap(stats.getMaxBytes()), statistics.max_value); + } + + private static Statistics createStats(PrimitiveType type, int min, int max) { + Statistics stats = Statistics.createStats(type); + stats.updateStats(max); + stats.updateStats(min); + assertEquals(min, stats.genericGetMin()); + assertEquals(max, stats.genericGetMax()); + return stats; + } + + private static Statistics createStats(PrimitiveType type, long min, long max) { + Statistics stats = Statistics.createStats(type); + stats.updateStats(max); + stats.updateStats(min); + assertEquals(min, stats.genericGetMin()); + assertEquals(max, stats.genericGetMax()); + return stats; + } + + private static Statistics createStats(PrimitiveType type, BigInteger min, BigInteger max) { + Statistics stats = Statistics.createStats(type); + Binary minBinary = Binary.fromConstantByteArray(min.toByteArray()); + Binary maxBinary = Binary.fromConstantByteArray(max.toByteArray()); + stats.updateStats(maxBinary); + stats.updateStats(minBinary); + assertEquals(minBinary, stats.genericGetMin()); + assertEquals(maxBinary, stats.genericGetMax()); + return stats; + } + + private enum StatsHelper { + // Only min and max are filled (min_value and max_value are not) + V1() { + @Override + public org.apache.parquet.format.Statistics toParquetStatistics(Statistics stats) { + org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats); + statistics.unsetMin_value(); + statistics.unsetMax_value(); + return statistics; + } + }, + // min_value and max_value are filled (min and max might be filled as well) + V2() { + @Override + public org.apache.parquet.format.Statistics toParquetStatistics(Statistics stats) { + return ParquetMetadataConverter.toParquetStatistics(stats); + } + }; + public abstract org.apache.parquet.format.Statistics toParquetStatistics(Statistics stats); + } + } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/RandomValues.java b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/RandomValues.java index cbdd935f29..5d95adb8d4 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/RandomValues.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/RandomValues.java @@ -26,7 +26,7 @@ public class RandomValues { private static final String ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890"; - private static abstract class RandomValueGenerator> { + static abstract class RandomValueGenerator> { private final Random random; protected RandomValueGenerator(long seed) { @@ -82,7 +82,7 @@ public String randomFixedLengthString(int length) { public abstract T nextValue(); } - private static abstract class RandomBinaryBase> extends RandomValueGenerator { + static abstract class RandomBinaryBase> extends RandomValueGenerator { protected final int bufferLength; protected final byte[] buffer; @@ -103,13 +103,21 @@ public Binary asReusedBinary(byte[] data) { } public static class IntGenerator extends RandomValueGenerator { - private final RandomRange randomRange = new RandomRange(randomInt(), randomInt()); - private final int minimum = randomRange.minimum(); - private final int maximum = randomRange.maximum(); - private final int range = (maximum - minimum); + private final int minimum; + private final int range; public IntGenerator(long seed) { super(seed); + RandomRange randomRange = new RandomRange<>(randomInt(), randomInt()); + this.minimum = randomRange.minimum(); + this.range = (randomRange.maximum() - this.minimum); + } + + public IntGenerator(long seed, int minimum, int maximum) { + super(seed); + RandomRange randomRange = new RandomRange<>(minimum, maximum); + this.minimum = randomRange.minimum(); + this.range = randomRange.maximum() - this.minimum; } @Override @@ -118,6 +126,17 @@ public Integer nextValue() { } } + public static class UnconstrainedIntGenerator extends RandomValueGenerator { + public UnconstrainedIntGenerator(long seed) { + super(seed); + } + + @Override + public Integer nextValue() { + return randomInt(); + } + } + public static class LongGenerator extends RandomValueGenerator { private final RandomRange randomRange = new RandomRange(randomLong(), randomLong()); private final long minimum = randomRange.minimum(); @@ -134,6 +153,17 @@ public Long nextValue() { } } + public static class UnconstrainedLongGenerator extends RandomValueGenerator { + public UnconstrainedLongGenerator(long seed) { + super(seed); + } + + @Override + public Long nextValue() { + return randomLong(); + } + } + public static class Int96Generator extends RandomBinaryBase { private final RandomRange randomRange = new RandomRange(randomInt96(), randomInt96()); private final BigInteger minimum = randomRange.minimum(); @@ -173,6 +203,17 @@ public Float nextValue() { } } + public static class UnconstrainedFloatGenerator extends RandomValueGenerator { + public UnconstrainedFloatGenerator(long seed) { + super(seed); + } + + @Override + public Float nextValue() { + return randomFloat(); + } + } + public static class DoubleGenerator extends RandomValueGenerator { private final RandomRange randomRange = new RandomRange(randomDouble(), randomDouble()); private final double minimum = randomRange.minimum(); @@ -189,6 +230,17 @@ public Double nextValue() { } } + public static class UnconstrainedDoubleGenerator extends RandomValueGenerator { + public UnconstrainedDoubleGenerator(long seed) { + super(seed); + } + + @Override + public Double nextValue() { + return randomDouble(); + } + } + public static class StringGenerator extends RandomBinaryBase { private static final int MAX_STRING_LENGTH = 16; public StringGenerator(long seed) { diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java index bdd858fe0b..047ec30283 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java @@ -19,6 +19,26 @@ package org.apache.parquet.statistics; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96; +import static org.apache.parquet.schema.Type.Repetition.OPTIONAL; +import static org.apache.parquet.schema.Type.Repetition.REQUIRED; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.IOException; +import java.math.BigInteger; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.Random; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.parquet.column.ColumnDescriptor; @@ -42,25 +62,18 @@ import org.apache.parquet.io.api.Binary; import org.apache.parquet.io.api.PrimitiveConverter; import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; +import org.apache.parquet.statistics.RandomValues.RandomBinaryBase; +import org.apache.parquet.statistics.RandomValues.RandomValueGenerator; import org.junit.Assert; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.io.File; -import java.io.IOException; -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; -import java.util.Random; - -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*; -import static org.apache.parquet.schema.Type.Repetition.OPTIONAL; -import static org.apache.parquet.schema.Type.Repetition.REQUIRED; -import static org.junit.Assert.assertTrue; - public class TestStatistics { private static final int MEGABYTE = 1 << 20; private static final long RANDOM_SEED = 1441990701846L; //System.currentTimeMillis(); @@ -283,7 +296,11 @@ public void validate(MessageType schema, PageReadStore store) { private void validateStatsForPage(DataPage page, DictionaryPage dict, ColumnDescriptor desc) { SingletonPageReader reader = new SingletonPageReader(dict, page); PrimitiveConverter converter = getValidatingConverter(page, desc.getType()); - Statistics stats = getStatisticsFromPageHeader(page); + Statistics stats = getStatisticsFromPageHeader(page); + + assertEquals("Statistics does not use the proper comparator", + desc.getFullType().comparator().getClass(), + stats.comparator().getClass()); if (stats.isEmpty()) { // stats are empty if num nulls = 0 and there are no non-null values @@ -318,92 +335,144 @@ private void validateStatsForPage(DataPage page, DictionaryPage dict, ColumnDesc public static class DataContext extends DataGenerationContext.WriteContext { private static final int MAX_TOTAL_ROWS = 1000000; - private final long seed; private final Random random; private final int recordCount; - private final int fixedLength; - private final RandomValues.IntGenerator intGenerator; - private final RandomValues.LongGenerator longGenerator; - private final RandomValues.Int96Generator int96Generator; - private final RandomValues.FloatGenerator floatGenerator; - private final RandomValues.DoubleGenerator doubleGenerator; - private final RandomValues.StringGenerator stringGenerator; - private final RandomValues.BinaryGenerator binaryGenerator; - private final RandomValues.FixedGenerator fixedBinaryGenerator; + private final List> randomGenerators; public DataContext(long seed, File path, int blockSize, int pageSize, boolean enableDictionary, ParquetProperties.WriterVersion version) throws IOException { super(path, buildSchema(seed), blockSize, pageSize, enableDictionary, true, version); - this.seed = seed; this.random = new Random(seed); this.recordCount = random.nextInt(MAX_TOTAL_ROWS); - this.fixedLength = schema.getType("fixed-binary").asPrimitiveType().getTypeLength(); - this.intGenerator = new RandomValues.IntGenerator(random.nextLong()); - this.longGenerator = new RandomValues.LongGenerator(random.nextLong()); - this.int96Generator = new RandomValues.Int96Generator(random.nextLong()); - this.floatGenerator = new RandomValues.FloatGenerator(random.nextLong()); - this.doubleGenerator = new RandomValues.DoubleGenerator(random.nextLong()); - this.stringGenerator = new RandomValues.StringGenerator(random.nextLong()); - this.binaryGenerator = new RandomValues.BinaryGenerator(random.nextLong()); - this.fixedBinaryGenerator = new RandomValues.FixedGenerator(random.nextLong(), fixedLength); + int fixedLength = schema.getType("fixed-binary").asPrimitiveType().getTypeLength(); + + randomGenerators = Arrays.>asList( + new RandomValues.IntGenerator(random.nextLong()), + new RandomValues.LongGenerator(random.nextLong()), + new RandomValues.Int96Generator(random.nextLong()), + new RandomValues.FloatGenerator(random.nextLong()), + new RandomValues.DoubleGenerator(random.nextLong()), + new RandomValues.StringGenerator(random.nextLong()), + new RandomValues.BinaryGenerator(random.nextLong()), + new RandomValues.FixedGenerator(random.nextLong(), fixedLength), + new RandomValues.UnconstrainedIntGenerator(random.nextLong()), + new RandomValues.UnconstrainedLongGenerator(random.nextLong()), + new RandomValues.UnconstrainedFloatGenerator(random.nextLong()), + new RandomValues.UnconstrainedDoubleGenerator(random.nextLong()), + new RandomValues.IntGenerator(random.nextLong(), Byte.MIN_VALUE, Byte.MAX_VALUE), + new RandomValues.IntGenerator(random.nextLong(), Byte.MIN_VALUE, Byte.MAX_VALUE), + new RandomValues.IntGenerator(random.nextLong(), Short.MIN_VALUE, Short.MAX_VALUE), + new RandomValues.IntGenerator(random.nextLong(), Short.MIN_VALUE, Short.MAX_VALUE), + new RandomValues.UnconstrainedIntGenerator(random.nextLong()), + new RandomValues.UnconstrainedIntGenerator(random.nextLong()), + new RandomValues.UnconstrainedLongGenerator(random.nextLong()), + new RandomValues.UnconstrainedLongGenerator(random.nextLong()), + new RandomValues.UnconstrainedIntGenerator(random.nextLong()), + new RandomValues.UnconstrainedLongGenerator(random.nextLong()), + new RandomValues.FixedGenerator(random.nextLong(), fixedLength), + new RandomValues.BinaryGenerator(random.nextLong()), + new RandomValues.StringGenerator(random.nextLong()), + new RandomValues.StringGenerator(random.nextLong()), + new RandomValues.StringGenerator(random.nextLong()), + new RandomValues.BinaryGenerator(random.nextLong()), + new RandomValues.IntGenerator(random.nextLong()), + new RandomValues.IntGenerator(random.nextLong()), + new RandomValues.LongGenerator(random.nextLong()), + new RandomValues.LongGenerator(random.nextLong()), + new RandomValues.LongGenerator(random.nextLong()), + new RandomValues.FixedGenerator(random.nextLong(), 12) + ); } private static MessageType buildSchema(long seed) { Random random = new Random(seed); int fixedBinaryLength = random.nextInt(21) + 1; + int fixedPrecision = calculatePrecision(fixedBinaryLength); + int fixedScale = fixedPrecision / 4; + int binaryPrecision = calculatePrecision(16); + int binaryScale = binaryPrecision / 4; return new MessageType("schema", - new PrimitiveType(OPTIONAL, INT32, "i32"), - new PrimitiveType(OPTIONAL, INT64, "i64"), - new PrimitiveType(OPTIONAL, INT96, "i96"), - new PrimitiveType(OPTIONAL, FLOAT, "sngl"), - new PrimitiveType(OPTIONAL, DOUBLE, "dbl"), - new PrimitiveType(OPTIONAL, BINARY, "strings"), - new PrimitiveType(OPTIONAL, BINARY, "binary"), - new PrimitiveType(OPTIONAL, FIXED_LEN_BYTE_ARRAY, fixedBinaryLength, "fixed-binary"), - new PrimitiveType(REQUIRED, INT32, "unconstrained-i32"), - new PrimitiveType(REQUIRED, INT64, "unconstrained-i64"), - new PrimitiveType(REQUIRED, FLOAT, "unconstrained-sngl"), - new PrimitiveType(REQUIRED, DOUBLE, "unconstrained-dbl") + new PrimitiveType(OPTIONAL, INT32, "i32"), + new PrimitiveType(OPTIONAL, INT64, "i64"), + new PrimitiveType(OPTIONAL, INT96, "i96"), + new PrimitiveType(OPTIONAL, FLOAT, "sngl"), + new PrimitiveType(OPTIONAL, DOUBLE, "dbl"), + new PrimitiveType(OPTIONAL, BINARY, "strings"), + new PrimitiveType(OPTIONAL, BINARY, "binary"), + new PrimitiveType(OPTIONAL, FIXED_LEN_BYTE_ARRAY, fixedBinaryLength, "fixed-binary"), + new PrimitiveType(REQUIRED, INT32, "unconstrained-i32"), + new PrimitiveType(REQUIRED, INT64, "unconstrained-i64"), + new PrimitiveType(REQUIRED, FLOAT, "unconstrained-sngl"), + new PrimitiveType(REQUIRED, DOUBLE, "unconstrained-dbl"), + Types.optional(INT32).as(OriginalType.INT_8).named("int8"), + Types.optional(INT32).as(OriginalType.UINT_8).named("uint8"), + Types.optional(INT32).as(OriginalType.INT_16).named("int16"), + Types.optional(INT32).as(OriginalType.UINT_16).named("uint16"), + Types.optional(INT32).as(OriginalType.INT_32).named("int32"), + Types.optional(INT32).as(OriginalType.UINT_32).named("uint32"), + Types.optional(INT64).as(OriginalType.INT_64).named("int64"), + Types.optional(INT64).as(OriginalType.UINT_64).named("uint64"), + Types.optional(INT32).as(OriginalType.DECIMAL).precision(9).scale(2).named("decimal-int32"), + Types.optional(INT64).as(OriginalType.DECIMAL).precision(18).scale(4).named("decimal-int64"), + Types.optional(FIXED_LEN_BYTE_ARRAY).length(fixedBinaryLength).as(OriginalType.DECIMAL) + .precision(fixedPrecision).scale(fixedScale).named("decimal-fixed"), + Types.optional(BINARY).as(OriginalType.DECIMAL).precision(binaryPrecision).scale(binaryScale) + .named("decimal-binary"), + Types.optional(BINARY).as(OriginalType.UTF8).named("utf8"), + Types.optional(BINARY).as(OriginalType.ENUM).named("enum"), + Types.optional(BINARY).as(OriginalType.JSON).named("json"), + Types.optional(BINARY).as(OriginalType.BSON).named("bson"), + Types.optional(INT32).as(OriginalType.DATE).named("date"), + Types.optional(INT32).as(OriginalType.TIME_MILLIS).named("time-millis"), + Types.optional(INT64).as(OriginalType.TIME_MICROS).named("time-micros"), + Types.optional(INT64).as(OriginalType.TIMESTAMP_MILLIS).named("timestamp-millis"), + Types.optional(INT64).as(OriginalType.TIMESTAMP_MICROS).named("timestamp-micros"), + Types.optional(FIXED_LEN_BYTE_ARRAY).length(12).as(OriginalType.INTERVAL).named("interval") ); } + private static int calculatePrecision(int byteCnt) { + String maxValue = BigInteger.valueOf(2L).pow(8 * byteCnt - 1).toString(); + return maxValue.length() - 1; + } + @Override public void write(ParquetWriter writer) throws IOException { for (int index = 0; index < recordCount; index++) { Group group = new SimpleGroup(super.schema); - if (!intGenerator.shouldGenerateNull()) { - group.append("i32", intGenerator.nextValue()); - } - if (!longGenerator.shouldGenerateNull()) { - group.append("i64", longGenerator.nextValue()); - } - if (!int96Generator.shouldGenerateNull()) { - group.append("i96", int96Generator.nextBinaryValue()); - } - if (!floatGenerator.shouldGenerateNull()) { - group.append("sngl", floatGenerator.nextValue()); - } - if (!doubleGenerator.shouldGenerateNull()) { - group.append("dbl", doubleGenerator.nextValue()); - } - if (!stringGenerator.shouldGenerateNull()) { - group.append("strings", stringGenerator.nextBinaryValue()); - } - if (!binaryGenerator.shouldGenerateNull()) { - group.append("binary", binaryGenerator.nextBinaryValue()); - } - if (!fixedBinaryGenerator.shouldGenerateNull()) { - group.append("fixed-binary", fixedBinaryGenerator.nextBinaryValue()); + for (int column = 0, columnCnt = schema.getFieldCount(); column < columnCnt; ++column) { + Type type = schema.getType(column); + RandomValueGenerator generator = randomGenerators.get(column); + if (type.isRepetition(OPTIONAL) && generator.shouldGenerateNull()) { + continue; + } + switch (type.asPrimitiveType().getPrimitiveTypeName()) { + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + case INT96: + group.append(type.getName(), ((RandomBinaryBase) generator).nextBinaryValue()); + break; + case INT32: + group.append(type.getName(), (Integer) generator.nextValue()); + break; + case INT64: + group.append(type.getName(), (Long) generator.nextValue()); + break; + case FLOAT: + group.append(type.getName(), (Float) generator.nextValue()); + break; + case DOUBLE: + group.append(type.getName(), (Double) generator.nextValue()); + break; + case BOOLEAN: + group.append(type.getName(), (Boolean) generator.nextValue()); + break; + } } - group.append("unconstrained-i32", random.nextInt()); - group.append("unconstrained-i64", random.nextLong()); - group.append("unconstrained-sngl", random.nextFloat()); - group.append("unconstrained-dbl", random.nextDouble()); - writer.write(group); } } From c5536a0a3210aa67225b4779403c52585e8f118d Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Fri, 24 Nov 2017 11:20:28 +0100 Subject: [PATCH 08/17] PARQUET-1025: Some modifications according to zi's comments --- .../parquet/filter2/predicate/Statistics.java | 14 ++-- .../apache/parquet/schema/PrimitiveType.java | 70 +++++++++++++++---- .../schema/TestPrimitiveComparator.java | 4 +- .../converter/ParquetMetadataConverter.java | 5 +- .../TestParquetMetadataConverter.java | 56 +++++++-------- 5 files changed, 97 insertions(+), 52 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java index 3325195810..5b83183c95 100644 --- a/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java @@ -46,18 +46,20 @@ public Statistics(T min, T max, Comparator comparator) { } /** - * Returns the generic object representing the min value in the statistics. - * The self-comparison logic of {@code T} might not proper for the actual logical type (e.g. unsigned int). Use {@link - * #getComparator()} for comparing. + * Returns the generic object representing the min value in the statistics. The + * natural ordering of type {@code T} defined by the {@code compareTo} method + * might not be appropriate for the actual logical type. Use + * {@link #getComparator()} for comparing. */ public T getMin() { return min; } /** - * Returns the generic object representing the max value in the statistics. - * The self-comparison logic of {@code T} might not proper for the actual logical type (e.g. unsigned int). Use {@link - * #getComparator()} for comparing. + * Returns the generic object representing the max value in the statistics. The + * natural ordering of type {@code T} defined by the {@code compareTo} method + * might not be appropriate for the actual logical type. Use + * {@link #getComparator()} for comparing. */ public T getMax() { return max; diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java index b00d0effee..e14acacb56 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java @@ -22,6 +22,7 @@ import java.util.List; import java.util.Locale; +import org.apache.parquet.ShouldNeverHappenException; import org.apache.parquet.column.ColumnReader; import org.apache.parquet.io.InvalidRecordException; import org.apache.parquet.io.api.Binary; @@ -89,10 +90,22 @@ public T convert(PrimitiveTypeNameConverter conve @Override PrimitiveComparator comparator(OriginalType logicalType) { - if (logicalType == OriginalType.UINT_64) { + if (logicalType == null) { + return PrimitiveComparator.SIGNED_INT64_COMPARATOR; + } + switch (logicalType) { + case UINT_64: return PrimitiveComparator.UNSIGNED_INT64_COMPARATOR; + case INT_64: + case DECIMAL: + case TIME_MICROS: + case TIMESTAMP_MILLIS: + case TIMESTAMP_MICROS: + return PrimitiveComparator.SIGNED_INT64_COMPARATOR; + default: + throw new ShouldNeverHappenException( + "No comparator logic implemented for INT64 logical type: " + logicalType); } - return PrimitiveComparator.SIGNED_INT64_COMPARATOR; } }, INT32("getInteger", Integer.TYPE) { @@ -120,15 +133,25 @@ public T convert(PrimitiveTypeNameConverter conve @Override PrimitiveComparator comparator(OriginalType logicalType) { - if (logicalType != null) { - switch (logicalType) { - case UINT_8: - case UINT_16: - case UINT_32: - return PrimitiveComparator.UNSIGNED_INT32_COMPARATOR; - } + if (logicalType == null) { + return PrimitiveComparator.SIGNED_INT32_COMPARATOR; + } + switch (logicalType) { + case UINT_8: + case UINT_16: + case UINT_32: + return PrimitiveComparator.UNSIGNED_INT32_COMPARATOR; + case INT_8: + case INT_16: + case INT_32: + case DECIMAL: + case DATE: + case TIME_MILLIS: + return PrimitiveComparator.SIGNED_INT32_COMPARATOR; + default: + throw new ShouldNeverHappenException( + "No comparator logic implemented for INT32 logical type: " + logicalType); } - return PrimitiveComparator.SIGNED_INT32_COMPARATOR; } }, BOOLEAN("getBoolean", Boolean.TYPE) { @@ -184,10 +207,21 @@ public T convert(PrimitiveTypeNameConverter conve @Override PrimitiveComparator comparator(OriginalType logicalType) { - if (logicalType == OriginalType.DECIMAL) { + if (logicalType == null) { + return PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR; + } + switch (logicalType) { + case DECIMAL: return PrimitiveComparator.SIGNED_BINARY_COMPARATOR; + case UTF8: + case ENUM: + case JSON: + case BSON: + return PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR; + default: + throw new ShouldNeverHappenException( + "No comparator logic implemented for BINARY logical type: " + logicalType); } - return PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR; } }, FLOAT("getFloat", Float.TYPE) { @@ -297,10 +331,18 @@ public T convert(PrimitiveTypeNameConverter conve @Override PrimitiveComparator comparator(OriginalType logicalType) { - if (logicalType == OriginalType.DECIMAL) { + if (logicalType == null) { + return PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR; + } + switch (logicalType) { + case DECIMAL: return PrimitiveComparator.SIGNED_BINARY_COMPARATOR; + case INTERVAL: + return PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR; + default: + throw new ShouldNeverHappenException( + "No comparator logic implemented for FIXED_LEN_BYTE_ARRAY logical type: " + logicalType); } - return PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR; } }; diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java index 5cb626b2cd..212ce22409 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java @@ -263,14 +263,14 @@ private void checkThrowingUnsupportedException(PrimitiveComparator comparator } if (Float.TYPE != exclude) { try { - comparator.compare(0F, 0F); + comparator.compare(0.0F, 0.0F); fail("An UnsupportedOperationException should have been thrown"); } catch (UnsupportedOperationException e) { } } if (Double.TYPE != exclude) { try { - comparator.compare(0D, 0D); + comparator.compare(0.0, 0.0); fail("An UnsupportedOperationException should have been thrown"); } catch (UnsupportedOperationException e) { } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 41a1f167c6..32148c54c0 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -397,8 +397,9 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist boolean sortOrdersMatch = SortOrder.SIGNED == typeSortOrder; // NOTE: See docs in CorruptStatistics for explanation of why this check is needed // The sort order is checked to avoid returning min/max stats that are not - // valid with the type's sort order. Currently, all stats are aggregated - // using a signed ordering, which isn't valid for strings or unsigned ints. + // valid with the type's sort order. In previous releases, all stats were + // aggregated using a signed byte-wise ordering, which isn't valid for all the + // types (e.g. strings, decimals etc.). if (!CorruptStatistics.shouldIgnoreStatistics(createdBy, type.getPrimitiveTypeName()) && (sortOrdersMatch || maxEqualsMin)) { if (isSet) { diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 48d3a52125..195d455ede 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -664,29 +664,29 @@ private void testUseStatsWithSignedSortOrder(StatsHelper helper) { @Test public void testV2OnlyStats() { testV2OnlyStats(createStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_8).named(""), - Byte.MAX_VALUE, - Byte.MIN_VALUE)); + 0x7F, + 0x80)); testV2OnlyStats(createStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_16).named(""), - Short.MAX_VALUE, - Short.MIN_VALUE)); + 0x7FFF, + 0x8000)); testV2OnlyStats(createStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_32).named(""), - Integer.MAX_VALUE, - Integer.MIN_VALUE)); + 0x7FFFFFFF, + 0x80000000)); testV2OnlyStats(createStats(Types.optional(PrimitiveTypeName.INT64).as(OriginalType.UINT_64).named(""), - Long.MAX_VALUE, - Long.MIN_VALUE)); + 0x7FFFFFFFFFFFFFFFL, + 0x8000000000000000L)); testV2OnlyStats( createStats(Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.DECIMAL).precision(6).named(""), - new BigInteger("-123456"), - new BigInteger("123456"))); + new BigInteger("-765875"), + new BigInteger("876856"))); testV2OnlyStats(createStats( Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(14).as(OriginalType.DECIMAL).precision(7) .named(""), - new BigInteger("-1234567"), - new BigInteger("1234567"))); + new BigInteger("-6769643"), + new BigInteger("9864675"))); testV2OnlyStats(createStats(Types.optional(PrimitiveTypeName.INT96).named(""), - new BigInteger("-12345678"), - new BigInteger("12345678"))); + new BigInteger("-75687987"), + new BigInteger("45367657"))); } private void testV2OnlyStats(Statistics stats) { @@ -700,27 +700,27 @@ private void testV2OnlyStats(Statistics stats) { @Test public void testV2StatsEqualMinMax() { testV2StatsEqualMinMax(createStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_8).named(""), - Byte.MAX_VALUE, - Byte.MAX_VALUE)); + 93, + 93)); testV2StatsEqualMinMax(createStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_16).named(""), - Short.MIN_VALUE, - Short.MIN_VALUE)); + -5892, + -5892)); testV2StatsEqualMinMax(createStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_32).named(""), - Integer.MAX_VALUE, - Integer.MAX_VALUE)); + 234998934, + 234998934)); testV2StatsEqualMinMax(createStats(Types.optional(PrimitiveTypeName.INT64).as(OriginalType.UINT_64).named(""), - Long.MIN_VALUE, - Long.MIN_VALUE)); + -2389943895984985L, + -2389943895984985L)); testV2StatsEqualMinMax(createStats(Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.DECIMAL).precision(6).named(""), - new BigInteger("123456"), - new BigInteger("123456"))); + new BigInteger("823749"), + new BigInteger("823749"))); testV2StatsEqualMinMax(createStats( Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(14).as(OriginalType.DECIMAL).precision(7).named(""), - new BigInteger("-1234567"), - new BigInteger("-1234567"))); + new BigInteger("-8752832"), + new BigInteger("-8752832"))); testV2StatsEqualMinMax(createStats(Types.optional(PrimitiveTypeName.INT96).named(""), - new BigInteger("12345678"), - new BigInteger("12345678"))); + new BigInteger("81032984"), + new BigInteger("81032984"))); } private void testV2StatsEqualMinMax(Statistics stats) { From 2f28c2c0e37e3c5142e109d714261d54ff87a778 Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Tue, 28 Nov 2017 18:27:47 +0100 Subject: [PATCH 09/17] PARQUET-1025: Finalize read/write stats updates --- .../parquet/column/statistics/Statistics.java | 21 +-- .../statistics/StatisticsClassException.java | 14 +- .../parquet/schema/PrimitiveComparator.java | 50 +++-- .../converter/ParquetMetadataConverter.java | 142 +++++++++----- .../hadoop/ColumnChunkPageWriteStore.java | 6 +- .../parquet/hadoop/ParquetFileWriter.java | 3 +- .../TestParquetMetadataConverter.java | 177 +++++++++++------- .../parquet/hadoop/TestParquetFileWriter.java | 5 +- 8 files changed, 273 insertions(+), 145 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index 0a0c93ade5..ba6b0166a3 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -51,8 +51,7 @@ public abstract class Statistics> { * @param type * PrimitiveTypeName type of the column * @return instance of a typed statistics class - * @deprecated Use {@link #createStats(Type)} or - * {@link #createLegacyStats(PrimitiveTypeName)} instead + * @deprecated Use {@link #createStats(Type)} instead */ @Deprecated public static Statistics getStatsBasedOnType(PrimitiveTypeName type) { @@ -78,18 +77,6 @@ public static Statistics getStatsBasedOnType(PrimitiveTypeName type) { } } - /** - * Creates an empty {@code Statistics} instance for the specified type to be - * used for reading/writing the legacy min/max statistics. - * - * @param type - * type of the column - * @return instance of a typed statistics class - */ - public static Statistics createLegacyStats(PrimitiveTypeName type) { - return getStatsBasedOnType(type); - } - /** * Creates an empty {@code Statistics} instance for the specified type to be * used for reading/writing the new min/max statistics used in the V2 format. @@ -203,14 +190,16 @@ public int hashCode() { public void mergeStatistics(Statistics stats) { if (stats.isEmpty()) return; - if (this.getClass() == stats.getClass()) { + // Merge stats only if they have the same type and comparator (the sorting order + // is the same) + if (this.getClass() == stats.getClass() && Objects.equals(comparator(), stats.comparator())) { incrementNumNulls(stats.getNumNulls()); if (stats.hasNonNullValue()) { mergeStatisticsMinMax(stats); markAsNotEmpty(); } } else { - throw new StatisticsClassException(this.getClass().toString(), stats.getClass().toString()); + throw StatisticsClassException.create(this, stats); } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/StatisticsClassException.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/StatisticsClassException.java index a242737616..4c23101074 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/StatisticsClassException.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/StatisticsClassException.java @@ -29,6 +29,18 @@ public class StatisticsClassException extends ParquetRuntimeException { private static final long serialVersionUID = 1L; public StatisticsClassException(String className1, String className2) { - super("Statistics classes mismatched: " + className1 + " vs. " + className2); + this("Statistics classes mismatched: " + className1 + " vs. " + className2); + } + + private StatisticsClassException(String msg) { + super(msg); + } + + static StatisticsClassException create(Statistics stats1, Statistics stats2) { + if (stats1.getClass() != stats2.getClass()) { + return new StatisticsClassException(stats1.getClass().toString(), stats2.getClass().toString()); + } + return new StatisticsClassException( + "Statistics comparator mismatched: " + stats1.comparator() + " vs. " + stats2.comparator()); } } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java index 80e60b3fae..33f119773a 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java @@ -50,14 +50,6 @@ public int compare(double d1, double d2) { throw new UnsupportedOperationException("compare(double, double) was called on a non-double comparator"); } - /** - * Returns whether this comparator is compliant with the sorting definition of - * the former V1 min-max statistics. - */ - public boolean isFormerStatsCompliant() { - return true; - } - static final PrimitiveComparator BOOLEAN_COMPARATOR = new PrimitiveComparator() { @Override public int compare(Boolean o1, Boolean o2) { @@ -68,6 +60,11 @@ public int compare(Boolean o1, Boolean o2) { public int compare(boolean b1, boolean b2) { return Boolean.compare(b1, b2); } + + @Override + public String toString() { + return "BOOLEAN_COMPARATOR"; + } }; private static abstract class IntComparator extends PrimitiveComparator { @@ -82,6 +79,11 @@ public final int compare(Integer o1, Integer o2) { public int compare(int i1, int i2) { return Integer.compare(i1, i2); } + + @Override + public String toString() { + return "SIGNED_INT32_COMPARATOR"; + } }; static final PrimitiveComparator UNSIGNED_INT32_COMPARATOR = new IntComparator() { @@ -92,8 +94,8 @@ public int compare(int i1, int i2) { } @Override - public boolean isFormerStatsCompliant() { - return false; + public String toString() { + return "UNSIGNED_INT32_COMPARATOR"; } }; @@ -109,6 +111,11 @@ public final int compare(Long o1, Long o2) { public int compare(long l1, long l2) { return Long.compare(l1, l2); } + + @Override + public String toString() { + return "SIGNED_INT64_COMPARATOR"; + } }; static final PrimitiveComparator UNSIGNED_INT64_COMPARATOR = new LongComparator() { @@ -119,8 +126,8 @@ public int compare(long l1, long l2) { } @Override - public boolean isFormerStatsCompliant() { - return false; + public String toString() { + return "UNSIGNED_INT64_COMPARATOR"; } }; @@ -134,6 +141,11 @@ public int compare(Float o1, Float o2) { public int compare(float f1, float f2) { return Float.compare(f1, f2); } + + @Override + public String toString() { + return "FLOAT_COMPARATOR"; + } }; static final PrimitiveComparator DOUBLE_COMPARATOR = new PrimitiveComparator() { @@ -146,6 +158,11 @@ public int compare(Double o1, Double o2) { public int compare(double d1, double d2) { return Double.compare(d1, d2); } + + @Override + public String toString() { + return "DOUBLE_COMPARATOR"; + } }; private static abstract class BinaryComparator extends PrimitiveComparator { @@ -183,6 +200,11 @@ int compare(ByteBuffer b1, ByteBuffer b2) { private int unsignedCompare(byte b1, byte b2) { return toUnsigned(b1) - toUnsigned(b2); } + + @Override + public String toString() { + return "LEXICOGRAPHICAL_BINARY_COMPARATOR"; + } }; static final PrimitiveComparator SIGNED_BINARY_COMPARATOR = new BinaryComparator() { @@ -218,8 +240,8 @@ int compare(ByteBuffer b1, ByteBuffer b2) { } @Override - public boolean isFormerStatsCompliant() { - return false; + public String toString() { + return "SIGNED_BINARY_COMPARATOR"; } }; } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 32148c54c0..4b963070f6 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -219,7 +219,8 @@ private void addRowGroup(ParquetMetadata parquetMetadata, List rowGrou columnMetaData.getFirstDataPageOffset()); columnChunk.meta_data.dictionary_page_offset = columnMetaData.getDictionaryPageOffset(); if (!columnMetaData.getStatistics().isEmpty()) { - columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics())); + columnChunk.meta_data + .setStatistics(toParquetStatistics(columnMetaData.getStatistics(), columnMetaData.getFullType())); } if (columnMetaData.getEncodingStats() != null) { columnChunk.meta_data.setEncoding_stats(convertEncodingStats(columnMetaData.getEncodingStats())); @@ -325,8 +326,14 @@ dataPageType, getEncoding(encoding), return formatStats; } + @Deprecated public static Statistics toParquetStatistics( org.apache.parquet.column.statistics.Statistics statistics) { + return toParquetStatistics(statistics, null); + } + + public static Statistics toParquetStatistics( + org.apache.parquet.column.statistics.Statistics statistics, PrimitiveType type) { Statistics stats = new Statistics(); // Don't write stats larger than the max size rather than truncating. The // rationale is that some engines may use the minimum value in the page as @@ -338,21 +345,30 @@ public static Statistics toParquetStatistics( byte[] min = statistics.getMinBytes(); byte[] max = statistics.getMaxBytes(); - // Fill the former min-max statistics only if comparison logic is the one - // specified in V1 format (e.g. signed comparison for numbers, unsigned - // lexicographical for binary) or the min and max are equal - if (statistics.comparator().isFormerStatsCompliant() || Arrays.equals(min, max)) { + // Fill the former min-max statistics only if the comparison logic is + // signed so the logic of V1 and V2 stats are the same (which is + // trivially true for equal min-max values) + if (sortOrder(type) == SortOrder.SIGNED || Arrays.equals(min, max)) { stats.setMin(min); stats.setMax(max); } - stats.setMin_value(min); - stats.setMax_value(max); + if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) { + stats.setMin_value(min); + stats.setMax_value(max); + } } } return stats; } + private static boolean isMinMaxStatsSupported(PrimitiveType type) { + // Have to handle null type to support deprecated methods + return type != null + && type.getPrimitiveTypeName() != PrimitiveTypeName.INT96 + && type.getOriginalType() != OriginalType.INTERVAL; + } + /** * @deprecated Replaced by {@link #fromParquetStatistics( * String createdBy, Statistics statistics, PrimitiveTypeName type)} @@ -375,37 +391,36 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist // Visible for testing static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal (String createdBy, Statistics statistics, PrimitiveType type, SortOrder typeSortOrder) { - // If there was no statistics written to the footer, create an empty Statistics object and return - if (statistics == null) { - return org.apache.parquet.column.statistics.Statistics - .createLegacyStats(type.asPrimitiveType().getPrimitiveTypeName()); - } - - org.apache.parquet.column.statistics.Statistics stats; - // Use the new V2 min-max statistics over the former one if it is filled - if (statistics.isSetMin_value() && statistics.isSetMax_value()) { - stats = org.apache.parquet.column.statistics.Statistics.createStats(type); - stats.setMinMaxFromBytes(statistics.min_value.array(), statistics.max_value.array()); - stats.setNumNulls(statistics.null_count); - } else { - // create stats object based on the column type - stats = org.apache.parquet.column.statistics.Statistics - .createLegacyStats(type.asPrimitiveType().getPrimitiveTypeName()); - - boolean isSet = statistics.isSetMax() && statistics.isSetMin(); - boolean maxEqualsMin = isSet ? Arrays.equals(statistics.getMin(), statistics.getMax()) : false; - boolean sortOrdersMatch = SortOrder.SIGNED == typeSortOrder; - // NOTE: See docs in CorruptStatistics for explanation of why this check is needed - // The sort order is checked to avoid returning min/max stats that are not - // valid with the type's sort order. In previous releases, all stats were - // aggregated using a signed byte-wise ordering, which isn't valid for all the - // types (e.g. strings, decimals etc.). - if (!CorruptStatistics.shouldIgnoreStatistics(createdBy, type.getPrimitiveTypeName()) && - (sortOrdersMatch || maxEqualsMin)) { - if (isSet) { - stats.setMinMaxFromBytes(statistics.min.array(), statistics.max.array()); + // create stats object based on the column type + org.apache.parquet.column.statistics.Statistics stats = org.apache.parquet.column.statistics.Statistics.createStats(type); + + if (statistics != null) { + // Use the new V2 min-max statistics over the former one if it is filled + if (statistics.isSetMin_value() && statistics.isSetMax_value()) { + byte[] min = statistics.min_value.array(); + byte[] max = statistics.max_value.array(); + // Ordering of INT96 and INTERVAL types is not clear; + // we only support statistics for them if min and max are equal + if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) { + stats.setMinMaxFromBytes(min, max); } stats.setNumNulls(statistics.null_count); + } else { + boolean isSet = statistics.isSetMax() && statistics.isSetMin(); + boolean maxEqualsMin = isSet ? Arrays.equals(statistics.getMin(), statistics.getMax()) : false; + boolean sortOrdersMatch = SortOrder.SIGNED == typeSortOrder; + // NOTE: See docs in CorruptStatistics for explanation of why this check is needed + // The sort order is checked to avoid returning min/max stats that are not + // valid with the type's sort order. In previous releases, all stats were + // aggregated using a signed byte-wise ordering, which isn't valid for all the + // types (e.g. strings, decimals etc.). + if (!CorruptStatistics.shouldIgnoreStatistics(createdBy, type.getPrimitiveTypeName()) && + (sortOrdersMatch || maxEqualsMin)) { + if (isSet) { + stats.setMinMaxFromBytes(statistics.min.array(), statistics.max.array()); + } + stats.setNumNulls(statistics.null_count); + } } } return stats; @@ -484,6 +499,11 @@ private static SortOrder defaultSortOrder(PrimitiveTypeName primitive) { * @return the "correct" sort order of the type that applications assume */ private static SortOrder sortOrder(PrimitiveType primitive) { + // Have to handle null type to support deprecated methods + if (primitive == null) { + return SortOrder.UNKNOWN; + } + OriginalType annotation = primitive.getOriginalType(); if (annotation != null) { switch (annotation) { @@ -988,9 +1008,11 @@ public void writeDataPageHeader( new org.apache.parquet.column.statistics.BooleanStatistics(), rlEncoding, dlEncoding, - valuesEncoding), to); + valuesEncoding, + null), to); } + @Deprecated public void writeDataPageHeader( int uncompressedSize, int compressedSize, @@ -1002,7 +1024,23 @@ public void writeDataPageHeader( OutputStream to) throws IOException { writePageHeader( newDataPageHeader(uncompressedSize, compressedSize, valueCount, statistics, - rlEncoding, dlEncoding, valuesEncoding), + rlEncoding, dlEncoding, valuesEncoding, null), + to); + } + + public void writeDataPageHeader( + int uncompressedSize, + int compressedSize, + int valueCount, + org.apache.parquet.column.statistics.Statistics statistics, + org.apache.parquet.column.Encoding rlEncoding, + org.apache.parquet.column.Encoding dlEncoding, + org.apache.parquet.column.Encoding valuesEncoding, + OutputStream to, + PrimitiveType type) throws IOException { + writePageHeader( + newDataPageHeader(uncompressedSize, compressedSize, valueCount, statistics, + rlEncoding, dlEncoding, valuesEncoding, type), to); } @@ -1012,7 +1050,8 @@ private PageHeader newDataPageHeader( org.apache.parquet.column.statistics.Statistics statistics, org.apache.parquet.column.Encoding rlEncoding, org.apache.parquet.column.Encoding dlEncoding, - org.apache.parquet.column.Encoding valuesEncoding) { + org.apache.parquet.column.Encoding valuesEncoding, + PrimitiveType type) { PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE, uncompressedSize, compressedSize); // TODO: pageHeader.crc = ...; pageHeader.setData_page_header(new DataPageHeader( @@ -1022,11 +1061,12 @@ private PageHeader newDataPageHeader( getEncoding(rlEncoding))); if (!statistics.isEmpty()) { pageHeader.getData_page_header().setStatistics( - toParquetStatistics(statistics)); + toParquetStatistics(statistics, type)); } return pageHeader; } + @Deprecated public void writeDataPageV2Header( int uncompressedSize, int compressedSize, int valueCount, int nullCount, int rowCount, @@ -1040,7 +1080,23 @@ public void writeDataPageV2Header( valueCount, nullCount, rowCount, statistics, dataEncoding, - rlByteLength, dlByteLength), to); + rlByteLength, dlByteLength, null), to); + } + + public void writeDataPageV2Header( + int uncompressedSize, int compressedSize, + int valueCount, int nullCount, int rowCount, + org.apache.parquet.column.statistics.Statistics statistics, + org.apache.parquet.column.Encoding dataEncoding, + int rlByteLength, int dlByteLength, + OutputStream to, PrimitiveType type) throws IOException { + writePageHeader( + newDataPageV2Header( + uncompressedSize, compressedSize, + valueCount, nullCount, rowCount, + statistics, + dataEncoding, + rlByteLength, dlByteLength, type), to); } private PageHeader newDataPageV2Header( @@ -1048,7 +1104,7 @@ private PageHeader newDataPageV2Header( int valueCount, int nullCount, int rowCount, org.apache.parquet.column.statistics.Statistics statistics, org.apache.parquet.column.Encoding dataEncoding, - int rlByteLength, int dlByteLength) { + int rlByteLength, int dlByteLength, PrimitiveType type) { // TODO: pageHeader.crc = ...; DataPageHeaderV2 dataPageHeaderV2 = new DataPageHeaderV2( valueCount, nullCount, rowCount, @@ -1056,7 +1112,7 @@ private PageHeader newDataPageV2Header( dlByteLength, rlByteLength); if (!statistics.isEmpty()) { dataPageHeaderV2.setStatistics( - toParquetStatistics(statistics)); + toParquetStatistics(statistics, type)); } PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE_V2, uncompressedSize, compressedSize); pageHeader.setData_page_header_v2(dataPageHeaderV2); diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java index 8eeab5c55f..56fad46718 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java @@ -108,7 +108,8 @@ public void writePage(BytesInput bytes, rlEncoding, dlEncoding, valuesEncoding, - tempOutputStream); + tempOutputStream, + path.getFullType()); this.uncompressedLength += uncompressedSize; this.compressedLength += compressedSize; this.totalValueCount += valueCount; @@ -153,7 +154,8 @@ public void writePageV2( dataEncoding, rlByteLength, dlByteLength, - tempOutputStream); + tempOutputStream, + path.getFullType()); this.uncompressedLength += uncompressedSize; this.compressedLength += compressedSize; this.totalValueCount += valueCount; diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java index dcfb88b694..0128a9c663 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java @@ -419,7 +419,8 @@ public void writeDataPage( rlEncoding, dlEncoding, valuesEncoding, - out); + out, + currentChunkType); long headerSize = out.getPos() - beforeHeader; this.uncompressedLength += uncompressedPageSize + headerSize; this.compressedLength += compressedPageSize + headerSize; diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 195d455ede..235cf4ed4f 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -397,21 +397,28 @@ private void testBinaryStats(StatsHelper helper) { Assert.assertTrue("Should be smaller than min + max size + 1", stats.isSmallerThan(totalLen + 1)); + PrimitiveType type = Types.optional(PrimitiveTypeName.BINARY).named(""); org.apache.parquet.format.Statistics formatStats = - helper.toParquetStatistics(stats); + helper.toParquetStatistics(type, stats); - Assert.assertArrayEquals("Min should match", min, formatStats.getMin()); - Assert.assertArrayEquals("Max should match", max, formatStats.getMax()); + assertFalse("Min should not be set", formatStats.isSetMin()); + assertFalse("Max should not be set", formatStats.isSetMax()); + if (helper == StatsHelper.V2) { + Assert.assertArrayEquals("Min_value should match", min, formatStats.getMin_value()); + Assert.assertArrayEquals("Max_value should match", max, formatStats.getMax_value()); + } Assert.assertEquals("Num nulls should match", 3004, formatStats.getNull_count()); // convert to empty stats because the values are too large stats.setMinMaxFromBytes(max, max); - formatStats = helper.toParquetStatistics(stats); + formatStats = helper.toParquetStatistics(type, stats); Assert.assertFalse("Min should not be set", formatStats.isSetMin()); Assert.assertFalse("Max should not be set", formatStats.isSetMax()); + Assert.assertFalse("Min_value should not be set", formatStats.isSetMin_value()); + Assert.assertFalse("Max_value should not be set", formatStats.isSetMax_value()); Assert.assertFalse("Num nulls should not be set", formatStats.isSetNull_count()); @@ -442,7 +449,7 @@ private void testIntegerStats(StatsHelper helper) { stats.updateStats(max); org.apache.parquet.format.Statistics formatStats = - helper.toParquetStatistics(stats); + helper.toParquetStatistics(Types.optional(PrimitiveTypeName.INT32).named(""), stats); Assert.assertEquals("Min should match", min, BytesUtils.bytesToInt(formatStats.getMin())); @@ -472,7 +479,7 @@ private void testLongStats(StatsHelper helper) { stats.updateStats(max); org.apache.parquet.format.Statistics formatStats = - helper.toParquetStatistics(stats); + helper.toParquetStatistics(Types.optional(PrimitiveTypeName.INT64).named(""), stats); Assert.assertEquals("Min should match", min, BytesUtils.bytesToLong(formatStats.getMin())); @@ -502,7 +509,7 @@ private void testFloatStats(StatsHelper helper) { stats.updateStats(max); org.apache.parquet.format.Statistics formatStats = - helper.toParquetStatistics(stats); + helper.toParquetStatistics(Types.optional(PrimitiveTypeName.FLOAT).named(""), stats); Assert.assertEquals("Min should match", min, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMin())), @@ -534,7 +541,7 @@ private void testDoubleStats(StatsHelper helper) { stats.updateStats(max); org.apache.parquet.format.Statistics formatStats = - helper.toParquetStatistics(stats); + helper.toParquetStatistics(Types.optional(PrimitiveTypeName.DOUBLE).named(""), stats); Assert.assertEquals("Min should match", min, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMin())), @@ -566,7 +573,7 @@ private void testBooleanStats(StatsHelper helper) { stats.updateStats(max); org.apache.parquet.format.Statistics formatStats = - helper.toParquetStatistics(stats); + helper.toParquetStatistics(Types.optional(PrimitiveTypeName.BOOLEAN).named(""), stats); Assert.assertEquals("Min should match", min, BytesUtils.bytesToBool(formatStats.getMin())); @@ -586,11 +593,12 @@ public void testIgnoreStatsWithSignedSortOrder() { stats.updateStats(Binary.fromString("z")); stats.incrementNumNulls(); + PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY) + .as(OriginalType.UTF8).named("b"); Statistics convertedStats = converter.fromParquetStatistics( Version.FULL_VERSION, - StatsHelper.V1.toParquetStatistics(stats), - Types.required(PrimitiveTypeName.BINARY) - .as(OriginalType.UTF8).named("b")); + StatsHelper.V1.toParquetStatistics(binaryType, stats), + binaryType); Assert.assertTrue("Stats should be empty: " + convertedStats, convertedStats.isEmpty()); } @@ -614,11 +622,11 @@ private void testStillUseStatsWithSignedSortOrderIfSingleValue(StatsHelper helpe stats.updateStats(Binary.fromString("A")); stats.incrementNumNulls(); + PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("b"); Statistics convertedStats = converter.fromParquetStatistics( Version.FULL_VERSION, - ParquetMetadataConverter.toParquetStatistics(stats), - Types.required(PrimitiveTypeName.BINARY) - .as(OriginalType.UTF8).named("b")); + ParquetMetadataConverter.toParquetStatistics(stats, binaryType), + binaryType); Assert.assertFalse("Stats should not be empty: " + convertedStats, convertedStats.isEmpty()); Assert.assertArrayEquals("min == max: " + convertedStats, convertedStats.getMaxBytes(), convertedStats.getMinBytes()); @@ -647,50 +655,72 @@ private void testUseStatsWithSignedSortOrder(StatsHelper helper) { stats.updateStats(Binary.fromString("z")); stats.incrementNumNulls(); + PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY) + .as(OriginalType.UTF8).named("b"); Statistics convertedStats = converter.fromParquetStatistics( Version.FULL_VERSION, - helper.toParquetStatistics(stats), - Types.required(PrimitiveTypeName.BINARY) - .as(OriginalType.UTF8).named("b")); + helper.toParquetStatistics(binaryType, stats), + binaryType); Assert.assertFalse("Stats should not be empty", convertedStats.isEmpty()); Assert.assertEquals("Should have 3 nulls", 3, convertedStats.getNumNulls()); - Assert.assertEquals("Should have correct min (unsigned sort)", - Binary.fromString("A"), convertedStats.genericGetMin()); - Assert.assertEquals("Should have correct max (unsigned sort)", - Binary.fromString("z"), convertedStats.genericGetMax()); + if (helper == StatsHelper.V1) { + assertFalse("Min-max should be null for V1 stats", convertedStats.hasNonNullValue()); + } else { + Assert.assertEquals("Should have correct min (unsigned sort)", + Binary.fromString("A"), convertedStats.genericGetMin()); + Assert.assertEquals("Should have correct max (unsigned sort)", + Binary.fromString("z"), convertedStats.genericGetMax()); + } + } + + @Test + public void testSkippedV2Stats() { + testSkippedV2Stats( + Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(12).as(OriginalType.INTERVAL).named(""), + new BigInteger("12345678"), + new BigInteger("12345679")); + testSkippedV2Stats(Types.optional(PrimitiveTypeName.INT96).named(""), + new BigInteger("-75687987"), + new BigInteger("45367657")); + } + + private void testSkippedV2Stats(PrimitiveType type, Object min, Object max) { + Statistics stats = createStats(type, min, max); + org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats, type); + assertFalse(statistics.isSetMin()); + assertFalse(statistics.isSetMax()); + assertFalse(statistics.isSetMin_value()); + assertFalse(statistics.isSetMax_value()); } @Test public void testV2OnlyStats() { - testV2OnlyStats(createStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_8).named(""), + testV2OnlyStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_8).named(""), 0x7F, - 0x80)); - testV2OnlyStats(createStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_16).named(""), + 0x80); + testV2OnlyStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_16).named(""), 0x7FFF, - 0x8000)); - testV2OnlyStats(createStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_32).named(""), + 0x8000); + testV2OnlyStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_32).named(""), 0x7FFFFFFF, - 0x80000000)); - testV2OnlyStats(createStats(Types.optional(PrimitiveTypeName.INT64).as(OriginalType.UINT_64).named(""), + 0x80000000); + testV2OnlyStats(Types.optional(PrimitiveTypeName.INT64).as(OriginalType.UINT_64).named(""), 0x7FFFFFFFFFFFFFFFL, - 0x8000000000000000L)); + 0x8000000000000000L); + testV2OnlyStats(Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.DECIMAL).precision(6).named(""), + new BigInteger("-765875"), + new BigInteger("876856")); testV2OnlyStats( - createStats(Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.DECIMAL).precision(6).named(""), - new BigInteger("-765875"), - new BigInteger("876856"))); - testV2OnlyStats(createStats( Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(14).as(OriginalType.DECIMAL).precision(7) .named(""), new BigInteger("-6769643"), - new BigInteger("9864675"))); - testV2OnlyStats(createStats(Types.optional(PrimitiveTypeName.INT96).named(""), - new BigInteger("-75687987"), - new BigInteger("45367657"))); + new BigInteger("9864675")); } - private void testV2OnlyStats(Statistics stats) { - org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats); + private void testV2OnlyStats(PrimitiveType type, Object min, Object max) { + Statistics stats = createStats(type, min, max); + org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats, type); assertFalse(statistics.isSetMin()); assertFalse(statistics.isSetMax()); assertEquals(ByteBuffer.wrap(stats.getMinBytes()), statistics.min_value); @@ -699,39 +729,54 @@ private void testV2OnlyStats(Statistics stats) { @Test public void testV2StatsEqualMinMax() { - testV2StatsEqualMinMax(createStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_8).named(""), + testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_8).named(""), 93, - 93)); - testV2StatsEqualMinMax(createStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_16).named(""), + 93); + testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_16).named(""), -5892, - -5892)); - testV2StatsEqualMinMax(createStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_32).named(""), + -5892); + testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_32).named(""), 234998934, - 234998934)); - testV2StatsEqualMinMax(createStats(Types.optional(PrimitiveTypeName.INT64).as(OriginalType.UINT_64).named(""), + 234998934); + testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.INT64).as(OriginalType.UINT_64).named(""), -2389943895984985L, - -2389943895984985L)); - testV2StatsEqualMinMax(createStats(Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.DECIMAL).precision(6).named(""), - new BigInteger("823749"), - new BigInteger("823749"))); - testV2StatsEqualMinMax(createStats( - Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(14).as(OriginalType.DECIMAL).precision(7).named(""), + -2389943895984985L); + testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.DECIMAL).precision(6).named(""), + new BigInteger("823749"), + new BigInteger("823749")); + testV2StatsEqualMinMax( + Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(14).as(OriginalType.DECIMAL).precision(7) + .named(""), new BigInteger("-8752832"), - new BigInteger("-8752832"))); - testV2StatsEqualMinMax(createStats(Types.optional(PrimitiveTypeName.INT96).named(""), + new BigInteger("-8752832")); + testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.INT96).named(""), new BigInteger("81032984"), - new BigInteger("81032984"))); + new BigInteger("81032984")); } - private void testV2StatsEqualMinMax(Statistics stats) { - org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats); + private void testV2StatsEqualMinMax(PrimitiveType type, Object min, Object max) { + Statistics stats = createStats(type, min, max); + org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats, type); assertEquals(ByteBuffer.wrap(stats.getMinBytes()), statistics.min); assertEquals(ByteBuffer.wrap(stats.getMaxBytes()), statistics.max); assertEquals(ByteBuffer.wrap(stats.getMinBytes()), statistics.min_value); assertEquals(ByteBuffer.wrap(stats.getMaxBytes()), statistics.max_value); } - private static Statistics createStats(PrimitiveType type, int min, int max) { + private static Statistics createStats(PrimitiveType type, T min, T max) { + Class c = min.getClass(); + if (c == Integer.class) { + return createStatsTyped(type, (Integer) min, (Integer) max); + } else if (c == Long.class) { + return createStatsTyped(type, (Long) min, (Long) max); + } else if (c == BigInteger.class) { + return createStatsTyped(type, (BigInteger) min, (BigInteger) max); + } + fail("Not implemented"); + return null; + } + + private static Statistics createStatsTyped(PrimitiveType type, int min, int max) { Statistics stats = Statistics.createStats(type); stats.updateStats(max); stats.updateStats(min); @@ -740,7 +785,7 @@ private static Statistics createStats(PrimitiveType type, int min, int max) { return stats; } - private static Statistics createStats(PrimitiveType type, long min, long max) { + private static Statistics createStatsTyped(PrimitiveType type, long min, long max) { Statistics stats = Statistics.createStats(type); stats.updateStats(max); stats.updateStats(min); @@ -749,7 +794,7 @@ private static Statistics createStats(PrimitiveType type, long min, long max) return stats; } - private static Statistics createStats(PrimitiveType type, BigInteger min, BigInteger max) { + private static Statistics createStatsTyped(PrimitiveType type, BigInteger min, BigInteger max) { Statistics stats = Statistics.createStats(type); Binary minBinary = Binary.fromConstantByteArray(min.toByteArray()); Binary maxBinary = Binary.fromConstantByteArray(max.toByteArray()); @@ -764,8 +809,8 @@ private enum StatsHelper { // Only min and max are filled (min_value and max_value are not) V1() { @Override - public org.apache.parquet.format.Statistics toParquetStatistics(Statistics stats) { - org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats); + public org.apache.parquet.format.Statistics toParquetStatistics(PrimitiveType type, Statistics stats) { + org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats, type); statistics.unsetMin_value(); statistics.unsetMax_value(); return statistics; @@ -774,11 +819,11 @@ public org.apache.parquet.format.Statistics toParquetStatistics(Statistics st // min_value and max_value are filled (min and max might be filled as well) V2() { @Override - public org.apache.parquet.format.Statistics toParquetStatistics(Statistics stats) { - return ParquetMetadataConverter.toParquetStatistics(stats); + public org.apache.parquet.format.Statistics toParquetStatistics(PrimitiveType type, Statistics stats) { + return ParquetMetadataConverter.toParquetStatistics(stats, type); } }; - public abstract org.apache.parquet.format.Statistics toParquetStatistics(Statistics stats); + public abstract org.apache.parquet.format.Statistics toParquetStatistics(PrimitiveType type, Statistics stats); } } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java index 6915c86ec3..cc17153759 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java @@ -47,6 +47,7 @@ import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageTypeParser; import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import java.io.File; @@ -431,8 +432,8 @@ public void testConvertToThriftStatistics() throws Exception { } final String createdBy = "parquet-mr version 1.8.0 (build d4d5a07ec9bd262ca1e93c309f1d7d4a74ebda4c)"; - Statistics thriftStats = - org.apache.parquet.format.converter.ParquetMetadataConverter.toParquetStatistics(parquetMRstats); + Statistics thriftStats = org.apache.parquet.format.converter.ParquetMetadataConverter + .toParquetStatistics(parquetMRstats, Types.optional(PrimitiveTypeName.INT64).named("")); LongStatistics convertedBackStats = (LongStatistics) org.apache.parquet.format.converter.ParquetMetadataConverter.fromParquetStatistics( createdBy, thriftStats, PrimitiveTypeName.INT64); From 95199e5e026ec08e6f64b4f9e95e32ebbde86193 Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Wed, 29 Nov 2017 10:31:48 +0100 Subject: [PATCH 10/17] PARQUET-1025: Use lexicographical comparison for Binary.compareTo Also rename SIGNED_BINARY_COMPARATOR to a more descriptive name Also added comments for haxa representation of values at unsigned comparison testing --- .../org/apache/parquet/io/api/Binary.java | 112 +----------------- .../parquet/schema/PrimitiveComparator.java | 6 +- .../apache/parquet/schema/PrimitiveType.java | 6 +- .../schema/TestPrimitiveComparator.java | 54 ++++----- 4 files changed, 38 insertions(+), 140 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java index 46fa336b36..2681fb67ae 100644 --- a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java +++ b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java @@ -33,6 +33,7 @@ import org.apache.parquet.io.ParquetDecodingException; import org.apache.parquet.io.ParquetEncodingException; +import org.apache.parquet.schema.PrimitiveComparator; import static org.apache.parquet.bytes.BytesUtils.UTF8; @@ -79,10 +80,6 @@ private Binary() { } @Deprecated abstract public int compareTo(Binary other); - abstract int compareTo(byte[] bytes, int offset, int length); - - abstract int compareTo(ByteBuffer bytes, int offset, int length); - abstract public ByteBuffer toByteBuffer(); @Override @@ -195,17 +192,7 @@ boolean equals(ByteBuffer bytes, int otherOffset, int otherLength) { @Override public int compareTo(Binary other) { - return other.compareTo(value, offset, length); - } - - @Override - int compareTo(byte[] other, int otherOffset, int otherLength) { - return Binary.compareTwoByteArrays(value, offset, length, other, otherOffset, otherLength); - } - - @Override - int compareTo(ByteBuffer bytes, int otherOffset, int otherLength) { - return Binary.compareByteArrayToByteBuffer(value, offset, length, bytes, otherOffset, otherLength); + return PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR.compare(this, other); } @Override @@ -351,20 +338,10 @@ boolean equals(ByteBuffer bytes, int otherOffset, int otherLength) { @Override public int compareTo(Binary other) { - return other.compareTo(value, 0, value.length); - } - - @Override - int compareTo(byte[] other, int otherOffset, int otherLength) { - return Binary.compareTwoByteArrays(value, 0, value.length, other, otherOffset, otherLength); + return PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR.compare(this, other); } - @Override - int compareTo(ByteBuffer bytes, int otherOffset, int otherLength) { - return Binary.compareByteArrayToByteBuffer(value, 0, value.length, bytes, otherOffset, otherLength); - } - - @Override + @Override public ByteBuffer toByteBuffer() { return ByteBuffer.wrap(value); } @@ -511,26 +488,7 @@ boolean equals(ByteBuffer otherBytes, int otherOffset, int otherLength) { @Override public int compareTo(Binary other) { - if (value.hasArray()) { - return other.compareTo(value.array(), value.arrayOffset() + offset, length); - } else { - return other.compareTo(value, offset, length); - } - } - - @Override - int compareTo(byte[] other, int otherOffset, int otherLength) { - if (value.hasArray()) { - return Binary.compareTwoByteArrays(value.array(), value.arrayOffset() + offset, length, - other, otherOffset, otherLength); - } { - return Binary.compareByteBufferToByteArray(value, offset, length, other, otherOffset, otherLength); - } - } - - @Override - int compareTo(ByteBuffer bytes, int otherOffset, int otherLength) { - return Binary.compareTwoByteBuffers(value, offset, length, bytes, otherOffset, otherLength); + return PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR.compare(this, other); } @Override @@ -671,64 +629,4 @@ private static final boolean equals(byte[] array1, int offset1, int length1, byt } return true; } - - private static final int compareByteBufferToByteArray(ByteBuffer buf, int offset1, int length1, - byte[] array, int offset2, int length2) { - return -1 * Binary.compareByteArrayToByteBuffer(array, offset1, length1, buf, offset2, length2); - } - - private static final int compareByteArrayToByteBuffer(byte[] array1, int offset1, int length1, - ByteBuffer buf, int offset2, int length2) { - if (array1 == null && buf == null) return 0; - int min_length = (length1 < length2) ? length1 : length2; - for (int i = 0; i < min_length; i++) { - if (array1[i + offset1] < buf.get(i + offset2)) { - return 1; - } - if (array1[i + offset1] > buf.get(i + offset2)) { - return -1; - } - } - // check remainder - if (length1 == length2) { return 0; } - else if (length1 < length2) { return 1;} - else { return -1; } - } - - private static final int compareTwoByteBuffers(ByteBuffer buf1, int offset1, int length1, - ByteBuffer buf2, int offset2, int length2) { - if (buf1 == null && buf2 == null) return 0; - int min_length = (length1 < length2) ? length1 : length2; - for (int i = 0; i < min_length; i++) { - if (buf1.get(i + offset1) < buf2.get(i + offset2)) { - return 1; - } - if (buf1.get(i + offset1) > buf2.get(i + offset2)) { - return -1; - } - } - // check remainder - if (length1 == length2) { return 0; } - else if (length1 < length2) { return 1;} - else { return -1; } - } - - private static final int compareTwoByteArrays(byte[] array1, int offset1, int length1, - byte[] array2, int offset2, int length2) { - if (array1 == null && array2 == null) return 0; - if (array1 == array2 && offset1 == offset2 && length1 == length2) return 0; - int min_length = (length1 < length2) ? length1 : length2; - for (int i = 0; i < min_length; i++) { - if (array1[i + offset1] < array2[i + offset2]) { - return 1; - } - if (array1[i + offset1] > array2[i + offset2]) { - return -1; - } - } - // check remainder - if (length1 == length2) { return 0; } - else if (length1 < length2) { return 1;} - else { return -1; } - } } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java index 33f119773a..5a56dcc370 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java @@ -178,7 +178,7 @@ final int toUnsigned(byte b) { } } - static final PrimitiveComparator LEXICOGRAPHICAL_BINARY_COMPARATOR = new BinaryComparator() { + public static final PrimitiveComparator LEXICOGRAPHICAL_BINARY_COMPARATOR = new BinaryComparator() { @Override int compare(ByteBuffer b1, ByteBuffer b2) { int l1 = b1.remaining(); @@ -207,7 +207,7 @@ public String toString() { } }; - static final PrimitiveComparator SIGNED_BINARY_COMPARATOR = new BinaryComparator() { + static final PrimitiveComparator BINARY_AS_SIGNED_INTEGER_COMPARATOR = new BinaryComparator() { private static final int NEGATIVE_PREFIX = 0xFF; private static final int POSITIVE_PREFIX = 0; @@ -241,7 +241,7 @@ int compare(ByteBuffer b1, ByteBuffer b2) { @Override public String toString() { - return "SIGNED_BINARY_COMPARATOR"; + return "BINARY_AS_SIGNED_INTEGER_COMPARATOR"; } }; } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java index e14acacb56..a5eb6247de 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java @@ -212,7 +212,7 @@ PrimitiveComparator comparator(OriginalType logicalType) { } switch (logicalType) { case DECIMAL: - return PrimitiveComparator.SIGNED_BINARY_COMPARATOR; + return PrimitiveComparator.BINARY_AS_SIGNED_INTEGER_COMPARATOR; case UTF8: case ENUM: case JSON: @@ -303,7 +303,7 @@ public T convert(PrimitiveTypeNameConverter conve @Override PrimitiveComparator comparator(OriginalType logicalType) { - return PrimitiveComparator.SIGNED_BINARY_COMPARATOR; + return PrimitiveComparator.BINARY_AS_SIGNED_INTEGER_COMPARATOR; } }, FIXED_LEN_BYTE_ARRAY("getBinary", Binary.class) { @@ -336,7 +336,7 @@ PrimitiveComparator comparator(OriginalType logicalType) { } switch (logicalType) { case DECIMAL: - return PrimitiveComparator.SIGNED_BINARY_COMPARATOR; + return PrimitiveComparator.BINARY_AS_SIGNED_INTEGER_COMPARATOR; case INTERVAL: return PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR; default: diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java index 212ce22409..9c88978c90 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java @@ -28,7 +28,7 @@ import static org.apache.parquet.schema.PrimitiveComparator.DOUBLE_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.FLOAT_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR; -import static org.apache.parquet.schema.PrimitiveComparator.SIGNED_BINARY_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.BINARY_AS_SIGNED_INTEGER_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.SIGNED_INT32_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.SIGNED_INT64_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.UNSIGNED_INT32_COMPARATOR; @@ -70,13 +70,13 @@ public void testSignedInt32Comparator() { @Test public void testUnsignedInt32Comparator() { testInt32Comparator(UNSIGNED_INT32_COMPARATOR, - 0, - 1, - 12345, - Integer.MAX_VALUE, - Integer.MIN_VALUE, - -12345, - -1); + 0, // 0x00000000 + 1, // 0x00000001 + 12345, // 0x00003039 + Integer.MAX_VALUE, // 0x7FFFFFFF + Integer.MIN_VALUE, // 0x80000000 + -12345, // 0xFFFFCFC7 + -1); // 0xFFFFFFFF } private void testInt32Comparator(PrimitiveComparator comparator, int... values) { @@ -108,13 +108,13 @@ public void testSignedInt64Comparator() { @Test public void testUnsignedInt64Comparator() { testInt64Comparator(UNSIGNED_INT64_COMPARATOR, - 0, - 1, - 12345678901L, - Long.MAX_VALUE, - Long.MIN_VALUE, - -12345678901L, - -1); + 0, // 0x0000000000000000 + 1, // 0x0000000000000001 + 12345678901L, // 0x00000002DFDC1C35 + Long.MAX_VALUE, // 0x7FFFFFFFFFFFFFFF + Long.MIN_VALUE, // 0x8000000000000000 + -12345678901L, // 0xFFFFFFFD2023E3CB + -1); // 0xFFFFFFFFFFFFFFFF } private void testInt64Comparator(PrimitiveComparator comparator, long... values) { @@ -186,22 +186,22 @@ public void testDoubleComparator() { @Test public void testLexicographicalBinaryComparator() { testObjectComparator(LEXICOGRAPHICAL_BINARY_COMPARATOR, - Binary.fromConstantByteArray(new byte[0]), - Binary.fromConstantByteArray(new byte[]{127, 127, 0, 127}, 2, 1), - Binary.fromCharSequence("aaa"), - Binary.fromString("aaaa"), - Binary.fromReusedByteArray("aaab".getBytes()), - Binary.fromReusedByteArray("azzza".getBytes(), 1, 3), - Binary.fromReusedByteBuffer(ByteBuffer.wrap("zzzzzz".getBytes())), - Binary.fromReusedByteBuffer(ByteBuffer.wrap("aazzzzzzaa".getBytes(), 2, 7)), - Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[]{-128, -128, -128})), - Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[]{-128, -128, -1}, 1, 2)) + Binary.fromConstantByteArray(new byte[0]), // || + Binary.fromConstantByteArray(new byte[]{127, 127, 0, 127}, 2, 1), // |00| + Binary.fromCharSequence("aaa"), // |61|61|61| + Binary.fromString("aaaa"), // |61|61|61|61| + Binary.fromReusedByteArray("aaab".getBytes()), // |61|61|61|62| + Binary.fromReusedByteArray("azzza".getBytes(), 1, 3), // |7A|7A|7A| + Binary.fromReusedByteBuffer(ByteBuffer.wrap("zzzzzz".getBytes())), // |7A|7A|7A|7A|7A|7A| + Binary.fromReusedByteBuffer(ByteBuffer.wrap("aazzzzzzaa".getBytes(), 2, 7)), // |7A|7A|7A|7A|7A|7A|61| + Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[]{-128, -128, -128})), // |80|80|80| + Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[]{-128, -128, -1}, 1, 2)) // |80|FF| ); } @Test - public void testSignedBinaryComparator() { - testObjectComparator(SIGNED_BINARY_COMPARATOR, + public void testBinaryAsSignedIntegerComparator() { + testObjectComparator(BINARY_AS_SIGNED_INTEGER_COMPARATOR, Binary.fromConstantByteArray(new BigInteger("-9999999999999999999999999999999999999999").toByteArray()), Binary.fromReusedByteArray(new BigInteger("-9999999999999999999999999999999999999998").toByteArray()), Binary.fromConstantByteArray(BigInteger.valueOf(Long.MIN_VALUE).subtract(BigInteger.ONE).toByteArray()), From 70e56a7599b417da08b659a0ab6b790adb48dbb8 Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Wed, 29 Nov 2017 16:52:22 +0100 Subject: [PATCH 11/17] PARQUET-1025: Add explicit list of types to not to read/write statistics --- .../converter/ParquetMetadataConverter.java | 38 +++++++++++++++++-- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 4b963070f6..99a41f24ad 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -364,9 +364,41 @@ public static Statistics toParquetStatistics( private static boolean isMinMaxStatsSupported(PrimitiveType type) { // Have to handle null type to support deprecated methods - return type != null - && type.getPrimitiveTypeName() != PrimitiveTypeName.INT96 - && type.getOriginalType() != OriginalType.INTERVAL; + if (type == null || type.getPrimitiveTypeName() == PrimitiveTypeName.INT96) { + return false; + } + + OriginalType origType = type.getOriginalType(); + if (origType == null) { + // All default primitive types excluding INT96 are supported by statistics + return true; + } + + // Explicitly listing all the supported logical types to avoid writing statistics for new types accidentally + switch (origType) { + case INT_8: + case INT_16: + case INT_32: + case INT_64: + case UINT_8: + case UINT_16: + case UINT_32: + case UINT_64: + case UTF8: + case DECIMAL: + case DATE: + case TIME_MILLIS: + case TIME_MICROS: + case TIMESTAMP_MILLIS: + case TIMESTAMP_MICROS: + case ENUM: + case JSON: + case BSON: + return true; + case INTERVAL: + default: + return false; + } } /** From bc86e8a63053fc7a36e535e6755801aeceace7bf Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Wed, 6 Dec 2017 14:40:41 +0100 Subject: [PATCH 12/17] PARQUET-1025: Updates according to rdblue's comments --- .../parquet/column/ColumnDescriptor.java | 9 +- .../parquet/column/impl/ColumnWriterV1.java | 2 +- .../parquet/column/impl/ColumnWriterV2.java | 2 +- .../column/statistics/BinaryStatistics.java | 11 ++- .../column/statistics/BooleanStatistics.java | 15 ++-- .../column/statistics/DoubleStatistics.java | 15 ++-- .../column/statistics/FloatStatistics.java | 15 ++-- .../column/statistics/IntStatistics.java | 15 ++-- .../column/statistics/LongStatistics.java | 15 ++-- .../parquet/column/statistics/Statistics.java | 74 +++++++++++------ ...allyUpdatedFilterPredicateBuilderBase.java | 2 +- .../org/apache/parquet/io/api/Binary.java | 12 +-- .../parquet/schema/PrimitiveComparator.java | 81 +++++++++++++----- .../apache/parquet/schema/PrimitiveType.java | 14 ++-- .../java/org/apache/parquet/schema/Type.java | 9 -- .../column/statistics/TestStatistics.java | 48 +++++------ .../schema/TestPrimitiveComparator.java | 83 ++++++++++++------- .../statisticslevel/StatisticsFilter.java | 12 +-- .../converter/ParquetMetadataConverter.java | 74 +++-------------- .../hadoop/ColumnChunkPageWriteStore.java | 6 +- .../parquet/hadoop/ParquetFileWriter.java | 6 +- .../TestParquetMetadataConverter.java | 43 ++++------ .../parquet/hadoop/TestParquetFileWriter.java | 3 +- .../parquet/statistics/TestStatistics.java | 36 ++++---- 24 files changed, 304 insertions(+), 298 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/ColumnDescriptor.java b/parquet-column/src/main/java/org/apache/parquet/column/ColumnDescriptor.java index 8c9ba46989..8bfe82a7f5 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/ColumnDescriptor.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/ColumnDescriptor.java @@ -18,7 +18,6 @@ */ package org.apache.parquet.column; -import org.apache.parquet.example.data.simple.Primitive; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type; @@ -102,22 +101,26 @@ public int getMaxDefinitionLevel() { /** * @return the type of that column + * @deprecated will removed in 2.0.0. Use {@link #getPrimitiveType()} instead. */ + @Deprecated public PrimitiveTypeName getType() { return type.getPrimitiveTypeName(); } /** * @return the size of the type + * @deprecated will removed in 2.0.0. Use {@link #getPrimitiveType()} instead. **/ + @Deprecated public int getTypeLength() { return type.getTypeLength(); } /** - * Returns the full type object of the column + * @return the primitive type object of the column */ - public PrimitiveType getFullType() { + public PrimitiveType getPrimitiveType() { return type; } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java index 80bd1ed5ee..e274c112b5 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java @@ -80,7 +80,7 @@ private void log(Object value, int r, int d) { } private void resetStatistics() { - this.statistics = Statistics.createStats(this.path.getFullType()); + this.statistics = Statistics.createStats(this.path.getPrimitiveType()); } /** diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java index a68fb6c3d1..b50d663b6c 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java @@ -77,7 +77,7 @@ private void log(Object value, int r, int d) { } private void resetStatistics() { - this.statistics = Statistics.createStats(path.getFullType()); + this.statistics = Statistics.createStats(path.getPrimitiveType()); } private void definitionLevel(int definitionLevel) { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java index d03c42ee26..b198d76682 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java @@ -20,27 +20,26 @@ import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; import org.apache.parquet.schema.Types; public class BinaryStatistics extends Statistics { // A fake type object to be used to generate the proper comparator - private static final Type DEFAULT_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.BINARY).named(""); + private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.BINARY).named(""); private Binary max; private Binary min; public BinaryStatistics() { - this(DEFAULT_TYPE); + this(DEFAULT_FAKE_TYPE); } - BinaryStatistics(Type type) { - super(type.comparator()); + BinaryStatistics(PrimitiveType type) { + super(type); } private BinaryStatistics(BinaryStatistics other) { - super(other.comparator()); + super(other.type()); if (other.hasNonNullValue()) { initializeStats(other.min, other.max); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java index 7e3670073e..a7111bc23c 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java @@ -20,27 +20,26 @@ import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; import org.apache.parquet.schema.Types; public class BooleanStatistics extends Statistics { // A fake type object to be used to generate the proper comparator - private static final Type DEFAULT_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.BOOLEAN).named(""); + private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.BOOLEAN).named(""); private boolean max; private boolean min; public BooleanStatistics() { - this(DEFAULT_TYPE); + this(DEFAULT_FAKE_TYPE); } - BooleanStatistics(Type type) { - super(type.comparator()); + BooleanStatistics(PrimitiveType type) { + super(type); } private BooleanStatistics(BooleanStatistics other) { - super(other.comparator()); + super(other.type()); if (other.hasNonNullValue()) { initializeStats(other.min, other.max); } @@ -109,11 +108,11 @@ public Boolean genericGetMax() { return max; } - public int compareToMin(boolean value) { + public int compareMinToValue(boolean value) { return comparator().compare(min, value); } - public int compareToMax(boolean value) { + public int compareMaxToValue(boolean value) { return comparator().compare(max, value); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java index 54f857d8c9..daf44197fa 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java @@ -20,27 +20,26 @@ import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; import org.apache.parquet.schema.Types; public class DoubleStatistics extends Statistics { // A fake type object to be used to generate the proper comparator - private static final Type DEFAULT_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.DOUBLE).named(""); + private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.DOUBLE).named(""); private double max; private double min; public DoubleStatistics() { - this(DEFAULT_TYPE); + this(DEFAULT_FAKE_TYPE); } - DoubleStatistics(Type type) { - super(type.comparator()); + DoubleStatistics(PrimitiveType type) { + super(type); } private DoubleStatistics(DoubleStatistics other) { - super(other.comparator()); + super(other.type()); if (other.hasNonNullValue()) { initializeStats(other.min, other.max); } @@ -114,11 +113,11 @@ public Double genericGetMax() { return max; } - public int compareToMin(double value) { + public int compareMinToValue(double value) { return comparator().compare(min, value); } - public int compareToMax(double value) { + public int compareMaxToValue(double value) { return comparator().compare(max, value); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java index 80458d3291..3d39f5fd46 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java @@ -20,28 +20,27 @@ import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; import org.apache.parquet.schema.Types; public class FloatStatistics extends Statistics { // A fake type object to be used to generate the proper comparator - private static final Type DEFAULT_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.FLOAT).named(""); + private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.FLOAT).named(""); private float max; private float min; public FloatStatistics() { // Creating a fake primitive type to have the proper comparator - this(DEFAULT_TYPE); + this(DEFAULT_FAKE_TYPE); } - FloatStatistics(Type type) { - super(type.comparator()); + FloatStatistics(PrimitiveType type) { + super(type); } private FloatStatistics(FloatStatistics other) { - super(other.comparator()); + super(other.type()); if (other.hasNonNullValue()) { initializeStats(other.min, other.max); } @@ -115,11 +114,11 @@ public Float genericGetMax() { return max; } - public int compareToMin(float value) { + public int compareMinToValue(float value) { return comparator().compare(min, value); } - public int compareToMax(float value) { + public int compareMaxToValue(float value) { return comparator().compare(max, value); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java index 929c35330a..31e9bf2d45 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java @@ -20,27 +20,26 @@ import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; import org.apache.parquet.schema.Types; public class IntStatistics extends Statistics { // A fake type object to be used to generate the proper comparator - private static final Type DEFAULT_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.INT32).named(""); + private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.INT32).named(""); private int max; private int min; public IntStatistics() { - this(DEFAULT_TYPE); + this(DEFAULT_FAKE_TYPE); } - IntStatistics(Type type) { - super(type.comparator()); + IntStatistics(PrimitiveType type) { + super(type); } private IntStatistics(IntStatistics other) { - super(other.comparator()); + super(other.type()); if (other.hasNonNullValue()) { initializeStats(other.min, other.max); } @@ -115,11 +114,11 @@ public Integer genericGetMax() { return max; } - public int compareToMin(int value) { + public int compareMinToValue(int value) { return comparator().compare(min, value); } - public int compareToMax(int value) { + public int compareMaxToValue(int value) { return comparator().compare(max, value); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java index e823f26289..5858711644 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java @@ -20,27 +20,26 @@ import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; import org.apache.parquet.schema.Types; public class LongStatistics extends Statistics { // A fake type object to be used to generate the proper comparator - private static final Type DEFAULT_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.INT64).named(""); + private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.INT64).named(""); private long max; private long min; public LongStatistics() { - this(DEFAULT_TYPE); + this(DEFAULT_FAKE_TYPE); } - LongStatistics(Type type) { - super(type.comparator()); + LongStatistics(PrimitiveType type) { + super(type); } private LongStatistics(LongStatistics other) { - super(other.comparator()); + super(other.type()); if (other.hasNonNullValue()) { initializeStats(other.min, other.max); } @@ -115,11 +114,11 @@ public Long genericGetMax() { return max; } - public int compareToMin(long value) { + public int compareMinToValue(long value) { return comparator().compare(min, value); } - public int compareToMax(long value) { + public int compareMaxToValue(long value) { return comparator().compare(max, value); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index ba6b0166a3..fde10a049f 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -18,15 +18,16 @@ */ package org.apache.parquet.column.statistics; +import java.util.Arrays; +import java.util.Objects; + import org.apache.parquet.column.UnknownColumnTypeException; import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.PrimitiveComparator; +import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type; -import java.util.Arrays; -import java.util.Objects; - /** * Statistics class to keep track of statistics in parquet pages and column chunks @@ -35,14 +36,16 @@ */ public abstract class Statistics> { + private final PrimitiveType type; private final PrimitiveComparator comparator; private boolean hasNonNullValue; private long num_nulls; - Statistics(PrimitiveComparator comparator) { + Statistics(PrimitiveType type) { + this.type = type; + this.comparator = type.comparator(); hasNonNullValue = false; num_nulls = 0; - this.comparator = comparator; } /** @@ -86,24 +89,24 @@ public static Statistics getStatsBasedOnType(PrimitiveTypeName type) { * @return instance of a typed statistics class */ public static Statistics createStats(Type type) { - PrimitiveTypeName primitive = type.asPrimitiveType().getPrimitiveTypeName(); - switch (primitive) { + PrimitiveType primitive = type.asPrimitiveType(); + switch (primitive.getPrimitiveTypeName()) { case INT32: - return new IntStatistics(type); + return new IntStatistics(primitive); case INT64: - return new LongStatistics(type); + return new LongStatistics(primitive); case FLOAT: - return new FloatStatistics(type); + return new FloatStatistics(primitive); case DOUBLE: - return new DoubleStatistics(type); + return new DoubleStatistics(primitive); case BOOLEAN: - return new BooleanStatistics(type); + return new BooleanStatistics(primitive); case BINARY: case INT96: case FIXED_LEN_BYTE_ARRAY: - return new BinaryStatistics(type); + return new BinaryStatistics(primitive); default: - throw new UnknownColumnTypeException(primitive); + throw new UnknownColumnTypeException(primitive.getPrimitiveTypeName()); } } @@ -192,7 +195,8 @@ public void mergeStatistics(Statistics stats) { // Merge stats only if they have the same type and comparator (the sorting order // is the same) - if (this.getClass() == stats.getClass() && Objects.equals(comparator(), stats.comparator())) { + if (this.getClass() == stats.getClass() && type.getPrimitiveTypeName() == stats.type.getPrimitiveTypeName() + && type.getOriginalType() == stats.type.getOriginalType()) { incrementNumNulls(stats.getNumNulls()); if (stats.hasNonNullValue()) { mergeStatisticsMinMax(stats); @@ -220,7 +224,7 @@ public void mergeStatistics(Statistics stats) { /** * Returns the min value in the statistics. The java natural order of the returned type defined by {@link * T#compareTo(Object)} might not be the proper one. For example, UINT_32 requires unsigned comparison instead of the - * natural signed one. Use {@link #compareToMin(Comparable)} or the comparator returned by {@link #comparator()} to + * natural signed one. Use {@link #compareMinToValue(Comparable)} or the comparator returned by {@link #comparator()} to * always get the proper ordering. */ abstract public T genericGetMin(); @@ -228,34 +232,44 @@ public void mergeStatistics(Statistics stats) { /** * Returns the max value in the statistics. The java natural order of the returned type defined by {@link * T#compareTo(Object)} might not be the proper one. For example, UINT_32 requires unsigned comparison instead of the - * natural signed one. Use {@link #compareToMax(Comparable)} or the comparator returned by {@link #comparator()} to + * natural signed one. Use {@link #compareMaxToValue(Comparable)} or the comparator returned by {@link #comparator()} to * always get the proper ordering. */ abstract public T genericGetMax(); /** - * Returns the comparator to be used to compare two generic values in the proper way (for example, unsigned comparison - * for UINT_32). + * Returns the {@link PrimitiveComparator} implementation to be used to compare two generic values in the proper way + * (for example, unsigned comparison for UINT_32). */ public final PrimitiveComparator comparator() { return comparator; } /** - * Compares the specified value to min in the proper way. + * Compares min to the specified value in the proper way. It does the same as invoking + * {@code comparator().compare(genericGetMin(), value)}. The corresponding statistics implementations overload this + * method so the one with the primitive argument shall be used to avoid boxing/unboxing. * - * @see Comparable#compareTo(Object) + * @param value + * the value which {@code min} is to be compared to + * @return a negative integer, zero, or a positive integer as {@code min} is less than, equal to, or greater than + * {@code value}. */ - public final int compareToMin(T value) { + public final int compareMinToValue(T value) { return comparator.compare(genericGetMin(), value); } /** - * Compares the specified value to max in the proper way. + * Compares max to the specified value in the proper way. It does the same as invoking + * {@code comparator().compare(genericGetMax(), value)}. The corresponding statistics implementations overload this + * method so the one with the primitive argument shall be used to avoid boxing/unboxing. * - * @see Comparable#compareTo(Object) + * @param value + * the value which {@code max} is to be compared to + * @return a negative integer, zero, or a positive integer as {@code max} is less than, equal to, or greater than + * {@code value}. */ - public final int compareToMax(T value) { + public final int compareMaxToValue(T value) { return comparator.compare(genericGetMax(), value); } @@ -363,9 +377,15 @@ protected void markAsNotEmpty() { } /** - * Returns a new independent statistics instance of this class. All the values - * are copied. + * @return a new independent statistics instance of this class. */ public abstract Statistics copy(); + + /** + * @return the primitive type object which this statistics is created for + */ + public PrimitiveType type() { + return type; + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/filter2/recordlevel/IncrementallyUpdatedFilterPredicateBuilderBase.java b/parquet-column/src/main/java/org/apache/parquet/filter2/recordlevel/IncrementallyUpdatedFilterPredicateBuilderBase.java index 0c5160bad2..c1f759c377 100644 --- a/parquet-column/src/main/java/org/apache/parquet/filter2/recordlevel/IncrementallyUpdatedFilterPredicateBuilderBase.java +++ b/parquet-column/src/main/java/org/apache/parquet/filter2/recordlevel/IncrementallyUpdatedFilterPredicateBuilderBase.java @@ -67,7 +67,7 @@ public IncrementallyUpdatedFilterPredicateBuilderBase(List le for (PrimitiveColumnIO leaf : leaves) { ColumnDescriptor descriptor = leaf.getColumnDescriptor(); ColumnPath path = ColumnPath.get(descriptor.getPath()); - PrimitiveComparator comparator = descriptor.getFullType().comparator(); + PrimitiveComparator comparator = descriptor.getPrimitiveType().comparator(); comparatorsByColumn.put(path, comparator); } } diff --git a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java index 2681fb67ae..9f5f0f2f7b 100644 --- a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java +++ b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java @@ -73,9 +73,9 @@ private Binary() { } abstract boolean equals(Binary other); /** - * @deprecated The comparison logic depends on the related logical type therefore this one might not be correct. The - * {@link java.util.Comparator} implementation for the related type available at {@link Type#comparator()} shall be - * used instead. + * @deprecated will be removed in 2.0.0. The comparison logic depends on the related logical type therefore this one + * might not be correct. The {@link java.util.Comparator} implementation for the related type available at + * {@link Type#comparator()} shall be used instead. */ @Deprecated abstract public int compareTo(Binary other); @@ -192,7 +192,7 @@ boolean equals(ByteBuffer bytes, int otherOffset, int otherLength) { @Override public int compareTo(Binary other) { - return PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR.compare(this, other); + return PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR.compare(this, other); } @Override @@ -338,7 +338,7 @@ boolean equals(ByteBuffer bytes, int otherOffset, int otherLength) { @Override public int compareTo(Binary other) { - return PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR.compare(this, other); + return PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR.compare(this, other); } @Override @@ -488,7 +488,7 @@ boolean equals(ByteBuffer otherBytes, int otherOffset, int otherLength) { @Override public int compareTo(Binary other) { - return PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR.compare(this, other); + return PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR.compare(this, other); } @Override diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java index 5a56dcc370..019fba6493 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java @@ -31,28 +31,38 @@ public abstract class PrimitiveComparator implements Comparator { public int compare(boolean b1, boolean b2) { - throw new UnsupportedOperationException("compare(boolean, boolean) was called on a non-boolean comparator"); + throw new UnsupportedOperationException("compare(boolean, boolean) was called on a non-boolean comparator: " + toString()); } public int compare(int i1, int i2) { - throw new UnsupportedOperationException("compare(int, int) was called on a non-int comparator"); + throw new UnsupportedOperationException("compare(int, int) was called on a non-int comparator: " + toString()); } public int compare(long l1, long l2) { - throw new UnsupportedOperationException("compare(long, long) was called on a non-long comparator"); + throw new UnsupportedOperationException("compare(long, long) was called on a non-long comparator: " + toString()); } public int compare(float f1, float f2) { - throw new UnsupportedOperationException("compare(float, float) was called on a non-float comparator"); + throw new UnsupportedOperationException("compare(float, float) was called on a non-float comparator: " + toString()); } public int compare(double d1, double d2) { - throw new UnsupportedOperationException("compare(double, double) was called on a non-double comparator"); + throw new UnsupportedOperationException("compare(double, double) was called on a non-double comparator: " + toString()); } + @Override + public final int compare(T o1, T o2) { + if (o1 == null) { + return o2 == null ? 0 : -1; + } + return o2 == null ? 1 : compareNotNulls(o1, o2); + } + + abstract int compareNotNulls(T o1, T o2); + static final PrimitiveComparator BOOLEAN_COMPARATOR = new PrimitiveComparator() { @Override - public int compare(Boolean o1, Boolean o2) { + int compareNotNulls(Boolean o1, Boolean o2) { return compare(o1.booleanValue(), o2.booleanValue()); } @@ -69,7 +79,7 @@ public String toString() { private static abstract class IntComparator extends PrimitiveComparator { @Override - public final int compare(Integer o1, Integer o2) { + int compareNotNulls(Integer o1, Integer o2) { return compare(o1.intValue(), o2.intValue()); } } @@ -101,7 +111,7 @@ public String toString() { private static abstract class LongComparator extends PrimitiveComparator { @Override - public final int compare(Long o1, Long o2) { + int compareNotNulls(Long o1, Long o2) { return compare(o1.longValue(), o2.longValue()); } } @@ -133,7 +143,7 @@ public String toString() { static final PrimitiveComparator FLOAT_COMPARATOR = new PrimitiveComparator() { @Override - public int compare(Float o1, Float o2) { + int compareNotNulls(Float o1, Float o2) { return compare(o1.floatValue(), o2.floatValue()); } @@ -150,7 +160,7 @@ public String toString() { static final PrimitiveComparator DOUBLE_COMPARATOR = new PrimitiveComparator() { @Override - public int compare(Double o1, Double o2) { + int compareNotNulls(Double o1, Double o2) { return compare(o1.doubleValue(), o2.doubleValue()); } @@ -167,7 +177,7 @@ public String toString() { private static abstract class BinaryComparator extends PrimitiveComparator { @Override - public final int compare(Binary o1, Binary o2) { + int compareNotNulls(Binary o1, Binary o2) { return compare(o1.toByteBuffer(), o2.toByteBuffer()); } @@ -178,7 +188,7 @@ final int toUnsigned(byte b) { } } - public static final PrimitiveComparator LEXICOGRAPHICAL_BINARY_COMPARATOR = new BinaryComparator() { + public static final PrimitiveComparator UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR = new BinaryComparator() { @Override int compare(ByteBuffer b1, ByteBuffer b2) { int l1 = b1.remaining(); @@ -203,10 +213,15 @@ private int unsignedCompare(byte b1, byte b2) { @Override public String toString() { - return "LEXICOGRAPHICAL_BINARY_COMPARATOR"; + return "UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR"; } }; + /* + * This comparator is for comparing two signed decimal values represented in twos-complement binary. In case of the + * binary length of one value is shorted than the other it will be padded by the related prefix (0xFF for negative, + * 0x00 for positive values). + */ static final PrimitiveComparator BINARY_AS_SIGNED_INTEGER_COMPARATOR = new BinaryComparator() { private static final int NEGATIVE_PREFIX = 0xFF; private static final int POSITIVE_PREFIX = 0; @@ -224,14 +239,38 @@ int compare(ByteBuffer b1, ByteBuffer b2) { return isNegative1 ? -1 : 1; } - int maxL = Math.max(l1, l2); - int iDiff1 = maxL - l1; - int iDiff2 = maxL - l2; - int prefix = isNegative1 ? NEGATIVE_PREFIX : POSITIVE_PREFIX; - for (int i = 0; i < maxL; ++i) { - int value1 = i < iDiff1 ? prefix : toUnsigned(b1.get(p1 + i - iDiff1)); - int value2 = i < iDiff2 ? prefix : toUnsigned(b2.get(p2 + i - iDiff2)); - int result = value1 - value2; + int result = 0; + + // Compare the beginning of the longer buffer with the proper padding + int diff = l1 - l2; + if (diff < 0) { + result = -compareWithPadding(-diff, b2, p2, isNegative1 ? NEGATIVE_PREFIX : POSITIVE_PREFIX); + p2 += -diff; + } else if (diff > 0) { + result = compareWithPadding(diff, b1, p1, isNegative2 ? NEGATIVE_PREFIX : POSITIVE_PREFIX); + p1 += diff; + } + + // The beginning of the longer buffer equals to the padding or the lengths are equal + if (result == 0) { + result = compare(l1, b1, p1, b2, p2); + } + return result; + } + + private int compareWithPadding(int length, ByteBuffer b, int p, int paddingByte) { + for (int i = p, n = p + length; i < n; ++i) { + int result = toUnsigned(b.get(i)) - paddingByte; + if (result != 0) { + return result; + } + } + return 0; + } + + private int compare(int length, ByteBuffer b1, int p1, ByteBuffer b2, int p2) { + for (int i = 0; i < length; ++i) { + int result = toUnsigned(b1.get(p1 + i)) - toUnsigned(b2.get(p2 + i)); if (result != 0) { return result; } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java index a5eb6247de..39d90ffe76 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java @@ -208,7 +208,7 @@ public T convert(PrimitiveTypeNameConverter conve @Override PrimitiveComparator comparator(OriginalType logicalType) { if (logicalType == null) { - return PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR; + return PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR; } switch (logicalType) { case DECIMAL: @@ -217,7 +217,7 @@ PrimitiveComparator comparator(OriginalType logicalType) { case ENUM: case JSON: case BSON: - return PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR; + return PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR; default: throw new ShouldNeverHappenException( "No comparator logic implemented for BINARY logical type: " + logicalType); @@ -332,13 +332,13 @@ public T convert(PrimitiveTypeNameConverter conve @Override PrimitiveComparator comparator(OriginalType logicalType) { if (logicalType == null) { - return PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR; + return PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR; } switch (logicalType) { case DECIMAL: return PrimitiveComparator.BINARY_AS_SIGNED_INTEGER_COMPARATOR; case INTERVAL: - return PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR; + return PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR; default: throw new ShouldNeverHappenException( "No comparator logic implemented for FIXED_LEN_BYTE_ARRAY logical type: " + logicalType); @@ -649,7 +649,11 @@ protected Type union(Type toMerge, boolean strict) { return builder.as(getOriginalType()).named(getName()); } - @Override + /** + * Returns the {@link Type} specific comparator for properly comparing values. The natural ordering of the values + * might not proper in certain cases (e.g. {@code UINT_32} requires unsigned comparison of {@code int} values while + * the natural ordering is signed.) + */ @SuppressWarnings("unchecked") public PrimitiveComparator comparator() { return (PrimitiveComparator) getPrimitiveTypeName().comparator(getOriginalType()); diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/Type.java b/parquet-column/src/main/java/org/apache/parquet/schema/Type.java index a5f7c59e70..782151a7e4 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/Type.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/Type.java @@ -323,13 +323,4 @@ void checkContains(Type subType) { * @return the converted tree */ abstract T convert(List path, TypeConverter converter); - - /** - * Returns the {@link Type} specific comparator for properly comparing values. The natural ordering of the values - * might not proper in certain cases (e.g. {@code UINT_32} requires unsigned comparison of {@code int} values while - * the natural ordering is signed.) - */ - public PrimitiveComparator comparator() { - throw new UnsupportedOperationException("No comparator is implemented for type: " + this); - } } diff --git a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java index 721e407b14..476fbb3376 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java @@ -74,12 +74,12 @@ public void testIntMinMax() { assertEquals(statsNeg.getMax(), 54); assertEquals(statsNeg.getMin(), -66); - assertEquals(-1, statsNeg.compareToMax(55)); - assertEquals(0, statsNeg.compareToMax(54)); - assertEquals(1, statsNeg.compareToMax(5)); - assertEquals(-1, statsNeg.compareToMin(0)); - assertEquals(0, statsNeg.compareToMin(-66)); - assertEquals(1, statsNeg.compareToMin(-67)); + assertTrue(statsNeg.compareMaxToValue(55) < 0); + assertTrue(statsNeg.compareMaxToValue(54) == 0); + assertTrue(statsNeg.compareMaxToValue(5) > 0); + assertTrue(statsNeg.compareMinToValue(0) < 0); + assertTrue(statsNeg.compareMinToValue(-66) == 0); + assertTrue(statsNeg.compareMinToValue(-67) > 0); // Test converting to and from byte[] byte[] intMaxBytes = statsNeg.getMaxBytes(); @@ -142,12 +142,12 @@ public void testLongMinMax() { assertEquals(statsNeg.getMax(), 993); assertEquals(statsNeg.getMin(), -9914); - assertEquals(-1, statsNeg.compareToMax(994)); - assertEquals(0, statsNeg.compareToMax(993)); - assertEquals(1, statsNeg.compareToMax(-1000)); - assertEquals(-1, statsNeg.compareToMin(10000)); - assertEquals(0, statsNeg.compareToMin(-9914)); - assertEquals(1, statsNeg.compareToMin(-9915)); + assertTrue(statsNeg.compareMaxToValue(994) < 0); + assertTrue(statsNeg.compareMaxToValue(993) == 0); + assertTrue(statsNeg.compareMaxToValue(-1000) > 0); + assertTrue(statsNeg.compareMinToValue(10000) < 0); + assertTrue(statsNeg.compareMinToValue(-9914) == 0); + assertTrue(statsNeg.compareMinToValue(-9915) > 0); // Test converting to and from byte[] byte[] longMaxBytes = statsNeg.getMaxBytes(); @@ -210,12 +210,12 @@ public void testFloatMinMax() { assertEquals(statsNeg.getMax(), 0.65f, 1e-10); assertEquals(statsNeg.getMin(), -412.99f, 1e-10); - assertEquals(-1, statsNeg.compareToMax(1)); - assertEquals(0, statsNeg.compareToMax(0.65F)); - assertEquals(1, statsNeg.compareToMax(0.649F)); - assertEquals(-1, statsNeg.compareToMin(-412.98F)); - assertEquals(0, statsNeg.compareToMin(-412.99F)); - assertEquals(1, statsNeg.compareToMin(-450)); + assertTrue(statsNeg.compareMaxToValue(1) < 0); + assertTrue(statsNeg.compareMaxToValue(0.65F) == 0); + assertTrue(statsNeg.compareMaxToValue(0.649F) > 0); + assertTrue(statsNeg.compareMinToValue(-412.98F) < 0); + assertTrue(statsNeg.compareMinToValue(-412.99F) == 0); + assertTrue(statsNeg.compareMinToValue(-450) > 0); // Test converting to and from byte[] byte[] floatMaxBytes = statsNeg.getMaxBytes(); @@ -278,12 +278,12 @@ public void testDoubleMinMax() { assertEquals(statsNeg.getMax(), 23.0d, 1e-10); assertEquals(statsNeg.getMin(), -944.5d, 1e-10); - assertEquals(-1, statsNeg.compareToMax(23.0001D)); - assertEquals(0, statsNeg.compareToMax(23D)); - assertEquals(1, statsNeg.compareToMax(0D)); - assertEquals(-1, statsNeg.compareToMin(-400D)); - assertEquals(0, statsNeg.compareToMin(-944.5D)); - assertEquals(1, statsNeg.compareToMin(-944.500001D)); + assertTrue(statsNeg.compareMaxToValue(23.0001D) < 0); + assertTrue(statsNeg.compareMaxToValue(23D) == 0); + assertTrue(statsNeg.compareMaxToValue(0D) > 0); + assertTrue(statsNeg.compareMinToValue(-400D) < 0); + assertTrue(statsNeg.compareMinToValue(-944.5D) == 0); + assertTrue(statsNeg.compareMinToValue(-944.500001D) > 0); // Test converting to and from byte[] byte[] doubleMaxBytes = statsNeg.getMaxBytes(); diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java index 9c88978c90..6f2e276c62 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java @@ -27,7 +27,7 @@ import static org.apache.parquet.schema.PrimitiveComparator.BOOLEAN_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.DOUBLE_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.FLOAT_COMPARATOR; -import static org.apache.parquet.schema.PrimitiveComparator.LEXICOGRAPHICAL_BINARY_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.BINARY_AS_SIGNED_INTEGER_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.SIGNED_INT32_COMPARATOR; import static org.apache.parquet.schema.PrimitiveComparator.SIGNED_INT64_COMPARATOR; @@ -36,19 +36,26 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; +/* + * This test verifies all the PrimitiveComparator implementations. The logic of all tests is the same: list the + * elements to be tested in ascending order and then compare every elements to each other (including the element + * itself) and expect the related value based on the defined order. + */ public class TestPrimitiveComparator { @Test public void testBooleanComparator() { - boolean[] values = {false, true}; + Boolean[] values = {null, false, true}; for (int i = 0; i < values.length; ++i) { for (int j = 0; j < values.length; ++j) { - boolean vi = values[i]; - boolean vj = values[j]; + Boolean vi = values[i]; + Boolean vj = values[j]; int exp = i - j; assertSignumEquals(vi, vj, exp, BOOLEAN_COMPARATOR.compare(vi, vj)); - assertSignumEquals(vi, vj, exp, BOOLEAN_COMPARATOR.compare(Boolean.valueOf(vi), Boolean.valueOf(vj))); + if (vi != null && vj != null) { + assertSignumEquals(vi, vj, exp, BOOLEAN_COMPARATOR.compare(vi.booleanValue(), vj.booleanValue())); + } } } @@ -58,6 +65,7 @@ public void testBooleanComparator() { @Test public void testSignedInt32Comparator() { testInt32Comparator(SIGNED_INT32_COMPARATOR, + null, Integer.MIN_VALUE, -12345, -1, @@ -70,6 +78,7 @@ public void testSignedInt32Comparator() { @Test public void testUnsignedInt32Comparator() { testInt32Comparator(UNSIGNED_INT32_COMPARATOR, + null, 0, // 0x00000000 1, // 0x00000001 12345, // 0x00003039 @@ -79,14 +88,16 @@ public void testUnsignedInt32Comparator() { -1); // 0xFFFFFFFF } - private void testInt32Comparator(PrimitiveComparator comparator, int... values) { + private void testInt32Comparator(PrimitiveComparator comparator, Integer... values) { for (int i = 0; i < values.length; ++i) { for (int j = 0; j < values.length; ++j) { - int vi = values[i]; - int vj = values[j]; + Integer vi = values[i]; + Integer vj = values[j]; int exp = i - j; assertSignumEquals(vi, vj, exp, comparator.compare(vi, vj)); - assertSignumEquals(vi, vj, exp, comparator.compare(Integer.valueOf(vi), Integer.valueOf(vj))); + if (vi != null && vj != null) { + assertSignumEquals(vi, vj, exp, comparator.compare(vi.intValue(), vj.intValue())); + } } } @@ -96,11 +107,12 @@ private void testInt32Comparator(PrimitiveComparator comparator, int... @Test public void testSignedInt64Comparator() { testInt64Comparator(SIGNED_INT64_COMPARATOR, + null, Long.MIN_VALUE, -12345678901L, - -1, - 0, - 1, + -1L, + 0L, + 1L, 12345678901L, Long.MAX_VALUE); } @@ -108,23 +120,26 @@ public void testSignedInt64Comparator() { @Test public void testUnsignedInt64Comparator() { testInt64Comparator(UNSIGNED_INT64_COMPARATOR, - 0, // 0x0000000000000000 - 1, // 0x0000000000000001 + null, + 0L, // 0x0000000000000000 + 1L, // 0x0000000000000001 12345678901L, // 0x00000002DFDC1C35 Long.MAX_VALUE, // 0x7FFFFFFFFFFFFFFF Long.MIN_VALUE, // 0x8000000000000000 -12345678901L, // 0xFFFFFFFD2023E3CB - -1); // 0xFFFFFFFFFFFFFFFF + -1L); // 0xFFFFFFFFFFFFFFFF } - private void testInt64Comparator(PrimitiveComparator comparator, long... values) { + private void testInt64Comparator(PrimitiveComparator comparator, Long... values) { for (int i = 0; i < values.length; ++i) { for (int j = 0; j < values.length; ++j) { - long vi = values[i]; - long vj = values[j]; + Long vi = values[i]; + Long vj = values[j]; int exp = i - j; assertSignumEquals(vi, vj, exp, comparator.compare(vi, vj)); - assertSignumEquals(vi, vj, exp, comparator.compare(Long.valueOf(vi), Long.valueOf(vj))); + if (vi != null && vj != null) { + assertSignumEquals(vi, vj, exp, comparator.compare(vi.longValue(), vj.longValue())); + } } } @@ -133,12 +148,13 @@ private void testInt64Comparator(PrimitiveComparator comparator, long... v @Test public void testFloatComparator() { - float[] values = { + Float[] values = { + null, Float.NEGATIVE_INFINITY, -Float.MAX_VALUE, -1234.5678F, -Float.MIN_VALUE, - 0, + 0.0F, Float.MIN_VALUE, 1234.5678F, Float.MAX_VALUE, @@ -146,11 +162,13 @@ public void testFloatComparator() { for (int i = 0; i < values.length; ++i) { for (int j = 0; j < values.length; ++j) { - float vi = values[i]; - float vj = values[j]; + Float vi = values[i]; + Float vj = values[j]; int exp = i - j; assertSignumEquals(vi, vj, exp, FLOAT_COMPARATOR.compare(vi, vj)); - assertSignumEquals(vi, vj, exp, FLOAT_COMPARATOR.compare(Float.valueOf(vi), Float.valueOf(vj))); + if (vi != null && vj != null) { + assertSignumEquals(vi, vj, exp, FLOAT_COMPARATOR.compare(vi.floatValue(), vj.floatValue())); + } } } @@ -159,12 +177,13 @@ public void testFloatComparator() { @Test public void testDoubleComparator() { - double[] values = { + Double[] values = { + null, Double.NEGATIVE_INFINITY, -Double.MAX_VALUE, -123456.7890123456789, -Double.MIN_VALUE, - 0, + 0.0, Double.MIN_VALUE, 123456.7890123456789, Double.MAX_VALUE, @@ -172,11 +191,13 @@ public void testDoubleComparator() { for (int i = 0; i < values.length; ++i) { for (int j = 0; j < values.length; ++j) { - double vi = values[i]; - double vj = values[j]; + Double vi = values[i]; + Double vj = values[j]; int exp = i - j; assertSignumEquals(vi, vj, exp, DOUBLE_COMPARATOR.compare(vi, vj)); - assertSignumEquals(vi, vj, exp, DOUBLE_COMPARATOR.compare(Double.valueOf(vi), Double.valueOf(vj))); + if (vi != null && vj != null) { + assertSignumEquals(vi, vj, exp, DOUBLE_COMPARATOR.compare(vi.doubleValue(), vj.doubleValue())); + } } } @@ -185,7 +206,8 @@ public void testDoubleComparator() { @Test public void testLexicographicalBinaryComparator() { - testObjectComparator(LEXICOGRAPHICAL_BINARY_COMPARATOR, + testObjectComparator(UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR, + null, Binary.fromConstantByteArray(new byte[0]), // || Binary.fromConstantByteArray(new byte[]{127, 127, 0, 127}, 2, 1), // |00| Binary.fromCharSequence("aaa"), // |61|61|61| @@ -202,6 +224,7 @@ public void testLexicographicalBinaryComparator() { @Test public void testBinaryAsSignedIntegerComparator() { testObjectComparator(BINARY_AS_SIGNED_INTEGER_COMPARATOR, + null, Binary.fromConstantByteArray(new BigInteger("-9999999999999999999999999999999999999999").toByteArray()), Binary.fromReusedByteArray(new BigInteger("-9999999999999999999999999999999999999998").toByteArray()), Binary.fromConstantByteArray(BigInteger.valueOf(Long.MIN_VALUE).subtract(BigInteger.ONE).toByteArray()), diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java index b75c47b635..f168a6004c 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java @@ -134,7 +134,7 @@ public > Boolean visit(Eq eq) { } // drop if value < min || value > max - return stats.compareToMin(value) > 0 || stats.compareToMax(value) < 0; + return stats.compareMinToValue(value) > 0 || stats.compareMaxToValue(value) < 0; } @Override @@ -173,7 +173,7 @@ public > Boolean visit(NotEq notEq) { } // drop if this is a column where min = max = value - return stats.compareToMin(value) == 0 && stats.compareToMax(value) == 0; + return stats.compareMinToValue(value) == 0 && stats.compareMaxToValue(value) == 0; } @Override @@ -204,7 +204,7 @@ public > Boolean visit(Lt lt) { T value = lt.getValue(); // drop if value <= min - return stats.compareToMin(value) >= 0; + return stats.compareMinToValue(value) >= 0; } @Override @@ -235,7 +235,7 @@ public > Boolean visit(LtEq ltEq) { T value = ltEq.getValue(); // drop if value < min - return stats.compareToMin(value) > 0; + return stats.compareMinToValue(value) > 0; } @Override @@ -266,7 +266,7 @@ public > Boolean visit(Gt gt) { T value = gt.getValue(); // drop if value >= max - return stats.compareToMax(value) <= 0; + return stats.compareMaxToValue(value) <= 0; } @Override @@ -297,7 +297,7 @@ public > Boolean visit(GtEq gtEq) { T value = gtEq.getValue(); // drop if value > max - return stats.compareToMax(value) < 0; + return stats.compareMaxToValue(value) < 0; } @Override diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 99a41f24ad..a83528df56 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -219,8 +219,7 @@ private void addRowGroup(ParquetMetadata parquetMetadata, List rowGrou columnMetaData.getFirstDataPageOffset()); columnChunk.meta_data.dictionary_page_offset = columnMetaData.getDictionaryPageOffset(); if (!columnMetaData.getStatistics().isEmpty()) { - columnChunk.meta_data - .setStatistics(toParquetStatistics(columnMetaData.getStatistics(), columnMetaData.getFullType())); + columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics())); } if (columnMetaData.getEncodingStats() != null) { columnChunk.meta_data.setEncoding_stats(convertEncodingStats(columnMetaData.getEncodingStats())); @@ -326,14 +325,8 @@ dataPageType, getEncoding(encoding), return formatStats; } - @Deprecated public static Statistics toParquetStatistics( org.apache.parquet.column.statistics.Statistics statistics) { - return toParquetStatistics(statistics, null); - } - - public static Statistics toParquetStatistics( - org.apache.parquet.column.statistics.Statistics statistics, PrimitiveType type) { Statistics stats = new Statistics(); // Don't write stats larger than the max size rather than truncating. The // rationale is that some engines may use the minimum value in the page as @@ -348,12 +341,12 @@ public static Statistics toParquetStatistics( // Fill the former min-max statistics only if the comparison logic is // signed so the logic of V1 and V2 stats are the same (which is // trivially true for equal min-max values) - if (sortOrder(type) == SortOrder.SIGNED || Arrays.equals(min, max)) { + if (sortOrder(statistics.type()) == SortOrder.SIGNED || Arrays.equals(min, max)) { stats.setMin(min); stats.setMax(max); } - if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) { + if (isMinMaxStatsSupported(statistics.type()) || Arrays.equals(min, max)) { stats.setMin_value(min); stats.setMax_value(max); } @@ -363,8 +356,7 @@ public static Statistics toParquetStatistics( } private static boolean isMinMaxStatsSupported(PrimitiveType type) { - // Have to handle null type to support deprecated methods - if (type == null || type.getPrimitiveTypeName() == PrimitiveTypeName.INT96) { + if (type.getPrimitiveTypeName() == PrimitiveTypeName.INT96) { return false; } @@ -447,7 +439,7 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist // aggregated using a signed byte-wise ordering, which isn't valid for all the // types (e.g. strings, decimals etc.). if (!CorruptStatistics.shouldIgnoreStatistics(createdBy, type.getPrimitiveTypeName()) && - (sortOrdersMatch || maxEqualsMin)) { + (sortOrdersMatch || maxEqualsMin)) { if (isSet) { stats.setMinMaxFromBytes(statistics.min.array(), statistics.max.array()); } @@ -531,11 +523,6 @@ private static SortOrder defaultSortOrder(PrimitiveTypeName primitive) { * @return the "correct" sort order of the type that applications assume */ private static SortOrder sortOrder(PrimitiveType primitive) { - // Have to handle null type to support deprecated methods - if (primitive == null) { - return SortOrder.UNKNOWN; - } - OriginalType annotation = primitive.getOriginalType(); if (annotation != null) { switch (annotation) { @@ -1040,11 +1027,9 @@ public void writeDataPageHeader( new org.apache.parquet.column.statistics.BooleanStatistics(), rlEncoding, dlEncoding, - valuesEncoding, - null), to); + valuesEncoding), to); } - @Deprecated public void writeDataPageHeader( int uncompressedSize, int compressedSize, @@ -1056,23 +1041,7 @@ public void writeDataPageHeader( OutputStream to) throws IOException { writePageHeader( newDataPageHeader(uncompressedSize, compressedSize, valueCount, statistics, - rlEncoding, dlEncoding, valuesEncoding, null), - to); - } - - public void writeDataPageHeader( - int uncompressedSize, - int compressedSize, - int valueCount, - org.apache.parquet.column.statistics.Statistics statistics, - org.apache.parquet.column.Encoding rlEncoding, - org.apache.parquet.column.Encoding dlEncoding, - org.apache.parquet.column.Encoding valuesEncoding, - OutputStream to, - PrimitiveType type) throws IOException { - writePageHeader( - newDataPageHeader(uncompressedSize, compressedSize, valueCount, statistics, - rlEncoding, dlEncoding, valuesEncoding, type), + rlEncoding, dlEncoding, valuesEncoding), to); } @@ -1082,8 +1051,7 @@ private PageHeader newDataPageHeader( org.apache.parquet.column.statistics.Statistics statistics, org.apache.parquet.column.Encoding rlEncoding, org.apache.parquet.column.Encoding dlEncoding, - org.apache.parquet.column.Encoding valuesEncoding, - PrimitiveType type) { + org.apache.parquet.column.Encoding valuesEncoding) { PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE, uncompressedSize, compressedSize); // TODO: pageHeader.crc = ...; pageHeader.setData_page_header(new DataPageHeader( @@ -1092,13 +1060,11 @@ private PageHeader newDataPageHeader( getEncoding(dlEncoding), getEncoding(rlEncoding))); if (!statistics.isEmpty()) { - pageHeader.getData_page_header().setStatistics( - toParquetStatistics(statistics, type)); + pageHeader.getData_page_header().setStatistics(toParquetStatistics(statistics)); } return pageHeader; } - @Deprecated public void writeDataPageV2Header( int uncompressedSize, int compressedSize, int valueCount, int nullCount, int rowCount, @@ -1112,23 +1078,7 @@ public void writeDataPageV2Header( valueCount, nullCount, rowCount, statistics, dataEncoding, - rlByteLength, dlByteLength, null), to); - } - - public void writeDataPageV2Header( - int uncompressedSize, int compressedSize, - int valueCount, int nullCount, int rowCount, - org.apache.parquet.column.statistics.Statistics statistics, - org.apache.parquet.column.Encoding dataEncoding, - int rlByteLength, int dlByteLength, - OutputStream to, PrimitiveType type) throws IOException { - writePageHeader( - newDataPageV2Header( - uncompressedSize, compressedSize, - valueCount, nullCount, rowCount, - statistics, - dataEncoding, - rlByteLength, dlByteLength, type), to); + rlByteLength, dlByteLength), to); } private PageHeader newDataPageV2Header( @@ -1136,7 +1086,7 @@ private PageHeader newDataPageV2Header( int valueCount, int nullCount, int rowCount, org.apache.parquet.column.statistics.Statistics statistics, org.apache.parquet.column.Encoding dataEncoding, - int rlByteLength, int dlByteLength, PrimitiveType type) { + int rlByteLength, int dlByteLength) { // TODO: pageHeader.crc = ...; DataPageHeaderV2 dataPageHeaderV2 = new DataPageHeaderV2( valueCount, nullCount, rowCount, @@ -1144,7 +1094,7 @@ private PageHeader newDataPageV2Header( dlByteLength, rlByteLength); if (!statistics.isEmpty()) { dataPageHeaderV2.setStatistics( - toParquetStatistics(statistics, type)); + toParquetStatistics(statistics)); } PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE_V2, uncompressedSize, compressedSize); pageHeader.setData_page_header_v2(dataPageHeaderV2); diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java index 56fad46718..8eeab5c55f 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java @@ -108,8 +108,7 @@ public void writePage(BytesInput bytes, rlEncoding, dlEncoding, valuesEncoding, - tempOutputStream, - path.getFullType()); + tempOutputStream); this.uncompressedLength += uncompressedSize; this.compressedLength += compressedSize; this.totalValueCount += valueCount; @@ -154,8 +153,7 @@ public void writePageV2( dataEncoding, rlByteLength, dlByteLength, - tempOutputStream, - path.getFullType()); + tempOutputStream); this.uncompressedLength += uncompressedSize; this.compressedLength += compressedSize; this.totalValueCount += valueCount; diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java index 0128a9c663..f440b433c2 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java @@ -66,7 +66,6 @@ import org.apache.parquet.io.PositionOutputStream; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.TypeUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -318,7 +317,7 @@ public void startColumn(ColumnDescriptor descriptor, encodingStatsBuilder.clear(); currentEncodings = new HashSet(); currentChunkPath = ColumnPath.get(descriptor.getPath()); - currentChunkType = descriptor.getFullType(); + currentChunkType = descriptor.getPrimitiveType(); currentChunkCodec = compressionCodecName; currentChunkValueCount = valueCount; currentChunkFirstDataPage = out.getPos(); @@ -419,8 +418,7 @@ public void writeDataPage( rlEncoding, dlEncoding, valuesEncoding, - out, - currentChunkType); + out); long headerSize = out.getPos() - beforeHeader; this.uncompressedLength += uncompressedPageSize + headerSize; this.compressedLength += compressedPageSize + headerSize; diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 235cf4ed4f..6808b7bc22 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -397,9 +397,7 @@ private void testBinaryStats(StatsHelper helper) { Assert.assertTrue("Should be smaller than min + max size + 1", stats.isSmallerThan(totalLen + 1)); - PrimitiveType type = Types.optional(PrimitiveTypeName.BINARY).named(""); - org.apache.parquet.format.Statistics formatStats = - helper.toParquetStatistics(type, stats); + org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats); assertFalse("Min should not be set", formatStats.isSetMin()); assertFalse("Max should not be set", formatStats.isSetMax()); @@ -413,7 +411,7 @@ private void testBinaryStats(StatsHelper helper) { // convert to empty stats because the values are too large stats.setMinMaxFromBytes(max, max); - formatStats = helper.toParquetStatistics(type, stats); + formatStats = helper.toParquetStatistics(stats); Assert.assertFalse("Min should not be set", formatStats.isSetMin()); Assert.assertFalse("Max should not be set", formatStats.isSetMax()); @@ -448,8 +446,7 @@ private void testIntegerStats(StatsHelper helper) { stats.updateStats(min); stats.updateStats(max); - org.apache.parquet.format.Statistics formatStats = - helper.toParquetStatistics(Types.optional(PrimitiveTypeName.INT32).named(""), stats); + org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats); Assert.assertEquals("Min should match", min, BytesUtils.bytesToInt(formatStats.getMin())); @@ -478,8 +475,7 @@ private void testLongStats(StatsHelper helper) { stats.updateStats(min); stats.updateStats(max); - org.apache.parquet.format.Statistics formatStats = - helper.toParquetStatistics(Types.optional(PrimitiveTypeName.INT64).named(""), stats); + org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats); Assert.assertEquals("Min should match", min, BytesUtils.bytesToLong(formatStats.getMin())); @@ -508,8 +504,7 @@ private void testFloatStats(StatsHelper helper) { stats.updateStats(min); stats.updateStats(max); - org.apache.parquet.format.Statistics formatStats = - helper.toParquetStatistics(Types.optional(PrimitiveTypeName.FLOAT).named(""), stats); + org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats); Assert.assertEquals("Min should match", min, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMin())), @@ -540,8 +535,7 @@ private void testDoubleStats(StatsHelper helper) { stats.updateStats(min); stats.updateStats(max); - org.apache.parquet.format.Statistics formatStats = - helper.toParquetStatistics(Types.optional(PrimitiveTypeName.DOUBLE).named(""), stats); + org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats); Assert.assertEquals("Min should match", min, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMin())), @@ -572,8 +566,7 @@ private void testBooleanStats(StatsHelper helper) { stats.updateStats(min); stats.updateStats(max); - org.apache.parquet.format.Statistics formatStats = - helper.toParquetStatistics(Types.optional(PrimitiveTypeName.BOOLEAN).named(""), stats); + org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats); Assert.assertEquals("Min should match", min, BytesUtils.bytesToBool(formatStats.getMin())); @@ -597,7 +590,7 @@ public void testIgnoreStatsWithSignedSortOrder() { .as(OriginalType.UTF8).named("b"); Statistics convertedStats = converter.fromParquetStatistics( Version.FULL_VERSION, - StatsHelper.V1.toParquetStatistics(binaryType, stats), + StatsHelper.V1.toParquetStatistics(stats), binaryType); Assert.assertTrue("Stats should be empty: " + convertedStats, convertedStats.isEmpty()); @@ -625,7 +618,7 @@ private void testStillUseStatsWithSignedSortOrderIfSingleValue(StatsHelper helpe PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("b"); Statistics convertedStats = converter.fromParquetStatistics( Version.FULL_VERSION, - ParquetMetadataConverter.toParquetStatistics(stats, binaryType), + ParquetMetadataConverter.toParquetStatistics(stats), binaryType); Assert.assertFalse("Stats should not be empty: " + convertedStats, convertedStats.isEmpty()); @@ -659,7 +652,7 @@ private void testUseStatsWithSignedSortOrder(StatsHelper helper) { .as(OriginalType.UTF8).named("b"); Statistics convertedStats = converter.fromParquetStatistics( Version.FULL_VERSION, - helper.toParquetStatistics(binaryType, stats), + helper.toParquetStatistics(stats), binaryType); Assert.assertFalse("Stats should not be empty", convertedStats.isEmpty()); @@ -687,7 +680,7 @@ public void testSkippedV2Stats() { private void testSkippedV2Stats(PrimitiveType type, Object min, Object max) { Statistics stats = createStats(type, min, max); - org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats, type); + org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats); assertFalse(statistics.isSetMin()); assertFalse(statistics.isSetMax()); assertFalse(statistics.isSetMin_value()); @@ -720,7 +713,7 @@ public void testV2OnlyStats() { private void testV2OnlyStats(PrimitiveType type, Object min, Object max) { Statistics stats = createStats(type, min, max); - org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats, type); + org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats); assertFalse(statistics.isSetMin()); assertFalse(statistics.isSetMax()); assertEquals(ByteBuffer.wrap(stats.getMinBytes()), statistics.min_value); @@ -756,7 +749,7 @@ public void testV2StatsEqualMinMax() { private void testV2StatsEqualMinMax(PrimitiveType type, Object min, Object max) { Statistics stats = createStats(type, min, max); - org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats, type); + org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats); assertEquals(ByteBuffer.wrap(stats.getMinBytes()), statistics.min); assertEquals(ByteBuffer.wrap(stats.getMaxBytes()), statistics.max); assertEquals(ByteBuffer.wrap(stats.getMinBytes()), statistics.min_value); @@ -809,8 +802,8 @@ private enum StatsHelper { // Only min and max are filled (min_value and max_value are not) V1() { @Override - public org.apache.parquet.format.Statistics toParquetStatistics(PrimitiveType type, Statistics stats) { - org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats, type); + public org.apache.parquet.format.Statistics toParquetStatistics(Statistics stats) { + org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats); statistics.unsetMin_value(); statistics.unsetMax_value(); return statistics; @@ -819,11 +812,11 @@ public org.apache.parquet.format.Statistics toParquetStatistics(PrimitiveType ty // min_value and max_value are filled (min and max might be filled as well) V2() { @Override - public org.apache.parquet.format.Statistics toParquetStatistics(PrimitiveType type, Statistics stats) { - return ParquetMetadataConverter.toParquetStatistics(stats, type); + public org.apache.parquet.format.Statistics toParquetStatistics(Statistics stats) { + return ParquetMetadataConverter.toParquetStatistics(stats); } }; - public abstract org.apache.parquet.format.Statistics toParquetStatistics(PrimitiveType type, Statistics stats); + public abstract org.apache.parquet.format.Statistics toParquetStatistics(Statistics stats); } } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java index cc17153759..2f52692683 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java @@ -47,7 +47,6 @@ import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageTypeParser; import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Types; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import java.io.File; @@ -433,7 +432,7 @@ public void testConvertToThriftStatistics() throws Exception { final String createdBy = "parquet-mr version 1.8.0 (build d4d5a07ec9bd262ca1e93c309f1d7d4a74ebda4c)"; Statistics thriftStats = org.apache.parquet.format.converter.ParquetMetadataConverter - .toParquetStatistics(parquetMRstats, Types.optional(PrimitiveTypeName.INT64).named("")); + .toParquetStatistics(parquetMRstats); LongStatistics convertedBackStats = (LongStatistics) org.apache.parquet.format.converter.ParquetMetadataConverter.fromParquetStatistics( createdBy, thriftStats, PrimitiveTypeName.INT64); diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java index 047ec30283..bf216423cb 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java @@ -19,26 +19,6 @@ package org.apache.parquet.statistics; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96; -import static org.apache.parquet.schema.Type.Repetition.OPTIONAL; -import static org.apache.parquet.schema.Type.Repetition.REQUIRED; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.io.File; -import java.io.IOException; -import java.math.BigInteger; -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; -import java.util.Random; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.parquet.column.ColumnDescriptor; @@ -74,6 +54,20 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; +import java.io.File; +import java.io.IOException; +import java.math.BigInteger; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.Random; + +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*; +import static org.apache.parquet.schema.Type.Repetition.OPTIONAL; +import static org.apache.parquet.schema.Type.Repetition.REQUIRED; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertEquals; + public class TestStatistics { private static final int MEGABYTE = 1 << 20; private static final long RANDOM_SEED = 1441990701846L; //System.currentTimeMillis(); @@ -299,7 +293,7 @@ private void validateStatsForPage(DataPage page, DictionaryPage dict, ColumnDesc Statistics stats = getStatisticsFromPageHeader(page); assertEquals("Statistics does not use the proper comparator", - desc.getFullType().comparator().getClass(), + desc.getPrimitiveType().comparator().getClass(), stats.comparator().getClass()); if (stats.isEmpty()) { From a2ae97ce580a04c172e7037eb686680ef9a37b6f Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Thu, 7 Dec 2017 10:29:10 +0100 Subject: [PATCH 13/17] PARQUET-1025: Unified formatting/comments/deprecation --- .../parquet/column/ColumnDescriptor.java | 4 +- .../column/statistics/BinaryStatistics.java | 4 + .../column/statistics/BooleanStatistics.java | 4 + .../column/statistics/DoubleStatistics.java | 4 + .../column/statistics/FloatStatistics.java | 4 + .../column/statistics/IntStatistics.java | 4 + .../column/statistics/LongStatistics.java | 4 + .../parquet/column/statistics/Statistics.java | 72 ++++---- .../parquet/filter2/predicate/Statistics.java | 3 + .../parquet/schema/PrimitiveComparator.java | 9 +- .../java/org/apache/parquet/schema/Type.java | 1 + .../org/apache/parquet/io/api/TestBinary.java | 7 +- .../schema/TestPrimitiveComparator.java | 170 +++++++++--------- .../dictionarylevel/DictionaryFilter.java | 10 +- .../converter/ParquetMetadataConverter.java | 92 +++++----- .../hadoop/ColumnChunkPageWriteStore.java | 4 +- .../parquet/hadoop/ParquetFileWriter.java | 6 +- .../hadoop/metadata/ColumnChunkMetaData.java | 37 ++-- .../metadata/ColumnChunkProperties.java | 17 +- .../TestParquetMetadataConverter.java | 12 +- .../parquet/hadoop/TestParquetFileWriter.java | 4 +- 21 files changed, 261 insertions(+), 211 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/ColumnDescriptor.java b/parquet-column/src/main/java/org/apache/parquet/column/ColumnDescriptor.java index 8bfe82a7f5..5f30cd0901 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/ColumnDescriptor.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/ColumnDescriptor.java @@ -18,12 +18,12 @@ */ package org.apache.parquet.column; +import java.util.Arrays; + import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type; -import java.util.Arrays; - /** * Describes a column's type as well as its position in its containing schema. * diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java index b198d76682..84be75a7b9 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java @@ -30,6 +30,10 @@ public class BinaryStatistics extends Statistics { private Binary max; private Binary min; + /** + * @deprecated will be removed in 2.0.0. Use {@link Statistics#createStats(org.apache.parquet.schema.Type)} instead + */ + @Deprecated public BinaryStatistics() { this(DEFAULT_FAKE_TYPE); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java index a7111bc23c..12b2f8ed14 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java @@ -30,6 +30,10 @@ public class BooleanStatistics extends Statistics { private boolean max; private boolean min; + /** + * @deprecated will be removed in 2.0.0. Use {@link Statistics#createStats(org.apache.parquet.schema.Type)} instead + */ + @Deprecated public BooleanStatistics() { this(DEFAULT_FAKE_TYPE); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java index daf44197fa..090ef684ad 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java @@ -30,6 +30,10 @@ public class DoubleStatistics extends Statistics { private double max; private double min; + /** + * @deprecated will be removed in 2.0.0. Use {@link Statistics#createStats(org.apache.parquet.schema.Type)} instead + */ + @Deprecated public DoubleStatistics() { this(DEFAULT_FAKE_TYPE); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java index 3d39f5fd46..b72947c79b 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java @@ -30,6 +30,10 @@ public class FloatStatistics extends Statistics { private float max; private float min; + /** + * @deprecated will be removed in 2.0.0. Use {@link Statistics#createStats(org.apache.parquet.schema.Type)} instead + */ + @Deprecated public FloatStatistics() { // Creating a fake primitive type to have the proper comparator this(DEFAULT_FAKE_TYPE); diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java index 31e9bf2d45..aa52018529 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java @@ -30,6 +30,10 @@ public class IntStatistics extends Statistics { private int max; private int min; + /** + * @deprecated will be removed in 2.0.0. Use {@link Statistics#createStats(org.apache.parquet.schema.Type)} instead + */ + @Deprecated public IntStatistics() { this(DEFAULT_FAKE_TYPE); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java index 5858711644..90fac05ec4 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java @@ -30,6 +30,10 @@ public class LongStatistics extends Statistics { private long max; private long min; + /** + * @deprecated will be removed in 2.0.0. Use {@link Statistics#createStats(org.apache.parquet.schema.Type)} instead + */ + @Deprecated public LongStatistics() { this(DEFAULT_FAKE_TYPE); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index fde10a049f..69733ee480 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -50,33 +50,31 @@ public abstract class Statistics> { /** * Returns the typed statistics object based on the passed type parameter - * - * @param type - * PrimitiveTypeName type of the column + * @param type PrimitiveTypeName type of the column * @return instance of a typed statistics class * @deprecated Use {@link #createStats(Type)} instead */ @Deprecated public static Statistics getStatsBasedOnType(PrimitiveTypeName type) { switch(type) { - case INT32: - return new IntStatistics(); - case INT64: - return new LongStatistics(); - case FLOAT: - return new FloatStatistics(); - case DOUBLE: - return new DoubleStatistics(); - case BOOLEAN: - return new BooleanStatistics(); - case BINARY: - return new BinaryStatistics(); - case INT96: - return new BinaryStatistics(); - case FIXED_LEN_BYTE_ARRAY: - return new BinaryStatistics(); - default: - throw new UnknownColumnTypeException(type); + case INT32: + return new IntStatistics(); + case INT64: + return new LongStatistics(); + case FLOAT: + return new FloatStatistics(); + case DOUBLE: + return new DoubleStatistics(); + case BOOLEAN: + return new BooleanStatistics(); + case BINARY: + return new BinaryStatistics(); + case INT96: + return new BinaryStatistics(); + case FIXED_LEN_BYTE_ARRAY: + return new BinaryStatistics(); + default: + throw new UnknownColumnTypeException(type); } } @@ -91,22 +89,22 @@ public static Statistics getStatsBasedOnType(PrimitiveTypeName type) { public static Statistics createStats(Type type) { PrimitiveType primitive = type.asPrimitiveType(); switch (primitive.getPrimitiveTypeName()) { - case INT32: - return new IntStatistics(primitive); - case INT64: - return new LongStatistics(primitive); - case FLOAT: - return new FloatStatistics(primitive); - case DOUBLE: - return new DoubleStatistics(primitive); - case BOOLEAN: - return new BooleanStatistics(primitive); - case BINARY: - case INT96: - case FIXED_LEN_BYTE_ARRAY: - return new BinaryStatistics(primitive); - default: - throw new UnknownColumnTypeException(primitive.getPrimitiveTypeName()); + case INT32: + return new IntStatistics(primitive); + case INT64: + return new LongStatistics(primitive); + case FLOAT: + return new FloatStatistics(primitive); + case DOUBLE: + return new DoubleStatistics(primitive); + case BOOLEAN: + return new BooleanStatistics(primitive); + case BINARY: + case INT96: + case FIXED_LEN_BYTE_ARRAY: + return new BinaryStatistics(primitive); + default: + throw new UnknownColumnTypeException(primitive.getPrimitiveTypeName()); } } diff --git a/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java index 5b83183c95..8df0250638 100644 --- a/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java @@ -31,6 +31,9 @@ public class Statistics { private final Comparator comparator; // Intended for use only within Parquet itself. + /** + * @deprecated will be removed in 2.0.0. Use {@link #Statistics(Object, Object, Comparator)} instead + */ @Deprecated public Statistics(T min, T max) { this.min = checkNotNull(min, "min"); diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java index 019fba6493..711a3e8103 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java @@ -31,7 +31,8 @@ public abstract class PrimitiveComparator implements Comparator { public int compare(boolean b1, boolean b2) { - throw new UnsupportedOperationException("compare(boolean, boolean) was called on a non-boolean comparator: " + toString()); + throw new UnsupportedOperationException( + "compare(boolean, boolean) was called on a non-boolean comparator: " + toString()); } public int compare(int i1, int i2) { @@ -43,11 +44,13 @@ public int compare(long l1, long l2) { } public int compare(float f1, float f2) { - throw new UnsupportedOperationException("compare(float, float) was called on a non-float comparator: " + toString()); + throw new UnsupportedOperationException( + "compare(float, float) was called on a non-float comparator: " + toString()); } public int compare(double d1, double d2) { - throw new UnsupportedOperationException("compare(double, double) was called on a non-double comparator: " + toString()); + throw new UnsupportedOperationException( + "compare(double, double) was called on a non-double comparator: " + toString()); } @Override diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/Type.java b/parquet-column/src/main/java/org/apache/parquet/schema/Type.java index 782151a7e4..176b9a6e27 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/Type.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/Type.java @@ -323,4 +323,5 @@ void checkContains(Type subType) { * @return the converted tree */ abstract T convert(List path, TypeConverter converter); + } diff --git a/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java b/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java index 0045c900c0..081559719c 100644 --- a/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java +++ b/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java @@ -18,9 +18,6 @@ */ package org.apache.parquet.io.api; -import org.apache.parquet.io.api.TestBinary.BinaryFactory.BinaryAndOriginal; -import org.junit.Test; - import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -29,11 +26,13 @@ import java.nio.ByteBuffer; import java.util.Arrays; +import org.apache.parquet.io.api.TestBinary.BinaryFactory.BinaryAndOriginal; +import org.junit.Test; + import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertSame; import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; public class TestBinary { diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java index 6f2e276c62..0df3c9798a 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java @@ -45,7 +45,7 @@ public class TestPrimitiveComparator { @Test public void testBooleanComparator() { - Boolean[] values = {null, false, true}; + Boolean[] values = { null, false, true }; for (int i = 0; i < values.length; ++i) { for (int j = 0; j < values.length; ++j) { @@ -65,27 +65,27 @@ public void testBooleanComparator() { @Test public void testSignedInt32Comparator() { testInt32Comparator(SIGNED_INT32_COMPARATOR, - null, - Integer.MIN_VALUE, - -12345, - -1, - 0, - 1, - 12345, - Integer.MAX_VALUE); + null, + Integer.MIN_VALUE, + -12345, + -1, + 0, + 1, + 12345, + Integer.MAX_VALUE); } @Test public void testUnsignedInt32Comparator() { testInt32Comparator(UNSIGNED_INT32_COMPARATOR, - null, - 0, // 0x00000000 - 1, // 0x00000001 - 12345, // 0x00003039 - Integer.MAX_VALUE, // 0x7FFFFFFF - Integer.MIN_VALUE, // 0x80000000 - -12345, // 0xFFFFCFC7 - -1); // 0xFFFFFFFF + null, + 0, // 0x00000000 + 1, // 0x00000001 + 12345, // 0x00003039 + Integer.MAX_VALUE, // 0x7FFFFFFF + Integer.MIN_VALUE, // 0x80000000 + -12345, // 0xFFFFCFC7 + -1); // 0xFFFFFFFF } private void testInt32Comparator(PrimitiveComparator comparator, Integer... values) { @@ -107,27 +107,27 @@ private void testInt32Comparator(PrimitiveComparator comparator, Intege @Test public void testSignedInt64Comparator() { testInt64Comparator(SIGNED_INT64_COMPARATOR, - null, - Long.MIN_VALUE, - -12345678901L, - -1L, - 0L, - 1L, - 12345678901L, - Long.MAX_VALUE); + null, + Long.MIN_VALUE, + -12345678901L, + -1L, + 0L, + 1L, + 12345678901L, + Long.MAX_VALUE); } @Test public void testUnsignedInt64Comparator() { testInt64Comparator(UNSIGNED_INT64_COMPARATOR, - null, - 0L, // 0x0000000000000000 - 1L, // 0x0000000000000001 - 12345678901L, // 0x00000002DFDC1C35 - Long.MAX_VALUE, // 0x7FFFFFFFFFFFFFFF - Long.MIN_VALUE, // 0x8000000000000000 - -12345678901L, // 0xFFFFFFFD2023E3CB - -1L); // 0xFFFFFFFFFFFFFFFF + null, + 0L, // 0x0000000000000000 + 1L, // 0x0000000000000001 + 12345678901L, // 0x00000002DFDC1C35 + Long.MAX_VALUE, // 0x7FFFFFFFFFFFFFFF + Long.MIN_VALUE, // 0x8000000000000000 + -12345678901L, // 0xFFFFFFFD2023E3CB + -1L); // 0xFFFFFFFFFFFFFFFF } private void testInt64Comparator(PrimitiveComparator comparator, Long... values) { @@ -149,16 +149,16 @@ private void testInt64Comparator(PrimitiveComparator comparator, Long... v @Test public void testFloatComparator() { Float[] values = { - null, - Float.NEGATIVE_INFINITY, - -Float.MAX_VALUE, - -1234.5678F, - -Float.MIN_VALUE, - 0.0F, - Float.MIN_VALUE, - 1234.5678F, - Float.MAX_VALUE, - Float.POSITIVE_INFINITY}; + null, + Float.NEGATIVE_INFINITY, + -Float.MAX_VALUE, + -1234.5678F, + -Float.MIN_VALUE, + 0.0F, + Float.MIN_VALUE, + 1234.5678F, + Float.MAX_VALUE, + Float.POSITIVE_INFINITY }; for (int i = 0; i < values.length; ++i) { for (int j = 0; j < values.length; ++j) { @@ -178,16 +178,16 @@ public void testFloatComparator() { @Test public void testDoubleComparator() { Double[] values = { - null, - Double.NEGATIVE_INFINITY, - -Double.MAX_VALUE, - -123456.7890123456789, - -Double.MIN_VALUE, - 0.0, - Double.MIN_VALUE, - 123456.7890123456789, - Double.MAX_VALUE, - Double.POSITIVE_INFINITY}; + null, + Double.NEGATIVE_INFINITY, + -Double.MAX_VALUE, + -123456.7890123456789, + -Double.MIN_VALUE, + 0.0, + Double.MIN_VALUE, + 123456.7890123456789, + Double.MAX_VALUE, + Double.POSITIVE_INFINITY }; for (int i = 0; i < values.length; ++i) { for (int j = 0; j < values.length; ++j) { @@ -207,44 +207,44 @@ public void testDoubleComparator() { @Test public void testLexicographicalBinaryComparator() { testObjectComparator(UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR, - null, - Binary.fromConstantByteArray(new byte[0]), // || - Binary.fromConstantByteArray(new byte[]{127, 127, 0, 127}, 2, 1), // |00| - Binary.fromCharSequence("aaa"), // |61|61|61| - Binary.fromString("aaaa"), // |61|61|61|61| - Binary.fromReusedByteArray("aaab".getBytes()), // |61|61|61|62| - Binary.fromReusedByteArray("azzza".getBytes(), 1, 3), // |7A|7A|7A| - Binary.fromReusedByteBuffer(ByteBuffer.wrap("zzzzzz".getBytes())), // |7A|7A|7A|7A|7A|7A| - Binary.fromReusedByteBuffer(ByteBuffer.wrap("aazzzzzzaa".getBytes(), 2, 7)), // |7A|7A|7A|7A|7A|7A|61| - Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[]{-128, -128, -128})), // |80|80|80| - Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[]{-128, -128, -1}, 1, 2)) // |80|FF| + null, + Binary.fromConstantByteArray(new byte[0]), // || + Binary.fromConstantByteArray(new byte[] { 127, 127, 0, 127 }, 2, 1), // |00| + Binary.fromCharSequence("aaa"), // |61|61|61| + Binary.fromString("aaaa"), // |61|61|61|61| + Binary.fromReusedByteArray("aaab".getBytes()), // |61|61|61|62| + Binary.fromReusedByteArray("azzza".getBytes(), 1, 3), // |7A|7A|7A| + Binary.fromReusedByteBuffer(ByteBuffer.wrap("zzzzzz".getBytes())), // |7A|7A|7A|7A|7A|7A| + Binary.fromReusedByteBuffer(ByteBuffer.wrap("aazzzzzzaa".getBytes(), 2, 7)), // |7A|7A|7A|7A|7A|7A|61| + Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[] { -128, -128, -128 })), // |80|80|80| + Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[] { -128, -128, -1 }, 1, 2)) // |80|FF| ); } @Test public void testBinaryAsSignedIntegerComparator() { testObjectComparator(BINARY_AS_SIGNED_INTEGER_COMPARATOR, - null, - Binary.fromConstantByteArray(new BigInteger("-9999999999999999999999999999999999999999").toByteArray()), - Binary.fromReusedByteArray(new BigInteger("-9999999999999999999999999999999999999998").toByteArray()), - Binary.fromConstantByteArray(BigInteger.valueOf(Long.MIN_VALUE).subtract(BigInteger.ONE).toByteArray()), - Binary.fromConstantByteArray(BigInteger.valueOf(Long.MIN_VALUE).toByteArray()), - Binary.fromConstantByteArray(BigInteger.valueOf(Long.MIN_VALUE).add(BigInteger.ONE).toByteArray()), - Binary.fromReusedByteArray(new BigInteger("-1").toByteArray()), - Binary.fromConstantByteBuffer(ByteBuffer.wrap(new BigInteger("0").toByteArray())), - Binary.fromReusedByteBuffer(ByteBuffer.wrap(new BigInteger("1").toByteArray())), - Binary.fromConstantByteBuffer( - ByteBuffer.wrap(BigInteger.valueOf(Long.MAX_VALUE).subtract(BigInteger.ONE).toByteArray())), - Binary.fromConstantByteBuffer(ByteBuffer.wrap(BigInteger.valueOf(Long.MAX_VALUE).toByteArray())), - Binary - .fromConstantByteBuffer(ByteBuffer.wrap(BigInteger.valueOf(Long.MAX_VALUE).add(BigInteger.ONE).toByteArray())), - Binary.fromConstantByteBuffer( - ByteBuffer.wrap(new BigInteger("999999999999999999999999999999999999999").toByteArray())), - Binary.fromReusedByteBuffer( - ByteBuffer.wrap(new BigInteger("9999999999999999999999999999999999999998").toByteArray())), - Binary.fromConstantByteBuffer( - ByteBuffer.wrap(new BigInteger("9999999999999999999999999999999999999999").toByteArray())) - ); + null, + Binary.fromConstantByteArray(new BigInteger("-9999999999999999999999999999999999999999").toByteArray()), + Binary.fromReusedByteArray(new BigInteger("-9999999999999999999999999999999999999998").toByteArray()), + Binary.fromConstantByteArray(BigInteger.valueOf(Long.MIN_VALUE).subtract(BigInteger.ONE).toByteArray()), + Binary.fromConstantByteArray(BigInteger.valueOf(Long.MIN_VALUE).toByteArray()), + Binary.fromConstantByteArray(BigInteger.valueOf(Long.MIN_VALUE).add(BigInteger.ONE).toByteArray()), + Binary.fromReusedByteArray(new BigInteger("-1").toByteArray()), + Binary.fromConstantByteBuffer(ByteBuffer.wrap(new BigInteger("0").toByteArray())), + Binary.fromReusedByteBuffer(ByteBuffer.wrap(new BigInteger("1").toByteArray())), + Binary.fromConstantByteBuffer( + ByteBuffer.wrap(BigInteger.valueOf(Long.MAX_VALUE).subtract(BigInteger.ONE).toByteArray())), + Binary.fromConstantByteBuffer(ByteBuffer.wrap(BigInteger.valueOf(Long.MAX_VALUE).toByteArray())), + Binary + .fromConstantByteBuffer( + ByteBuffer.wrap(BigInteger.valueOf(Long.MAX_VALUE).add(BigInteger.ONE).toByteArray())), + Binary.fromConstantByteBuffer( + ByteBuffer.wrap(new BigInteger("999999999999999999999999999999999999999").toByteArray())), + Binary.fromReusedByteBuffer( + ByteBuffer.wrap(new BigInteger("9999999999999999999999999999999999999998").toByteArray())), + Binary.fromConstantByteBuffer( + ByteBuffer.wrap(new BigInteger("9999999999999999999999999999999999999999").toByteArray()))); } private void testObjectComparator(PrimitiveComparator comparator, T... values) { diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java index e410b3fd13..eaba2c1cb8 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java @@ -76,7 +76,7 @@ private ColumnChunkMetaData getColumnChunk(ColumnPath columnPath) { @SuppressWarnings("unchecked") private > Set expandDictionary(ColumnChunkMetaData meta) throws IOException { - ColumnDescriptor col = new ColumnDescriptor(meta.getPath().toArray(), meta.getFullType(), -1, -1); + ColumnDescriptor col = new ColumnDescriptor(meta.getPath().toArray(), meta.getPrimitiveType(), -1, -1); DictionaryPage page = dictionaries.readDictionaryPage(col); // the chunk may not be dictionary-encoded @@ -213,7 +213,7 @@ public > Boolean visit(Lt lt) { return BLOCK_MIGHT_MATCH; } - Comparator comparator = meta.getFullType().comparator(); + Comparator comparator = meta.getPrimitiveType().comparator(); for (T entry : dictSet) { if (comparator.compare(value, entry) > 0) { return BLOCK_MIGHT_MATCH; @@ -255,7 +255,7 @@ public > Boolean visit(LtEq ltEq) { return BLOCK_MIGHT_MATCH; } - Comparator comparator = meta.getFullType().comparator(); + Comparator comparator = meta.getPrimitiveType().comparator(); for (T entry : dictSet) { if (comparator.compare(value, entry) >= 0) { return BLOCK_MIGHT_MATCH; @@ -295,7 +295,7 @@ public > Boolean visit(Gt gt) { return BLOCK_MIGHT_MATCH; } - Comparator comparator = meta.getFullType().comparator(); + Comparator comparator = meta.getPrimitiveType().comparator(); for (T entry : dictSet) { if (comparator.compare(value, entry) < 0) { return BLOCK_MIGHT_MATCH; @@ -337,7 +337,7 @@ public > Boolean visit(GtEq gtEq) { return BLOCK_MIGHT_MATCH; } - Comparator comparator = meta.getFullType().comparator(); + Comparator comparator = meta.getPrimitiveType().comparator(); for (T entry : dictSet) { if (comparator.compare(value, entry) <= 0) { return BLOCK_MIGHT_MATCH; diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index a83528df56..d89e32d5b9 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -326,33 +326,33 @@ dataPageType, getEncoding(encoding), } public static Statistics toParquetStatistics( - org.apache.parquet.column.statistics.Statistics statistics) { - Statistics stats = new Statistics(); + org.apache.parquet.column.statistics.Statistics stats) { + Statistics formatStats = new Statistics(); // Don't write stats larger than the max size rather than truncating. The // rationale is that some engines may use the minimum value in the page as // the true minimum for aggregations and there is no way to mark that a // value has been truncated and is a lower bound and not in the page. - if (!statistics.isEmpty() && statistics.isSmallerThan(MAX_STATS_SIZE)) { - stats.setNull_count(statistics.getNumNulls()); - if (statistics.hasNonNullValue()) { - byte[] min = statistics.getMinBytes(); - byte[] max = statistics.getMaxBytes(); + if (!stats.isEmpty() && stats.isSmallerThan(MAX_STATS_SIZE)) { + formatStats.setNull_count(stats.getNumNulls()); + if (stats.hasNonNullValue()) { + byte[] min = stats.getMinBytes(); + byte[] max = stats.getMaxBytes(); // Fill the former min-max statistics only if the comparison logic is // signed so the logic of V1 and V2 stats are the same (which is // trivially true for equal min-max values) - if (sortOrder(statistics.type()) == SortOrder.SIGNED || Arrays.equals(min, max)) { - stats.setMin(min); - stats.setMax(max); + if (sortOrder(stats.type()) == SortOrder.SIGNED || Arrays.equals(min, max)) { + formatStats.setMin(min); + formatStats.setMax(max); } - if (isMinMaxStatsSupported(statistics.type()) || Arrays.equals(min, max)) { - stats.setMin_value(min); - stats.setMax_value(max); + if (isMinMaxStatsSupported(stats.type()) || Arrays.equals(min, max)) { + formatStats.setMin_value(min); + formatStats.setMax_value(max); } } } - return stats; + return formatStats; } private static boolean isMinMaxStatsSupported(PrimitiveType type) { @@ -368,28 +368,28 @@ private static boolean isMinMaxStatsSupported(PrimitiveType type) { // Explicitly listing all the supported logical types to avoid writing statistics for new types accidentally switch (origType) { - case INT_8: - case INT_16: - case INT_32: - case INT_64: - case UINT_8: - case UINT_16: - case UINT_32: - case UINT_64: - case UTF8: - case DECIMAL: - case DATE: - case TIME_MILLIS: - case TIME_MICROS: - case TIMESTAMP_MILLIS: - case TIMESTAMP_MICROS: - case ENUM: - case JSON: - case BSON: - return true; - case INTERVAL: - default: - return false; + case INT_8: + case INT_16: + case INT_32: + case INT_64: + case UINT_8: + case UINT_16: + case UINT_32: + case UINT_64: + case UTF8: + case DECIMAL: + case DATE: + case TIME_MILLIS: + case TIME_MICROS: + case TIMESTAMP_MILLIS: + case TIMESTAMP_MICROS: + case ENUM: + case JSON: + case BSON: + return true; + case INTERVAL: + default: + return false; } } @@ -414,24 +414,24 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist // Visible for testing static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal - (String createdBy, Statistics statistics, PrimitiveType type, SortOrder typeSortOrder) { + (String createdBy, Statistics formatStats, PrimitiveType type, SortOrder typeSortOrder) { // create stats object based on the column type org.apache.parquet.column.statistics.Statistics stats = org.apache.parquet.column.statistics.Statistics.createStats(type); - if (statistics != null) { + if (formatStats != null) { // Use the new V2 min-max statistics over the former one if it is filled - if (statistics.isSetMin_value() && statistics.isSetMax_value()) { - byte[] min = statistics.min_value.array(); - byte[] max = statistics.max_value.array(); + if (formatStats.isSetMin_value() && formatStats.isSetMax_value()) { + byte[] min = formatStats.min_value.array(); + byte[] max = formatStats.max_value.array(); // Ordering of INT96 and INTERVAL types is not clear; // we only support statistics for them if min and max are equal if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) { stats.setMinMaxFromBytes(min, max); } - stats.setNumNulls(statistics.null_count); + stats.setNumNulls(formatStats.null_count); } else { - boolean isSet = statistics.isSetMax() && statistics.isSetMin(); - boolean maxEqualsMin = isSet ? Arrays.equals(statistics.getMin(), statistics.getMax()) : false; + boolean isSet = formatStats.isSetMax() && formatStats.isSetMin(); + boolean maxEqualsMin = isSet ? Arrays.equals(formatStats.getMin(), formatStats.getMax()) : false; boolean sortOrdersMatch = SortOrder.SIGNED == typeSortOrder; // NOTE: See docs in CorruptStatistics for explanation of why this check is needed // The sort order is checked to avoid returning min/max stats that are not @@ -441,9 +441,9 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist if (!CorruptStatistics.shouldIgnoreStatistics(createdBy, type.getPrimitiveTypeName()) && (sortOrdersMatch || maxEqualsMin)) { if (isSet) { - stats.setMinMaxFromBytes(statistics.min.array(), statistics.max.array()); + stats.setMinMaxFromBytes(formatStats.min.array(), formatStats.max.array()); } - stats.setNumNulls(statistics.null_count); + stats.setNumNulls(formatStats.null_count); } } } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java index 8eeab5c55f..82c288fe43 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java @@ -114,7 +114,7 @@ public void writePage(BytesInput bytes, this.totalValueCount += valueCount; this.pageCount += 1; - // Cloning the statistics if it is not initialized yet so we have the correct typed one + // Copying the statistics if it is not initialized yet so we have the correct typed one if (totalStatistics == null) { totalStatistics = statistics.copy(); } else { @@ -159,7 +159,7 @@ public void writePageV2( this.totalValueCount += valueCount; this.pageCount += 1; - // Cloning the statistics if it is not initialized yet so we have the correct typed one + // Copying the statistics if it is not initialized yet so we have the correct typed one if (totalStatistics == null) { totalStatistics = statistics.copy(); } else { diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java index f440b433c2..285c2db1a4 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java @@ -323,7 +323,7 @@ public void startColumn(ColumnDescriptor descriptor, currentChunkFirstDataPage = out.getPos(); compressedLength = 0; uncompressedLength = 0; - // The statistics will be cloned from the first one added at writeDataPage(s) so we have the correct typed one + // The statistics will be copied from the first one added at writeDataPage(s) so we have the correct typed one currentStatistics = null; } @@ -425,7 +425,7 @@ public void writeDataPage( LOG.debug("{}: write data page content {}", out.getPos(), compressedPageSize); bytes.writeAllTo(out); - // Cloning the statistics if it is not initialized yet so we have the correct typed one + // Copying the statistics if it is not initialized yet so we have the correct typed one if (currentStatistics == null) { currentStatistics = statistics.copy(); } else { @@ -605,7 +605,7 @@ public void appendRowGroup(SeekableInputStream from, BlockMetaData rowGroup, currentBlock.addColumn(ColumnChunkMetaData.get( chunk.getPath(), - chunk.getFullType(), + chunk.getPrimitiveType(), chunk.getCodec(), chunk.getEncodingStats(), chunk.getEncodings(), diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java index 4b428e50c0..e50fef71fa 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java @@ -67,6 +67,11 @@ public static ColumnChunkMetaData get( valueCount, totalSize, totalUncompressedSize); } + /** + * @deprecated will be removed in 2.0.0. Use + * {@link #get(ColumnPath, PrimitiveType, CompressionCodecName, EncodingStats, Set, Statistics, long, long, long, long, long)} + * instead. + */ @Deprecated public static ColumnChunkMetaData get( ColumnPath path, @@ -85,17 +90,17 @@ public static ColumnChunkMetaData get( } public static ColumnChunkMetaData get( - ColumnPath path, - PrimitiveType type, - CompressionCodecName codec, - EncodingStats encodingStats, - Set encodings, - Statistics statistics, - long firstDataPage, - long dictionaryPageOffset, - long valueCount, - long totalSize, - long totalUncompressedSize) { + ColumnPath path, + PrimitiveType type, + CompressionCodecName codec, + EncodingStats encodingStats, + Set encodings, + Statistics statistics, + long firstDataPage, + long dictionaryPageOffset, + long valueCount, + long totalSize, + long totalUncompressedSize) { // to save space we store those always positive longs in ints when they fit. if (positiveLongFitsInAnInt(firstDataPage) && positiveLongFitsInAnInt(dictionaryPageOffset) @@ -168,23 +173,27 @@ public CompressionCodecName getCodec() { /** * * @return column identifier + * @deprecated will be removed in 2.0.0. Use {@link #getPrimitiveType()} instead. */ + @Deprecated public ColumnPath getPath() { return properties.getPath(); } /** * @return type of the column + * @deprecated will be removed in 2.0.0. Use {@link #getPrimitiveType()} instead. */ + @Deprecated public PrimitiveTypeName getType() { return properties.getType(); } /** - * @return the full type object of the column + * @return the primitive type object of the column */ - public PrimitiveType getFullType() { - return properties.getFullType(); + public PrimitiveType getPrimitiveType() { + return properties.getPrimitiveType(); } /** diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkProperties.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkProperties.java index dbd3c369fa..233cf94b1a 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkProperties.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkProperties.java @@ -30,12 +30,17 @@ public class ColumnChunkProperties { private static Canonicalizer properties = new Canonicalizer(); + /** + * @deprecated will be removed in 2.0.0. Use {@link #get(ColumnPath, PrimitiveType, CompressionCodecName, Set)} + * instead. + */ @Deprecated public static ColumnChunkProperties get(ColumnPath path, PrimitiveTypeName type, CompressionCodecName codec, Set encodings) { return get(path, new PrimitiveType(Type.Repetition.OPTIONAL, type, ""), codec, encodings); } - public static ColumnChunkProperties get(ColumnPath path, PrimitiveType type, CompressionCodecName codec, Set encodings) { + public static ColumnChunkProperties get(ColumnPath path, PrimitiveType type, CompressionCodecName codec, + Set encodings) { return properties.canonicalize(new ColumnChunkProperties(codec, path, type, encodings)); } @@ -63,11 +68,19 @@ public ColumnPath getPath() { return path; } + /** + * @return the primitive type name for the column + * @deprecated will be removed in 2.0.0. Use {@link #getPrimitiveType()} instead. + */ + @Deprecated public PrimitiveTypeName getType() { return type.getPrimitiveTypeName(); } - public PrimitiveType getFullType() { + /** + * @return the primitive type object for the column + */ + public PrimitiveType getPrimitiveType() { return type; } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 6808b7bc22..774582480a 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -489,12 +489,12 @@ private void testLongStats(StatsHelper helper) { public void testFloatStatsV1() { testFloatStats(StatsHelper.V1); } - + @Test public void testFloatStatsV2() { testFloatStats(StatsHelper.V2); } - + private void testFloatStats(StatsHelper helper) { // make fake stats and verify the size check FloatStatistics stats = new FloatStatistics(); @@ -520,12 +520,12 @@ private void testFloatStats(StatsHelper helper) { public void testDoubleStatsV1() { testDoubleStats(StatsHelper.V1); } - + @Test public void testDoubleStatsV2() { testDoubleStats(StatsHelper.V2); } - + private void testDoubleStats(StatsHelper helper) { // make fake stats and verify the size check DoubleStatistics stats = new DoubleStatistics(); @@ -551,12 +551,12 @@ private void testDoubleStats(StatsHelper helper) { public void testBooleanStatsV1() { testBooleanStats(StatsHelper.V1); } - + @Test public void testBooleanStatsV2() { testBooleanStats(StatsHelper.V2); } - + private void testBooleanStats(StatsHelper helper) { // make fake stats and verify the size check BooleanStatistics stats = new BooleanStatistics(); diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java index 2f52692683..6915c86ec3 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java @@ -431,8 +431,8 @@ public void testConvertToThriftStatistics() throws Exception { } final String createdBy = "parquet-mr version 1.8.0 (build d4d5a07ec9bd262ca1e93c309f1d7d4a74ebda4c)"; - Statistics thriftStats = org.apache.parquet.format.converter.ParquetMetadataConverter - .toParquetStatistics(parquetMRstats); + Statistics thriftStats = + org.apache.parquet.format.converter.ParquetMetadataConverter.toParquetStatistics(parquetMRstats); LongStatistics convertedBackStats = (LongStatistics) org.apache.parquet.format.converter.ParquetMetadataConverter.fromParquetStatistics( createdBy, thriftStats, PrimitiveTypeName.INT64); From 524750be0f4ec1da6b4b42829844cee3edbe14fe Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Thu, 7 Dec 2017 17:28:49 +0100 Subject: [PATCH 14/17] PARQUET-1025: Some updates for zi's findings --- .../parquet/schema/PrimitiveComparator.java | 22 +++++++++---------- .../schema/TestPrimitiveComparator.java | 4 +++- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java index 711a3e8103..9997d37b36 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java @@ -222,12 +222,12 @@ public String toString() { /* * This comparator is for comparing two signed decimal values represented in twos-complement binary. In case of the - * binary length of one value is shorted than the other it will be padded by the related prefix (0xFF for negative, - * 0x00 for positive values). + * binary length of one value is shorter than the other it will be padded by the corresponding prefix (0xFF for + * negative, 0x00 for positive values). */ static final PrimitiveComparator BINARY_AS_SIGNED_INTEGER_COMPARATOR = new BinaryComparator() { - private static final int NEGATIVE_PREFIX = 0xFF; - private static final int POSITIVE_PREFIX = 0; + private static final int NEGATIVE_PADDING = 0xFF; + private static final int POSITIVE_PADDING = 0; @Override int compare(ByteBuffer b1, ByteBuffer b2) { @@ -245,13 +245,13 @@ int compare(ByteBuffer b1, ByteBuffer b2) { int result = 0; // Compare the beginning of the longer buffer with the proper padding - int diff = l1 - l2; - if (diff < 0) { - result = -compareWithPadding(-diff, b2, p2, isNegative1 ? NEGATIVE_PREFIX : POSITIVE_PREFIX); - p2 += -diff; - } else if (diff > 0) { - result = compareWithPadding(diff, b1, p1, isNegative2 ? NEGATIVE_PREFIX : POSITIVE_PREFIX); - p1 += diff; + int lengthDiff = l1 - l2; + if (lengthDiff < 0) { + result = -compareWithPadding(-lengthDiff, b2, p2, isNegative1 ? NEGATIVE_PADDING : POSITIVE_PADDING); + p2 += -lengthDiff; + } else if (lengthDiff > 0) { + result = compareWithPadding(lengthDiff, b1, p1, isNegative2 ? NEGATIVE_PADDING : POSITIVE_PADDING); + p1 += lengthDiff; } // The beginning of the longer buffer equals to the padding or the lengths are equal diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java index 0df3c9798a..c8a423ad23 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java @@ -230,9 +230,11 @@ public void testBinaryAsSignedIntegerComparator() { Binary.fromConstantByteArray(BigInteger.valueOf(Long.MIN_VALUE).subtract(BigInteger.ONE).toByteArray()), Binary.fromConstantByteArray(BigInteger.valueOf(Long.MIN_VALUE).toByteArray()), Binary.fromConstantByteArray(BigInteger.valueOf(Long.MIN_VALUE).add(BigInteger.ONE).toByteArray()), + Binary.fromReusedByteArray(new byte[] { (byte) 0xFF, (byte) 0xFF, (byte) 0xFF, -2 }, 1, 3), Binary.fromReusedByteArray(new BigInteger("-1").toByteArray()), Binary.fromConstantByteBuffer(ByteBuffer.wrap(new BigInteger("0").toByteArray())), - Binary.fromReusedByteBuffer(ByteBuffer.wrap(new BigInteger("1").toByteArray())), + Binary.fromReusedByteBuffer(ByteBuffer.wrap(new byte[] { 0, 0, 0, 1 })), + Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[] { 0, 0, 0, 2 }), 2, 2), Binary.fromConstantByteBuffer( ByteBuffer.wrap(BigInteger.valueOf(Long.MAX_VALUE).subtract(BigInteger.ONE).toByteArray())), Binary.fromConstantByteBuffer(ByteBuffer.wrap(BigInteger.valueOf(Long.MAX_VALUE).toByteArray())), From dc838f273f1ab317055d7ae98c36186e031722a1 Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Wed, 13 Dec 2017 12:13:28 +0100 Subject: [PATCH 15/17] PARQUET-1025: Implement ColumnOrder; other updates for rdblue's findings --- .../column/statistics/BinaryStatistics.java | 3 +- .../column/statistics/BooleanStatistics.java | 3 +- .../column/statistics/DoubleStatistics.java | 3 +- .../column/statistics/FloatStatistics.java | 3 +- .../column/statistics/IntStatistics.java | 3 +- .../column/statistics/LongStatistics.java | 3 +- .../parquet/column/statistics/Statistics.java | 86 ++++++++-------- .../apache/parquet/schema/ColumnOrder.java | 97 +++++++++++++++++++ .../parquet/schema/PrimitiveComparator.java | 11 ++- .../apache/parquet/schema/PrimitiveType.java | 76 ++++++++++++++- .../java/org/apache/parquet/schema/Types.java | 20 +++- .../parquet/schema/TestMessageType.java | 45 +++++++++ .../schema/TestPrimitiveComparator.java | 60 ++++++------ .../parquet/schema/TestTypeBuilders.java | 47 +++++++++ .../converter/ParquetMetadataConverter.java | 91 ++++++++--------- .../hadoop/metadata/ColumnChunkMetaData.java | 4 +- .../TestParquetMetadataConverter.java | 44 ++++++++- .../parquet/hadoop/TestParquetFileWriter.java | 8 +- .../org/apache/parquet/hadoop/TestUtils.java | 21 ++++ .../thrift/TestThriftToParquetFileWriter.java | 23 +++-- 20 files changed, 501 insertions(+), 150 deletions(-) create mode 100644 parquet-column/src/main/java/org/apache/parquet/schema/ColumnOrder.java diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java index 84be75a7b9..a68285bc1c 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java @@ -25,7 +25,8 @@ public class BinaryStatistics extends Statistics { // A fake type object to be used to generate the proper comparator - private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.BINARY).named(""); + private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.BINARY) + .named("fake_binary_type"); private Binary max; private Binary min; diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java index 12b2f8ed14..0e77b61e1b 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java @@ -25,7 +25,8 @@ public class BooleanStatistics extends Statistics { // A fake type object to be used to generate the proper comparator - private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.BOOLEAN).named(""); + private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.BOOLEAN) + .named("fake_boolean_type"); private boolean max; private boolean min; diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java index 090ef684ad..0dd067b717 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java @@ -25,7 +25,8 @@ public class DoubleStatistics extends Statistics { // A fake type object to be used to generate the proper comparator - private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.DOUBLE).named(""); + private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.DOUBLE) + .named("fake_double_type"); private double max; private double min; diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java index b72947c79b..36836c6ff7 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java @@ -25,7 +25,8 @@ public class FloatStatistics extends Statistics { // A fake type object to be used to generate the proper comparator - private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.FLOAT).named(""); + private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.FLOAT) + .named("fake_float_type"); private float max; private float min; diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java index aa52018529..5df7f0a7c6 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java @@ -25,7 +25,8 @@ public class IntStatistics extends Statistics { // A fake type object to be used to generate the proper comparator - private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.INT32).named(""); + private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.INT32) + .named("fake_int32_type"); private int max; private int min; diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java index 90fac05ec4..fd6d19cfda 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java @@ -25,7 +25,8 @@ public class LongStatistics extends Statistics { // A fake type object to be used to generate the proper comparator - private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.INT64).named(""); + private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.INT64) + .named("fake_int64_type"); private long max; private long min; diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index 69733ee480..6eb23819ef 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -56,25 +56,25 @@ public abstract class Statistics> { */ @Deprecated public static Statistics getStatsBasedOnType(PrimitiveTypeName type) { - switch(type) { - case INT32: - return new IntStatistics(); - case INT64: - return new LongStatistics(); - case FLOAT: - return new FloatStatistics(); - case DOUBLE: - return new DoubleStatistics(); - case BOOLEAN: - return new BooleanStatistics(); - case BINARY: - return new BinaryStatistics(); - case INT96: - return new BinaryStatistics(); - case FIXED_LEN_BYTE_ARRAY: - return new BinaryStatistics(); - default: - throw new UnknownColumnTypeException(type); + switch (type) { + case INT32: + return new IntStatistics(); + case INT64: + return new LongStatistics(); + case FLOAT: + return new FloatStatistics(); + case DOUBLE: + return new DoubleStatistics(); + case BOOLEAN: + return new BooleanStatistics(); + case BINARY: + return new BinaryStatistics(); + case INT96: + return new BinaryStatistics(); + case FIXED_LEN_BYTE_ARRAY: + return new BinaryStatistics(); + default: + throw new UnknownColumnTypeException(type); } } @@ -89,22 +89,22 @@ public static Statistics getStatsBasedOnType(PrimitiveTypeName type) { public static Statistics createStats(Type type) { PrimitiveType primitive = type.asPrimitiveType(); switch (primitive.getPrimitiveTypeName()) { - case INT32: - return new IntStatistics(primitive); - case INT64: - return new LongStatistics(primitive); - case FLOAT: - return new FloatStatistics(primitive); - case DOUBLE: - return new DoubleStatistics(primitive); - case BOOLEAN: - return new BooleanStatistics(primitive); - case BINARY: - case INT96: - case FIXED_LEN_BYTE_ARRAY: - return new BinaryStatistics(primitive); - default: - throw new UnknownColumnTypeException(primitive.getPrimitiveTypeName()); + case INT32: + return new IntStatistics(primitive); + case INT64: + return new LongStatistics(primitive); + case FLOAT: + return new FloatStatistics(primitive); + case DOUBLE: + return new DoubleStatistics(primitive); + case BOOLEAN: + return new BooleanStatistics(primitive); + case BINARY: + case INT96: + case FIXED_LEN_BYTE_ARRAY: + return new BinaryStatistics(primitive); + default: + throw new UnknownColumnTypeException(primitive.getPrimitiveTypeName()); } } @@ -168,9 +168,10 @@ public boolean equals(Object other) { if (!(other instanceof Statistics)) return false; Statistics stats = (Statistics) other; - return Arrays.equals(stats.getMaxBytes(), this.getMaxBytes()) && - Arrays.equals(stats.getMinBytes(), this.getMinBytes()) && - stats.getNumNulls() == this.getNumNulls(); + return type.equals(stats.type) && + Arrays.equals(stats.getMaxBytes(), this.getMaxBytes()) && + Arrays.equals(stats.getMinBytes(), this.getMinBytes()) && + stats.getNumNulls() == this.getNumNulls(); } /** @@ -179,7 +180,8 @@ public boolean equals(Object other) { */ @Override public int hashCode() { - return 31 * Arrays.hashCode(getMaxBytes()) + 17 * Arrays.hashCode(getMinBytes()) + Long.valueOf(this.getNumNulls()).hashCode(); + return 31 * type.hashCode() + 31 * Arrays.hashCode(getMaxBytes()) + 17 * Arrays.hashCode(getMinBytes()) + + Long.valueOf(this.getNumNulls()).hashCode(); } /** @@ -191,10 +193,8 @@ public int hashCode() { public void mergeStatistics(Statistics stats) { if (stats.isEmpty()) return; - // Merge stats only if they have the same type and comparator (the sorting order - // is the same) - if (this.getClass() == stats.getClass() && type.getPrimitiveTypeName() == stats.type.getPrimitiveTypeName() - && type.getOriginalType() == stats.type.getOriginalType()) { + // Merge stats only if they have the same type + if (type.equals(stats.type)) { incrementNumNulls(stats.getNumNulls()); if (stats.hasNonNullValue()) { mergeStatisticsMinMax(stats); diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/ColumnOrder.java b/parquet-column/src/main/java/org/apache/parquet/schema/ColumnOrder.java new file mode 100644 index 0000000000..144a93a06a --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/schema/ColumnOrder.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.schema; + +import org.apache.parquet.Preconditions; + +/** + * Class representing the column order with all the related parameters. + */ +public class ColumnOrder { + /** + * The enum type of the column order. + */ + public enum ColumnOrderName { + /** + * Representing the case when the defined column order is undefined (e.g. the file is written by a later API and the + * current one does not support the related column order). No statistics will be written/read in this case. + */ + UNDEFINED, + /** + * Type defined order meaning that the comparison order of the elements are based on its type. + */ + TYPE_DEFINED_ORDER + } + + private static final ColumnOrder UNDEFINED_COLUMN_ORDER = new ColumnOrder(ColumnOrderName.UNDEFINED); + private static final ColumnOrder TYPE_DEFINED_COLUMN_ORDER = new ColumnOrder(ColumnOrderName.TYPE_DEFINED_ORDER); + + /** + * @return a {@link ColumnOrder} instance representing an undefined order + * @see ColumnOrderName#UNDEFINED + */ + public static ColumnOrder undefined() { + return UNDEFINED_COLUMN_ORDER; + } + + /** + * @return a {@link ColumnOrder} instance representing a type defined order + * @see ColumnOrderName#TYPE_DEFINED_ORDER + */ + public static ColumnOrder typeDefined() { + return TYPE_DEFINED_COLUMN_ORDER; + } + + private final ColumnOrderName columnOrderName; + + private ColumnOrder(ColumnOrderName columnOrderName) { + this.columnOrderName = Preconditions.checkNotNull(columnOrderName, "columnOrderName"); + } + + public ColumnOrderName getColumnOrderName() { + return columnOrderName; + } + + /** + * {@inheritDoc} + */ + @Override + public boolean equals(Object obj) { + if (obj instanceof ColumnOrder) { + return columnOrderName == ((ColumnOrder) obj).columnOrderName; + } + return false; + } + + /** + * {@inheritDoc} + */ + @Override + public int hashCode() { + return columnOrderName.hashCode(); + } + + /** + * {@inheritDoc} + */ + @Override + public String toString() { + return columnOrderName.toString(); + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java index 9997d37b36..085a67a26d 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java @@ -245,11 +245,12 @@ int compare(ByteBuffer b1, ByteBuffer b2) { int result = 0; // Compare the beginning of the longer buffer with the proper padding - int lengthDiff = l1 - l2; - if (lengthDiff < 0) { - result = -compareWithPadding(-lengthDiff, b2, p2, isNegative1 ? NEGATIVE_PADDING : POSITIVE_PADDING); - p2 += -lengthDiff; - } else if (lengthDiff > 0) { + if (l1 < l2) { + int lengthDiff = l2 - l1; + result = -compareWithPadding(lengthDiff, b2, p2, isNegative1 ? NEGATIVE_PADDING : POSITIVE_PADDING); + p2 += lengthDiff; + } else if (l1 > l2) { + int lengthDiff = l1 - l2; result = compareWithPadding(lengthDiff, b1, p1, isNegative2 ? NEGATIVE_PADDING : POSITIVE_PADDING); p1 += lengthDiff; } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java index 39d90ffe76..2d7491f610 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java @@ -22,12 +22,14 @@ import java.util.List; import java.util.Locale; +import org.apache.parquet.Preconditions; import org.apache.parquet.ShouldNeverHappenException; import org.apache.parquet.column.ColumnReader; import org.apache.parquet.io.InvalidRecordException; import org.apache.parquet.io.api.Binary; import org.apache.parquet.io.api.PrimitiveConverter; import org.apache.parquet.io.api.RecordConsumer; +import org.apache.parquet.schema.ColumnOrder.ColumnOrderName; /** @@ -381,6 +383,7 @@ abstract public void addValueToPrimitiveConverter( private final PrimitiveTypeName primitive; private final int length; private final DecimalMetadata decimalMeta; + private final ColumnOrder columnOrder; /** * @param repetition OPTIONAL, REPEATED, REQUIRED @@ -438,10 +441,61 @@ public PrimitiveType(Repetition repetition, PrimitiveTypeName primitive, public PrimitiveType(Repetition repetition, PrimitiveTypeName primitive, int length, String name, OriginalType originalType, DecimalMetadata decimalMeta, ID id) { + this(repetition, primitive, length, name, originalType, decimalMeta, id, null); + } + + PrimitiveType(Repetition repetition, PrimitiveTypeName primitive, + int length, String name, OriginalType originalType, + DecimalMetadata decimalMeta, ID id, ColumnOrder columnOrder) { super(name, repetition, originalType, id); this.primitive = primitive; this.length = length; this.decimalMeta = decimalMeta; + + if (columnOrder == null) { + columnOrder = primitive == PrimitiveTypeName.INT96 || originalType == OriginalType.INTERVAL + ? ColumnOrder.undefined() + : ColumnOrder.typeDefined(); + } + this.columnOrder = requireValidColumnOrder(columnOrder); + } + + private ColumnOrder requireValidColumnOrder(ColumnOrder columnOrder) { + if (primitive == PrimitiveTypeName.INT96) { + Preconditions.checkArgument(columnOrder.getColumnOrderName() == ColumnOrderName.UNDEFINED, + "The column order {} is not supported by INT96", columnOrder); + } + if (getOriginalType() != null) { + // Explicitly listing all the logical types to avoid having unsupported column orders new types accidentally + switch (getOriginalType()) { + case INT_8: + case INT_16: + case INT_32: + case INT_64: + case UINT_8: + case UINT_16: + case UINT_32: + case UINT_64: + case UTF8: + case DECIMAL: + case DATE: + case TIME_MILLIS: + case TIME_MICROS: + case TIMESTAMP_MILLIS: + case TIMESTAMP_MICROS: + case ENUM: + case JSON: + case BSON: + // Currently any available column order is valid + break; + case INTERVAL: + default: + Preconditions.checkArgument(columnOrder.getColumnOrderName() == ColumnOrderName.UNDEFINED, + "The column order {} is not supported by {} ({})", columnOrder, primitive, getOriginalType()); + break; + } + } + return columnOrder; } /** @@ -450,7 +504,8 @@ public PrimitiveType(Repetition repetition, PrimitiveTypeName primitive, */ @Override public PrimitiveType withId(int id) { - return new PrimitiveType(getRepetition(), primitive, length, getName(), getOriginalType(), decimalMeta, new ID(id)); + return new PrimitiveType(getRepetition(), primitive, length, getName(), getOriginalType(), decimalMeta, new ID(id), + columnOrder); } /** @@ -542,6 +597,7 @@ protected boolean equals(Type other) { return super.equals(other) && primitive == otherPrimitive.getPrimitiveTypeName() && length == otherPrimitive.length + && columnOrder.equals(otherPrimitive.columnOrder) && eqOrBothNull(decimalMeta, otherPrimitive.decimalMeta); } @@ -553,6 +609,7 @@ public int hashCode() { int hash = super.hashCode(); hash = hash * 31 + primitive.hashCode(); hash = hash * 31 + length; + hash = hash * 31 + columnOrder.hashCode(); if (decimalMeta != null) { hash = hash * 31 + decimalMeta.hashCode(); } @@ -620,6 +677,11 @@ private void reportSchemaMergeError(Type toMerge) { throw new IncompatibleSchemaModificationException("can not merge type " + toMerge + " into " + this); } + private void reportSchemaMergeErrorWithColumnOrder(Type toMerge) { + throw new IncompatibleSchemaModificationException("can not merge type " + toMerge + " with column order " + + toMerge.asPrimitiveType().columnOrder() + " into " + this + " with column order " + columnOrder()); + } + @Override protected Type union(Type toMerge, boolean strict) { if (!toMerge.isPrimitive()) { @@ -638,6 +700,11 @@ protected Type union(Type toMerge, boolean strict) { if (primitive == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY && length != toMergeLength) { reportSchemaMergeError(toMerge); } + + // Can't merge primitive fields with different column orders + if (!columnOrder().equals(toMerge.asPrimitiveType().columnOrder())) { + reportSchemaMergeErrorWithColumnOrder(toMerge); + } } Types.PrimitiveBuilder builder = Types.primitive(primitive, toMerge.getRepetition()); @@ -658,4 +725,11 @@ protected Type union(Type toMerge, boolean strict) { public PrimitiveComparator comparator() { return (PrimitiveComparator) getPrimitiveTypeName().comparator(getOriginalType()); } + + /** + * @return the column order for this type + */ + public ColumnOrder columnOrder() { + return columnOrder; + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/Types.java b/parquet-column/src/main/java/org/apache/parquet/schema/Types.java index e81daaea9a..0422a9d431 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/Types.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/Types.java @@ -23,6 +23,7 @@ import java.util.List; import org.apache.parquet.Preconditions; +import org.apache.parquet.schema.ColumnOrder.ColumnOrderName; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type.ID; import org.slf4j.Logger; @@ -316,6 +317,7 @@ public P named(String name) { private int length = NOT_SET; private int precision = NOT_SET; private int scale = NOT_SET; + private ColumnOrder columnOrder; private BasePrimitiveBuilder(P parent, PrimitiveTypeName type) { super(parent); @@ -374,6 +376,22 @@ public THIS scale(int scale) { return self(); } + /** + * Adds the column order for the primitive type. + *

+ * In case of not set the default column order is {@link ColumnOrderName#TYPE_DEFINED_ORDER} except the type + * {@link PrimitiveTypeName#INT96} and the types annotated by {@link OriginalType#INTERVAL} where the default column + * order is {@link ColumnOrderName#UNDEFINED}. + * + * @param columnOrder + * the column order for the primitive type + * @return this builder for method chaining + */ + public THIS columnOrder(ColumnOrder columnOrder) { + this.columnOrder = columnOrder; + return self(); + } + @Override protected PrimitiveType build(String name) { if (PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY == primitiveType) { @@ -457,7 +475,7 @@ protected PrimitiveType build(String name) { } } - return new PrimitiveType(repetition, primitiveType, length, name, originalType, meta, id); + return new PrimitiveType(repetition, primitiveType, length, name, originalType, meta, id, columnOrder); } private static long maxPrecision(int numBytes) { diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java index 4add1740ce..05619385bc 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java @@ -21,9 +21,12 @@ import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; import static org.apache.parquet.schema.OriginalType.LIST; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; + import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96; import static org.apache.parquet.schema.Type.Repetition.OPTIONAL; import static org.apache.parquet.schema.Type.Repetition.REPEATED; import static org.apache.parquet.schema.Type.Repetition.REQUIRED; @@ -188,6 +191,48 @@ public void testMergeSchemaWithOriginalType() throws Exception { t5.union(t6)); } + @Test + public void testMergeSchemaWithColumnOrder() { + MessageType m1 = Types.buildMessage().addFields( + Types.requiredList().element( + Types.optional(BINARY).columnOrder(ColumnOrder.undefined()).named("a") + ).named("g"), + Types.optional(INT96).named("b") + ).named("root"); + MessageType m2 = Types.buildMessage().addFields( + Types.requiredList().element( + Types.optional(BINARY).columnOrder(ColumnOrder.undefined()).named("a") + ).named("g"), + Types.optional(BINARY).named("c") + ).named("root"); + MessageType m3 = Types.buildMessage().addFields( + Types.requiredList().element( + Types.optional(BINARY).named("a") + ).named("g") + ).named("root"); + + assertEquals( + Types.buildMessage().addFields( + Types.requiredList().element( + Types.optional(BINARY).named("a") + ).named("g"), + Types.optional(INT96).named("b"), + Types.optional(BINARY).named("c") + ).named("root"), + m1.union(m2)); + try { + m1.union(m3); + fail("An IncompatibleSchemaModificationException should have been thrown"); + } catch (Exception e) { + assertTrue( + "The thrown exception should have been IncompatibleSchemaModificationException but was " + e.getClass(), + e instanceof IncompatibleSchemaModificationException); + assertEquals( + "can not merge type optional binary a with column order TYPE_DEFINED_ORDER into optional binary a with column order UNDEFINED", + e.getMessage()); + } + } + @Test public void testIDs() throws Exception { MessageType schema = new MessageType("test", diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java index c8a423ad23..3f9d6431b5 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java @@ -45,12 +45,12 @@ public class TestPrimitiveComparator { @Test public void testBooleanComparator() { - Boolean[] values = { null, false, true }; + Boolean[] valuesInAscendingOrder = { null, false, true }; - for (int i = 0; i < values.length; ++i) { - for (int j = 0; j < values.length; ++j) { - Boolean vi = values[i]; - Boolean vj = values[j]; + for (int i = 0; i < valuesInAscendingOrder.length; ++i) { + for (int j = 0; j < valuesInAscendingOrder.length; ++j) { + Boolean vi = valuesInAscendingOrder[i]; + Boolean vj = valuesInAscendingOrder[j]; int exp = i - j; assertSignumEquals(vi, vj, exp, BOOLEAN_COMPARATOR.compare(vi, vj)); if (vi != null && vj != null) { @@ -88,11 +88,11 @@ public void testUnsignedInt32Comparator() { -1); // 0xFFFFFFFF } - private void testInt32Comparator(PrimitiveComparator comparator, Integer... values) { - for (int i = 0; i < values.length; ++i) { - for (int j = 0; j < values.length; ++j) { - Integer vi = values[i]; - Integer vj = values[j]; + private void testInt32Comparator(PrimitiveComparator comparator, Integer... valuesInAscendingOrder) { + for (int i = 0; i < valuesInAscendingOrder.length; ++i) { + for (int j = 0; j < valuesInAscendingOrder.length; ++j) { + Integer vi = valuesInAscendingOrder[i]; + Integer vj = valuesInAscendingOrder[j]; int exp = i - j; assertSignumEquals(vi, vj, exp, comparator.compare(vi, vj)); if (vi != null && vj != null) { @@ -130,11 +130,11 @@ public void testUnsignedInt64Comparator() { -1L); // 0xFFFFFFFFFFFFFFFF } - private void testInt64Comparator(PrimitiveComparator comparator, Long... values) { - for (int i = 0; i < values.length; ++i) { - for (int j = 0; j < values.length; ++j) { - Long vi = values[i]; - Long vj = values[j]; + private void testInt64Comparator(PrimitiveComparator comparator, Long... valuesInAscendingOrder) { + for (int i = 0; i < valuesInAscendingOrder.length; ++i) { + for (int j = 0; j < valuesInAscendingOrder.length; ++j) { + Long vi = valuesInAscendingOrder[i]; + Long vj = valuesInAscendingOrder[j]; int exp = i - j; assertSignumEquals(vi, vj, exp, comparator.compare(vi, vj)); if (vi != null && vj != null) { @@ -148,7 +148,7 @@ private void testInt64Comparator(PrimitiveComparator comparator, Long... v @Test public void testFloatComparator() { - Float[] values = { + Float[] valuesInAscendingOrder = { null, Float.NEGATIVE_INFINITY, -Float.MAX_VALUE, @@ -160,10 +160,10 @@ public void testFloatComparator() { Float.MAX_VALUE, Float.POSITIVE_INFINITY }; - for (int i = 0; i < values.length; ++i) { - for (int j = 0; j < values.length; ++j) { - Float vi = values[i]; - Float vj = values[j]; + for (int i = 0; i < valuesInAscendingOrder.length; ++i) { + for (int j = 0; j < valuesInAscendingOrder.length; ++j) { + Float vi = valuesInAscendingOrder[i]; + Float vj = valuesInAscendingOrder[j]; int exp = i - j; assertSignumEquals(vi, vj, exp, FLOAT_COMPARATOR.compare(vi, vj)); if (vi != null && vj != null) { @@ -177,7 +177,7 @@ public void testFloatComparator() { @Test public void testDoubleComparator() { - Double[] values = { + Double[] valuesInAscendingOrder = { null, Double.NEGATIVE_INFINITY, -Double.MAX_VALUE, @@ -189,10 +189,10 @@ public void testDoubleComparator() { Double.MAX_VALUE, Double.POSITIVE_INFINITY }; - for (int i = 0; i < values.length; ++i) { - for (int j = 0; j < values.length; ++j) { - Double vi = values[i]; - Double vj = values[j]; + for (int i = 0; i < valuesInAscendingOrder.length; ++i) { + for (int j = 0; j < valuesInAscendingOrder.length; ++j) { + Double vi = valuesInAscendingOrder[i]; + Double vj = valuesInAscendingOrder[j]; int exp = i - j; assertSignumEquals(vi, vj, exp, DOUBLE_COMPARATOR.compare(vi, vj)); if (vi != null && vj != null) { @@ -249,11 +249,11 @@ public void testBinaryAsSignedIntegerComparator() { ByteBuffer.wrap(new BigInteger("9999999999999999999999999999999999999999").toByteArray()))); } - private void testObjectComparator(PrimitiveComparator comparator, T... values) { - for (int i = 0; i < values.length; ++i) { - for (int j = 0; j < values.length; ++j) { - T vi = values[i]; - T vj = values[j]; + private void testObjectComparator(PrimitiveComparator comparator, T... valuesInAscendingOrder) { + for (int i = 0; i < valuesInAscendingOrder.length; ++i) { + for (int j = 0; j < valuesInAscendingOrder.length; ++j) { + T vi = valuesInAscendingOrder[i]; + T vj = valuesInAscendingOrder[j]; int exp = i - j; assertSignumEquals(vi, vj, exp, comparator.compare(vi, vj)); } diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java index 0c39ef2ba1..0b1f41a59c 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java @@ -24,6 +24,7 @@ import org.junit.Assert; import org.junit.Test; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type.Repetition; import static org.apache.parquet.schema.OriginalType.*; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*; @@ -1348,6 +1349,52 @@ public void testOptionalMapWithinList() { Assert.assertEquals(expected, actual); } + @Test + public void testTypeConstructionWithUndefinedColumnOrder() { + PrimitiveTypeName[] types = new PrimitiveTypeName[] { + BOOLEAN, INT32, INT64, INT96, FLOAT, DOUBLE, BINARY, FIXED_LEN_BYTE_ARRAY + }; + for (PrimitiveTypeName type : types) { + String name = type.toString() + "_"; + int len = type == FIXED_LEN_BYTE_ARRAY ? 42 : 0; + PrimitiveType expected = new PrimitiveType(Repetition.OPTIONAL, type, len, name, null, null, null, + ColumnOrder.undefined()); + PrimitiveType built = Types.optional(type).length(len).columnOrder(ColumnOrder.undefined()).named(name); + Assert.assertEquals(expected, built); + } + } + + @Test + public void testTypeConstructionWithTypeDefinedColumnOrder() { + PrimitiveTypeName[] types = new PrimitiveTypeName[] { + BOOLEAN, INT32, INT64, FLOAT, DOUBLE, BINARY, FIXED_LEN_BYTE_ARRAY + }; + for (PrimitiveTypeName type : types) { + String name = type.toString() + "_"; + int len = type == FIXED_LEN_BYTE_ARRAY ? 42 : 0; + PrimitiveType expected = new PrimitiveType(Repetition.OPTIONAL, type, len, name, null, null, null, + ColumnOrder.typeDefined()); + PrimitiveType built = Types.optional(type).length(len).columnOrder(ColumnOrder.typeDefined()).named(name); + Assert.assertEquals(expected, built); + } + } + + @Test + public void testTypeConstructionWithUnsupportedColumnOrder() { + assertThrows(null, IllegalArgumentException.class, new Callable() { + @Override + public PrimitiveType call() { + return Types.optional(INT96).columnOrder(ColumnOrder.typeDefined()).named("int96_unsupported"); + } + }); + assertThrows(null, IllegalArgumentException.class, new Callable() { + @Override + public PrimitiveType call() { + return Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(12).as(INTERVAL) + .columnOrder(ColumnOrder.typeDefined()).named("interval_unsupported"); + } + }); + } /** * A convenience method to avoid a large number of @Test(expected=...) tests diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index d89e32d5b9..aca137b7f6 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -44,6 +44,7 @@ import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.format.ColumnChunk; import org.apache.parquet.format.ColumnMetaData; +import org.apache.parquet.format.ColumnOrder; import org.apache.parquet.format.ConvertedType; import org.apache.parquet.format.DataPageHeader; import org.apache.parquet.format.DataPageHeaderV2; @@ -58,12 +59,14 @@ import org.apache.parquet.format.SchemaElement; import org.apache.parquet.format.Statistics; import org.apache.parquet.format.Type; +import org.apache.parquet.format.TypeDefinedOrder; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.column.EncodingStats; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.schema.ColumnOrder.ColumnOrderName; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.OriginalType; @@ -135,9 +138,24 @@ public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parque } fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy()); + + fileMetaData.setColumn_orders(getColumnOrders(parquetMetadata.getFileMetaData().getSchema())); + return fileMetaData; } + private List getColumnOrders(MessageType schema) { + List columnOrders = new ArrayList<>(); + // Currently, only TypeDefinedOrder is supported, so we create a column order for each columns with + // TypeDefinedOrder even if some types (e.g. INT96) have undefined column orders. + for (int i = 0, n = schema.getPaths().size(); i < n; ++i) { + ColumnOrder columnOrder = new ColumnOrder(); + columnOrder.setTYPE_ORDER(new TypeDefinedOrder()); + columnOrders.add(columnOrder); + } + return columnOrders; + } + // Visible for testing List toParquetSchema(MessageType schema) { List result = new ArrayList(); @@ -356,41 +374,7 @@ public static Statistics toParquetStatistics( } private static boolean isMinMaxStatsSupported(PrimitiveType type) { - if (type.getPrimitiveTypeName() == PrimitiveTypeName.INT96) { - return false; - } - - OriginalType origType = type.getOriginalType(); - if (origType == null) { - // All default primitive types excluding INT96 are supported by statistics - return true; - } - - // Explicitly listing all the supported logical types to avoid writing statistics for new types accidentally - switch (origType) { - case INT_8: - case INT_16: - case INT_32: - case INT_64: - case UINT_8: - case UINT_16: - case UINT_32: - case UINT_64: - case UTF8: - case DECIMAL: - case DATE: - case TIME_MILLIS: - case TIME_MICROS: - case TIMESTAMP_MILLIS: - case TIMESTAMP_MICROS: - case ENUM: - case JSON: - case BSON: - return true; - case INTERVAL: - default: - return false; - } + return type.columnOrder().getColumnOrderName() == ColumnOrderName.TYPE_DEFINED_ORDER; } /** @@ -408,8 +392,8 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist @Deprecated public static org.apache.parquet.column.statistics.Statistics fromParquetStatistics (String createdBy, Statistics statistics, PrimitiveTypeName type) { - return fromParquetStatisticsInternal(createdBy, statistics, new PrimitiveType(Repetition.OPTIONAL, type, ""), - defaultSortOrder(type)); + return fromParquetStatisticsInternal(createdBy, statistics, + new PrimitiveType(Repetition.OPTIONAL, type, "fake_type"), defaultSortOrder(type)); } // Visible for testing @@ -423,8 +407,6 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist if (formatStats.isSetMin_value() && formatStats.isSetMax_value()) { byte[] min = formatStats.min_value.array(); byte[] max = formatStats.max_value.array(); - // Ordering of INT96 and INTERVAL types is not clear; - // we only support statistics for them if min and max are equal if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) { stats.setMinMaxFromBytes(min, max); } @@ -893,7 +875,7 @@ public FileMetaData visit(RangeMetadataFilter filter) throws IOException { } public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws IOException { - MessageType messageType = fromParquetSchema(parquetMetadata.getSchema()); + MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders()); List blocks = new ArrayList(); List row_groups = parquetMetadata.getRow_groups(); if (row_groups != null) { @@ -952,20 +934,22 @@ private static ColumnPath getPath(ColumnMetaData metaData) { } // Visible for testing - MessageType fromParquetSchema(List schema) { + MessageType fromParquetSchema(List schema, List columnOrders) { Iterator iterator = schema.iterator(); SchemaElement root = iterator.next(); Types.MessageTypeBuilder builder = Types.buildMessage(); if (root.isSetField_id()) { builder.id(root.field_id); } - buildChildren(builder, iterator, root.getNum_children()); + buildChildren(builder, iterator, root.getNum_children(), columnOrders, 0); return builder.named(root.name); } private void buildChildren(Types.GroupBuilder builder, Iterator schema, - int childrenCount) { + int childrenCount, + List columnOrders, + int columnCount) { for (int i = 0; i < childrenCount; i++) { SchemaElement schemaElement = schema.next(); @@ -984,11 +968,21 @@ private void buildChildren(Types.GroupBuilder builder, if (schemaElement.isSetScale()) { primitiveBuilder.scale(schemaElement.scale); } + if (columnOrders != null) { + org.apache.parquet.schema.ColumnOrder columnOrder = fromParquetColumnOrder(columnOrders.get(columnCount)); + // As per parquet format 2.4.0 no UNDEFINED order is supported. So, set undefined column order for the types + // where ordering is not supported. + if (columnOrder.getColumnOrderName() == ColumnOrderName.TYPE_DEFINED_ORDER + && (schemaElement.type == Type.INT96 || schemaElement.converted_type == ConvertedType.INTERVAL)) { + columnOrder = org.apache.parquet.schema.ColumnOrder.undefined(); + } + primitiveBuilder.columnOrder(columnOrder); + } childBuilder = primitiveBuilder; } else { childBuilder = builder.group(fromParquetRepetition(schemaElement.repetition_type)); - buildChildren((Types.GroupBuilder) childBuilder, schema, schemaElement.num_children); + buildChildren((Types.GroupBuilder) childBuilder, schema, schemaElement.num_children, columnOrders, columnCount); } if (schemaElement.isSetConverted_type()) { @@ -999,6 +993,7 @@ private void buildChildren(Types.GroupBuilder builder, } childBuilder.named(schemaElement.name); + ++columnCount; } } @@ -1012,6 +1007,14 @@ Repetition fromParquetRepetition(FieldRepetitionType repetition) { return Repetition.valueOf(repetition.name()); } + private static org.apache.parquet.schema.ColumnOrder fromParquetColumnOrder(ColumnOrder columnOrder) { + if (columnOrder.isSetTYPE_ORDER()) { + return org.apache.parquet.schema.ColumnOrder.typeDefined(); + } + // The column order is not yet supported by this API + return org.apache.parquet.schema.ColumnOrder.undefined(); + } + @Deprecated public void writeDataPageHeader( int uncompressedSize, diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java index e50fef71fa..e1986986f7 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java @@ -85,8 +85,8 @@ public static ColumnChunkMetaData get( long valueCount, long totalSize, long totalUncompressedSize) { - return get(path, Types.optional(type).named(""), codec, encodingStats, encodings, statistics, - firstDataPage, dictionaryPageOffset, valueCount, totalSize, totalUncompressedSize); + return get(path, Types.optional(type).named("fake_type"), codec, encodingStats, encodings, statistics, + firstDataPage, dictionaryPageOffset, valueCount, totalSize, totalUncompressedSize); } public static ColumnChunkMetaData get( diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 774582480a..ee92d4625b 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -24,6 +24,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertSame; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import static org.apache.parquet.format.CompressionCodec.UNCOMPRESSED; import static org.apache.parquet.format.Type.INT32; @@ -51,6 +52,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.parquet.Version; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.statistics.BinaryStatistics; import org.apache.parquet.column.statistics.BooleanStatistics; import org.apache.parquet.column.statistics.DoubleStatistics; @@ -67,7 +69,6 @@ import org.apache.parquet.schema.PrimitiveType; import org.junit.Assert; import org.junit.Test; - import org.apache.parquet.example.Paper; import org.apache.parquet.format.ColumnChunk; import org.apache.parquet.format.ColumnMetaData; @@ -79,6 +80,7 @@ import org.apache.parquet.format.RowGroup; import org.apache.parquet.format.SchemaElement; import org.apache.parquet.format.Type; +import org.apache.parquet.schema.ColumnOrder; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; @@ -105,7 +107,7 @@ public void testPageHeader() throws IOException { public void testSchemaConverter() { ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); List parquetSchema = parquetMetadataConverter.toParquetSchema(Paper.schema); - MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema); + MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); assertEquals(Paper.schema, schema); } @@ -819,4 +821,42 @@ public org.apache.parquet.format.Statistics toParquetStatistics(Statistics st public abstract org.apache.parquet.format.Statistics toParquetStatistics(Statistics stats); } + @Test + public void testColumnOrders() throws IOException { + MessageType schema = parseMessageType("message test {" + + " optional binary binary_col;" // Normal column with type defined column order -> typeDefined + + " optional group map_col (MAP) {" + + " repeated group map (MAP_KEY_VALUE) {" + + " required binary key (UTF8);" // Key to be hacked to have unknown column order -> undefined + + " optional group list_col (LIST) {" + + " repeated group list {" + + " optional int96 array_element;" // INT96 element with type defined column order -> undefined + + " }" + + " }" + + " }" + + " }" + + "}"); + org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = new org.apache.parquet.hadoop.metadata.FileMetaData( + schema, new HashMap(), null); + ParquetMetadata metadata = new ParquetMetadata(fileMetaData, new ArrayList()); + ParquetMetadataConverter converter = new ParquetMetadataConverter(); + FileMetaData formatMetadata = converter.toParquetMetadata(1, metadata); + + List columnOrders = formatMetadata.getColumn_orders(); + assertEquals(3, columnOrders.size()); + for (org.apache.parquet.format.ColumnOrder columnOrder : columnOrders) { + assertTrue(columnOrder.isSetTYPE_ORDER()); + } + + // Simulate that thrift got a union type that is not in the generated code + // (when the file contains a not-yet-supported column order) + columnOrders.get(1).clear(); + + MessageType resultSchema = converter.fromParquetMetadata(formatMetadata).getFileMetaData().getSchema(); + List columns = resultSchema.getColumns(); + assertEquals(3, columns.size()); + assertEquals(ColumnOrder.typeDefined(), columns.get(0).getPrimitiveType().columnOrder()); + assertEquals(ColumnOrder.undefined(), columns.get(1).getPrimitiveType().columnOrder()); + assertEquals(ColumnOrder.undefined(), columns.get(2).getPrimitiveType().columnOrder()); + } } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java index 6915c86ec3..4243e9bd18 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java @@ -524,12 +524,12 @@ public void testWriteReadStatistics() throws Exception { String str = new String(bsout.getMaxBytes()); String str2 = new String(bsout.getMinBytes()); - assertTrue(((BinaryStatistics)readFooter.getBlocks().get(0).getColumns().get(0).getStatistics()).equals(bs1)); - assertTrue(((LongStatistics)readFooter.getBlocks().get(0).getColumns().get(1).getStatistics()).equals(ls1)); + TestUtils.assertStatsValuesEqual(bs1, readFooter.getBlocks().get(0).getColumns().get(0).getStatistics()); + TestUtils.assertStatsValuesEqual(ls1, readFooter.getBlocks().get(0).getColumns().get(1).getStatistics()); } { // assert stats are correct for the second block - assertTrue(((BinaryStatistics)readFooter.getBlocks().get(1).getColumns().get(0).getStatistics()).equals(bs2)); - assertTrue(((LongStatistics)readFooter.getBlocks().get(1).getColumns().get(1).getStatistics()).equals(ls2)); + TestUtils.assertStatsValuesEqual(bs2, readFooter.getBlocks().get(1).getColumns().get(0).getStatistics()); + TestUtils.assertStatsValuesEqual(ls2, readFooter.getBlocks().get(1).getColumns().get(1).getStatistics()); } } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestUtils.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestUtils.java index e53ac785a0..59b4b62140 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestUtils.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestUtils.java @@ -24,6 +24,8 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.parquet.column.statistics.Statistics; +import org.hamcrest.CoreMatchers; import org.junit.Assert; public class TestUtils { @@ -61,4 +63,23 @@ public static void assertThrows( } } } + + public static void assertStatsValuesEqual(Statistics stats1, Statistics stats2) { + assertStatsValuesEqual(null, stats1, stats2); + } + + // To be used to assert that the values (min, max, num-of-nulls) equals. It might be used in cases when creating + // Statistics object for the proper Type would require too much work/code duplications etc. + public static void assertStatsValuesEqual(String message, Statistics expected, Statistics actual) { + if (expected == actual) { + return; + } + if (expected == null || actual == null) { + Assert.assertEquals(expected, actual); + } + Assert.assertThat(actual, CoreMatchers.instanceOf(expected.getClass())); + Assert.assertArrayEquals(message, expected.getMaxBytes(), actual.getMaxBytes()); + Assert.assertArrayEquals(message, expected.getMinBytes(), actual.getMinBytes()); + Assert.assertEquals(message, expected.getNumNulls(), actual.getNumNulls()); + } } diff --git a/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestThriftToParquetFileWriter.java b/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestThriftToParquetFileWriter.java index 0439686dce..66b804ccb8 100644 --- a/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestThriftToParquetFileWriter.java +++ b/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestThriftToParquetFileWriter.java @@ -19,8 +19,6 @@ package org.apache.parquet.hadoop.thrift; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.Arrays; @@ -53,6 +51,7 @@ import org.apache.parquet.example.data.Group; import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.hadoop.ParquetReader; +import org.apache.parquet.hadoop.TestUtils; import org.apache.parquet.hadoop.example.GroupReadSupport; import org.apache.parquet.hadoop.metadata.ParquetMetadata; @@ -122,21 +121,21 @@ public void testWriteStatistics() throws Exception { for(ColumnChunkMetaData cmd: bmd.getColumns()) { switch(cmd.getType()) { case INT32: - assertTrue(intStatsSmall.equals((IntStatistics)cmd.getStatistics())); + TestUtils.assertStatsValuesEqual(intStatsSmall, cmd.getStatistics()); break; case INT64: - assertTrue(longStatsSmall.equals((LongStatistics)cmd.getStatistics())); + TestUtils.assertStatsValuesEqual(longStatsSmall, cmd.getStatistics()); break; case DOUBLE: - assertTrue(doubleStatsSmall.equals((DoubleStatistics)cmd.getStatistics())); + TestUtils.assertStatsValuesEqual(doubleStatsSmall, cmd.getStatistics()); break; case BOOLEAN: - assertTrue(boolStats.equals((BooleanStatistics)cmd.getStatistics())); + TestUtils.assertStatsValuesEqual(boolStats, cmd.getStatistics()); break; case BINARY: // there is also info_string that has no statistics if(cmd.getPath().toString() == "[test_string]") - assertTrue(binaryStatsSmall.equals((BinaryStatistics)cmd.getStatistics())); + TestUtils.assertStatsValuesEqual(binaryStatsSmall, cmd.getStatistics()); break; } } @@ -171,21 +170,21 @@ public void testWriteStatistics() throws Exception { case INT32: // testing the correct limits of an int32, there are also byte and short, tested earlier if(cmd.getPath().toString() == "[test_i32]") - assertTrue(intStatsLarge.equals((IntStatistics)cmd.getStatistics())); + TestUtils.assertStatsValuesEqual(intStatsLarge, cmd.getStatistics()); break; case INT64: - assertTrue(longStatsLarge.equals((LongStatistics)cmd.getStatistics())); + TestUtils.assertStatsValuesEqual(longStatsLarge, cmd.getStatistics()); break; case DOUBLE: - assertTrue(doubleStatsLarge.equals((DoubleStatistics)cmd.getStatistics())); + TestUtils.assertStatsValuesEqual(doubleStatsLarge, cmd.getStatistics()); break; case BOOLEAN: - assertTrue(boolStats.equals((BooleanStatistics)cmd.getStatistics())); + TestUtils.assertStatsValuesEqual(boolStats, cmd.getStatistics()); break; case BINARY: // there is also info_string that has no statistics if(cmd.getPath().toString() == "[test_string]") - assertTrue(binaryStatsLarge.equals((BinaryStatistics)cmd.getStatistics())); + TestUtils.assertStatsValuesEqual(binaryStatsLarge, cmd.getStatistics()); break; } } From 820df6fb701f52c140eff028c6979bf9fabf7f20 Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Wed, 20 Dec 2017 15:38:42 +0100 Subject: [PATCH 16/17] PARQUET-1025: Minor fixes at data generation for TestStatistics --- .../parquet/statistics/RandomValues.java | 33 +++++++++++++++---- .../parquet/statistics/TestStatistics.java | 4 +-- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/RandomValues.java b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/RandomValues.java index 5d95adb8d4..16db5cbf0d 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/RandomValues.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/RandomValues.java @@ -37,8 +37,8 @@ public boolean shouldGenerateNull() { return (random.nextInt(10) == 0); } - public int randomInt() { return randomInt(Integer.MAX_VALUE - 1); } - public int randomInt(int maximum) { + public int randomInt() { return random.nextInt(); } + public int randomPositiveInt(int maximum) { // Maximum may be a random number (which may be negative). return random.nextInt(Math.abs(maximum) + 1); } @@ -63,11 +63,11 @@ public BigInteger randomInt96(BigInteger maximum) { } public char randomLetter() { - return ALPHABET.charAt(randomInt() % ALPHABET.length()); + return ALPHABET.charAt(randomPositiveInt(ALPHABET.length() - 1)); } public String randomString(int maxLength) { - return randomFixedLengthString(randomInt(maxLength)); + return randomFixedLengthString(randomPositiveInt(maxLength)); } public String randomFixedLengthString(int length) { @@ -122,7 +122,26 @@ public IntGenerator(long seed, int minimum, int maximum) { @Override public Integer nextValue() { - return (minimum + randomInt(range)); + return (minimum + randomPositiveInt(range)); + } + } + + public static class UIntGenerator extends IntGenerator { + private final int mask; + + public UIntGenerator(long seed, byte minimum, byte maximum) { + super(seed, minimum, maximum); + mask = 0xFF; + } + + public UIntGenerator(long seed, short minimum, short maximum) { + super(seed, minimum, maximum); + mask = 0xFFFF; + } + + @Override + public Integer nextValue() { + return super.nextValue() & mask; } } @@ -249,7 +268,7 @@ public StringGenerator(long seed) { @Override public String nextValue() { - int stringLength = randomInt(15) + 1; + int stringLength = randomPositiveInt(15) + 1; return randomString(stringLength); } @@ -268,7 +287,7 @@ public BinaryGenerator(long seed) { @Override public Binary nextValue() { // use a random length, but ensure it is at least a few bytes - int length = 5 + randomInt(buffer.length - 5); + int length = 5 + randomPositiveInt(buffer.length - 5); for (int index = 0; index < length; index++) { buffer[index] = (byte) randomInt(); } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java index bf216423cb..5a5d6d4f25 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java @@ -356,9 +356,9 @@ public DataContext(long seed, File path, int blockSize, int pageSize, boolean en new RandomValues.UnconstrainedFloatGenerator(random.nextLong()), new RandomValues.UnconstrainedDoubleGenerator(random.nextLong()), new RandomValues.IntGenerator(random.nextLong(), Byte.MIN_VALUE, Byte.MAX_VALUE), - new RandomValues.IntGenerator(random.nextLong(), Byte.MIN_VALUE, Byte.MAX_VALUE), - new RandomValues.IntGenerator(random.nextLong(), Short.MIN_VALUE, Short.MAX_VALUE), + new RandomValues.UIntGenerator(random.nextLong(), Byte.MIN_VALUE, Byte.MAX_VALUE), new RandomValues.IntGenerator(random.nextLong(), Short.MIN_VALUE, Short.MAX_VALUE), + new RandomValues.UIntGenerator(random.nextLong(), Short.MIN_VALUE, Short.MAX_VALUE), new RandomValues.UnconstrainedIntGenerator(random.nextLong()), new RandomValues.UnconstrainedIntGenerator(random.nextLong()), new RandomValues.UnconstrainedLongGenerator(random.nextLong()), From 2a63fcf13b0229512fcc894b5b00e73605ed99b6 Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Thu, 21 Dec 2017 10:06:05 +0100 Subject: [PATCH 17/17] PARQUET-1025: Use constant instead of creating new TypeDefinedOrder instances --- .../parquet/format/converter/ParquetMetadataConverter.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index aca137b7f6..ef59760640 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -82,6 +82,7 @@ // TODO: Lets split it up: https://issues.apache.org/jira/browse/PARQUET-310 public class ParquetMetadataConverter { + private static final TypeDefinedOrder TYPE_DEFINED_ORDER = new TypeDefinedOrder(); public static final MetadataFilter NO_FILTER = new NoFilter(); public static final MetadataFilter SKIP_ROW_GROUPS = new SkipMetadataFilter(); public static final long MAX_STATS_SIZE = 4096; // limit stats to 4k @@ -150,7 +151,7 @@ private List getColumnOrders(MessageType schema) { // TypeDefinedOrder even if some types (e.g. INT96) have undefined column orders. for (int i = 0, n = schema.getPaths().size(); i < n; ++i) { ColumnOrder columnOrder = new ColumnOrder(); - columnOrder.setTYPE_ORDER(new TypeDefinedOrder()); + columnOrder.setTYPE_ORDER(TYPE_DEFINED_ORDER); columnOrders.add(columnOrder); } return columnOrders;