-
Notifications
You must be signed in to change notification settings - Fork 4k
Open
Description
Describe the bug, including details regarding any error messages, version, and platform.
#34112 fixes reading from stats.min_value and stats.max_value where applicable. However, these are cases where statistics are invalid or corrupted. Please check parquet-mr for reference: https://github.com/apache/parquet-mr/blob/5290bd5e0ee5dc30db0576e2bfc6eea335c465cf/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java#L797
if (formatStats != null) {
// Use the new V2 min-max statistics over the former one if it is filled
if (formatStats.isSetMin_value() && formatStats.isSetMax_value()) {
byte[] min = formatStats.min_value.array();
byte[] max = formatStats.max_value.array();
if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) {
statsBuilder.withMin(min);
statsBuilder.withMax(max);
}
} else {
boolean isSet = formatStats.isSetMax() && formatStats.isSetMin();
boolean maxEqualsMin = isSet ? Arrays.equals(formatStats.getMin(), formatStats.getMax()) : false;
boolean sortOrdersMatch = SortOrder.SIGNED == typeSortOrder;
// NOTE: See docs in CorruptStatistics for explanation of why this check is needed
// The sort order is checked to avoid returning min/max stats that are not
// valid with the type's sort order. In previous releases, all stats were
// aggregated using a signed byte-wise ordering, which isn't valid for all the
// types (e.g. strings, decimals etc.).
if (!CorruptStatistics.shouldIgnoreStatistics(createdBy, type.getPrimitiveTypeName()) &&
(sortOrdersMatch || maxEqualsMin)) {
if (isSet) {
statsBuilder.withMin(formatStats.min.array());
statsBuilder.withMax(formatStats.max.array());
}
}
}Component(s)
C++, Parquet