From 7e929c31f5768644620f47df839d6fde9e326776 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 6 Dec 2016 04:25:39 +0000 Subject: [PATCH 1/2] PARQUET-791: Add missing column support for UserDefinedPredicate. --- .../statisticslevel/StatisticsFilter.java | 10 +++++-- .../statisticslevel/TestStatisticsFilter.java | 28 +++++++++++++++++++ 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java index b37297aaaf..05adc37001 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java @@ -327,19 +327,25 @@ public Boolean visit(Not not) { private , U extends UserDefinedPredicate> Boolean visit(UserDefined ud, boolean inverted) { Column filterColumn = ud.getColumn(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); + + if (columnChunk == null) { + // the column isn't in this file so all values are null. + return BLOCK_MIGHT_MATCH; + } + U udp = ud.getUserDefinedPredicate(); Statistics stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks - return false; + return BLOCK_MIGHT_MATCH; } if (isAllNulls(columnChunk)) { // there is no min max, there is nothing // else we can say about this chunk, we // cannot drop it. - return false; + return BLOCK_MIGHT_MATCH; } org.apache.parquet.filter2.predicate.Statistics udpStats = diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java index b47ed694a8..4d0d55a83b 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java @@ -83,6 +83,7 @@ private static ColumnChunkMetaData getDoubleColumnMeta(DoubleStatistics stats, l private static final IntColumn intColumn = intColumn("int.column"); private static final DoubleColumn doubleColumn = doubleColumn("double.column"); private static final BinaryColumn missingColumn = binaryColumn("missing"); + private static final IntColumn missingColumn2 = intColumn("missing.int"); private static final IntStatistics intStats = new IntStatistics(); private static final IntStatistics nullIntStats = new IntStatistics(); @@ -288,6 +289,9 @@ public void testUdp() { FilterPredicate pred = userDefined(intColumn, SevensAndEightsUdp.class); FilterPredicate invPred = LogicalInverseRewriter.rewrite(not(userDefined(intColumn, SevensAndEightsUdp.class))); + FilterPredicate predForMissingColumn = userDefined(missingColumn2, SevensAndEightsUdp.class); + FilterPredicate invPredForMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2, SevensAndEightsUdp.class))); + IntStatistics seven = new IntStatistics(); seven.setMinMax(7, 7); @@ -320,6 +324,30 @@ public void testUdp() { assertFalse(canDrop(invPred, Arrays.asList( getIntColumnMeta(neither, 177L), getDoubleColumnMeta(doubleStats, 177L)))); + + assertFalse(canDrop(predForMissingColumn, Arrays.asList( + getIntColumnMeta(seven, 177L), + getDoubleColumnMeta(doubleStats, 177L)))); + + assertFalse(canDrop(predForMissingColumn, Arrays.asList( + getIntColumnMeta(eight, 177L), + getDoubleColumnMeta(doubleStats, 177L)))); + + assertFalse(canDrop(predForMissingColumn, Arrays.asList( + getIntColumnMeta(neither, 177L), + getDoubleColumnMeta(doubleStats, 177L)))); + + assertFalse(canDrop(invPredForMissingColumn, Arrays.asList( + getIntColumnMeta(seven, 177L), + getDoubleColumnMeta(doubleStats, 177L)))); + + assertFalse(canDrop(invPredForMissingColumn, Arrays.asList( + getIntColumnMeta(eight, 177L), + getDoubleColumnMeta(doubleStats, 177L)))); + + assertFalse(canDrop(invPredForMissingColumn, Arrays.asList( + getIntColumnMeta(neither, 177L), + getDoubleColumnMeta(doubleStats, 177L)))); } @Test From d6be37d59e63444d1acaafd277146c3e231ed2dd Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 8 Dec 2016 03:11:51 +0000 Subject: [PATCH 2/2] Address comment. --- .../statisticslevel/StatisticsFilter.java | 19 ++++-- .../statisticslevel/TestStatisticsFilter.java | 62 ++++++++++++++++--- 2 files changed, 66 insertions(+), 15 deletions(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java index 05adc37001..ac7132e74e 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java @@ -327,13 +327,18 @@ public Boolean visit(Not not) { private , U extends UserDefinedPredicate> Boolean visit(UserDefined ud, boolean inverted) { Column filterColumn = ud.getColumn(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); + U udp = ud.getUserDefinedPredicate(); if (columnChunk == null) { // the column isn't in this file so all values are null. - return BLOCK_MIGHT_MATCH; + // lets run the udp with null value to see if it keeps null or not. + if (inverted) { + return udp.keep(null); + } else { + return !udp.keep(null); + } } - U udp = ud.getUserDefinedPredicate(); Statistics stats = columnChunk.getStatistics(); if (stats.isEmpty()) { @@ -342,10 +347,12 @@ private , U extends UserDefinedPredicate> Boolean vis } if (isAllNulls(columnChunk)) { - // there is no min max, there is nothing - // else we can say about this chunk, we - // cannot drop it. - return BLOCK_MIGHT_MATCH; + // lets run the udp with null value to see if it keeps null or not. + if (inverted) { + return udp.keep(null); + } else { + return !udp.keep(null); + } } org.apache.parquet.filter2.predicate.Statistics udpStats = diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java index 4d0d55a83b..d8b4407914 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java @@ -270,7 +270,10 @@ public static class SevensAndEightsUdp extends UserDefinedPredicate { @Override public boolean keep(Integer value) { - throw new RuntimeException("this method should not be called"); + if (value == null) { + return true; + } + throw new RuntimeException("this method should not be called with value != null"); } @Override @@ -284,13 +287,26 @@ public boolean inverseCanDrop(Statistics statistics) { } } + public static class DropNullUdp extends SevensAndEightsUdp { + @Override + public boolean keep(Integer value) { + if (value == null) { + return false; + } + throw new RuntimeException("this method should not be called with value != null"); + } + } + @Test public void testUdp() { FilterPredicate pred = userDefined(intColumn, SevensAndEightsUdp.class); FilterPredicate invPred = LogicalInverseRewriter.rewrite(not(userDefined(intColumn, SevensAndEightsUdp.class))); - FilterPredicate predForMissingColumn = userDefined(missingColumn2, SevensAndEightsUdp.class); - FilterPredicate invPredForMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2, SevensAndEightsUdp.class))); + FilterPredicate udpDropMissingColumn = userDefined(missingColumn2, DropNullUdp.class); + FilterPredicate invUdpDropMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2, DropNullUdp.class))); + + FilterPredicate udpKeepMissingColumn = userDefined(missingColumn2, SevensAndEightsUdp.class); + FilterPredicate invUdpKeepMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2, SevensAndEightsUdp.class))); IntStatistics seven = new IntStatistics(); seven.setMinMax(7, 7); @@ -325,27 +341,55 @@ public void testUdp() { getIntColumnMeta(neither, 177L), getDoubleColumnMeta(doubleStats, 177L)))); - assertFalse(canDrop(predForMissingColumn, Arrays.asList( + // udpDropMissingColumn drops null column. + assertTrue(canDrop(udpDropMissingColumn, Arrays.asList( + getIntColumnMeta(seven, 177L), + getDoubleColumnMeta(doubleStats, 177L)))); + + assertTrue(canDrop(udpDropMissingColumn, Arrays.asList( + getIntColumnMeta(eight, 177L), + getDoubleColumnMeta(doubleStats, 177L)))); + + assertTrue(canDrop(udpDropMissingColumn, Arrays.asList( + getIntColumnMeta(neither, 177L), + getDoubleColumnMeta(doubleStats, 177L)))); + + // invUdpDropMissingColumn (i.e., not(udpDropMissingColumn)) keeps null column. + assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList( + getIntColumnMeta(seven, 177L), + getDoubleColumnMeta(doubleStats, 177L)))); + + assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList( + getIntColumnMeta(eight, 177L), + getDoubleColumnMeta(doubleStats, 177L)))); + + assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList( + getIntColumnMeta(neither, 177L), + getDoubleColumnMeta(doubleStats, 177L)))); + + // udpKeepMissingColumn keeps null column. + assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList( getIntColumnMeta(seven, 177L), getDoubleColumnMeta(doubleStats, 177L)))); - assertFalse(canDrop(predForMissingColumn, Arrays.asList( + assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList( getIntColumnMeta(eight, 177L), getDoubleColumnMeta(doubleStats, 177L)))); - assertFalse(canDrop(predForMissingColumn, Arrays.asList( + assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList( getIntColumnMeta(neither, 177L), getDoubleColumnMeta(doubleStats, 177L)))); - assertFalse(canDrop(invPredForMissingColumn, Arrays.asList( + // invUdpKeepMissingColumn (i.e., not(udpKeepMissingColumn)) drops null column. + assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList( getIntColumnMeta(seven, 177L), getDoubleColumnMeta(doubleStats, 177L)))); - assertFalse(canDrop(invPredForMissingColumn, Arrays.asList( + assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList( getIntColumnMeta(eight, 177L), getDoubleColumnMeta(doubleStats, 177L)))); - assertFalse(canDrop(invPredForMissingColumn, Arrays.asList( + assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList( getIntColumnMeta(neither, 177L), getDoubleColumnMeta(doubleStats, 177L)))); }