From d26788a758b604a0b40b7702a6128c69a1ccfd01 Mon Sep 17 00:00:00 2001 From: Vinitha Gankidi Date: Tue, 30 Jul 2019 19:18:59 -0700 Subject: [PATCH] Fix truncateStringMax in UnicodeUtil. Index to codePointAt should be the offset calculated by code points --- .../java/org/apache/iceberg/util/UnicodeUtil.java | 6 +++--- .../iceberg/parquet/TestParquetMetricsTruncation.java | 11 +++++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java b/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java index 1eaed21df6d2..f76ec73429d7 100644 --- a/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java +++ b/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java @@ -79,11 +79,11 @@ public static Literal truncateStringMax(Literal inpu // Try incrementing the code points from the end for (int i = length - 1; i >= 0; i--) { - int nextCodePoint = truncatedStringBuffer.codePointAt(i) + 1; + // Get the offset in the truncated string buffer where the number of unicode characters = i + int offsetByCodePoint = truncatedStringBuffer.offsetByCodePoints(0, i); + int nextCodePoint = truncatedStringBuffer.codePointAt(offsetByCodePoint) + 1; // No overflow if (nextCodePoint != 0 && Character.isValidCodePoint(nextCodePoint)) { - // Get the offset in the truncated string buffer where the number of unicode characters = i - int offsetByCodePoint = truncatedStringBuffer.offsetByCodePoints(0, i); truncatedStringBuffer.setLength(offsetByCodePoint); // Append next code point to the truncated substring truncatedStringBuffer.appendCodePoint(nextCodePoint); diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetMetricsTruncation.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetMetricsTruncation.java index a887e19cff53..d38609b8fefa 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetMetricsTruncation.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetMetricsTruncation.java @@ -138,6 +138,9 @@ public void testTruncateStringMax() throws IOException { String test6 = "\uD800\uDFFF\uD800\uDFFF"; // Increment the previous character String test6_2_expected = "\uD801\uDC00"; + String test7 = "\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02"; + String test7_2_expected = "\uD83D\uDE02\uD83D\uDE03"; + String test7_1_expected = "\uD83D\uDE03"; Comparator cmp = Literal.of(test1).comparator(); Assert.assertTrue("Truncated upper bound should be greater than or equal to the actual upper bound", @@ -175,5 +178,13 @@ public void testTruncateStringMax() throws IOException { Assert.assertTrue("Test 4 byte UTF-8 character increment. Output must have one character with " + "the first character incremented", cmp.compare( truncateStringMax(Literal.of(test6), 1).value(), test6_2_expected) == 0); + Assert.assertTrue("Truncated upper bound should be greater than or equal to the actual upper bound", + cmp.compare(truncateStringMax(Literal.of(test7), 2).value(), test7) >= 0); + Assert.assertTrue("Test input with multiple 4 byte UTF-8 character where the second unicode " + + "character should be incremented", cmp.compare( + truncateStringMax(Literal.of(test7), 2).value(), test7_2_expected) == 0); + Assert.assertTrue("Test input with multiple 4 byte UTF-8 character where the first unicode " + + "character should be incremented", cmp.compare( + truncateStringMax(Literal.of(test7), 1).value(), test7_1_expected) == 0); } }