From d26788a758b604a0b40b7702a6128c69a1ccfd01 Mon Sep 17 00:00:00 2001
From: Vinitha Gankidi <vgankidi@netflix.com>
Date: Tue, 30 Jul 2019 19:18:59 -0700
Subject: [PATCH] Fix truncateStringMax in UnicodeUtil. Index to codePointAt
 should be the offset calculated by code points

---
 .../java/org/apache/iceberg/util/UnicodeUtil.java     |  6 +++---
 .../iceberg/parquet/TestParquetMetricsTruncation.java | 11 +++++++++++
 2 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java b/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java
index 1eaed21df6d2..f76ec73429d7 100644
--- a/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java
+++ b/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java
@@ -79,11 +79,11 @@ public static Literal<CharSequence> truncateStringMax(Literal<CharSequence> inpu
 
     // Try incrementing the code points from the end
     for (int i = length - 1; i >= 0; i--) {
-      int nextCodePoint = truncatedStringBuffer.codePointAt(i) + 1;
+      // Get the offset in the truncated string buffer where the number of unicode characters = i
+      int offsetByCodePoint = truncatedStringBuffer.offsetByCodePoints(0, i);
+      int nextCodePoint = truncatedStringBuffer.codePointAt(offsetByCodePoint) + 1;
       // No overflow
       if (nextCodePoint != 0 && Character.isValidCodePoint(nextCodePoint)) {
-        // Get the offset in the truncated string buffer where the number of unicode characters = i
-        int offsetByCodePoint = truncatedStringBuffer.offsetByCodePoints(0, i);
         truncatedStringBuffer.setLength(offsetByCodePoint);
         // Append next code point to the truncated substring
         truncatedStringBuffer.appendCodePoint(nextCodePoint);
diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetMetricsTruncation.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetMetricsTruncation.java
index a887e19cff53..d38609b8fefa 100644
--- a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetMetricsTruncation.java
+++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetMetricsTruncation.java
@@ -138,6 +138,9 @@ public void testTruncateStringMax() throws IOException {
     String test6 = "\uD800\uDFFF\uD800\uDFFF";
     // Increment the previous character
     String test6_2_expected = "\uD801\uDC00";
+    String test7 = "\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02";
+    String test7_2_expected = "\uD83D\uDE02\uD83D\uDE03";
+    String test7_1_expected = "\uD83D\uDE03";
 
     Comparator<CharSequence> cmp = Literal.of(test1).comparator();
     Assert.assertTrue("Truncated upper bound should be greater than or equal to the actual upper bound",
@@ -175,5 +178,13 @@ public void testTruncateStringMax() throws IOException {
     Assert.assertTrue("Test 4 byte UTF-8 character increment. Output must have one character with " +
         "the first character incremented", cmp.compare(
         truncateStringMax(Literal.of(test6), 1).value(), test6_2_expected) == 0);
+    Assert.assertTrue("Truncated upper bound should be greater than or equal to the actual upper bound",
+        cmp.compare(truncateStringMax(Literal.of(test7), 2).value(), test7) >= 0);
+    Assert.assertTrue("Test input with multiple 4 byte UTF-8 character where the second unicode " +
+        "character should be incremented", cmp.compare(
+            truncateStringMax(Literal.of(test7), 2).value(), test7_2_expected) == 0);
+    Assert.assertTrue("Test input with multiple 4 byte UTF-8 character where the first unicode " +
+        "character should be incremented", cmp.compare(
+            truncateStringMax(Literal.of(test7), 1).value(), test7_1_expected) == 0);
   }
 }