Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,11 @@ public static Literal<CharSequence> truncateStringMax(Literal<CharSequence> inpu

// Try incrementing the code points from the end
for (int i = length - 1; i >= 0; i--) {
int nextCodePoint = truncatedStringBuffer.codePointAt(i) + 1;
// Get the offset in the truncated string buffer where the number of unicode characters = i
int offsetByCodePoint = truncatedStringBuffer.offsetByCodePoints(0, i);
int nextCodePoint = truncatedStringBuffer.codePointAt(offsetByCodePoint) + 1;
// No overflow
if (nextCodePoint != 0 && Character.isValidCodePoint(nextCodePoint)) {
// Get the offset in the truncated string buffer where the number of unicode characters = i
int offsetByCodePoint = truncatedStringBuffer.offsetByCodePoints(0, i);
truncatedStringBuffer.setLength(offsetByCodePoint);
// Append next code point to the truncated substring
truncatedStringBuffer.appendCodePoint(nextCodePoint);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,9 @@ public void testTruncateStringMax() throws IOException {
String test6 = "\uD800\uDFFF\uD800\uDFFF";
// Increment the previous character
String test6_2_expected = "\uD801\uDC00";
String test7 = "\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02";
String test7_2_expected = "\uD83D\uDE02\uD83D\uDE03";
String test7_1_expected = "\uD83D\uDE03";

Comparator<CharSequence> cmp = Literal.of(test1).comparator();
Assert.assertTrue("Truncated upper bound should be greater than or equal to the actual upper bound",
Expand Down Expand Up @@ -175,5 +178,13 @@ public void testTruncateStringMax() throws IOException {
Assert.assertTrue("Test 4 byte UTF-8 character increment. Output must have one character with " +
"the first character incremented", cmp.compare(
truncateStringMax(Literal.of(test6), 1).value(), test6_2_expected) == 0);
Assert.assertTrue("Truncated upper bound should be greater than or equal to the actual upper bound",
cmp.compare(truncateStringMax(Literal.of(test7), 2).value(), test7) >= 0);
Assert.assertTrue("Test input with multiple 4 byte UTF-8 character where the second unicode " +
"character should be incremented", cmp.compare(
truncateStringMax(Literal.of(test7), 2).value(), test7_2_expected) == 0);
Assert.assertTrue("Test input with multiple 4 byte UTF-8 character where the first unicode " +
"character should be incremented", cmp.compare(
truncateStringMax(Literal.of(test7), 1).value(), test7_1_expected) == 0);
}
}