From e2a28a82e73a051e5c0a9e8b196186fa42a71897 Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Wed, 27 Mar 2024 10:53:36 -0700
Subject: [PATCH] JSONFlattenerMaker: Speed up charsetFix.

JSON parsing has this function "charsetFix" that fixes up strings
so they can round-trip through UTF-8 encoding without loss of
fidelity. It was originally introduced to fix a bug where strings
could be sorted, encoded, then decoded, and the resulting decoded
strings could end up no longer in sorted order (due to character
swaps during the encode operation).

The code has been in place for some time, and only applies to JSON.
I am not sure if it needs to apply to other formats; it's certainly
more difficult to get broken strings from other formats. It's easy
in JSON because you can write a JSON string like "foo\uD900".

At any rate, this patch does not revisit whether charsetFix should
be applied to all formats. It merely optimizes it for the JSON case.
The function works by using CharsetEncoder.canEncode, which is
a relatively slow method (just as expensive as actually encoding).
This patch adds a short-circuit to skip canEncode if all chars in
a string are in the basic multilingual plane (i.e. if no chars are
surrogates).
---
 .../common/parsers/JSONFlattenerMaker.java    | 44 ++++++++++++++++---
 .../parsers/JSONFlattenerMakerTest.java       | 15 +++++++
 2 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/processing/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java b/processing/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java
index 5df98199080f..84490b19fce0 100644
--- a/processing/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java
+++ b/processing/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java
@@ -203,19 +203,53 @@ public static Object convertJsonNode(JsonNode val, CharsetEncoder enc)
     return null;
   }
 
+  /**
+   * Fix up a string so it can round-trip through UTF-8 encoding without loss of fidelity. This comes up when a string
+   * has surrogates (e.g. \uD900) that appear in invalid positions, such as alone rather than as part of a
+   * surrogate pair.
+   *
+   * This operation is useful because when a string cannot round-trip properly, it can cause it to sort differently
+   * relative to other strings after an encode/decode round-trip. This causes incorrect ordering in cases where strings
+   * are sorted, then encoded, then decoded again.
+   *
+   * @param s   string, or null
+   * @param enc UTF-8 encoder
+   *
+   * @return the original string, or a fixed version that can round-trip properly
+   */
   @Nullable
-  private static String charsetFix(String s, CharsetEncoder enc)
+  static String charsetFix(@Nullable String s, CharsetEncoder enc)
   {
-    if (s != null && !enc.canEncode(s)) {
-      // Some whacky characters are in this string (e.g. \uD900). These are problematic because they are decodeable
-      // by new String(...) but will not encode into the same character. This dance here will replace these
-      // characters with something more sane.
+    if (s != null && !isBmp(s) && !enc.canEncode(s)) {
+      // Note: the check isBmp isn't necessary for correct behavior, but it improves performance in the common case
+      // where all characters are in BMP. It short-circuits "enc.canEncode", which is a slow operation. The short
+      // circuit is valid because if every char in a string is in BMP, it is definitely encodable as UTF-8.
+
+      // This dance here will replace the original string with one that can be round-tripped.
       return StringUtils.fromUtf8(StringUtils.toUtf8(s));
     } else {
       return s;
     }
   }
 
+  /**
+   * Returns whether every character in a string is in BMP (basic multilingual plane).
+   */
+  private static boolean isBmp(String s)
+  {
+    for (int i = 0; i < s.length(); i++) {
+      final char c = s.charAt(i);
+
+      // All 16-bit code units are either valid code points, or surrogates. So if this char is not a surrogate,
+      // it must represent a code point in BMP.
+      if (Character.isSurrogate(c)) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
   private static boolean isFlatList(JsonNode list)
   {
     for (JsonNode obj : list) {
diff --git a/processing/src/test/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMakerTest.java b/processing/src/test/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMakerTest.java
index 5583081db10e..b471c24b40d1 100644
--- a/processing/src/test/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMakerTest.java
+++ b/processing/src/test/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMakerTest.java
@@ -31,6 +31,8 @@
 import org.junit.Test;
 
 import java.math.BigInteger;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.StandardCharsets;
 import java.util.List;
 import java.util.Map;
 
@@ -199,4 +201,17 @@ public void testDiscovery() throws JsonProcessingException
         ImmutableSet.copyOf(FLATTENER_MAKER_NESTED.discoverRootFields(node))
     );
   }
+
+  @Test
+  public void testCharsetFix()
+  {
+    final CharsetEncoder encoder = StandardCharsets.UTF_8.newEncoder();
+    Assert.assertEquals("hello", JSONFlattenerMaker.charsetFix("hello", encoder));
+    Assert.assertEquals("Apache® Druid", JSONFlattenerMaker.charsetFix("Apache® Druid", encoder));
+    Assert.assertEquals("hello?", JSONFlattenerMaker.charsetFix("hello\uD900", encoder));
+    Assert.assertEquals("hello?", JSONFlattenerMaker.charsetFix("hello\uD83D", encoder));
+    Assert.assertEquals("hello?", JSONFlattenerMaker.charsetFix("hello\uDCAF", encoder));
+    Assert.assertEquals("hello💯", JSONFlattenerMaker.charsetFix("hello\uD83D\uDCAF", encoder));
+    Assert.assertEquals("héllö", JSONFlattenerMaker.charsetFix("héllö", encoder));
+  }
 }