From b1420899e4eaa04bce10ef4b8831fc96b56636d0 Mon Sep 17 00:00:00 2001
From: "Piotr P. Karwasz" <piotr@github.copernik.eu>
Date: Sun, 23 Mar 2025 11:37:29 +0100
Subject: [PATCH 1/3] feat: Improve benchmark (#222)

Fixes a bug in the benchmark initialization and adds a `toLowerCase` benchmark.
---
 .../packageurl/utils/StringUtilBenchmark.java | 37 ++++++++++++-------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/src/test/java/com/github/packageurl/utils/StringUtilBenchmark.java b/src/test/java/com/github/packageurl/utils/StringUtilBenchmark.java
index e811abe..e0bc455 100644
--- a/src/test/java/com/github/packageurl/utils/StringUtilBenchmark.java
+++ b/src/test/java/com/github/packageurl/utils/StringUtilBenchmark.java
@@ -21,7 +21,9 @@
  */
 package com.github.packageurl.utils;
 
+import com.github.packageurl.PackageURL;
 import java.nio.charset.StandardCharsets;
+import java.util.Locale;
 import java.util.Random;
 import java.util.concurrent.TimeUnit;
 import org.openjdk.jmh.annotations.Benchmark;
@@ -30,7 +32,6 @@
 import org.openjdk.jmh.annotations.OutputTimeUnit;
 import org.openjdk.jmh.annotations.Param;
 import org.openjdk.jmh.annotations.Scope;
-import org.openjdk.jmh.annotations.Setup;
 import org.openjdk.jmh.annotations.State;
 import org.openjdk.jmh.infra.Blackhole;
 
@@ -62,14 +63,8 @@ public class StringUtilBenchmark {
     @Param({"0", "0.1", "0.5"})
     private double nonAsciiProb;
 
-    private String[] decodedData = createDecodedData();
-    private String[] encodedData = encodeData(decodedData);
-
-    @Setup
-    public void setup() {
-        decodedData = createDecodedData();
-        encodedData = encodeData(encodedData);
-    }
+    private final String[] decodedData = createDecodedData();
+    private final String[] encodedData = encodeData(decodedData);
 
     private String[] createDecodedData() {
         Random random = new Random();
@@ -92,6 +87,9 @@ private static String[] encodeData(String[] decodedData) {
         String[] encodedData = new String[decodedData.length];
         for (int i = 0; i < decodedData.length; i++) {
             encodedData[i] = StringUtil.percentEncode(decodedData[i]);
+            if (!StringUtil.percentDecode(encodedData[i]).equals(decodedData[i])) {
+                throw new RuntimeException("Invalid implementation of `percentEncode` and `percentDecode`.");
+            }
         }
         return encodedData;
     }
@@ -100,17 +98,28 @@ private static String[] encodeData(String[] decodedData) {
     public void baseline(Blackhole blackhole) {
         for (int i = 0; i < DATA_COUNT; i++) {
             byte[] buffer = decodedData[i].getBytes(StandardCharsets.UTF_8);
-            // Change the String a little bit
+            // Prevent JIT compiler from assuming the buffer was not modified
             for (int idx = 0; idx < buffer.length; idx++) {
-                byte b = buffer[idx];
-                if ('a' <= b && b <= 'z') {
-                    buffer[idx] = (byte) (b & 0x20);
-                }
+                buffer[idx] ^= 0x20;
             }
             blackhole.consume(new String(buffer, StandardCharsets.UTF_8));
         }
     }
 
+    @Benchmark
+    public void toLowerCaseJre(Blackhole blackhole) {
+        for (int i = 0; i < DATA_COUNT; i++) {
+            blackhole.consume(decodedData[i].toLowerCase(Locale.ROOT));
+        }
+    }
+
+    @Benchmark
+    public void toLowerCase(Blackhole blackhole) {
+        for (int i = 0; i < DATA_COUNT; i++) {
+            blackhole.consume(StringUtil.toLowerCase(decodedData[i]));
+        }
+    }
+
     @Benchmark
     public void percentDecode(final Blackhole blackhole) {
         for (int i = 0; i < DATA_COUNT; i++) {

From 377b411845d8380c247b970b5021f3791d9433ed Mon Sep 17 00:00:00 2001
From: "Piotr P. Karwasz" <piotr@github.copernik.eu>
Date: Sun, 23 Mar 2025 19:08:13 +0100
Subject: [PATCH 2/3] fix: Benchmark initialization

The benchmark **must** be initialized in a `@Setup` method, otherwise `nonAsciiProb` will always be `0.0`.
---
 .../packageurl/utils/StringUtilBenchmark.java   | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/test/java/com/github/packageurl/utils/StringUtilBenchmark.java b/src/test/java/com/github/packageurl/utils/StringUtilBenchmark.java
index e0bc455..51230b1 100644
--- a/src/test/java/com/github/packageurl/utils/StringUtilBenchmark.java
+++ b/src/test/java/com/github/packageurl/utils/StringUtilBenchmark.java
@@ -21,7 +21,6 @@
  */
 package com.github.packageurl.utils;
 
-import com.github.packageurl.PackageURL;
 import java.nio.charset.StandardCharsets;
 import java.util.Locale;
 import java.util.Random;
@@ -32,6 +31,7 @@
 import org.openjdk.jmh.annotations.OutputTimeUnit;
 import org.openjdk.jmh.annotations.Param;
 import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
 import org.openjdk.jmh.annotations.State;
 import org.openjdk.jmh.infra.Blackhole;
 
@@ -63,8 +63,14 @@ public class StringUtilBenchmark {
     @Param({"0", "0.1", "0.5"})
     private double nonAsciiProb;
 
-    private final String[] decodedData = createDecodedData();
-    private final String[] encodedData = encodeData(decodedData);
+    private String[] decodedData;
+    private String[] encodedData;
+
+    @Setup
+    public void setup() {
+        decodedData = createDecodedData();
+        encodedData = encodeData(decodedData);
+    }
 
     private String[] createDecodedData() {
         Random random = new Random();
@@ -88,7 +94,10 @@ private static String[] encodeData(String[] decodedData) {
         for (int i = 0; i < decodedData.length; i++) {
             encodedData[i] = StringUtil.percentEncode(decodedData[i]);
             if (!StringUtil.percentDecode(encodedData[i]).equals(decodedData[i])) {
-                throw new RuntimeException("Invalid implementation of `percentEncode` and `percentDecode`.");
+                throw new RuntimeException(
+                        "Invalid implementation of `percentEncode` and `percentDecode`.\nOriginal data: "
+                                + encodedData[i] + "\nEncoded and decoded data: "
+                                + StringUtil.percentDecode(encodedData[i]));
             }
         }
         return encodedData;

From d7066f136ed7c8d95aa782d975849e26418763cb Mon Sep 17 00:00:00 2001
From: "Piotr P. Karwasz" <piotr@github.copernik.eu>
Date: Sun, 23 Mar 2025 19:25:28 +0100
Subject: [PATCH 3/3] fix: Improve encoding/decoding performance for ASCII
 strings

Since strings that don't require **any** percent encoding are in practice the rule, the encoding/decoding code should be optimized for this case.
---
 .../github/packageurl/utils/StringUtil.java   | 137 ++++++++++--------
 1 file changed, 74 insertions(+), 63 deletions(-)

diff --git a/src/main/java/com/github/packageurl/utils/StringUtil.java b/src/main/java/com/github/packageurl/utils/StringUtil.java
index 39f2bdd..7f04e0c 100644
--- a/src/main/java/com/github/packageurl/utils/StringUtil.java
+++ b/src/main/java/com/github/packageurl/utils/StringUtil.java
@@ -21,10 +21,10 @@
  */
 package com.github.packageurl.utils;
 
+import static java.lang.Byte.toUnsignedInt;
+
 import com.github.packageurl.ValidationException;
-import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
-import java.util.stream.IntStream;
 
 /**
  * String utility for validation and encoding.
@@ -35,6 +35,24 @@ public final class StringUtil {
 
     private static final byte PERCENT_CHAR = '%';
 
+    private static final boolean[] UNRESERVED_CHARS = new boolean[128];
+
+    static {
+        for (char c = '0'; c <= '9'; c++) {
+            UNRESERVED_CHARS[c] = true;
+        }
+        for (char c = 'A'; c <= 'Z'; c++) {
+            UNRESERVED_CHARS[c] = true;
+        }
+        for (char c = 'a'; c <= 'z'; c++) {
+            UNRESERVED_CHARS[c] = true;
+        }
+        UNRESERVED_CHARS['-'] = true;
+        UNRESERVED_CHARS['.'] = true;
+        UNRESERVED_CHARS['_'] = true;
+        UNRESERVED_CHARS['~'] = true;
+    }
+
     private StringUtil() {
         throw new AssertionError("Cannot instantiate StringUtil");
     }
@@ -48,10 +66,6 @@ private StringUtil() {
      * @since 2.0.0
      */
     public static String toLowerCase(String s) {
-        if (s == null) {
-            return null;
-        }
-
         int pos = indexOfFirstUpperCaseChar(s);
 
         if (pos == -1) {
@@ -59,10 +73,9 @@ public static String toLowerCase(String s) {
         }
 
         char[] chars = s.toCharArray();
-        int length = chars.length;
 
-        for (int i = pos; i < length; i++) {
-            chars[i] = (char) toLowerCase(chars[i]);
+        for (int length = chars.length; pos < length; pos++) {
+            chars[pos] = (char) toLowerCase(chars[pos]);
         }
 
         return new String(chars);
@@ -77,26 +90,22 @@ public static String toLowerCase(String s) {
      * @since 2.0.0
      */
     public static String percentDecode(final String source) {
-        if (source == null || source.isEmpty()) {
+        if (source.indexOf(PERCENT_CHAR) == -1) {
             return source;
         }
 
         byte[] bytes = source.getBytes(StandardCharsets.UTF_8);
-        int i = indexOfFirstPercentChar(bytes);
-
-        if (i == -1) {
-            return source;
-        }
 
+        int readPos = indexOfFirstPercentChar(bytes);
+        int writePos = readPos;
         int length = bytes.length;
-        int writePos = i;
-        while (i < length) {
-            byte b = bytes[i];
+        while (readPos < length) {
+            byte b = bytes[readPos];
             if (b == PERCENT_CHAR) {
-                bytes[writePos++] = percentDecode(bytes, i++);
-                i += 2;
+                bytes[writePos++] = percentDecode(bytes, readPos++);
+                readPos += 2;
             } else {
-                bytes[writePos++] = bytes[i++];
+                bytes[writePos++] = bytes[readPos++];
             }
         }
 
@@ -112,34 +121,29 @@ public static String percentDecode(final String source) {
      * @since 2.0.0
      */
     public static String percentEncode(final String source) {
-        if (source == null || source.isEmpty()) {
-            return source;
-        }
-        byte[] bytes = source.getBytes(StandardCharsets.UTF_8);
-        int start = indexOfFirstNonAsciiChar(bytes);
-        if (start == -1) {
+        if (!shouldEncode(source)) {
             return source;
         }
-        int length = bytes.length;
-        ByteBuffer buffer = ByteBuffer.allocate(start + ((length - start) * 3));
-        if (start != 0) {
-            buffer.put(bytes, 0, start);
-        }
 
-        for (int i = start; i < length; i++) {
-            byte b = bytes[i];
-            if (shouldEncode(b)) {
-                byte b1 = (byte) Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, 16));
-                byte b2 = (byte) Character.toUpperCase(Character.forDigit(b & 0xF, 16));
-                buffer.put(PERCENT_CHAR);
-                buffer.put(b1);
-                buffer.put(b2);
+        byte[] src = source.getBytes(StandardCharsets.UTF_8);
+        byte[] dest = new byte[3 * src.length];
+
+        int writePos = 0;
+        for (byte b : src) {
+            if (shouldEncode(toUnsignedInt(b))) {
+                dest[writePos++] = PERCENT_CHAR;
+                dest[writePos++] = toHexDigit(b >> 4);
+                dest[writePos++] = toHexDigit(b);
             } else {
-                buffer.put(b);
+                dest[writePos++] = b;
             }
         }
 
-        return new String(buffer.array(), 0, buffer.position(), StandardCharsets.UTF_8);
+        return new String(dest, 0, writePos, StandardCharsets.UTF_8);
+    }
+
+    private static byte toHexDigit(int b) {
+        return (byte) Character.toUpperCase(Character.forDigit(b & 0xF, 16));
     }
 
     /**
@@ -178,14 +182,34 @@ public static boolean isValidCharForKey(int c) {
         return (isAlphaNumeric(c) || c == '.' || c == '_' || c == '-');
     }
 
+    /**
+     * Returns {@code true} if the character is in the unreserved RFC 3986 set.
+     * <p>
+     *     <strong>Warning</strong>: Profiling shows that the performance of {@link #percentEncode} relies heavily on this method.
+     *     Modify with care.
+     * </p>
+     * @param c non-negative integer.
+     */
     private static boolean isUnreserved(int c) {
-        return (isValidCharForKey(c) || c == '~');
+        return c < 128 && UNRESERVED_CHARS[c];
     }
 
+    /**
+     * @param c non-negative integer
+     */
     private static boolean shouldEncode(int c) {
         return !isUnreserved(c);
     }
 
+    private static boolean shouldEncode(String s) {
+        for (int i = 0, length = s.length(); i < length; i++) {
+            if (shouldEncode(s.charAt(i))) {
+                return true;
+            }
+        }
+        return false;
+    }
+
     private static boolean isAlpha(int c) {
         return (isLowerCase(c) || isUpperCase(c));
     }
@@ -195,7 +219,7 @@ private static boolean isAlphaNumeric(int c) {
     }
 
     private static boolean isUpperCase(int c) {
-        return (c >= 'A' && c <= 'Z');
+        return 'A' <= c && c <= 'Z';
     }
 
     private static boolean isLowerCase(int c) {
@@ -207,34 +231,21 @@ private static int toLowerCase(int c) {
     }
 
     private static int indexOfFirstUpperCaseChar(String s) {
-        int length = s.length();
-
-        for (int i = 0; i < length; i++) {
+        for (int i = 0, length = s.length(); i < length; i++) {
             if (isUpperCase(s.charAt(i))) {
                 return i;
             }
         }
-
         return -1;
     }
 
-    private static int indexOfFirstNonAsciiChar(byte[] bytes) {
-        int length = bytes.length;
-        int start = -1;
-        for (int i = 0; i < length; i++) {
-            if (shouldEncode(bytes[i])) {
-                start = i;
-                break;
+    private static int indexOfFirstPercentChar(final byte[] bytes) {
+        for (int i = 0, length = bytes.length; i < length; i++) {
+            if (bytes[i] == PERCENT_CHAR) {
+                return i;
             }
         }
-        return start;
-    }
-
-    private static int indexOfFirstPercentChar(final byte[] bytes) {
-        return IntStream.range(0, bytes.length)
-                .filter(i -> bytes[i] == PERCENT_CHAR)
-                .findFirst()
-                .orElse(-1);
+        return -1;
     }
 
     private static byte percentDecode(final byte[] bytes, final int start) {