From b1420899e4eaa04bce10ef4b8831fc96b56636d0 Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Sun, 23 Mar 2025 11:37:29 +0100 Subject: [PATCH 1/3] feat: Improve benchmark (#222) Fixes a bug in the benchmark initialization and adds a `toLowerCase` benchmark. --- .../packageurl/utils/StringUtilBenchmark.java | 37 ++++++++++++------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/src/test/java/com/github/packageurl/utils/StringUtilBenchmark.java b/src/test/java/com/github/packageurl/utils/StringUtilBenchmark.java index e811abe..e0bc455 100644 --- a/src/test/java/com/github/packageurl/utils/StringUtilBenchmark.java +++ b/src/test/java/com/github/packageurl/utils/StringUtilBenchmark.java @@ -21,7 +21,9 @@ */ package com.github.packageurl.utils; +import com.github.packageurl.PackageURL; import java.nio.charset.StandardCharsets; +import java.util.Locale; import java.util.Random; import java.util.concurrent.TimeUnit; import org.openjdk.jmh.annotations.Benchmark; @@ -30,7 +32,6 @@ import org.openjdk.jmh.annotations.OutputTimeUnit; import org.openjdk.jmh.annotations.Param; import org.openjdk.jmh.annotations.Scope; -import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.infra.Blackhole; @@ -62,14 +63,8 @@ public class StringUtilBenchmark { @Param({"0", "0.1", "0.5"}) private double nonAsciiProb; - private String[] decodedData = createDecodedData(); - private String[] encodedData = encodeData(decodedData); - - @Setup - public void setup() { - decodedData = createDecodedData(); - encodedData = encodeData(encodedData); - } + private final String[] decodedData = createDecodedData(); + private final String[] encodedData = encodeData(decodedData); private String[] createDecodedData() { Random random = new Random(); @@ -92,6 +87,9 @@ private static String[] encodeData(String[] decodedData) { String[] encodedData = new String[decodedData.length]; for (int i = 0; i < decodedData.length; i++) { encodedData[i] = StringUtil.percentEncode(decodedData[i]); + if (!StringUtil.percentDecode(encodedData[i]).equals(decodedData[i])) { + throw new RuntimeException("Invalid implementation of `percentEncode` and `percentDecode`."); + } } return encodedData; } @@ -100,17 +98,28 @@ private static String[] encodeData(String[] decodedData) { public void baseline(Blackhole blackhole) { for (int i = 0; i < DATA_COUNT; i++) { byte[] buffer = decodedData[i].getBytes(StandardCharsets.UTF_8); - // Change the String a little bit + // Prevent JIT compiler from assuming the buffer was not modified for (int idx = 0; idx < buffer.length; idx++) { - byte b = buffer[idx]; - if ('a' <= b && b <= 'z') { - buffer[idx] = (byte) (b & 0x20); - } + buffer[idx] ^= 0x20; } blackhole.consume(new String(buffer, StandardCharsets.UTF_8)); } } + @Benchmark + public void toLowerCaseJre(Blackhole blackhole) { + for (int i = 0; i < DATA_COUNT; i++) { + blackhole.consume(decodedData[i].toLowerCase(Locale.ROOT)); + } + } + + @Benchmark + public void toLowerCase(Blackhole blackhole) { + for (int i = 0; i < DATA_COUNT; i++) { + blackhole.consume(StringUtil.toLowerCase(decodedData[i])); + } + } + @Benchmark public void percentDecode(final Blackhole blackhole) { for (int i = 0; i < DATA_COUNT; i++) { From 377b411845d8380c247b970b5021f3791d9433ed Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Sun, 23 Mar 2025 19:08:13 +0100 Subject: [PATCH 2/3] fix: Benchmark initialization The benchmark **must** be initialized in a `@Setup` method, otherwise `nonAsciiProb` will always be `0.0`. --- .../packageurl/utils/StringUtilBenchmark.java | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/test/java/com/github/packageurl/utils/StringUtilBenchmark.java b/src/test/java/com/github/packageurl/utils/StringUtilBenchmark.java index e0bc455..51230b1 100644 --- a/src/test/java/com/github/packageurl/utils/StringUtilBenchmark.java +++ b/src/test/java/com/github/packageurl/utils/StringUtilBenchmark.java @@ -21,7 +21,6 @@ */ package com.github.packageurl.utils; -import com.github.packageurl.PackageURL; import java.nio.charset.StandardCharsets; import java.util.Locale; import java.util.Random; @@ -32,6 +31,7 @@ import org.openjdk.jmh.annotations.OutputTimeUnit; import org.openjdk.jmh.annotations.Param; import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.infra.Blackhole; @@ -63,8 +63,14 @@ public class StringUtilBenchmark { @Param({"0", "0.1", "0.5"}) private double nonAsciiProb; - private final String[] decodedData = createDecodedData(); - private final String[] encodedData = encodeData(decodedData); + private String[] decodedData; + private String[] encodedData; + + @Setup + public void setup() { + decodedData = createDecodedData(); + encodedData = encodeData(decodedData); + } private String[] createDecodedData() { Random random = new Random(); @@ -88,7 +94,10 @@ private static String[] encodeData(String[] decodedData) { for (int i = 0; i < decodedData.length; i++) { encodedData[i] = StringUtil.percentEncode(decodedData[i]); if (!StringUtil.percentDecode(encodedData[i]).equals(decodedData[i])) { - throw new RuntimeException("Invalid implementation of `percentEncode` and `percentDecode`."); + throw new RuntimeException( + "Invalid implementation of `percentEncode` and `percentDecode`.\nOriginal data: " + + encodedData[i] + "\nEncoded and decoded data: " + + StringUtil.percentDecode(encodedData[i])); } } return encodedData; From d7066f136ed7c8d95aa782d975849e26418763cb Mon Sep 17 00:00:00 2001 From: "Piotr P. Karwasz" Date: Sun, 23 Mar 2025 19:25:28 +0100 Subject: [PATCH 3/3] fix: Improve encoding/decoding performance for ASCII strings Since strings that don't require **any** percent encoding are in practice the rule, the encoding/decoding code should be optimized for this case. --- .../github/packageurl/utils/StringUtil.java | 137 ++++++++++-------- 1 file changed, 74 insertions(+), 63 deletions(-) diff --git a/src/main/java/com/github/packageurl/utils/StringUtil.java b/src/main/java/com/github/packageurl/utils/StringUtil.java index 39f2bdd..7f04e0c 100644 --- a/src/main/java/com/github/packageurl/utils/StringUtil.java +++ b/src/main/java/com/github/packageurl/utils/StringUtil.java @@ -21,10 +21,10 @@ */ package com.github.packageurl.utils; +import static java.lang.Byte.toUnsignedInt; + import com.github.packageurl.ValidationException; -import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; -import java.util.stream.IntStream; /** * String utility for validation and encoding. @@ -35,6 +35,24 @@ public final class StringUtil { private static final byte PERCENT_CHAR = '%'; + private static final boolean[] UNRESERVED_CHARS = new boolean[128]; + + static { + for (char c = '0'; c <= '9'; c++) { + UNRESERVED_CHARS[c] = true; + } + for (char c = 'A'; c <= 'Z'; c++) { + UNRESERVED_CHARS[c] = true; + } + for (char c = 'a'; c <= 'z'; c++) { + UNRESERVED_CHARS[c] = true; + } + UNRESERVED_CHARS['-'] = true; + UNRESERVED_CHARS['.'] = true; + UNRESERVED_CHARS['_'] = true; + UNRESERVED_CHARS['~'] = true; + } + private StringUtil() { throw new AssertionError("Cannot instantiate StringUtil"); } @@ -48,10 +66,6 @@ private StringUtil() { * @since 2.0.0 */ public static String toLowerCase(String s) { - if (s == null) { - return null; - } - int pos = indexOfFirstUpperCaseChar(s); if (pos == -1) { @@ -59,10 +73,9 @@ public static String toLowerCase(String s) { } char[] chars = s.toCharArray(); - int length = chars.length; - for (int i = pos; i < length; i++) { - chars[i] = (char) toLowerCase(chars[i]); + for (int length = chars.length; pos < length; pos++) { + chars[pos] = (char) toLowerCase(chars[pos]); } return new String(chars); @@ -77,26 +90,22 @@ public static String toLowerCase(String s) { * @since 2.0.0 */ public static String percentDecode(final String source) { - if (source == null || source.isEmpty()) { + if (source.indexOf(PERCENT_CHAR) == -1) { return source; } byte[] bytes = source.getBytes(StandardCharsets.UTF_8); - int i = indexOfFirstPercentChar(bytes); - - if (i == -1) { - return source; - } + int readPos = indexOfFirstPercentChar(bytes); + int writePos = readPos; int length = bytes.length; - int writePos = i; - while (i < length) { - byte b = bytes[i]; + while (readPos < length) { + byte b = bytes[readPos]; if (b == PERCENT_CHAR) { - bytes[writePos++] = percentDecode(bytes, i++); - i += 2; + bytes[writePos++] = percentDecode(bytes, readPos++); + readPos += 2; } else { - bytes[writePos++] = bytes[i++]; + bytes[writePos++] = bytes[readPos++]; } } @@ -112,34 +121,29 @@ public static String percentDecode(final String source) { * @since 2.0.0 */ public static String percentEncode(final String source) { - if (source == null || source.isEmpty()) { - return source; - } - byte[] bytes = source.getBytes(StandardCharsets.UTF_8); - int start = indexOfFirstNonAsciiChar(bytes); - if (start == -1) { + if (!shouldEncode(source)) { return source; } - int length = bytes.length; - ByteBuffer buffer = ByteBuffer.allocate(start + ((length - start) * 3)); - if (start != 0) { - buffer.put(bytes, 0, start); - } - for (int i = start; i < length; i++) { - byte b = bytes[i]; - if (shouldEncode(b)) { - byte b1 = (byte) Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, 16)); - byte b2 = (byte) Character.toUpperCase(Character.forDigit(b & 0xF, 16)); - buffer.put(PERCENT_CHAR); - buffer.put(b1); - buffer.put(b2); + byte[] src = source.getBytes(StandardCharsets.UTF_8); + byte[] dest = new byte[3 * src.length]; + + int writePos = 0; + for (byte b : src) { + if (shouldEncode(toUnsignedInt(b))) { + dest[writePos++] = PERCENT_CHAR; + dest[writePos++] = toHexDigit(b >> 4); + dest[writePos++] = toHexDigit(b); } else { - buffer.put(b); + dest[writePos++] = b; } } - return new String(buffer.array(), 0, buffer.position(), StandardCharsets.UTF_8); + return new String(dest, 0, writePos, StandardCharsets.UTF_8); + } + + private static byte toHexDigit(int b) { + return (byte) Character.toUpperCase(Character.forDigit(b & 0xF, 16)); } /** @@ -178,14 +182,34 @@ public static boolean isValidCharForKey(int c) { return (isAlphaNumeric(c) || c == '.' || c == '_' || c == '-'); } + /** + * Returns {@code true} if the character is in the unreserved RFC 3986 set. + *

+ * Warning: Profiling shows that the performance of {@link #percentEncode} relies heavily on this method. + * Modify with care. + *

+ * @param c non-negative integer. + */ private static boolean isUnreserved(int c) { - return (isValidCharForKey(c) || c == '~'); + return c < 128 && UNRESERVED_CHARS[c]; } + /** + * @param c non-negative integer + */ private static boolean shouldEncode(int c) { return !isUnreserved(c); } + private static boolean shouldEncode(String s) { + for (int i = 0, length = s.length(); i < length; i++) { + if (shouldEncode(s.charAt(i))) { + return true; + } + } + return false; + } + private static boolean isAlpha(int c) { return (isLowerCase(c) || isUpperCase(c)); } @@ -195,7 +219,7 @@ private static boolean isAlphaNumeric(int c) { } private static boolean isUpperCase(int c) { - return (c >= 'A' && c <= 'Z'); + return 'A' <= c && c <= 'Z'; } private static boolean isLowerCase(int c) { @@ -207,34 +231,21 @@ private static int toLowerCase(int c) { } private static int indexOfFirstUpperCaseChar(String s) { - int length = s.length(); - - for (int i = 0; i < length; i++) { + for (int i = 0, length = s.length(); i < length; i++) { if (isUpperCase(s.charAt(i))) { return i; } } - return -1; } - private static int indexOfFirstNonAsciiChar(byte[] bytes) { - int length = bytes.length; - int start = -1; - for (int i = 0; i < length; i++) { - if (shouldEncode(bytes[i])) { - start = i; - break; + private static int indexOfFirstPercentChar(final byte[] bytes) { + for (int i = 0, length = bytes.length; i < length; i++) { + if (bytes[i] == PERCENT_CHAR) { + return i; } } - return start; - } - - private static int indexOfFirstPercentChar(final byte[] bytes) { - return IntStream.range(0, bytes.length) - .filter(i -> bytes[i] == PERCENT_CHAR) - .findFirst() - .orElse(-1); + return -1; } private static byte percentDecode(final byte[] bytes, final int start) {