amazon-ion · zslayton · Jun 23, 2021 · Jun 23, 2021 · Jun 23, 2021 · Jun 23, 2021
diff --git a/src/com/amazon/ion/impl/bin/IonRawBinaryWriter.java b/src/com/amazon/ion/impl/bin/IonRawBinaryWriter.java
@@ -46,15 +46,13 @@
 import com.amazon.ion.SymbolTable;
 import com.amazon.ion.SymbolToken;
 import com.amazon.ion.Timestamp;
+import com.amazon.ion.impl.bin.utf8.Utf8StringEncoder;
+import com.amazon.ion.impl.bin.utf8.Utf8StringEncoderPool;
+
 import java.io.IOException;
 import java.io.OutputStream;
 import java.math.BigDecimal;
 import java.math.BigInteger;
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetEncoder;
-import java.nio.charset.CoderResult;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
@@ -125,14 +123,9 @@ private static byte[] bytes(int... vals) {
 
     private static final byte VARINT_NEG_ZERO   = (byte) 0xC0;
 
-    // See IonRawBinaryWriter#writeString(String) for usage information.
-    static final int SMALL_STRING_SIZE = 4 * 1024;
-
-    // Reusable resources for encoding Strings as UTF-8 bytes
-    final CharsetEncoder utf8Encoder = Charset.forName("UTF-8").newEncoder();
-    final ByteBuffer utf8EncodingBuffer = ByteBuffer.allocate((int) (SMALL_STRING_SIZE * utf8Encoder.maxBytesPerChar()));
-    final char[] charArray = new char[SMALL_STRING_SIZE];
-    final CharBuffer reusableCharBuffer = CharBuffer.wrap(charArray);
+    final Utf8StringEncoder utf8StringEncoder = Utf8StringEncoderPool
+            .getInstance()
+            .getOrCreateUtf8Encoder();
 
     private static final byte[] makeTypedPreallocatedBytes(final int typeDesc, final int length)
     {
@@ -1443,73 +1436,10 @@ public void writeString(final String value) throws IOException
         }
         prepareValue();
 
-        /*
-         This method relies on the standard CharsetEncoder class to encode each String's UTF-16 char[] data into
-         UTF-8 bytes. Strangely, CharsetEncoders cannot operate directly on instances of a String. The CharsetEncoder
-         API requires all inputs and outputs to be specified as instances of java.nio.ByteBuffer and
-         java.nio.CharBuffer, making some number of allocations mandatory. Specifically, for each encoding operation
-         we need to have:
-
-            1. An instance of a UTF-8 CharsetEncoder.
-            2. A CharBuffer representation of the String's data.
-            3. A ByteBuffer into which the CharsetEncoder may write UTF-8 bytes.
-
-         To minimize the overhead involved, the IonRawBinaryWriter will reuse previously initialized resources wherever
-         possible. However, because CharBuffer and ByteBuffer each have a fixed length, we can only reuse them for
-         Strings that are small enough to fit. This creates two kinds of input String to encode: those that are small
-         enough for us to reuse our buffers ("small strings"), and those which are not ("large strings").
-
-         The String#getBytes(Charset) method cannot be used for two reasons:
-
-               1. It always allocates, so we cannot reuse any resources.
-               2. If/when it encounters character data that cannot be encoded as UTF-8, it simply replaces that data
-                 with a substitute character[1]. (Sometimes seen in applications as a '?'.) In order
-                 to surface invalid data to the user, the method must be able to detect these events at encoding time.
-
-            [1] https://en.wikipedia.org/wiki/Substitute_character
-        */
-
-        CharBuffer stringData;
-        ByteBuffer encodingBuffer;
-
-        int length = value.length();
-
-        // While it is possible to encode the Ion string using a fixed-size encodingBuffer, we need to be able to
-        // write the length of the complete UTF-8 string to the output stream before we write the string itself.
-        // For simplicity, we reuse or create an encodingBuffer that is large enough to hold the full string.
-
-        // In order to encode the input String, we need to pass it to CharsetEncoder as an implementation of CharBuffer.
-        // Surprisingly, the intuitive way to achieve this (the CharBuffer#wrap(CharSequence) method) adds a large
-        // amount of CPU overhead to the encoding process. Benchmarking shows that it's substantially faster
-        // to use String#getChars(int, int, char[], int) to copy the String's backing array and then call
-        // CharBuffer#wrap(char[]) on the copy.
-
-        if (length > SMALL_STRING_SIZE) {
-            // Allocate a new buffer for large strings
-            encodingBuffer = ByteBuffer.allocate((int) (value.length() * utf8Encoder.maxBytesPerChar()));
-            char[] chars = new char[value.length()];
-            value.getChars(0, value.length(), chars, 0);
-            stringData = CharBuffer.wrap(chars);
-        } else {
-            // Reuse our existing buffers for small strings
-            encodingBuffer = utf8EncodingBuffer;
-            encodingBuffer.clear();
-            stringData = reusableCharBuffer;
-            value.getChars(0, value.length(), charArray, 0);
-            reusableCharBuffer.rewind();
-            reusableCharBuffer.limit(value.length());
-        }
-
-        // Because encodingBuffer is guaranteed to be large enough to hold the encoded string, we can
-        // perform the encoding in a single call to CharsetEncoder#encode(CharBuffer, ByteBuffer, boolean).
-        CoderResult coderResult = utf8Encoder.encode(stringData, encodingBuffer, true);
-
-        // 'Underflow' is the success state of a CoderResult.
-        if (!coderResult.isUnderflow()) {
-            throw new IllegalArgumentException("Could not encode string as UTF8 bytes: " + value);
-        }
-        encodingBuffer.flip();
-        int utf8Length = encodingBuffer.remaining();
+        // UTF-8 encode the String
+        Utf8StringEncoder.Result encoderResult = utf8StringEncoder.encode(value);
+        int utf8Length = encoderResult.getEncodedLength();
+        byte[] utf8Buffer = encoderResult.getBuffer();
 
         // Write the type and length codes to the output stream.
         long previousPosition = buffer.position();
@@ -1521,7 +1451,7 @@ enough for us to reuse our buffers ("small strings"), and those which are not ("
         }
 
         // Write the encoded UTF-8 bytes to the output stream
-        buffer.writeBytes(encodingBuffer.array(), 0, utf8Length);
+        buffer.writeBytes(utf8Buffer, 0, utf8Length);
 
         long bytesWritten = buffer.position() - previousPosition;
         updateLength(bytesWritten);
@@ -1686,6 +1616,7 @@ public void close() throws IOException
             buffer.close();
             patchBuffer.close();
             allocator.close();
+            utf8StringEncoder.close();
         }
         finally
         {

diff --git a/src/com/amazon/ion/impl/bin/utf8/Utf8StringEncoder.java b/src/com/amazon/ion/impl/bin/utf8/Utf8StringEncoder.java
@@ -0,0 +1,170 @@
+package com.amazon.ion.impl.bin.utf8;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+
+/**
+ * Encodes {@link String}s to UTF-8. Instances of this class are reusable but are NOT threadsafe.
+ *
+ * Users are strongly encouraged to get instances from {@link Utf8StringEncoderPool#getOrCreateUtf8Encoder()}.
+ * {@link #encode(String)} can be called any number of times. Users are expected to call {@link #close()} when
+ * the encoder is no longer needed.
+ */
+public class Utf8StringEncoder implements Closeable {
+    // The longest String (as measured by {@link java.lang.String#length()}) that this instance can encode without
+    // requiring additional allocations.
+    private static final int SMALL_STRING_SIZE = 4 * 1024;
+
+    // Reusable resources for encoding Strings as UTF-8 bytes
+    final Utf8StringEncoderPool utf8StringEncoderPool;
+    final CharsetEncoder utf8Encoder;
+    final ByteBuffer utf8EncodingBuffer;
+    final char[] charArray;
+    final CharBuffer charBuffer;
+
+    public Utf8StringEncoder(Utf8StringEncoderPool pool) {
+        utf8StringEncoderPool = pool;
+        utf8Encoder = Charset.forName("UTF-8").newEncoder();
+        utf8EncodingBuffer = ByteBuffer.allocate((int) (SMALL_STRING_SIZE * utf8Encoder.maxBytesPerChar()));
+        charArray = new char[SMALL_STRING_SIZE];
+        charBuffer = CharBuffer.wrap(charArray);
+    }
+
+    public Utf8StringEncoder() {
+        // This instance is not associated with a Utf8StringEncoderPool
+        this(null);
+    }
+
+    /**
+     * Encodes the provided String's text to UTF-8. Unlike {@link String#getBytes(Charset)}, this method will not
+     * silently replace characters that cannot be encoded with a substitute character. Instead, it will throw
+     * an {@link IllegalArgumentException}.
+     *
+     * Some resources in the returned {@link Result} may be reused across calls to this method. Consequently,
+     * callers should use the Result and discard it immediately.
+     *
+     * @param text A Java String to encode as UTF8 bytes.
+     * @return  A {@link Result} containing a byte array of UTF-8 bytes and encoded length.
+     * @throws IllegalArgumentException if the String cannot be encoded as UTF-8.
+     */
+    public Result encode(String text) {
+        /*
+         This method relies on the standard CharsetEncoder class to encode each String's UTF-16 char[] data into
+         UTF-8 bytes. Strangely, CharsetEncoders cannot operate directly on instances of a String. The CharsetEncoder
+         API requires all inputs and outputs to be specified as instances of java.nio.ByteBuffer and
+         java.nio.CharBuffer, making some number of allocations mandatory. Specifically, for each encoding operation
+         we need to have:
+
+            1. An instance of a UTF-8 CharsetEncoder.
+            2. A CharBuffer representation of the String's data.
+            3. A ByteBuffer into which the CharsetEncoder may write UTF-8 bytes.
+
+         To minimize the overhead involved, the Utf8StringEncoder will reuse previously initialized resources wherever
+         possible. However, because CharBuffer and ByteBuffer each have a fixed length, we can only reuse them for
+         Strings that are small enough to fit. This creates two kinds of input String to encode: those that are small
+         enough for us to reuse our buffers ("small strings"), and those which are not ("large strings").
+
+         The String#getBytes(Charset) method cannot be used for two reasons:
+
+               1. It always allocates, so we cannot reuse any resources.
+               2. If/when it encounters character data that cannot be encoded as UTF-8, it simply replaces that data
+                 with a substitute character[1]. (Sometimes seen in applications as a '?'.) In order
+                 to surface invalid data to the user, the method must be able to detect these events at encoding time.
+
+            [1] https://en.wikipedia.org/wiki/Substitute_character
+        */
+
+        CharBuffer stringData;
+        ByteBuffer encodingBuffer;
+
+        int length = text.length();
+
+        // While it is technically possible to encode any String using a fixed-size encodingBuffer, we need
+        // to be able to write the length of the complete UTF-8 string to the output stream before we write the string
+        // itself. For simplicity, we reuse or create an encodingBuffer that is large enough to hold the full string.
+
+        // In order to encode the input String, we need to pass it to CharsetEncoder as an implementation of CharBuffer.
+        // Surprisingly, the intuitive way to achieve this (the CharBuffer#wrap(CharSequence) method) adds a large
+        // amount of CPU overhead to the encoding process. Benchmarking shows that it's substantially faster
+        // to use String#getChars(int, int, char[], int) to copy the String's backing array and then call
+        // CharBuffer#wrap(char[]) on the copy.
+
+        if (length > SMALL_STRING_SIZE) {
+            // Allocate a new buffer for large strings
+            encodingBuffer = ByteBuffer.allocate((int) (text.length() * utf8Encoder.maxBytesPerChar()));
+            char[] chars = new char[text.length()];
+            text.getChars(0, text.length(), chars, 0);
+            stringData = CharBuffer.wrap(chars);
+        } else {
+            // Reuse our existing buffers for small strings
+            encodingBuffer = utf8EncodingBuffer;
+            encodingBuffer.clear();
+            stringData = charBuffer;
+            text.getChars(0, text.length(), charArray, 0);
+            charBuffer.rewind();
+            charBuffer.limit(text.length());
+        }
+
+        // Because encodingBuffer is guaranteed to be large enough to hold the encoded string, we can
+        // perform the encoding in a single call to CharsetEncoder#encode(CharBuffer, ByteBuffer, boolean).
+        CoderResult coderResult = utf8Encoder.encode(stringData, encodingBuffer, true);
+
+        // 'Underflow' is the success state of a CoderResult.
+        if (!coderResult.isUnderflow()) {
+            throw new IllegalArgumentException("Could not encode string as UTF8 bytes: " + text);
+        }
+        encodingBuffer.flip();
+        int utf8Length = encodingBuffer.remaining();
+
+        // In most usages, the JVM should be able to eliminate this allocation via an escape analysis of the caller.
+        return new Result(utf8Length, encodingBuffer.array());
+    }
+
+    /**
+     * Attempts to return this instance to the Utf8StringEncoderPool with which it is associated, if any.
+     *
+     * Do not continue to use this encoder after calling this method.
+     */
+    @Override
+    public void close() {
+        if (utf8StringEncoderPool != null) {
+            utf8StringEncoderPool.returnEncoderToPool(this);
+        }
+    }
+
+    /**
+     * Represents the result of a {@link Utf8StringEncoder#encode(String)} operation.
+     */
+    public static class Result {
+        final private byte[] buffer;
+        final private int encodedLength;
+
+        public Result(int encodedLength, byte[] buffer) {
+            this.encodedLength = encodedLength;
+            this.buffer = buffer;
+        }
+
+        /**
+         * Returns a byte array containing the encoded UTF-8 bytes starting at index 0. This byte array is NOT
+         * guaranteed to be the same length as the data it contains. Callers must use {@link #getEncodedLength()}
+         * to determine the number of bytes that should be read from the byte array.
+         *
+         * @return the buffer containing UTF-8 bytes.
+         */
+        public byte[] getBuffer() {
+            return buffer;
+        }
+
+        /**
+         * @return the number of encoded bytes in the array returned by {@link #getBuffer()}.
+         */
+        public int getEncodedLength() {
+            return encodedLength;
+        }
+    }
+}
diff --git a/src/com/amazon/ion/impl/bin/utf8/Utf8StringEncoderPool.java b/src/com/amazon/ion/impl/bin/utf8/Utf8StringEncoderPool.java
@@ -0,0 +1,60 @@
+package com.amazon.ion.impl.bin.utf8;
+
+import java.util.concurrent.ArrayBlockingQueue;
+
+/**
+ * A thread-safe shared pool of {@link Utf8StringEncoder}s that can be used for UTF8 encoding and decoding.
+ */
+public enum Utf8StringEncoderPool {
+    // The only enum variant; a singleton instance.
+    INSTANCE;
+
+    // The maximum number of Utf8Encoders that can be waiting in the queue before new ones will be discarded.
+    private static final int MAX_QUEUE_SIZE = 128;
+
+    // A queue of previously initialized encoders that can be loaned out.
+    private final ArrayBlockingQueue<Utf8StringEncoder> bufferQueue;
+
+    // Do not allow instantiation; all classes should share the singleton instance.
+    private Utf8StringEncoderPool() {
+        bufferQueue = new ArrayBlockingQueue<Utf8StringEncoder>(MAX_QUEUE_SIZE);
+    }
+
+    /**
+     * @return a threadsafe shared instance of {@link Utf8StringEncoderPool}.
+     */
+    public static Utf8StringEncoderPool getInstance() {
+        return INSTANCE;
+    }
+
+    /**
+     * If the pool is not empty, removes an instance of {@link Utf8StringEncoder} from the pool and returns it;
+     * otherwise, constructs a new instance.
+     *
+     * @return An instance of {@link Utf8StringEncoder}.
+     */
+    public Utf8StringEncoder getOrCreateUtf8Encoder() {
+        // The `poll` method does not block. If the queue is empty it returns `null` immediately.
+        Utf8StringEncoder encoder = bufferQueue.poll();
+        if (encoder == null) {
+            // No buffers were available in the pool. Create a new one.
+            encoder = new Utf8StringEncoder(this);
+        }
+        return encoder;
+    }
+
+    /**
+     * Adds the provided instance of {@link Utf8StringEncoder} to the pool. If the pool is full, the instance will
+     * be discarded.
+     *
+     * Callers MUST NOT use an encoder after returning it to the pool.
+     *
+     * @param encoder   A {@link Utf8StringEncoder} to add to the pool.
+     */
+    public void returnEncoderToPool(Utf8StringEncoder encoder) {
+        // The `offer` method does not block. If the queue is full, it returns `false` immediately.
+        // If the provided instance cannot be added to the pool, we discard it silently.
+        bufferQueue.offer(encoder);
+    }
+
+}