diff --git a/src/com/amazon/ion/impl/bin/IonRawBinaryWriter.java b/src/com/amazon/ion/impl/bin/IonRawBinaryWriter.java index 887ad9be30..4df80f7993 100644 --- a/src/com/amazon/ion/impl/bin/IonRawBinaryWriter.java +++ b/src/com/amazon/ion/impl/bin/IonRawBinaryWriter.java @@ -46,15 +46,13 @@ import com.amazon.ion.SymbolTable; import com.amazon.ion.SymbolToken; import com.amazon.ion.Timestamp; +import com.amazon.ion.impl.bin.utf8.Utf8StringEncoder; +import com.amazon.ion.impl.bin.utf8.Utf8StringEncoderPool; + import java.io.IOException; import java.io.OutputStream; import java.math.BigDecimal; import java.math.BigInteger; -import java.nio.ByteBuffer; -import java.nio.CharBuffer; -import java.nio.charset.Charset; -import java.nio.charset.CharsetEncoder; -import java.nio.charset.CoderResult; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -125,14 +123,9 @@ private static byte[] bytes(int... vals) { private static final byte VARINT_NEG_ZERO = (byte) 0xC0; - // See IonRawBinaryWriter#writeString(String) for usage information. - static final int SMALL_STRING_SIZE = 4 * 1024; - - // Reusable resources for encoding Strings as UTF-8 bytes - final CharsetEncoder utf8Encoder = Charset.forName("UTF-8").newEncoder(); - final ByteBuffer utf8EncodingBuffer = ByteBuffer.allocate((int) (SMALL_STRING_SIZE * utf8Encoder.maxBytesPerChar())); - final char[] charArray = new char[SMALL_STRING_SIZE]; - final CharBuffer reusableCharBuffer = CharBuffer.wrap(charArray); + final Utf8StringEncoder utf8StringEncoder = Utf8StringEncoderPool + .getInstance() + .getOrCreateUtf8Encoder(); private static final byte[] makeTypedPreallocatedBytes(final int typeDesc, final int length) { @@ -1443,73 +1436,10 @@ public void writeString(final String value) throws IOException } prepareValue(); - /* - This method relies on the standard CharsetEncoder class to encode each String's UTF-16 char[] data into - UTF-8 bytes. Strangely, CharsetEncoders cannot operate directly on instances of a String. The CharsetEncoder - API requires all inputs and outputs to be specified as instances of java.nio.ByteBuffer and - java.nio.CharBuffer, making some number of allocations mandatory. Specifically, for each encoding operation - we need to have: - - 1. An instance of a UTF-8 CharsetEncoder. - 2. A CharBuffer representation of the String's data. - 3. A ByteBuffer into which the CharsetEncoder may write UTF-8 bytes. - - To minimize the overhead involved, the IonRawBinaryWriter will reuse previously initialized resources wherever - possible. However, because CharBuffer and ByteBuffer each have a fixed length, we can only reuse them for - Strings that are small enough to fit. This creates two kinds of input String to encode: those that are small - enough for us to reuse our buffers ("small strings"), and those which are not ("large strings"). - - The String#getBytes(Charset) method cannot be used for two reasons: - - 1. It always allocates, so we cannot reuse any resources. - 2. If/when it encounters character data that cannot be encoded as UTF-8, it simply replaces that data - with a substitute character[1]. (Sometimes seen in applications as a '?'.) In order - to surface invalid data to the user, the method must be able to detect these events at encoding time. - - [1] https://en.wikipedia.org/wiki/Substitute_character - */ - - CharBuffer stringData; - ByteBuffer encodingBuffer; - - int length = value.length(); - - // While it is possible to encode the Ion string using a fixed-size encodingBuffer, we need to be able to - // write the length of the complete UTF-8 string to the output stream before we write the string itself. - // For simplicity, we reuse or create an encodingBuffer that is large enough to hold the full string. - - // In order to encode the input String, we need to pass it to CharsetEncoder as an implementation of CharBuffer. - // Surprisingly, the intuitive way to achieve this (the CharBuffer#wrap(CharSequence) method) adds a large - // amount of CPU overhead to the encoding process. Benchmarking shows that it's substantially faster - // to use String#getChars(int, int, char[], int) to copy the String's backing array and then call - // CharBuffer#wrap(char[]) on the copy. - - if (length > SMALL_STRING_SIZE) { - // Allocate a new buffer for large strings - encodingBuffer = ByteBuffer.allocate((int) (value.length() * utf8Encoder.maxBytesPerChar())); - char[] chars = new char[value.length()]; - value.getChars(0, value.length(), chars, 0); - stringData = CharBuffer.wrap(chars); - } else { - // Reuse our existing buffers for small strings - encodingBuffer = utf8EncodingBuffer; - encodingBuffer.clear(); - stringData = reusableCharBuffer; - value.getChars(0, value.length(), charArray, 0); - reusableCharBuffer.rewind(); - reusableCharBuffer.limit(value.length()); - } - - // Because encodingBuffer is guaranteed to be large enough to hold the encoded string, we can - // perform the encoding in a single call to CharsetEncoder#encode(CharBuffer, ByteBuffer, boolean). - CoderResult coderResult = utf8Encoder.encode(stringData, encodingBuffer, true); - - // 'Underflow' is the success state of a CoderResult. - if (!coderResult.isUnderflow()) { - throw new IllegalArgumentException("Could not encode string as UTF8 bytes: " + value); - } - encodingBuffer.flip(); - int utf8Length = encodingBuffer.remaining(); + // UTF-8 encode the String + Utf8StringEncoder.Result encoderResult = utf8StringEncoder.encode(value); + int utf8Length = encoderResult.getEncodedLength(); + byte[] utf8Buffer = encoderResult.getBuffer(); // Write the type and length codes to the output stream. long previousPosition = buffer.position(); @@ -1521,7 +1451,7 @@ enough for us to reuse our buffers ("small strings"), and those which are not (" } // Write the encoded UTF-8 bytes to the output stream - buffer.writeBytes(encodingBuffer.array(), 0, utf8Length); + buffer.writeBytes(utf8Buffer, 0, utf8Length); long bytesWritten = buffer.position() - previousPosition; updateLength(bytesWritten); @@ -1686,6 +1616,7 @@ public void close() throws IOException buffer.close(); patchBuffer.close(); allocator.close(); + utf8StringEncoder.close(); } finally { diff --git a/src/com/amazon/ion/impl/bin/utf8/Utf8StringEncoder.java b/src/com/amazon/ion/impl/bin/utf8/Utf8StringEncoder.java new file mode 100644 index 0000000000..d890e1beaf --- /dev/null +++ b/src/com/amazon/ion/impl/bin/utf8/Utf8StringEncoder.java @@ -0,0 +1,170 @@ +package com.amazon.ion.impl.bin.utf8; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; + +/** + * Encodes {@link String}s to UTF-8. Instances of this class are reusable but are NOT threadsafe. + * + * Users are strongly encouraged to get instances from {@link Utf8StringEncoderPool#getOrCreateUtf8Encoder()}. + * {@link #encode(String)} can be called any number of times. Users are expected to call {@link #close()} when + * the encoder is no longer needed. + */ +public class Utf8StringEncoder implements Closeable { + // The longest String (as measured by {@link java.lang.String#length()}) that this instance can encode without + // requiring additional allocations. + private static final int SMALL_STRING_SIZE = 4 * 1024; + + // Reusable resources for encoding Strings as UTF-8 bytes + final Utf8StringEncoderPool utf8StringEncoderPool; + final CharsetEncoder utf8Encoder; + final ByteBuffer utf8EncodingBuffer; + final char[] charArray; + final CharBuffer charBuffer; + + public Utf8StringEncoder(Utf8StringEncoderPool pool) { + utf8StringEncoderPool = pool; + utf8Encoder = Charset.forName("UTF-8").newEncoder(); + utf8EncodingBuffer = ByteBuffer.allocate((int) (SMALL_STRING_SIZE * utf8Encoder.maxBytesPerChar())); + charArray = new char[SMALL_STRING_SIZE]; + charBuffer = CharBuffer.wrap(charArray); + } + + public Utf8StringEncoder() { + // This instance is not associated with a Utf8StringEncoderPool + this(null); + } + + /** + * Encodes the provided String's text to UTF-8. Unlike {@link String#getBytes(Charset)}, this method will not + * silently replace characters that cannot be encoded with a substitute character. Instead, it will throw + * an {@link IllegalArgumentException}. + * + * Some resources in the returned {@link Result} may be reused across calls to this method. Consequently, + * callers should use the Result and discard it immediately. + * + * @param text A Java String to encode as UTF8 bytes. + * @return A {@link Result} containing a byte array of UTF-8 bytes and encoded length. + * @throws IllegalArgumentException if the String cannot be encoded as UTF-8. + */ + public Result encode(String text) { + /* + This method relies on the standard CharsetEncoder class to encode each String's UTF-16 char[] data into + UTF-8 bytes. Strangely, CharsetEncoders cannot operate directly on instances of a String. The CharsetEncoder + API requires all inputs and outputs to be specified as instances of java.nio.ByteBuffer and + java.nio.CharBuffer, making some number of allocations mandatory. Specifically, for each encoding operation + we need to have: + + 1. An instance of a UTF-8 CharsetEncoder. + 2. A CharBuffer representation of the String's data. + 3. A ByteBuffer into which the CharsetEncoder may write UTF-8 bytes. + + To minimize the overhead involved, the Utf8StringEncoder will reuse previously initialized resources wherever + possible. However, because CharBuffer and ByteBuffer each have a fixed length, we can only reuse them for + Strings that are small enough to fit. This creates two kinds of input String to encode: those that are small + enough for us to reuse our buffers ("small strings"), and those which are not ("large strings"). + + The String#getBytes(Charset) method cannot be used for two reasons: + + 1. It always allocates, so we cannot reuse any resources. + 2. If/when it encounters character data that cannot be encoded as UTF-8, it simply replaces that data + with a substitute character[1]. (Sometimes seen in applications as a '?'.) In order + to surface invalid data to the user, the method must be able to detect these events at encoding time. + + [1] https://en.wikipedia.org/wiki/Substitute_character + */ + + CharBuffer stringData; + ByteBuffer encodingBuffer; + + int length = text.length(); + + // While it is technically possible to encode any String using a fixed-size encodingBuffer, we need + // to be able to write the length of the complete UTF-8 string to the output stream before we write the string + // itself. For simplicity, we reuse or create an encodingBuffer that is large enough to hold the full string. + + // In order to encode the input String, we need to pass it to CharsetEncoder as an implementation of CharBuffer. + // Surprisingly, the intuitive way to achieve this (the CharBuffer#wrap(CharSequence) method) adds a large + // amount of CPU overhead to the encoding process. Benchmarking shows that it's substantially faster + // to use String#getChars(int, int, char[], int) to copy the String's backing array and then call + // CharBuffer#wrap(char[]) on the copy. + + if (length > SMALL_STRING_SIZE) { + // Allocate a new buffer for large strings + encodingBuffer = ByteBuffer.allocate((int) (text.length() * utf8Encoder.maxBytesPerChar())); + char[] chars = new char[text.length()]; + text.getChars(0, text.length(), chars, 0); + stringData = CharBuffer.wrap(chars); + } else { + // Reuse our existing buffers for small strings + encodingBuffer = utf8EncodingBuffer; + encodingBuffer.clear(); + stringData = charBuffer; + text.getChars(0, text.length(), charArray, 0); + charBuffer.rewind(); + charBuffer.limit(text.length()); + } + + // Because encodingBuffer is guaranteed to be large enough to hold the encoded string, we can + // perform the encoding in a single call to CharsetEncoder#encode(CharBuffer, ByteBuffer, boolean). + CoderResult coderResult = utf8Encoder.encode(stringData, encodingBuffer, true); + + // 'Underflow' is the success state of a CoderResult. + if (!coderResult.isUnderflow()) { + throw new IllegalArgumentException("Could not encode string as UTF8 bytes: " + text); + } + encodingBuffer.flip(); + int utf8Length = encodingBuffer.remaining(); + + // In most usages, the JVM should be able to eliminate this allocation via an escape analysis of the caller. + return new Result(utf8Length, encodingBuffer.array()); + } + + /** + * Attempts to return this instance to the Utf8StringEncoderPool with which it is associated, if any. + * + * Do not continue to use this encoder after calling this method. + */ + @Override + public void close() { + if (utf8StringEncoderPool != null) { + utf8StringEncoderPool.returnEncoderToPool(this); + } + } + + /** + * Represents the result of a {@link Utf8StringEncoder#encode(String)} operation. + */ + public static class Result { + final private byte[] buffer; + final private int encodedLength; + + public Result(int encodedLength, byte[] buffer) { + this.encodedLength = encodedLength; + this.buffer = buffer; + } + + /** + * Returns a byte array containing the encoded UTF-8 bytes starting at index 0. This byte array is NOT + * guaranteed to be the same length as the data it contains. Callers must use {@link #getEncodedLength()} + * to determine the number of bytes that should be read from the byte array. + * + * @return the buffer containing UTF-8 bytes. + */ + public byte[] getBuffer() { + return buffer; + } + + /** + * @return the number of encoded bytes in the array returned by {@link #getBuffer()}. + */ + public int getEncodedLength() { + return encodedLength; + } + } +} diff --git a/src/com/amazon/ion/impl/bin/utf8/Utf8StringEncoderPool.java b/src/com/amazon/ion/impl/bin/utf8/Utf8StringEncoderPool.java new file mode 100644 index 0000000000..e926df4a75 --- /dev/null +++ b/src/com/amazon/ion/impl/bin/utf8/Utf8StringEncoderPool.java @@ -0,0 +1,60 @@ +package com.amazon.ion.impl.bin.utf8; + +import java.util.concurrent.ArrayBlockingQueue; + +/** + * A thread-safe shared pool of {@link Utf8StringEncoder}s that can be used for UTF8 encoding and decoding. + */ +public enum Utf8StringEncoderPool { + // The only enum variant; a singleton instance. + INSTANCE; + + // The maximum number of Utf8Encoders that can be waiting in the queue before new ones will be discarded. + private static final int MAX_QUEUE_SIZE = 128; + + // A queue of previously initialized encoders that can be loaned out. + private final ArrayBlockingQueue bufferQueue; + + // Do not allow instantiation; all classes should share the singleton instance. + private Utf8StringEncoderPool() { + bufferQueue = new ArrayBlockingQueue(MAX_QUEUE_SIZE); + } + + /** + * @return a threadsafe shared instance of {@link Utf8StringEncoderPool}. + */ + public static Utf8StringEncoderPool getInstance() { + return INSTANCE; + } + + /** + * If the pool is not empty, removes an instance of {@link Utf8StringEncoder} from the pool and returns it; + * otherwise, constructs a new instance. + * + * @return An instance of {@link Utf8StringEncoder}. + */ + public Utf8StringEncoder getOrCreateUtf8Encoder() { + // The `poll` method does not block. If the queue is empty it returns `null` immediately. + Utf8StringEncoder encoder = bufferQueue.poll(); + if (encoder == null) { + // No buffers were available in the pool. Create a new one. + encoder = new Utf8StringEncoder(this); + } + return encoder; + } + + /** + * Adds the provided instance of {@link Utf8StringEncoder} to the pool. If the pool is full, the instance will + * be discarded. + * + * Callers MUST NOT use an encoder after returning it to the pool. + * + * @param encoder A {@link Utf8StringEncoder} to add to the pool. + */ + public void returnEncoderToPool(Utf8StringEncoder encoder) { + // The `offer` method does not block. If the queue is full, it returns `false` immediately. + // If the provided instance cannot be added to the pool, we discard it silently. + bufferQueue.offer(encoder); + } + +}