-
Notifications
You must be signed in to change notification settings - Fork 125
Adds a pool of UTF8 String encoders #369
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -46,15 +46,13 @@ | |
| import com.amazon.ion.SymbolTable; | ||
| import com.amazon.ion.SymbolToken; | ||
| import com.amazon.ion.Timestamp; | ||
| import com.amazon.ion.impl.bin.utf8.Utf8StringEncoder; | ||
| import com.amazon.ion.impl.bin.utf8.Utf8StringEncoderPool; | ||
|
|
||
| import java.io.IOException; | ||
| import java.io.OutputStream; | ||
| import java.math.BigDecimal; | ||
| import java.math.BigInteger; | ||
| import java.nio.ByteBuffer; | ||
| import java.nio.CharBuffer; | ||
| import java.nio.charset.Charset; | ||
| import java.nio.charset.CharsetEncoder; | ||
| import java.nio.charset.CoderResult; | ||
| import java.util.ArrayList; | ||
| import java.util.Iterator; | ||
| import java.util.List; | ||
|
|
@@ -125,14 +123,9 @@ private static byte[] bytes(int... vals) { | |
|
|
||
| private static final byte VARINT_NEG_ZERO = (byte) 0xC0; | ||
|
|
||
| // See IonRawBinaryWriter#writeString(String) for usage information. | ||
| static final int SMALL_STRING_SIZE = 4 * 1024; | ||
|
|
||
| // Reusable resources for encoding Strings as UTF-8 bytes | ||
| final CharsetEncoder utf8Encoder = Charset.forName("UTF-8").newEncoder(); | ||
| final ByteBuffer utf8EncodingBuffer = ByteBuffer.allocate((int) (SMALL_STRING_SIZE * utf8Encoder.maxBytesPerChar())); | ||
| final char[] charArray = new char[SMALL_STRING_SIZE]; | ||
| final CharBuffer reusableCharBuffer = CharBuffer.wrap(charArray); | ||
| final Utf8StringEncoder utf8StringEncoder = Utf8StringEncoderPool | ||
| .getInstance() | ||
| .getOrCreateUtf8Encoder(); | ||
|
Comment on lines
+126
to
+128
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rather than allocating several new arrays for each binary writer we construct, we simply pull a |
||
|
|
||
| private static final byte[] makeTypedPreallocatedBytes(final int typeDesc, final int length) | ||
| { | ||
|
|
@@ -1443,73 +1436,10 @@ public void writeString(final String value) throws IOException | |
| } | ||
| prepareValue(); | ||
|
|
||
| /* | ||
| This method relies on the standard CharsetEncoder class to encode each String's UTF-16 char[] data into | ||
| UTF-8 bytes. Strangely, CharsetEncoders cannot operate directly on instances of a String. The CharsetEncoder | ||
| API requires all inputs and outputs to be specified as instances of java.nio.ByteBuffer and | ||
| java.nio.CharBuffer, making some number of allocations mandatory. Specifically, for each encoding operation | ||
| we need to have: | ||
|
|
||
| 1. An instance of a UTF-8 CharsetEncoder. | ||
| 2. A CharBuffer representation of the String's data. | ||
| 3. A ByteBuffer into which the CharsetEncoder may write UTF-8 bytes. | ||
|
|
||
| To minimize the overhead involved, the IonRawBinaryWriter will reuse previously initialized resources wherever | ||
| possible. However, because CharBuffer and ByteBuffer each have a fixed length, we can only reuse them for | ||
| Strings that are small enough to fit. This creates two kinds of input String to encode: those that are small | ||
| enough for us to reuse our buffers ("small strings"), and those which are not ("large strings"). | ||
|
|
||
| The String#getBytes(Charset) method cannot be used for two reasons: | ||
|
|
||
| 1. It always allocates, so we cannot reuse any resources. | ||
| 2. If/when it encounters character data that cannot be encoded as UTF-8, it simply replaces that data | ||
| with a substitute character[1]. (Sometimes seen in applications as a '?'.) In order | ||
| to surface invalid data to the user, the method must be able to detect these events at encoding time. | ||
|
|
||
| [1] https://en.wikipedia.org/wiki/Substitute_character | ||
| */ | ||
|
|
||
| CharBuffer stringData; | ||
| ByteBuffer encodingBuffer; | ||
|
|
||
| int length = value.length(); | ||
|
|
||
| // While it is possible to encode the Ion string using a fixed-size encodingBuffer, we need to be able to | ||
| // write the length of the complete UTF-8 string to the output stream before we write the string itself. | ||
| // For simplicity, we reuse or create an encodingBuffer that is large enough to hold the full string. | ||
|
|
||
| // In order to encode the input String, we need to pass it to CharsetEncoder as an implementation of CharBuffer. | ||
| // Surprisingly, the intuitive way to achieve this (the CharBuffer#wrap(CharSequence) method) adds a large | ||
| // amount of CPU overhead to the encoding process. Benchmarking shows that it's substantially faster | ||
| // to use String#getChars(int, int, char[], int) to copy the String's backing array and then call | ||
| // CharBuffer#wrap(char[]) on the copy. | ||
|
|
||
| if (length > SMALL_STRING_SIZE) { | ||
| // Allocate a new buffer for large strings | ||
| encodingBuffer = ByteBuffer.allocate((int) (value.length() * utf8Encoder.maxBytesPerChar())); | ||
| char[] chars = new char[value.length()]; | ||
| value.getChars(0, value.length(), chars, 0); | ||
| stringData = CharBuffer.wrap(chars); | ||
| } else { | ||
| // Reuse our existing buffers for small strings | ||
| encodingBuffer = utf8EncodingBuffer; | ||
| encodingBuffer.clear(); | ||
| stringData = reusableCharBuffer; | ||
| value.getChars(0, value.length(), charArray, 0); | ||
| reusableCharBuffer.rewind(); | ||
| reusableCharBuffer.limit(value.length()); | ||
| } | ||
|
|
||
| // Because encodingBuffer is guaranteed to be large enough to hold the encoded string, we can | ||
| // perform the encoding in a single call to CharsetEncoder#encode(CharBuffer, ByteBuffer, boolean). | ||
| CoderResult coderResult = utf8Encoder.encode(stringData, encodingBuffer, true); | ||
|
|
||
| // 'Underflow' is the success state of a CoderResult. | ||
| if (!coderResult.isUnderflow()) { | ||
| throw new IllegalArgumentException("Could not encode string as UTF8 bytes: " + value); | ||
| } | ||
| encodingBuffer.flip(); | ||
| int utf8Length = encodingBuffer.remaining(); | ||
| // UTF-8 encode the String | ||
| Utf8StringEncoder.Result encoderResult = utf8StringEncoder.encode(value); | ||
| int utf8Length = encoderResult.getEncodedLength(); | ||
| byte[] utf8Buffer = encoderResult.getBuffer(); | ||
|
|
||
| // Write the type and length codes to the output stream. | ||
| long previousPosition = buffer.position(); | ||
|
|
@@ -1521,7 +1451,7 @@ enough for us to reuse our buffers ("small strings"), and those which are not (" | |
| } | ||
|
|
||
| // Write the encoded UTF-8 bytes to the output stream | ||
| buffer.writeBytes(encodingBuffer.array(), 0, utf8Length); | ||
| buffer.writeBytes(utf8Buffer, 0, utf8Length); | ||
|
|
||
| long bytesWritten = buffer.position() - previousPosition; | ||
| updateLength(bytesWritten); | ||
|
|
@@ -1686,6 +1616,7 @@ public void close() throws IOException | |
| buffer.close(); | ||
| patchBuffer.close(); | ||
| allocator.close(); | ||
| utf8StringEncoder.close(); | ||
| } | ||
| finally | ||
| { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,170 @@ | ||
| package com.amazon.ion.impl.bin.utf8; | ||
|
|
||
| import java.io.Closeable; | ||
| import java.io.IOException; | ||
| import java.nio.ByteBuffer; | ||
| import java.nio.CharBuffer; | ||
| import java.nio.charset.Charset; | ||
| import java.nio.charset.CharsetEncoder; | ||
| import java.nio.charset.CoderResult; | ||
|
|
||
| /** | ||
| * Encodes {@link String}s to UTF-8. Instances of this class are reusable but are NOT threadsafe. | ||
| * | ||
| * Users are strongly encouraged to get instances from {@link Utf8StringEncoderPool#getOrCreateUtf8Encoder()}. | ||
| * {@link #encode(String)} can be called any number of times. Users are expected to call {@link #close()} when | ||
| * the encoder is no longer needed. | ||
| */ | ||
| public class Utf8StringEncoder implements Closeable { | ||
| // The longest String (as measured by {@link java.lang.String#length()}) that this instance can encode without | ||
| // requiring additional allocations. | ||
| private static final int SMALL_STRING_SIZE = 4 * 1024; | ||
|
|
||
| // Reusable resources for encoding Strings as UTF-8 bytes | ||
| final Utf8StringEncoderPool utf8StringEncoderPool; | ||
| final CharsetEncoder utf8Encoder; | ||
| final ByteBuffer utf8EncodingBuffer; | ||
| final char[] charArray; | ||
| final CharBuffer charBuffer; | ||
|
|
||
| public Utf8StringEncoder(Utf8StringEncoderPool pool) { | ||
| utf8StringEncoderPool = pool; | ||
| utf8Encoder = Charset.forName("UTF-8").newEncoder(); | ||
| utf8EncodingBuffer = ByteBuffer.allocate((int) (SMALL_STRING_SIZE * utf8Encoder.maxBytesPerChar())); | ||
| charArray = new char[SMALL_STRING_SIZE]; | ||
| charBuffer = CharBuffer.wrap(charArray); | ||
| } | ||
|
|
||
| public Utf8StringEncoder() { | ||
| // This instance is not associated with a Utf8StringEncoderPool | ||
| this(null); | ||
| } | ||
|
|
||
| /** | ||
| * Encodes the provided String's text to UTF-8. Unlike {@link String#getBytes(Charset)}, this method will not | ||
| * silently replace characters that cannot be encoded with a substitute character. Instead, it will throw | ||
| * an {@link IllegalArgumentException}. | ||
| * | ||
| * Some resources in the returned {@link Result} may be reused across calls to this method. Consequently, | ||
| * callers should use the Result and discard it immediately. | ||
| * | ||
| * @param text A Java String to encode as UTF8 bytes. | ||
| * @return A {@link Result} containing a byte array of UTF-8 bytes and encoded length. | ||
| * @throws IllegalArgumentException if the String cannot be encoded as UTF-8. | ||
| */ | ||
| public Result encode(String text) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The encoding logic in this method was migrated without changes. |
||
| /* | ||
| This method relies on the standard CharsetEncoder class to encode each String's UTF-16 char[] data into | ||
| UTF-8 bytes. Strangely, CharsetEncoders cannot operate directly on instances of a String. The CharsetEncoder | ||
| API requires all inputs and outputs to be specified as instances of java.nio.ByteBuffer and | ||
| java.nio.CharBuffer, making some number of allocations mandatory. Specifically, for each encoding operation | ||
| we need to have: | ||
|
|
||
| 1. An instance of a UTF-8 CharsetEncoder. | ||
| 2. A CharBuffer representation of the String's data. | ||
| 3. A ByteBuffer into which the CharsetEncoder may write UTF-8 bytes. | ||
|
|
||
| To minimize the overhead involved, the Utf8StringEncoder will reuse previously initialized resources wherever | ||
| possible. However, because CharBuffer and ByteBuffer each have a fixed length, we can only reuse them for | ||
| Strings that are small enough to fit. This creates two kinds of input String to encode: those that are small | ||
| enough for us to reuse our buffers ("small strings"), and those which are not ("large strings"). | ||
|
|
||
| The String#getBytes(Charset) method cannot be used for two reasons: | ||
|
|
||
| 1. It always allocates, so we cannot reuse any resources. | ||
| 2. If/when it encounters character data that cannot be encoded as UTF-8, it simply replaces that data | ||
| with a substitute character[1]. (Sometimes seen in applications as a '?'.) In order | ||
| to surface invalid data to the user, the method must be able to detect these events at encoding time. | ||
|
|
||
| [1] https://en.wikipedia.org/wiki/Substitute_character | ||
| */ | ||
|
|
||
| CharBuffer stringData; | ||
| ByteBuffer encodingBuffer; | ||
|
|
||
| int length = text.length(); | ||
|
|
||
| // While it is technically possible to encode any String using a fixed-size encodingBuffer, we need | ||
| // to be able to write the length of the complete UTF-8 string to the output stream before we write the string | ||
| // itself. For simplicity, we reuse or create an encodingBuffer that is large enough to hold the full string. | ||
|
|
||
| // In order to encode the input String, we need to pass it to CharsetEncoder as an implementation of CharBuffer. | ||
| // Surprisingly, the intuitive way to achieve this (the CharBuffer#wrap(CharSequence) method) adds a large | ||
| // amount of CPU overhead to the encoding process. Benchmarking shows that it's substantially faster | ||
| // to use String#getChars(int, int, char[], int) to copy the String's backing array and then call | ||
| // CharBuffer#wrap(char[]) on the copy. | ||
|
|
||
| if (length > SMALL_STRING_SIZE) { | ||
| // Allocate a new buffer for large strings | ||
| encodingBuffer = ByteBuffer.allocate((int) (text.length() * utf8Encoder.maxBytesPerChar())); | ||
| char[] chars = new char[text.length()]; | ||
| text.getChars(0, text.length(), chars, 0); | ||
| stringData = CharBuffer.wrap(chars); | ||
| } else { | ||
| // Reuse our existing buffers for small strings | ||
| encodingBuffer = utf8EncodingBuffer; | ||
| encodingBuffer.clear(); | ||
| stringData = charBuffer; | ||
| text.getChars(0, text.length(), charArray, 0); | ||
| charBuffer.rewind(); | ||
| charBuffer.limit(text.length()); | ||
| } | ||
|
|
||
| // Because encodingBuffer is guaranteed to be large enough to hold the encoded string, we can | ||
| // perform the encoding in a single call to CharsetEncoder#encode(CharBuffer, ByteBuffer, boolean). | ||
| CoderResult coderResult = utf8Encoder.encode(stringData, encodingBuffer, true); | ||
|
|
||
| // 'Underflow' is the success state of a CoderResult. | ||
| if (!coderResult.isUnderflow()) { | ||
| throw new IllegalArgumentException("Could not encode string as UTF8 bytes: " + text); | ||
| } | ||
| encodingBuffer.flip(); | ||
| int utf8Length = encodingBuffer.remaining(); | ||
|
|
||
| // In most usages, the JVM should be able to eliminate this allocation via an escape analysis of the caller. | ||
| return new Result(utf8Length, encodingBuffer.array()); | ||
| } | ||
|
|
||
| /** | ||
| * Attempts to return this instance to the Utf8StringEncoderPool with which it is associated, if any. | ||
| * | ||
| * Do not continue to use this encoder after calling this method. | ||
| */ | ||
| @Override | ||
| public void close() { | ||
| if (utf8StringEncoderPool != null) { | ||
| utf8StringEncoderPool.returnEncoderToPool(this); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Represents the result of a {@link Utf8StringEncoder#encode(String)} operation. | ||
| */ | ||
| public static class Result { | ||
| final private byte[] buffer; | ||
| final private int encodedLength; | ||
|
|
||
| public Result(int encodedLength, byte[] buffer) { | ||
| this.encodedLength = encodedLength; | ||
| this.buffer = buffer; | ||
| } | ||
|
|
||
| /** | ||
| * Returns a byte array containing the encoded UTF-8 bytes starting at index 0. This byte array is NOT | ||
| * guaranteed to be the same length as the data it contains. Callers must use {@link #getEncodedLength()} | ||
| * to determine the number of bytes that should be read from the byte array. | ||
| * | ||
| * @return the buffer containing UTF-8 bytes. | ||
| */ | ||
| public byte[] getBuffer() { | ||
| return buffer; | ||
| } | ||
|
|
||
| /** | ||
| * @return the number of encoded bytes in the array returned by {@link #getBuffer()}. | ||
| */ | ||
| public int getEncodedLength() { | ||
| return encodedLength; | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,60 @@ | ||
| package com.amazon.ion.impl.bin.utf8; | ||
|
|
||
| import java.util.concurrent.ArrayBlockingQueue; | ||
|
|
||
| /** | ||
| * A thread-safe shared pool of {@link Utf8StringEncoder}s that can be used for UTF8 encoding and decoding. | ||
| */ | ||
| public enum Utf8StringEncoderPool { | ||
| // The only enum variant; a singleton instance. | ||
| INSTANCE; | ||
|
|
||
| // The maximum number of Utf8Encoders that can be waiting in the queue before new ones will be discarded. | ||
| private static final int MAX_QUEUE_SIZE = 128; | ||
|
|
||
| // A queue of previously initialized encoders that can be loaned out. | ||
| private final ArrayBlockingQueue<Utf8StringEncoder> bufferQueue; | ||
|
|
||
| // Do not allow instantiation; all classes should share the singleton instance. | ||
| private Utf8StringEncoderPool() { | ||
| bufferQueue = new ArrayBlockingQueue<Utf8StringEncoder>(MAX_QUEUE_SIZE); | ||
| } | ||
|
|
||
| /** | ||
| * @return a threadsafe shared instance of {@link Utf8StringEncoderPool}. | ||
| */ | ||
| public static Utf8StringEncoderPool getInstance() { | ||
| return INSTANCE; | ||
| } | ||
|
|
||
| /** | ||
| * If the pool is not empty, removes an instance of {@link Utf8StringEncoder} from the pool and returns it; | ||
| * otherwise, constructs a new instance. | ||
| * | ||
| * @return An instance of {@link Utf8StringEncoder}. | ||
| */ | ||
| public Utf8StringEncoder getOrCreateUtf8Encoder() { | ||
| // The `poll` method does not block. If the queue is empty it returns `null` immediately. | ||
| Utf8StringEncoder encoder = bufferQueue.poll(); | ||
| if (encoder == null) { | ||
| // No buffers were available in the pool. Create a new one. | ||
| encoder = new Utf8StringEncoder(this); | ||
| } | ||
| return encoder; | ||
| } | ||
|
|
||
| /** | ||
| * Adds the provided instance of {@link Utf8StringEncoder} to the pool. If the pool is full, the instance will | ||
| * be discarded. | ||
| * | ||
| * Callers MUST NOT use an encoder after returning it to the pool. | ||
| * | ||
| * @param encoder A {@link Utf8StringEncoder} to add to the pool. | ||
| */ | ||
| public void returnEncoderToPool(Utf8StringEncoder encoder) { | ||
| // The `offer` method does not block. If the queue is full, it returns `false` immediately. | ||
| // If the provided instance cannot be added to the pool, we discard it silently. | ||
| bufferQueue.offer(encoder); | ||
| } | ||
|
|
||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All of the code removed from this file (
IonRawBinaryWriter.java) was moved to the newUtf8StringEncoderclass.