Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 12 additions & 81 deletions src/com/amazon/ion/impl/bin/IonRawBinaryWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,13 @@
import com.amazon.ion.SymbolTable;
import com.amazon.ion.SymbolToken;
import com.amazon.ion.Timestamp;
import com.amazon.ion.impl.bin.utf8.Utf8StringEncoder;
import com.amazon.ion.impl.bin.utf8.Utf8StringEncoderPool;

import java.io.IOException;
import java.io.OutputStream;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
Expand Down Expand Up @@ -125,14 +123,9 @@ private static byte[] bytes(int... vals) {

private static final byte VARINT_NEG_ZERO = (byte) 0xC0;

// See IonRawBinaryWriter#writeString(String) for usage information.
static final int SMALL_STRING_SIZE = 4 * 1024;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All of the code removed from this file (IonRawBinaryWriter.java) was moved to the new Utf8StringEncoder class.


// Reusable resources for encoding Strings as UTF-8 bytes
final CharsetEncoder utf8Encoder = Charset.forName("UTF-8").newEncoder();
final ByteBuffer utf8EncodingBuffer = ByteBuffer.allocate((int) (SMALL_STRING_SIZE * utf8Encoder.maxBytesPerChar()));
final char[] charArray = new char[SMALL_STRING_SIZE];
final CharBuffer reusableCharBuffer = CharBuffer.wrap(charArray);
final Utf8StringEncoder utf8StringEncoder = Utf8StringEncoderPool
.getInstance()
.getOrCreateUtf8Encoder();
Comment on lines +126 to +128
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than allocating several new arrays for each binary writer we construct, we simply pull a Utf8StringEncoder from the pool.


private static final byte[] makeTypedPreallocatedBytes(final int typeDesc, final int length)
{
Expand Down Expand Up @@ -1443,73 +1436,10 @@ public void writeString(final String value) throws IOException
}
prepareValue();

/*
This method relies on the standard CharsetEncoder class to encode each String's UTF-16 char[] data into
UTF-8 bytes. Strangely, CharsetEncoders cannot operate directly on instances of a String. The CharsetEncoder
API requires all inputs and outputs to be specified as instances of java.nio.ByteBuffer and
java.nio.CharBuffer, making some number of allocations mandatory. Specifically, for each encoding operation
we need to have:

1. An instance of a UTF-8 CharsetEncoder.
2. A CharBuffer representation of the String's data.
3. A ByteBuffer into which the CharsetEncoder may write UTF-8 bytes.

To minimize the overhead involved, the IonRawBinaryWriter will reuse previously initialized resources wherever
possible. However, because CharBuffer and ByteBuffer each have a fixed length, we can only reuse them for
Strings that are small enough to fit. This creates two kinds of input String to encode: those that are small
enough for us to reuse our buffers ("small strings"), and those which are not ("large strings").

The String#getBytes(Charset) method cannot be used for two reasons:

1. It always allocates, so we cannot reuse any resources.
2. If/when it encounters character data that cannot be encoded as UTF-8, it simply replaces that data
with a substitute character[1]. (Sometimes seen in applications as a '?'.) In order
to surface invalid data to the user, the method must be able to detect these events at encoding time.

[1] https://en.wikipedia.org/wiki/Substitute_character
*/

CharBuffer stringData;
ByteBuffer encodingBuffer;

int length = value.length();

// While it is possible to encode the Ion string using a fixed-size encodingBuffer, we need to be able to
// write the length of the complete UTF-8 string to the output stream before we write the string itself.
// For simplicity, we reuse or create an encodingBuffer that is large enough to hold the full string.

// In order to encode the input String, we need to pass it to CharsetEncoder as an implementation of CharBuffer.
// Surprisingly, the intuitive way to achieve this (the CharBuffer#wrap(CharSequence) method) adds a large
// amount of CPU overhead to the encoding process. Benchmarking shows that it's substantially faster
// to use String#getChars(int, int, char[], int) to copy the String's backing array and then call
// CharBuffer#wrap(char[]) on the copy.

if (length > SMALL_STRING_SIZE) {
// Allocate a new buffer for large strings
encodingBuffer = ByteBuffer.allocate((int) (value.length() * utf8Encoder.maxBytesPerChar()));
char[] chars = new char[value.length()];
value.getChars(0, value.length(), chars, 0);
stringData = CharBuffer.wrap(chars);
} else {
// Reuse our existing buffers for small strings
encodingBuffer = utf8EncodingBuffer;
encodingBuffer.clear();
stringData = reusableCharBuffer;
value.getChars(0, value.length(), charArray, 0);
reusableCharBuffer.rewind();
reusableCharBuffer.limit(value.length());
}

// Because encodingBuffer is guaranteed to be large enough to hold the encoded string, we can
// perform the encoding in a single call to CharsetEncoder#encode(CharBuffer, ByteBuffer, boolean).
CoderResult coderResult = utf8Encoder.encode(stringData, encodingBuffer, true);

// 'Underflow' is the success state of a CoderResult.
if (!coderResult.isUnderflow()) {
throw new IllegalArgumentException("Could not encode string as UTF8 bytes: " + value);
}
encodingBuffer.flip();
int utf8Length = encodingBuffer.remaining();
// UTF-8 encode the String
Utf8StringEncoder.Result encoderResult = utf8StringEncoder.encode(value);
int utf8Length = encoderResult.getEncodedLength();
byte[] utf8Buffer = encoderResult.getBuffer();

// Write the type and length codes to the output stream.
long previousPosition = buffer.position();
Expand All @@ -1521,7 +1451,7 @@ enough for us to reuse our buffers ("small strings"), and those which are not ("
}

// Write the encoded UTF-8 bytes to the output stream
buffer.writeBytes(encodingBuffer.array(), 0, utf8Length);
buffer.writeBytes(utf8Buffer, 0, utf8Length);

long bytesWritten = buffer.position() - previousPosition;
updateLength(bytesWritten);
Expand Down Expand Up @@ -1686,6 +1616,7 @@ public void close() throws IOException
buffer.close();
patchBuffer.close();
allocator.close();
utf8StringEncoder.close();
}
finally
{
Expand Down
170 changes: 170 additions & 0 deletions src/com/amazon/ion/impl/bin/utf8/Utf8StringEncoder.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
package com.amazon.ion.impl.bin.utf8;

import java.io.Closeable;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;

/**
* Encodes {@link String}s to UTF-8. Instances of this class are reusable but are NOT threadsafe.
*
* Users are strongly encouraged to get instances from {@link Utf8StringEncoderPool#getOrCreateUtf8Encoder()}.
* {@link #encode(String)} can be called any number of times. Users are expected to call {@link #close()} when
* the encoder is no longer needed.
*/
public class Utf8StringEncoder implements Closeable {
// The longest String (as measured by {@link java.lang.String#length()}) that this instance can encode without
// requiring additional allocations.
private static final int SMALL_STRING_SIZE = 4 * 1024;

// Reusable resources for encoding Strings as UTF-8 bytes
final Utf8StringEncoderPool utf8StringEncoderPool;
final CharsetEncoder utf8Encoder;
final ByteBuffer utf8EncodingBuffer;
final char[] charArray;
final CharBuffer charBuffer;

public Utf8StringEncoder(Utf8StringEncoderPool pool) {
utf8StringEncoderPool = pool;
utf8Encoder = Charset.forName("UTF-8").newEncoder();
utf8EncodingBuffer = ByteBuffer.allocate((int) (SMALL_STRING_SIZE * utf8Encoder.maxBytesPerChar()));
charArray = new char[SMALL_STRING_SIZE];
charBuffer = CharBuffer.wrap(charArray);
}

public Utf8StringEncoder() {
// This instance is not associated with a Utf8StringEncoderPool
this(null);
}

/**
* Encodes the provided String's text to UTF-8. Unlike {@link String#getBytes(Charset)}, this method will not
* silently replace characters that cannot be encoded with a substitute character. Instead, it will throw
* an {@link IllegalArgumentException}.
*
* Some resources in the returned {@link Result} may be reused across calls to this method. Consequently,
* callers should use the Result and discard it immediately.
*
* @param text A Java String to encode as UTF8 bytes.
* @return A {@link Result} containing a byte array of UTF-8 bytes and encoded length.
* @throws IllegalArgumentException if the String cannot be encoded as UTF-8.
*/
public Result encode(String text) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The encoding logic in this method was migrated without changes.

/*
This method relies on the standard CharsetEncoder class to encode each String's UTF-16 char[] data into
UTF-8 bytes. Strangely, CharsetEncoders cannot operate directly on instances of a String. The CharsetEncoder
API requires all inputs and outputs to be specified as instances of java.nio.ByteBuffer and
java.nio.CharBuffer, making some number of allocations mandatory. Specifically, for each encoding operation
we need to have:

1. An instance of a UTF-8 CharsetEncoder.
2. A CharBuffer representation of the String's data.
3. A ByteBuffer into which the CharsetEncoder may write UTF-8 bytes.

To minimize the overhead involved, the Utf8StringEncoder will reuse previously initialized resources wherever
possible. However, because CharBuffer and ByteBuffer each have a fixed length, we can only reuse them for
Strings that are small enough to fit. This creates two kinds of input String to encode: those that are small
enough for us to reuse our buffers ("small strings"), and those which are not ("large strings").

The String#getBytes(Charset) method cannot be used for two reasons:

1. It always allocates, so we cannot reuse any resources.
2. If/when it encounters character data that cannot be encoded as UTF-8, it simply replaces that data
with a substitute character[1]. (Sometimes seen in applications as a '?'.) In order
to surface invalid data to the user, the method must be able to detect these events at encoding time.

[1] https://en.wikipedia.org/wiki/Substitute_character
*/

CharBuffer stringData;
ByteBuffer encodingBuffer;

int length = text.length();

// While it is technically possible to encode any String using a fixed-size encodingBuffer, we need
// to be able to write the length of the complete UTF-8 string to the output stream before we write the string
// itself. For simplicity, we reuse or create an encodingBuffer that is large enough to hold the full string.

// In order to encode the input String, we need to pass it to CharsetEncoder as an implementation of CharBuffer.
// Surprisingly, the intuitive way to achieve this (the CharBuffer#wrap(CharSequence) method) adds a large
// amount of CPU overhead to the encoding process. Benchmarking shows that it's substantially faster
// to use String#getChars(int, int, char[], int) to copy the String's backing array and then call
// CharBuffer#wrap(char[]) on the copy.

if (length > SMALL_STRING_SIZE) {
// Allocate a new buffer for large strings
encodingBuffer = ByteBuffer.allocate((int) (text.length() * utf8Encoder.maxBytesPerChar()));
char[] chars = new char[text.length()];
text.getChars(0, text.length(), chars, 0);
stringData = CharBuffer.wrap(chars);
} else {
// Reuse our existing buffers for small strings
encodingBuffer = utf8EncodingBuffer;
encodingBuffer.clear();
stringData = charBuffer;
text.getChars(0, text.length(), charArray, 0);
charBuffer.rewind();
charBuffer.limit(text.length());
}

// Because encodingBuffer is guaranteed to be large enough to hold the encoded string, we can
// perform the encoding in a single call to CharsetEncoder#encode(CharBuffer, ByteBuffer, boolean).
CoderResult coderResult = utf8Encoder.encode(stringData, encodingBuffer, true);

// 'Underflow' is the success state of a CoderResult.
if (!coderResult.isUnderflow()) {
throw new IllegalArgumentException("Could not encode string as UTF8 bytes: " + text);
}
encodingBuffer.flip();
int utf8Length = encodingBuffer.remaining();

// In most usages, the JVM should be able to eliminate this allocation via an escape analysis of the caller.
return new Result(utf8Length, encodingBuffer.array());
}

/**
* Attempts to return this instance to the Utf8StringEncoderPool with which it is associated, if any.
*
* Do not continue to use this encoder after calling this method.
*/
@Override
public void close() {
if (utf8StringEncoderPool != null) {
utf8StringEncoderPool.returnEncoderToPool(this);
}
}

/**
* Represents the result of a {@link Utf8StringEncoder#encode(String)} operation.
*/
public static class Result {
final private byte[] buffer;
final private int encodedLength;

public Result(int encodedLength, byte[] buffer) {
this.encodedLength = encodedLength;
this.buffer = buffer;
}

/**
* Returns a byte array containing the encoded UTF-8 bytes starting at index 0. This byte array is NOT
* guaranteed to be the same length as the data it contains. Callers must use {@link #getEncodedLength()}
* to determine the number of bytes that should be read from the byte array.
*
* @return the buffer containing UTF-8 bytes.
*/
public byte[] getBuffer() {
return buffer;
}

/**
* @return the number of encoded bytes in the array returned by {@link #getBuffer()}.
*/
public int getEncodedLength() {
return encodedLength;
}
}
}
60 changes: 60 additions & 0 deletions src/com/amazon/ion/impl/bin/utf8/Utf8StringEncoderPool.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package com.amazon.ion.impl.bin.utf8;

import java.util.concurrent.ArrayBlockingQueue;

/**
* A thread-safe shared pool of {@link Utf8StringEncoder}s that can be used for UTF8 encoding and decoding.
*/
public enum Utf8StringEncoderPool {
// The only enum variant; a singleton instance.
INSTANCE;

// The maximum number of Utf8Encoders that can be waiting in the queue before new ones will be discarded.
private static final int MAX_QUEUE_SIZE = 128;

// A queue of previously initialized encoders that can be loaned out.
private final ArrayBlockingQueue<Utf8StringEncoder> bufferQueue;

// Do not allow instantiation; all classes should share the singleton instance.
private Utf8StringEncoderPool() {
bufferQueue = new ArrayBlockingQueue<Utf8StringEncoder>(MAX_QUEUE_SIZE);
}

/**
* @return a threadsafe shared instance of {@link Utf8StringEncoderPool}.
*/
public static Utf8StringEncoderPool getInstance() {
return INSTANCE;
}

/**
* If the pool is not empty, removes an instance of {@link Utf8StringEncoder} from the pool and returns it;
* otherwise, constructs a new instance.
*
* @return An instance of {@link Utf8StringEncoder}.
*/
public Utf8StringEncoder getOrCreateUtf8Encoder() {
// The `poll` method does not block. If the queue is empty it returns `null` immediately.
Utf8StringEncoder encoder = bufferQueue.poll();
if (encoder == null) {
// No buffers were available in the pool. Create a new one.
encoder = new Utf8StringEncoder(this);
}
return encoder;
}

/**
* Adds the provided instance of {@link Utf8StringEncoder} to the pool. If the pool is full, the instance will
* be discarded.
*
* Callers MUST NOT use an encoder after returning it to the pool.
*
* @param encoder A {@link Utf8StringEncoder} to add to the pool.
*/
public void returnEncoderToPool(Utf8StringEncoder encoder) {
// The `offer` method does not block. If the queue is full, it returns `false` immediately.
// If the provided instance cannot be added to the pool, we discard it silently.
bufferQueue.offer(encoder);
}

}