From 0e91759914ca2565498d7e9c1fb0066e03561eeb Mon Sep 17 00:00:00 2001 From: Aihua Xu Date: Fri, 6 Dec 2024 17:53:36 -0800 Subject: [PATCH 1/5] Add VariantBuilder --- .../org/apache/iceberg/variants/Variant.java | 45 +- .../iceberg/variants/VariantBuilder.java | 592 ++++++++++++++++++ .../iceberg/variants/VariantConstants.java | 33 + .../variants/VariantSizeLimitException.java | 26 + .../apache/iceberg/variants/VariantUtil.java | 10 +- 5 files changed, 699 insertions(+), 7 deletions(-) create mode 100644 core/src/main/java/org/apache/iceberg/variants/VariantBuilder.java create mode 100644 core/src/main/java/org/apache/iceberg/variants/VariantConstants.java create mode 100644 core/src/main/java/org/apache/iceberg/variants/VariantSizeLimitException.java diff --git a/core/src/main/java/org/apache/iceberg/variants/Variant.java b/core/src/main/java/org/apache/iceberg/variants/Variant.java index b5606fa094b6..02027c2ffd63 100644 --- a/core/src/main/java/org/apache/iceberg/variants/Variant.java +++ b/core/src/main/java/org/apache/iceberg/variants/Variant.java @@ -18,11 +18,44 @@ */ package org.apache.iceberg.variants; -/** A variant metadata and value pair. */ -public interface Variant { - /** Returns the metadata for all values in the variant. */ - VariantMetadata metadata(); +public final class Variant { + private final byte[] value; + private final byte[] metadata; + // The variant value doesn't use the whole `value` binary, but starts from its `pos` index and + // spans a size of `valueSize(value, pos)`. This design avoids frequent copies of the value binary + // when reading a sub-variant in the array/object element. + private final int pos; - /** Returns the variant value. */ - VariantValue value(); + public Variant(byte[] value, byte[] metadata) { + this(value, metadata, 0); + } + + Variant(byte[] value, byte[] metadata, int pos) { + this.value = value; + this.metadata = metadata; + this.pos = pos; + // There is currently only one allowed version. + if (metadata.length < 1 + || (metadata[0] & VariantConstants.VERSION_MASK) != VariantConstants.VERSION) { + throw new IllegalStateException(); + } + // Don't attempt to use a Variant larger than 16 MiB. We'll never produce one, and it risks + // memory instability. + if (metadata.length > VariantConstants.SIZE_LIMIT + || value.length > VariantConstants.SIZE_LIMIT) { + throw new VariantSizeLimitException(); + } + } + + public byte[] getMetadata() { + return metadata; + } + + public byte[] getValue() { + return value; + } + + public int getPos() { + return pos; + } } diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantBuilder.java b/core/src/main/java/org/apache/iceberg/variants/VariantBuilder.java new file mode 100644 index 000000000000..e472420e6382 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/variants/VariantBuilder.java @@ -0,0 +1,592 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.variants; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.core.exc.InputCoercionException; +import java.io.IOException; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +public class VariantBuilder { + private static final int MAX_SHORT_STR_SIZE = 0x3F; + + private ByteBufferWrapper buffer = new ByteBufferWrapper(); + + // Store the mapping from a string to a monotonically increasing assigned id + private final Map dictionary = Maps.newHashMap(); + // Store all the strings encoded with UTF8 in `dictionary` in the order of assigned ids. + private final List dictionaryKeys = Lists.newArrayList(); + + /** + * Parses a JSON string and constructs a Variant object. + * + * @param json The JSON string to parse. + * @return The constructed Variant object. + * @throws IOException If an error occurs while reading or parsing the JSON. + */ + public static Variant parseJson(String json) throws IOException { + Preconditions.checkArgument( + json != null && !json.isEmpty(), "Input JSON string cannot be null or empty."); + + try (JsonParser parser = new JsonFactory().createParser(json)) { + parser.nextToken(); + + VariantBuilder builder = new VariantBuilder(); + builder.buildJson(parser); + + return builder.result(); + } + } + + /** + * Builds the variant metadata from `dictionaryKeys` and returns the resulting Variant object. + * + * @return The constructed Variant object. + */ + private Variant result() { + int numKeys = dictionaryKeys.size(); + + // Calculate total size of dictionary strings + long numStringBytes = dictionaryKeys.stream().mapToLong(key -> key.length).sum(); + if (numStringBytes > VariantConstants.SIZE_LIMIT) { + throw new VariantSizeLimitException(); + } + + // Determine the number of bytes required for dictionary size and offset entry + int offsetSize = sizeOf(Math.max((int) numStringBytes, numKeys)); + + // metadata: header byte, dictionary size, offsets and string bytes + long metadataSize = 1 + offsetSize + (numKeys + 1) * offsetSize + numStringBytes; + + // Ensure the metadata size is within limits + if (metadataSize > VariantConstants.SIZE_LIMIT) { + throw new VariantSizeLimitException(); + } + + ByteBufferWrapper metadataBuffer = + new ByteBufferWrapper((int) metadataSize, (int) metadataSize); + + // Write header byte (version + offset size) + metadataBuffer.addByte(VariantUtil.metadataHeader(VariantConstants.VERSION, offsetSize)); + + // Write number of keys + metadataBuffer.writeLittleEndianUnsigned(numKeys, offsetSize); + + // Write offsets + int currentOffset = 0; + for (byte[] key : dictionaryKeys) { + metadataBuffer.writeLittleEndianUnsigned(currentOffset, offsetSize); + currentOffset += key.length; + } + metadataBuffer.writeLittleEndianUnsigned(numStringBytes, offsetSize); + + // Write dictionary strings + dictionaryKeys.stream().forEach(metadataBuffer::addBytes); + + return new Variant(buffer.toByteArray(), metadataBuffer.toByteArray()); + } + + private void buildJson(JsonParser parser) throws IOException { + JsonToken token = parser.currentToken(); + + if (token == null) { + throw new JsonParseException(parser, "Unexpected null token"); + } + + switch (token) { + case START_OBJECT: + appendObject(parser); + break; + case START_ARRAY: + appendArray(parser); + break; + case VALUE_STRING: + appendString(parser.getText()); + break; + case VALUE_NUMBER_INT: + appendInteger(parser); + break; + case VALUE_NUMBER_FLOAT: + appendFloat(parser); + break; + case VALUE_TRUE: + appendBoolean(true); + break; + case VALUE_FALSE: + appendBoolean(false); + break; + case VALUE_NULL: + appendNull(); + break; + default: + throw new JsonParseException(parser, "Unexpected token " + token); + } + } + + private void appendObject(JsonParser parser) throws IOException { + List fields = Lists.newArrayList(); + int startPos = buffer.pos; + + // Store object keys to dictionary of metadata + while (parser.nextToken() != JsonToken.END_OBJECT) { + String key = parser.currentName(); + parser.nextToken(); // Move to the value + + int id = + dictionary.computeIfAbsent( + key, + k -> { + int newId = dictionary.size(); + dictionaryKeys.add(k.getBytes(StandardCharsets.UTF_8)); + return newId; + }); + + fields.add(new FieldEntry(key, id, buffer.pos - startPos)); + buildJson(parser); + } + + finishWritingObject(startPos, fields); + } + + private void appendArray(JsonParser parser) throws IOException { + List offsets = Lists.newArrayList(); + int start = buffer.pos; + + parser.nextToken(); + while (parser.nextToken() != JsonToken.END_ARRAY) { + offsets.add(buffer.pos - start); + buildJson(parser); + } + + finishWritingArray(start, offsets); + } + + private void appendInteger(JsonParser parser) throws IOException { + try { + appendNumeric(parser.getLongValue()); + } catch (InputCoercionException ignored) { + appendFloat(parser); // Fallback for large integers + } + } + + private void appendString(String str) { + byte[] text = str.getBytes(StandardCharsets.UTF_8); + boolean longStr = text.length > MAX_SHORT_STR_SIZE; + + // Write header + if (longStr) { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_STRING)); + buffer.writeLittleEndianUnsigned(text.length, 4); + } else { + buffer.addByte(VariantUtil.shortStrHeader(text.length)); + } + + // Write string content + buffer.addBytes(text); + } + + public void appendNull() { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_NULL)); + } + + public void appendBoolean(boolean value) { + buffer.addByte( + VariantUtil.primitiveHeader( + value ? Variants.Primitives.TYPE_TRUE : Variants.Primitives.TYPE_FALSE)); + } + + /** + * Appends a numeric value to the variant builder, automatically choosing the smallest type (INT8, + * INT16, INT32, or INT64) to store the value efficiently. + * + * @param value The numeric value to append. + */ + public void appendNumeric(long value) { + if (value == (byte) value) { + // INT8: Requires 1 byte for value + 1 byte for header + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_INT8)); + buffer.writeLittleEndianUnsigned(value, 1); + } else if (value == (short) value) { + // INT16: Requires 2 bytes for value + 1 byte for header + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_INT16)); + buffer.writeLittleEndianUnsigned(value, 2); + } else if (value == (int) value) { + // INT32: Requires 4 bytes for value + 1 byte for header + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_INT32)); + buffer.writeLittleEndianUnsigned(value, 4); + } else { + // INT64: Requires 8 bytes for value + 1 byte for header + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_INT64)); + buffer.writeLittleEndianUnsigned(value, 8); + } + } + + public void appendDouble(double value) { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_DOUBLE)); + buffer.writeLittleEndianUnsigned(Double.doubleToLongBits(value), 8); + } + + /** + * Appends a decimal value to the variant builder, choosing the smallest decimal type (DECIMAL4, + * DECIMAL8, DECIMAL16) that fits its precision and scale. + */ + public void appendDecimal(BigDecimal value) { + Preconditions.checkArgument( + value.precision() <= VariantConstants.MAX_DECIMAL16_PRECISION, + "Unsupported Decimal precision: %s", + value.precision()); + + BigInteger unscaled = value.unscaledValue(); + if (value.scale() <= VariantConstants.MAX_DECIMAL4_PRECISION + && value.precision() <= VariantConstants.MAX_DECIMAL4_PRECISION) { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_DECIMAL4)); + buffer.addByte((byte) value.scale()); + buffer.writeLittleEndianUnsigned(unscaled.intValueExact(), 4); + } else if (value.scale() <= VariantConstants.MAX_DECIMAL8_PRECISION + && value.precision() <= VariantConstants.MAX_DECIMAL8_PRECISION) { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_DECIMAL8)); + buffer.addByte((byte) value.scale()); + buffer.writeLittleEndianUnsigned(unscaled.longValueExact(), 8); + } else { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_DECIMAL16)); + buffer.addByte((byte) value.scale()); + byte[] bytes = unscaled.toByteArray(); + // TODO call addBytes + for (int i = 0; i < 16; i++) { + byte byteValue = + i < bytes.length ? bytes[bytes.length - 1 - i] : (byte) (bytes[0] < 0 ? -1 : 0); + buffer.addByte(byteValue); + } + } + } + + public void appendDate(int daysSinceEpoch) { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_DATE)); + buffer.writeLittleEndianUnsigned(daysSinceEpoch, 4); + } + + /** Appends a timestamp with timezone (microseconds since epoch) to the variant builder. */ + public void appendTimestampTz(long microsSinceEpoch) { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_TIMESTAMPTZ)); + buffer.writeLittleEndianUnsigned(microsSinceEpoch, 8); + } + + /** Appends a timestamp without timezone (microseconds since epoch) to the variant builder. */ + public void appendTimestampNtz(long microsSinceEpoch) { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_TIMESTAMPNTZ)); + buffer.writeLittleEndianUnsigned(microsSinceEpoch, 8); + } + + public void appendFloat(float value) throws VariantSizeLimitException { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_FLOAT)); + buffer.writeLittleEndianUnsigned(Float.floatToIntBits(value), 4); + } + + public void appendBinary(byte[] value) throws VariantSizeLimitException { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_BINARY)); + buffer.writeLittleEndianUnsigned(value.length, 4); + buffer.addBytes(value); + } + + /** + * Completes writing an object to the buffer. Object fields are already written, and this method + * inserts header including header byte, number of elements, field IDs, and field offsets. + * + * @param startPos The starting position of the object data in the buffer. + * @param fields The list of field entries (key, ID, offset). + */ + private void finishWritingObject(int startPos, List fields) { + int numElements = fields.size(); + + // Sort fields by key and ensure no duplicate keys + Collections.sort(fields); + int maxId = numElements == 0 ? 0 : fields.get(0).id; + for (int i = 1; i < numElements; i++) { + maxId = Math.max(maxId, fields.get(i).id); + if (fields.get(i).key.equals(fields.get(i - 1).key)) { + throw new IllegalStateException("Duplicate key in Variant: " + fields.get(i).key); + } + } + + int dataSize = buffer.pos - startPos; // Total byte size of the object values + boolean isLarge = numElements > 0xFF; // Determine whether to use large format + int sizeBytes = isLarge ? 4 : 1; // Number of bytes for the object size + int fieldIdSize = sizeOf(maxId); // Number of bytes for each field id + int fieldOffsetSize = sizeOf(dataSize); // Number of bytes for each field offset + int headerSize = + 1 + sizeBytes + numElements * fieldIdSize + (numElements + 1) * fieldOffsetSize; + + // Shift existing data to make room for header + buffer.shift(startPos, headerSize); + + buffer.insertByte( + VariantUtil.objectHeader(isLarge, fieldIdSize, fieldOffsetSize), + startPos); // Insert header byte + buffer.insertLittleEndianUnsigned( + numElements, sizeBytes, startPos + 1); // Insert number of elements + + // Insert field IDs and offsets + int fieldIdStart = startPos + 1 + sizeBytes; + int fieldOffsetStart = fieldIdStart + numElements * fieldIdSize; + for (int i = 0; i < numElements; i++) { + buffer.insertLittleEndianUnsigned( + fields.get(i).id, fieldIdSize, fieldIdStart + i * fieldIdSize); + buffer.insertLittleEndianUnsigned( + fields.get(i).offset, fieldOffsetSize, fieldOffsetStart + i * fieldOffsetSize); + } + + // Insert the offset to the end of the data + buffer.insertLittleEndianUnsigned( + dataSize, fieldOffsetSize, fieldOffsetStart + numElements * fieldOffsetSize); + } + + /** + * Completes writing an array to the buffer. Array values are already written, and this method + * inserts header including the header byte, number of elements, and field offsets. + * + * @param startPos The starting position of the array values in the buffer. + * @param offsets The offsets for each array value. + */ + private void finishWritingArray(int startPos, List offsets) { + int dataSize = buffer.pos - startPos; // Total byte size of the array values + int numElements = offsets.size(); + + boolean isLarge = numElements > 0xFF; // Determine whether to use large format + int sizeBytes = isLarge ? 4 : 1; // Number of bytes for the array size + int fieldOffsetSize = sizeOf(dataSize); // Number of bytes of each field offset + int headerSize = 1 + sizeBytes + (numElements + 1) * fieldOffsetSize; // header size + int offsetStart = startPos + 1 + sizeBytes; // Start position for offsets + + // Shift existing data to make room for header + buffer.shift(startPos, headerSize); + + buffer.insertByte( + VariantUtil.arrayHeader(isLarge, fieldOffsetSize), startPos); // Insert header byte + buffer.insertLittleEndianUnsigned( + numElements, sizeBytes, startPos + 1); // Insert number of elements + + // Insert field offsets + for (int i = 0; i < numElements; i++) { + buffer.insertLittleEndianUnsigned( + offsets.get(i), fieldOffsetSize, offsetStart + i * fieldOffsetSize); + } + + // Insert the offset to the end of the data + buffer.insertLittleEndianUnsigned( + dataSize, fieldOffsetSize, offsetStart + numElements * fieldOffsetSize); + } + + /** Choose the smallest number of bytes to store the given value. */ + private static int sizeOf(int maxValue) { + if (maxValue <= 0xFF) { + return 1; + } else if (maxValue <= 0xFFFF) { + return 2; + } else if (maxValue <= 0xFFFFFF) { + return 3; + } + + return 4; + } + + private void appendFloat(JsonParser parser) throws IOException { + if (!tryAppendDecimal(parser.getText())) { + appendDouble(parser.getDoubleValue()); + } + } + + /** + * Attempts to parse a JSON number as a decimal and append it. The input must: - Use only decimal + * format (integer with an optional '.'). - Avoid scientific notation. - Fit within the precision + * and scale limits of decimal types. + */ + private boolean tryAppendDecimal(String input) { + // Validate that the input only contains valid decimal characters. + if (!input.matches("-?\\d+(\\.\\d+)?")) { + return false; + } + + // Parse the input string to BigDecimal. + BigDecimal decimalValue = new BigDecimal(input); + + // Check if the decimal value meets precision and scale limits. + if (decimalValue.scale() <= VariantConstants.MAX_DECIMAL16_PRECISION + && decimalValue.precision() <= VariantConstants.MAX_DECIMAL16_PRECISION) { + appendDecimal(decimalValue); + return true; + } + + return false; + } + + // Temporarily store the information of a field. We need to collect all fields in an JSON object, + // sort them by their keys, and build the variant object in sorted order. + public static final class FieldEntry implements Comparable { + private final String key; + private final int id; + private final int offset; + + public FieldEntry(String key, int id, int offset) { + this.key = key; + this.id = id; + this.offset = offset; + } + + FieldEntry withNewOffset(int newOffset) { + return new FieldEntry(key, id, newOffset); + } + + @Override + public int compareTo(FieldEntry other) { + return key.compareTo(other.key); + } + } + + /** An auto-growing byte buffer that doubles its size whenever the capacity is exceeded. */ + private static class ByteBufferWrapper { + private static final int SIZE_LIMIT = 1 << 24; // 16MB size limit + private static final int INITIAL_CAPACITY = 128; // Starting capacity + private byte[] buffer; + private int pos = 0; + private final int sizeLimit; + + ByteBufferWrapper() { + this(INITIAL_CAPACITY, SIZE_LIMIT); + } + + ByteBufferWrapper(int initialCapacity, int sizeLimit) { + if (initialCapacity <= 0) { + throw new IllegalArgumentException("Initial capacity must be positive"); + } + this.buffer = new byte[initialCapacity]; + this.sizeLimit = sizeLimit; + } + + /** + * Ensures the buffer has enough capacity to hold additional bytes. + * + * @param additional The number of additional bytes required. + * @throws VariantSizeLimitException If the required capacity exceeds the size limit. + */ + private void ensureCapacity(int additional) { + int required = pos + additional; + if (required > buffer.length) { + int newCapacity = Integer.highestOneBit(required); + newCapacity = newCapacity < required ? newCapacity * 2 : newCapacity; // Double the capacity + if (newCapacity > this.sizeLimit) { + throw new VariantSizeLimitException(); + } + + byte[] newBuffer = new byte[newCapacity]; + System.arraycopy(buffer, 0, newBuffer, 0, pos); + buffer = newBuffer; + } + } + + /** Adds a byte to the buffer, growing the buffer if necessary. */ + public void addByte(byte value) throws VariantSizeLimitException { + ensureCapacity(1); + buffer[pos++] = value; + } + + /** Adds an array of bytes to the buffer, growing the buffer if necessary. */ + public void addBytes(byte[] values) throws VariantSizeLimitException { + ensureCapacity(values.length); + System.arraycopy(values, 0, buffer, pos, values.length); + pos += values.length; + } + + /** + * Writes a numeric value in little-endian order to the buffer, growing the buffer if necessary. + * + * @param value The numeric value to write. + * @param numBytes The number of bytes to write (e.g., 2 for short, 4 for int, 8 for long). + */ + public void writeLittleEndianUnsigned(long value, int numBytes) { + if (numBytes < 1 || numBytes > 8) { + throw new IllegalArgumentException("numBytes must be between 1 and 8"); + } + ensureCapacity(numBytes); + + for (int i = 0; i < numBytes; ++i) { + buffer[pos + i] = (byte) ((value >>> (8 * i)) & 0xFF); + } + pos += numBytes; + } + + /** + * Move the bytes of buffer range [start, pos) by the provided offset position. This is used for + * writing array/object header. + */ + public void shift(int start, int offset) { + Preconditions.checkArgument(offset > 0, "offset must be positive"); + Preconditions.checkArgument(pos >= start, "start must be no greater than pos"); + ensureCapacity(offset); + + if (pos > start) { + System.arraycopy(buffer, start, buffer, start + offset, pos - start); + } + + pos += offset; + } + + /** + * Insert a byte into the buffer of the provided position. Note: this assumes shift() has been + * called to leave space for insert. + */ + public void insertByte(byte value, int insertPos) { + Preconditions.checkArgument(insertPos < pos, "insertPos must be smaller than pos"); + + buffer[insertPos] = value; + } + + /** + * Insert a number into the buffer of the provided position. Note: this assumes shift() has been + * called to leave space for insert. + */ + public void insertLittleEndianUnsigned(long value, int numBytes, int insertPos) { + Preconditions.checkArgument(insertPos < pos, "insertPos must be smaller than pos"); + if (numBytes < 1 || numBytes > 8) { + throw new IllegalArgumentException("numBytes must be between 1 and 8"); + } + + for (int i = 0; i < numBytes; ++i) { + buffer[insertPos + i] = (byte) ((value >>> (8 * i)) & 0xFF); + } + } + + /** Returns the underlying byte array. */ + public byte[] toByteArray() { + return Arrays.copyOf(buffer, pos); + } + } +} diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantConstants.java b/core/src/main/java/org/apache/iceberg/variants/VariantConstants.java new file mode 100644 index 000000000000..8ea93fdc05c1 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/variants/VariantConstants.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.variants; + +public class VariantConstants { + public static final byte VERSION = 1; // Variant version + public static final int SIZE_LIMIT = 1 << 24; // metadata and value size limits + + // The lower 4 bits of the first metadata byte contain the version. + public static final byte VERSION_MASK = 0x0F; + + public static final int MAX_DECIMAL4_PRECISION = 9; + public static final int MAX_DECIMAL8_PRECISION = 18; + public static final int MAX_DECIMAL16_PRECISION = 38; + + private VariantConstants() {} +} diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantSizeLimitException.java b/core/src/main/java/org/apache/iceberg/variants/VariantSizeLimitException.java new file mode 100644 index 000000000000..3570e3c70666 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/variants/VariantSizeLimitException.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.variants; + +public class VariantSizeLimitException extends RuntimeException { + + public VariantSizeLimitException() { + super("Variant size limit exceeded"); + } +} diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantUtil.java b/core/src/main/java/org/apache/iceberg/variants/VariantUtil.java index d6b78fe899e6..e37d0c2e0afb 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantUtil.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantUtil.java @@ -164,8 +164,16 @@ static int sizeOf(int maxValue) { } } + public static byte metadataHeader(int version, int offsetSize) { + return (byte) (version | ((offsetSize - 1) << 6)); + } + static byte primitiveHeader(int primitiveType) { - return (byte) (primitiveType << Variants.Primitives.PRIMITIVE_TYPE_SHIFT); + return (byte) (primitiveType << Variants.Primitives. PRIMITIVE_TYPE_SHIFT); + } + + public static byte shortStrHeader(int size) { + return (byte) (size << 2 | 0b01); } static byte objectHeader(boolean isLarge, int fieldIdSize, int offsetSize) { From 17661f728341522c56aea6d1a8ec660ac72630d8 Mon Sep 17 00:00:00 2001 From: Aihua Xu Date: Sat, 7 Dec 2024 22:41:35 -0800 Subject: [PATCH 2/5] Add tests --- .../iceberg/variants/SerializedArray.java | 5 + .../iceberg/variants/SerializedPrimitive.java | 4 + .../variants/SerializedShortString.java | 6 +- .../org/apache/iceberg/variants/Variant.java | 36 ++-- .../apache/iceberg/variants/VariantArray.java | 2 + .../iceberg/variants/VariantBuilder.java | 24 ++- .../apache/iceberg/variants/VariantUtil.java | 2 +- .../iceberg/variants/TestVariantBuilder.java | 176 ++++++++++++++++++ 8 files changed, 217 insertions(+), 38 deletions(-) create mode 100644 core/src/test/java/org/apache/iceberg/variants/TestVariantBuilder.java diff --git a/core/src/main/java/org/apache/iceberg/variants/SerializedArray.java b/core/src/main/java/org/apache/iceberg/variants/SerializedArray.java index be6649cb0d20..7976fc797e02 100644 --- a/core/src/main/java/org/apache/iceberg/variants/SerializedArray.java +++ b/core/src/main/java/org/apache/iceberg/variants/SerializedArray.java @@ -28,6 +28,11 @@ class SerializedArray extends Variants.SerializedValue implements VariantArray { private static final int OFFSET_SIZE_SHIFT = 2; private static final int IS_LARGE = 0b10000; + + static SerializedArray from(Variant variant) { + return from(SerializedMetadata.from(variant.getMetadata()), variant.getValue()); + } + @VisibleForTesting static SerializedArray from(VariantMetadata metadata, byte[] bytes) { return from(metadata, ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN), bytes[0]); diff --git a/core/src/main/java/org/apache/iceberg/variants/SerializedPrimitive.java b/core/src/main/java/org/apache/iceberg/variants/SerializedPrimitive.java index 1a6bd37a4ff3..eee62bcd37a5 100644 --- a/core/src/main/java/org/apache/iceberg/variants/SerializedPrimitive.java +++ b/core/src/main/java/org/apache/iceberg/variants/SerializedPrimitive.java @@ -28,6 +28,10 @@ class SerializedPrimitive extends Variants.SerializedValue implements VariantPri private static final int PRIMITIVE_TYPE_SHIFT = 2; private static final int PRIMITIVE_OFFSET = Variants.HEADER_SIZE; + static SerializedPrimitive from(Variant variant) { + return from(variant.getValue()); + } + static SerializedPrimitive from(byte[] bytes) { return from(ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN), bytes[0]); } diff --git a/core/src/main/java/org/apache/iceberg/variants/SerializedShortString.java b/core/src/main/java/org/apache/iceberg/variants/SerializedShortString.java index 3004a075def1..8d66ac2093e3 100644 --- a/core/src/main/java/org/apache/iceberg/variants/SerializedShortString.java +++ b/core/src/main/java/org/apache/iceberg/variants/SerializedShortString.java @@ -26,7 +26,11 @@ class SerializedShortString extends Variants.SerializedValue implements VariantP private static final int LENGTH_MASK = 0b11111100; private static final int LENGTH_SHIFT = 2; - static SerializedShortString from(byte[] bytes) { + static SerializedShortString from(Variant variant) { + return from(variant.getValue()); + } + + static SerializedShortString from(byte[] bytes) { return from(ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN), bytes[0]); } diff --git a/core/src/main/java/org/apache/iceberg/variants/Variant.java b/core/src/main/java/org/apache/iceberg/variants/Variant.java index 02027c2ffd63..0a01f02d1541 100644 --- a/core/src/main/java/org/apache/iceberg/variants/Variant.java +++ b/core/src/main/java/org/apache/iceberg/variants/Variant.java @@ -18,33 +18,27 @@ */ package org.apache.iceberg.variants; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + public final class Variant { private final byte[] value; private final byte[] metadata; - // The variant value doesn't use the whole `value` binary, but starts from its `pos` index and - // spans a size of `valueSize(value, pos)`. This design avoids frequent copies of the value binary - // when reading a sub-variant in the array/object element. - private final int pos; public Variant(byte[] value, byte[] metadata) { - this(value, metadata, 0); - } + Preconditions.checkArgument(metadata != null && metadata.length >= 1, + "Metadata must not be null or empty."); + Preconditions.checkArgument(value != null && value.length >= 1, + "Value must not be null or empty."); - Variant(byte[] value, byte[] metadata, int pos) { - this.value = value; - this.metadata = metadata; - this.pos = pos; - // There is currently only one allowed version. - if (metadata.length < 1 - || (metadata[0] & VariantConstants.VERSION_MASK) != VariantConstants.VERSION) { - throw new IllegalStateException(); - } - // Don't attempt to use a Variant larger than 16 MiB. We'll never produce one, and it risks - // memory instability. - if (metadata.length > VariantConstants.SIZE_LIMIT - || value.length > VariantConstants.SIZE_LIMIT) { + Preconditions.checkArgument((metadata[0] & VariantConstants.VERSION_MASK) == VariantConstants.VERSION, + "Unsupported metadata version."); + + if (value.length > VariantConstants.SIZE_LIMIT || metadata.length > VariantConstants.SIZE_LIMIT) { throw new VariantSizeLimitException(); } + + this.value = value; + this.metadata = metadata; } public byte[] getMetadata() { @@ -54,8 +48,4 @@ public byte[] getMetadata() { public byte[] getValue() { return value; } - - public int getPos() { - return pos; - } } diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantArray.java b/core/src/main/java/org/apache/iceberg/variants/VariantArray.java index dd1aa5cf4f10..767b0d847409 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantArray.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantArray.java @@ -20,6 +20,8 @@ /** An variant array value. */ public interface VariantArray extends VariantValue { + int numElements(); + /** Returns the {@link VariantValue} at {@code index} in this array. */ VariantValue get(int index); diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantBuilder.java b/core/src/main/java/org/apache/iceberg/variants/VariantBuilder.java index e472420e6382..bb47acd9621e 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantBuilder.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantBuilder.java @@ -29,7 +29,6 @@ import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.Collections; -import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; @@ -63,7 +62,7 @@ public static Variant parseJson(String json) throws IOException { VariantBuilder builder = new VariantBuilder(); builder.buildJson(parser); - return builder.result(); + return builder.build(); } } @@ -72,7 +71,7 @@ public static Variant parseJson(String json) throws IOException { * * @return The constructed Variant object. */ - private Variant result() { + public Variant build() { int numKeys = dictionaryKeys.size(); // Calculate total size of dictionary strings @@ -110,7 +109,7 @@ private Variant result() { metadataBuffer.writeLittleEndianUnsigned(numStringBytes, offsetSize); // Write dictionary strings - dictionaryKeys.stream().forEach(metadataBuffer::addBytes); + dictionaryKeys.forEach(metadataBuffer::addBytes); return new Variant(buffer.toByteArray(), metadataBuffer.toByteArray()); } @@ -174,20 +173,19 @@ private void appendObject(JsonParser parser) throws IOException { buildJson(parser); } - finishWritingObject(startPos, fields); + endObject(startPos, fields); } private void appendArray(JsonParser parser) throws IOException { List offsets = Lists.newArrayList(); int start = buffer.pos; - parser.nextToken(); while (parser.nextToken() != JsonToken.END_ARRAY) { offsets.add(buffer.pos - start); buildJson(parser); } - finishWritingArray(start, offsets); + endArray(start, offsets); } private void appendInteger(JsonParser parser) throws IOException { @@ -232,19 +230,19 @@ public void appendBoolean(boolean value) { */ public void appendNumeric(long value) { if (value == (byte) value) { - // INT8: Requires 1 byte for value + 1 byte for header + // INT8: Requires 1 byte for header + 1 byte for value buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_INT8)); buffer.writeLittleEndianUnsigned(value, 1); } else if (value == (short) value) { - // INT16: Requires 2 bytes for value + 1 byte for header + // INT16: Requires 1 byte for header + 2 bytes for value buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_INT16)); buffer.writeLittleEndianUnsigned(value, 2); } else if (value == (int) value) { - // INT32: Requires 4 bytes for value + 1 byte for header + // INT32: Requires 1 byte for header + 4 bytes for value buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_INT32)); buffer.writeLittleEndianUnsigned(value, 4); } else { - // INT64: Requires 8 bytes for value + 1 byte for header + // INT64: Requires 1 byte for header + 8 bytes for value buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_INT64)); buffer.writeLittleEndianUnsigned(value, 8); } @@ -324,7 +322,7 @@ public void appendBinary(byte[] value) throws VariantSizeLimitException { * @param startPos The starting position of the object data in the buffer. * @param fields The list of field entries (key, ID, offset). */ - private void finishWritingObject(int startPos, List fields) { + private void endObject(int startPos, List fields) { int numElements = fields.size(); // Sort fields by key and ensure no duplicate keys @@ -376,7 +374,7 @@ private void finishWritingObject(int startPos, List fields) { * @param startPos The starting position of the array values in the buffer. * @param offsets The offsets for each array value. */ - private void finishWritingArray(int startPos, List offsets) { + private void endArray(int startPos, List offsets) { int dataSize = buffer.pos - startPos; // Total byte size of the array values int numElements = offsets.size(); diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantUtil.java b/core/src/main/java/org/apache/iceberg/variants/VariantUtil.java index e37d0c2e0afb..85cde9d11ae9 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantUtil.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantUtil.java @@ -169,7 +169,7 @@ public static byte metadataHeader(int version, int offsetSize) { } static byte primitiveHeader(int primitiveType) { - return (byte) (primitiveType << Variants.Primitives. PRIMITIVE_TYPE_SHIFT); + return (byte) (primitiveType << Variants.Primitives.PRIMITIVE_TYPE_SHIFT); } public static byte shortStrHeader(int size) { diff --git a/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilder.java b/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilder.java new file mode 100644 index 000000000000..d77272f0b35d --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilder.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.variants; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.stream.Stream; +import net.minidev.json.JSONArray; +import org.apache.iceberg.util.DateTimeUtil; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +public class TestVariantBuilder { + @ParameterizedTest + @MethodSource("primitiveInputs") + public void testPrimitive(String input, Variants.PhysicalType expectedType, Object expectedValue) throws IOException { + Variant variant = VariantBuilder.parseJson(input); + + SerializedPrimitive primitive = SerializedPrimitive.from(variant); + + assertThat(primitive.type()).isEqualTo(expectedType); + assertThat(primitive.get()).isEqualTo(expectedValue); + } + + private static Stream primitiveInputs() { + return Stream.of( + Arguments.of("null", Variants.PhysicalType.NULL, null), + Arguments.of("true", Variants.PhysicalType.BOOLEAN_TRUE, true), + Arguments.of("false", Variants.PhysicalType.BOOLEAN_FALSE, false), + Arguments.of("34", Variants.PhysicalType.INT8, (byte)34), + Arguments.of("1234", Variants.PhysicalType.INT16, (short)1234), + Arguments.of("1234567890", Variants.PhysicalType.INT32, 1234567890), + Arguments.of("1234567890987654321", Variants.PhysicalType.INT64, 1234567890987654321L), + Arguments.of("1234e-2", Variants.PhysicalType.DOUBLE, 12.34), + Arguments.of("123456.789", Variants.PhysicalType.DECIMAL4, new BigDecimal("123456.789")), + Arguments.of("123456789.987654321", Variants.PhysicalType.DECIMAL8, new BigDecimal("123456789.987654321")), + Arguments.of("12345678901234567890.987654321", Variants.PhysicalType.DECIMAL16, new BigDecimal("12345678901234567890.987654321")), + Arguments.of("\"This test string is used to generate a primitive string type of variant\"", Variants.PhysicalType.STRING, "This test string is used to generate a primitive string type of variant") + + ); + } + + @Test + public void testPrimitiveFloat() { + VariantBuilder builder = new VariantBuilder(); + builder.appendFloat(12.34f); + Variant variant = builder.build(); + SerializedPrimitive primitive = SerializedPrimitive.from(variant); + + assertThat(primitive.type()).isEqualTo(Variants.PhysicalType.FLOAT); + assertThat(primitive.get()).isEqualTo(12.34f); + } + + @Test + public void testPrimitiveDate() { + String dateString = "2017-08-18"; + int daysSinceEpoch = DateTimeUtil.isoDateToDays(dateString); + + VariantBuilder builder = new VariantBuilder(); + builder.appendDate(daysSinceEpoch); + Variant variant = builder.build(); + SerializedPrimitive primitive = SerializedPrimitive.from(variant); + + assertThat(primitive.type()).isEqualTo(Variants.PhysicalType.DATE); + assertThat(DateTimeUtil.daysToIsoDate((int)primitive.get())).isEqualTo(dateString); + } + + @Test + public void testPrimitiveTimestampTZ() { + String tzString = "2017-08-18T14:21:01.919+00:00"; + long microsSinceEpoch = DateTimeUtil.isoTimestamptzToMicros(tzString); + + VariantBuilder builder = new VariantBuilder(); + builder.appendTimestampTz(microsSinceEpoch); + Variant variant = builder.build(); + SerializedPrimitive primitive = SerializedPrimitive.from(variant); + + assertThat(primitive.type()).isEqualTo(Variants.PhysicalType.TIMESTAMPTZ); + assertThat(DateTimeUtil.microsToIsoTimestamptz((long)primitive.get())).isEqualTo(tzString); + } + + @Test + public void testPrimitiveTimestampNTZ() { + String ntzString = "2017-08-18T14:21:01.919"; + long microsSinceEpoch = DateTimeUtil.isoTimestampToMicros(ntzString); + + VariantBuilder builder = new VariantBuilder(); + builder.appendTimestampNtz(microsSinceEpoch); + Variant variant = builder.build(); + SerializedPrimitive primitive = SerializedPrimitive.from(variant); + + assertThat(primitive.type()).isEqualTo(Variants.PhysicalType.TIMESTAMPNTZ); + assertThat(DateTimeUtil.microsToIsoTimestamp((long)primitive.get())).isEqualTo(ntzString); + } + + @Test + public void testPrimitiveBinary() { + VariantBuilder builder = new VariantBuilder(); + builder.appendBinary("iceberg".getBytes()); + Variant variant = builder.build(); + SerializedPrimitive primitive = SerializedPrimitive.from(variant); + + assertThat(primitive.type()).isEqualTo(Variants.PhysicalType.BINARY); + assertThat(primitive.get()).isEqualTo(ByteBuffer.wrap("iceberg".getBytes())); + } + + @Test + public void testShortString() throws IOException { + Variant variant = VariantBuilder.parseJson("\"iceberg\""); + SerializedShortString shortString = SerializedShortString.from(variant); + + assertThat(shortString.type()).isEqualTo(Variants.PhysicalType.STRING); + assertThat(shortString.get()).isEqualTo("iceberg"); + } + + @Test + public void testArray() throws IOException { + List input = List.of("Ford", "BMW", "Fiat"); + Variant variant = VariantBuilder.parseJson(JSONArray.toJSONString(input)); + SerializedArray arr = SerializedArray.from(variant); + + assertThat(arr.type()).isEqualTo(Variants.PhysicalType.ARRAY); + for (int i = 0; i < arr.numElements(); i++) { + assertThat(arr.get(i).asPrimitive().get()).isEqualTo(input.get(i)); + } + } + + @Test + public void testEmptyObject() throws IOException { + Variant variant = VariantBuilder.parseJson("{}"); + SerializedObject object = SerializedObject.from(variant); + + assertThat(object.type()).isEqualTo(Variants.PhysicalType.OBJECT); + assertThat(object.numElements()).isEqualTo(0); + } + + @Test + public void testObject() throws IOException { + Variant variant = VariantBuilder.parseJson("{ \"id\": 1234, \"firstName\": \"Joe\", \"lastName\": \"Smith\", \"phones\":[\"123-456-7890\", \"789-123-4560\"] }"); + SerializedObject object = SerializedObject.from(variant); + + assertThat(object.type()).isEqualTo(Variants.PhysicalType.OBJECT); + assertThat(object.numElements()).isEqualTo(4); + + assertThat(object.get("id").asPrimitive().get()).isEqualTo((short)1234); + assertThat(object.get("firstName").asPrimitive().get()).isEqualTo("Joe"); + assertThat(object.get("lastName").asPrimitive().get()).isEqualTo("Smith"); + + VariantArray phones = object.get("phones").asArray(); + assertThat(phones.numElements()).isEqualTo(2); + assertThat(phones.get(0).asPrimitive().get()).isEqualTo("123-456-7890"); + assertThat(phones.get(1).asPrimitive().get()).isEqualTo("789-123-4560"); + } +} From 5ba965ae09c866455f05abe8eaa380a205471f00 Mon Sep 17 00:00:00 2001 From: Aihua Xu Date: Fri, 20 Dec 2024 11:46:21 -0800 Subject: [PATCH 3/5] Add array/object build interface --- .../iceberg/variants/SerializedArray.java | 5 - .../iceberg/variants/SerializedPrimitive.java | 4 - .../variants/SerializedShortString.java | 6 +- .../org/apache/iceberg/variants/Variant.java | 35 +- .../apache/iceberg/variants/VariantArray.java | 4 +- .../iceberg/variants/VariantArrayBuilder.java | 120 +++++ .../iceberg/variants/VariantBuilder.java | 508 ++---------------- .../iceberg/variants/VariantBuilderBase.java | 504 +++++++++++++++++ .../apache/iceberg/variants/VariantImpl.java | 73 +++ .../iceberg/variants/VariantObject.java | 4 + .../variants/VariantObjectBuilder.java | 121 +++++ .../variants/VariantPrimitiveBuilder.java | 86 +++ .../iceberg/variants/TestVariantBuilder.java | 176 ------ .../variants/TestVariantBuilderArray.java | 183 +++++++ .../variants/TestVariantBuilderObject.java | 117 ++++ .../variants/TestVariantBuilderPrimitive.java | 246 +++++++++ 16 files changed, 1512 insertions(+), 680 deletions(-) create mode 100644 core/src/main/java/org/apache/iceberg/variants/VariantArrayBuilder.java create mode 100644 core/src/main/java/org/apache/iceberg/variants/VariantBuilderBase.java create mode 100644 core/src/main/java/org/apache/iceberg/variants/VariantImpl.java create mode 100644 core/src/main/java/org/apache/iceberg/variants/VariantObjectBuilder.java create mode 100644 core/src/main/java/org/apache/iceberg/variants/VariantPrimitiveBuilder.java delete mode 100644 core/src/test/java/org/apache/iceberg/variants/TestVariantBuilder.java create mode 100644 core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderArray.java create mode 100644 core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderObject.java create mode 100644 core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderPrimitive.java diff --git a/core/src/main/java/org/apache/iceberg/variants/SerializedArray.java b/core/src/main/java/org/apache/iceberg/variants/SerializedArray.java index 7976fc797e02..be6649cb0d20 100644 --- a/core/src/main/java/org/apache/iceberg/variants/SerializedArray.java +++ b/core/src/main/java/org/apache/iceberg/variants/SerializedArray.java @@ -28,11 +28,6 @@ class SerializedArray extends Variants.SerializedValue implements VariantArray { private static final int OFFSET_SIZE_SHIFT = 2; private static final int IS_LARGE = 0b10000; - - static SerializedArray from(Variant variant) { - return from(SerializedMetadata.from(variant.getMetadata()), variant.getValue()); - } - @VisibleForTesting static SerializedArray from(VariantMetadata metadata, byte[] bytes) { return from(metadata, ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN), bytes[0]); diff --git a/core/src/main/java/org/apache/iceberg/variants/SerializedPrimitive.java b/core/src/main/java/org/apache/iceberg/variants/SerializedPrimitive.java index eee62bcd37a5..1a6bd37a4ff3 100644 --- a/core/src/main/java/org/apache/iceberg/variants/SerializedPrimitive.java +++ b/core/src/main/java/org/apache/iceberg/variants/SerializedPrimitive.java @@ -28,10 +28,6 @@ class SerializedPrimitive extends Variants.SerializedValue implements VariantPri private static final int PRIMITIVE_TYPE_SHIFT = 2; private static final int PRIMITIVE_OFFSET = Variants.HEADER_SIZE; - static SerializedPrimitive from(Variant variant) { - return from(variant.getValue()); - } - static SerializedPrimitive from(byte[] bytes) { return from(ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN), bytes[0]); } diff --git a/core/src/main/java/org/apache/iceberg/variants/SerializedShortString.java b/core/src/main/java/org/apache/iceberg/variants/SerializedShortString.java index 8d66ac2093e3..3004a075def1 100644 --- a/core/src/main/java/org/apache/iceberg/variants/SerializedShortString.java +++ b/core/src/main/java/org/apache/iceberg/variants/SerializedShortString.java @@ -26,11 +26,7 @@ class SerializedShortString extends Variants.SerializedValue implements VariantP private static final int LENGTH_MASK = 0b11111100; private static final int LENGTH_SHIFT = 2; - static SerializedShortString from(Variant variant) { - return from(variant.getValue()); - } - - static SerializedShortString from(byte[] bytes) { + static SerializedShortString from(byte[] bytes) { return from(ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN), bytes[0]); } diff --git a/core/src/main/java/org/apache/iceberg/variants/Variant.java b/core/src/main/java/org/apache/iceberg/variants/Variant.java index 0a01f02d1541..b5606fa094b6 100644 --- a/core/src/main/java/org/apache/iceberg/variants/Variant.java +++ b/core/src/main/java/org/apache/iceberg/variants/Variant.java @@ -18,34 +18,11 @@ */ package org.apache.iceberg.variants; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +/** A variant metadata and value pair. */ +public interface Variant { + /** Returns the metadata for all values in the variant. */ + VariantMetadata metadata(); -public final class Variant { - private final byte[] value; - private final byte[] metadata; - - public Variant(byte[] value, byte[] metadata) { - Preconditions.checkArgument(metadata != null && metadata.length >= 1, - "Metadata must not be null or empty."); - Preconditions.checkArgument(value != null && value.length >= 1, - "Value must not be null or empty."); - - Preconditions.checkArgument((metadata[0] & VariantConstants.VERSION_MASK) == VariantConstants.VERSION, - "Unsupported metadata version."); - - if (value.length > VariantConstants.SIZE_LIMIT || metadata.length > VariantConstants.SIZE_LIMIT) { - throw new VariantSizeLimitException(); - } - - this.value = value; - this.metadata = metadata; - } - - public byte[] getMetadata() { - return metadata; - } - - public byte[] getValue() { - return value; - } + /** Returns the variant value. */ + VariantValue value(); } diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantArray.java b/core/src/main/java/org/apache/iceberg/variants/VariantArray.java index 767b0d847409..5de38df6e417 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantArray.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantArray.java @@ -20,7 +20,9 @@ /** An variant array value. */ public interface VariantArray extends VariantValue { - int numElements(); + default int numElements() { + throw new UnsupportedOperationException(); + } /** Returns the {@link VariantValue} at {@code index} in this array. */ VariantValue get(int index); diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantArrayBuilder.java b/core/src/main/java/org/apache/iceberg/variants/VariantArrayBuilder.java new file mode 100644 index 000000000000..e07856444e4d --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/variants/VariantArrayBuilder.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.variants; + +import java.math.BigDecimal; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.OffsetDateTime; +import java.util.List; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.DateTimeUtil; + +public class VariantArrayBuilder extends VariantBuilderBase { + private final List offsets; + + public VariantArrayBuilder(ByteBufferWrapper buffer, Dictionary dict) { + super(buffer, dict); + offsets = Lists.newArrayList(); + } + + public VariantObjectBuilder startObject() { + addOffset(); + return new VariantObjectBuilder(getBuffer(), getDict()); + } + + public VariantArrayBuilder startArray() { + addOffset(); + return new VariantArrayBuilder(getBuffer(), getDict()); + } + + public VariantArrayBuilder writeNull() { + addOffset(); + writeNullInternal(); + return this; + } + + public VariantArrayBuilder writeBoolean(boolean value) { + addOffset(); + writeBooleanInternal(value); + return this; + } + + public VariantArrayBuilder writeNumeric(long value) { + addOffset(); + writeNumericInternal(value); + return this; + } + + public VariantArrayBuilder writeDouble(double value) { + addOffset(); + writeDoubleInternal(value); + return this; + } + + public VariantArrayBuilder writeDecimal(BigDecimal value) { + addOffset(); + writeDecimalInternal(value); + return this; + } + + public VariantArrayBuilder writeDate(LocalDate value) { + addOffset(); + writeDateInternal(DateTimeUtil.daysFromDate(value)); + return this; + } + + public VariantArrayBuilder writeTimestampTz(OffsetDateTime value) { + addOffset(); + writeTimestampTzInternal(DateTimeUtil.microsFromTimestamptz(value)); + return this; + } + + public VariantArrayBuilder writeTimestampNtz(LocalDateTime value) { + addOffset(); + writeTimestampNtzInternal(DateTimeUtil.microsFromTimestamp(value)); + return this; + } + + public VariantArrayBuilder writeFloat(float value) { + addOffset(); + writeFloatInternal(value); + return this; + } + + public VariantArrayBuilder writeBinary(byte[] value) { + addOffset(); + writeBinaryInternal(value); + return this; + } + + public VariantArrayBuilder writeString(String str) { + addOffset(); + writeStringInternal(str); + return this; + } + + private void addOffset() { + offsets.add(getBuffer().getPos() - getStartPos()); + } + + public void endArray() { + super.endArray(getStartPos(), offsets); + } +} diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantBuilder.java b/core/src/main/java/org/apache/iceberg/variants/VariantBuilder.java index bb47acd9621e..598981714ba5 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantBuilder.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantBuilder.java @@ -25,25 +25,28 @@ import com.fasterxml.jackson.core.exc.InputCoercionException; import java.io.IOException; import java.math.BigDecimal; -import java.math.BigInteger; -import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import java.util.Collections; import java.util.List; -import java.util.Map; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -public class VariantBuilder { - private static final int MAX_SHORT_STR_SIZE = 0x3F; +/** A builder class to build a primitive/array/object variant. */ +public class VariantBuilder extends VariantBuilderBase { + public VariantBuilder() { + super(new VariantBuilderBase.ByteBufferWrapper(), new VariantBuilderBase.Dictionary()); + } + + public VariantPrimitiveBuilder createPrimitive() { + VariantPrimitiveBuilder primitiveBuilder = new VariantPrimitiveBuilder(getBuffer(), getDict()); + return primitiveBuilder; + } - private ByteBufferWrapper buffer = new ByteBufferWrapper(); + public VariantObjectBuilder startObject() { + return new VariantObjectBuilder(getBuffer(), getDict()); + } - // Store the mapping from a string to a monotonically increasing assigned id - private final Map dictionary = Maps.newHashMap(); - // Store all the strings encoded with UTF8 in `dictionary` in the order of assigned ids. - private final List dictionaryKeys = Lists.newArrayList(); + public VariantArrayBuilder startArray() { + return new VariantArrayBuilder(getBuffer(), getDict()); + } /** * Parses a JSON string and constructs a Variant object. @@ -66,54 +69,6 @@ public static Variant parseJson(String json) throws IOException { } } - /** - * Builds the variant metadata from `dictionaryKeys` and returns the resulting Variant object. - * - * @return The constructed Variant object. - */ - public Variant build() { - int numKeys = dictionaryKeys.size(); - - // Calculate total size of dictionary strings - long numStringBytes = dictionaryKeys.stream().mapToLong(key -> key.length).sum(); - if (numStringBytes > VariantConstants.SIZE_LIMIT) { - throw new VariantSizeLimitException(); - } - - // Determine the number of bytes required for dictionary size and offset entry - int offsetSize = sizeOf(Math.max((int) numStringBytes, numKeys)); - - // metadata: header byte, dictionary size, offsets and string bytes - long metadataSize = 1 + offsetSize + (numKeys + 1) * offsetSize + numStringBytes; - - // Ensure the metadata size is within limits - if (metadataSize > VariantConstants.SIZE_LIMIT) { - throw new VariantSizeLimitException(); - } - - ByteBufferWrapper metadataBuffer = - new ByteBufferWrapper((int) metadataSize, (int) metadataSize); - - // Write header byte (version + offset size) - metadataBuffer.addByte(VariantUtil.metadataHeader(VariantConstants.VERSION, offsetSize)); - - // Write number of keys - metadataBuffer.writeLittleEndianUnsigned(numKeys, offsetSize); - - // Write offsets - int currentOffset = 0; - for (byte[] key : dictionaryKeys) { - metadataBuffer.writeLittleEndianUnsigned(currentOffset, offsetSize); - currentOffset += key.length; - } - metadataBuffer.writeLittleEndianUnsigned(numStringBytes, offsetSize); - - // Write dictionary strings - dictionaryKeys.forEach(metadataBuffer::addBytes); - - return new Variant(buffer.toByteArray(), metadataBuffer.toByteArray()); - } - private void buildJson(JsonParser parser) throws IOException { JsonToken token = parser.currentToken(); @@ -123,312 +78,87 @@ private void buildJson(JsonParser parser) throws IOException { switch (token) { case START_OBJECT: - appendObject(parser); + writeObject(parser); break; case START_ARRAY: - appendArray(parser); + writeArray(parser); break; case VALUE_STRING: - appendString(parser.getText()); + writeStringInternal(parser.getText()); break; case VALUE_NUMBER_INT: - appendInteger(parser); + writeInteger(parser); break; case VALUE_NUMBER_FLOAT: - appendFloat(parser); + writeFloat(parser); break; case VALUE_TRUE: - appendBoolean(true); + writeBooleanInternal(true); break; case VALUE_FALSE: - appendBoolean(false); + writeBooleanInternal(false); break; case VALUE_NULL: - appendNull(); + writeNullInternal(); break; default: throw new JsonParseException(parser, "Unexpected token " + token); } } - private void appendObject(JsonParser parser) throws IOException { - List fields = Lists.newArrayList(); - int startPos = buffer.pos; + private void writeObject(JsonParser parser) throws IOException { + List fields = Lists.newArrayList(); + int startPos = getBuffer().getPos(); // Store object keys to dictionary of metadata while (parser.nextToken() != JsonToken.END_OBJECT) { String key = parser.currentName(); parser.nextToken(); // Move to the value - int id = - dictionary.computeIfAbsent( - key, - k -> { - int newId = dictionary.size(); - dictionaryKeys.add(k.getBytes(StandardCharsets.UTF_8)); - return newId; - }); - - fields.add(new FieldEntry(key, id, buffer.pos - startPos)); + int id = getDict().add(key); + fields.add(new VariantBuilderBase.FieldEntry(key, id, getBuffer().getPos() - startPos)); buildJson(parser); } endObject(startPos, fields); } - private void appendArray(JsonParser parser) throws IOException { + private void writeArray(JsonParser parser) throws IOException { List offsets = Lists.newArrayList(); - int start = buffer.pos; + int startPos = getBuffer().getPos(); while (parser.nextToken() != JsonToken.END_ARRAY) { - offsets.add(buffer.pos - start); + offsets.add(getBuffer().getPos() - startPos); buildJson(parser); } - endArray(start, offsets); + endArray(startPos, offsets); } - private void appendInteger(JsonParser parser) throws IOException { + private void writeInteger(JsonParser parser) throws IOException { try { - appendNumeric(parser.getLongValue()); + writeNumericInternal(parser.getLongValue()); } catch (InputCoercionException ignored) { - appendFloat(parser); // Fallback for large integers + writeFloat(parser); // Fallback for large integers } } - private void appendString(String str) { - byte[] text = str.getBytes(StandardCharsets.UTF_8); - boolean longStr = text.length > MAX_SHORT_STR_SIZE; - - // Write header - if (longStr) { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_STRING)); - buffer.writeLittleEndianUnsigned(text.length, 4); - } else { - buffer.addByte(VariantUtil.shortStrHeader(text.length)); - } - - // Write string content - buffer.addBytes(text); - } - - public void appendNull() { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_NULL)); - } - - public void appendBoolean(boolean value) { - buffer.addByte( - VariantUtil.primitiveHeader( - value ? Variants.Primitives.TYPE_TRUE : Variants.Primitives.TYPE_FALSE)); - } - - /** - * Appends a numeric value to the variant builder, automatically choosing the smallest type (INT8, - * INT16, INT32, or INT64) to store the value efficiently. - * - * @param value The numeric value to append. - */ - public void appendNumeric(long value) { - if (value == (byte) value) { - // INT8: Requires 1 byte for header + 1 byte for value - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_INT8)); - buffer.writeLittleEndianUnsigned(value, 1); - } else if (value == (short) value) { - // INT16: Requires 1 byte for header + 2 bytes for value - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_INT16)); - buffer.writeLittleEndianUnsigned(value, 2); - } else if (value == (int) value) { - // INT32: Requires 1 byte for header + 4 bytes for value - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_INT32)); - buffer.writeLittleEndianUnsigned(value, 4); - } else { - // INT64: Requires 1 byte for header + 8 bytes for value - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_INT64)); - buffer.writeLittleEndianUnsigned(value, 8); - } - } - - public void appendDouble(double value) { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_DOUBLE)); - buffer.writeLittleEndianUnsigned(Double.doubleToLongBits(value), 8); - } - - /** - * Appends a decimal value to the variant builder, choosing the smallest decimal type (DECIMAL4, - * DECIMAL8, DECIMAL16) that fits its precision and scale. - */ - public void appendDecimal(BigDecimal value) { - Preconditions.checkArgument( - value.precision() <= VariantConstants.MAX_DECIMAL16_PRECISION, - "Unsupported Decimal precision: %s", - value.precision()); - - BigInteger unscaled = value.unscaledValue(); - if (value.scale() <= VariantConstants.MAX_DECIMAL4_PRECISION - && value.precision() <= VariantConstants.MAX_DECIMAL4_PRECISION) { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_DECIMAL4)); - buffer.addByte((byte) value.scale()); - buffer.writeLittleEndianUnsigned(unscaled.intValueExact(), 4); - } else if (value.scale() <= VariantConstants.MAX_DECIMAL8_PRECISION - && value.precision() <= VariantConstants.MAX_DECIMAL8_PRECISION) { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_DECIMAL8)); - buffer.addByte((byte) value.scale()); - buffer.writeLittleEndianUnsigned(unscaled.longValueExact(), 8); - } else { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_DECIMAL16)); - buffer.addByte((byte) value.scale()); - byte[] bytes = unscaled.toByteArray(); - // TODO call addBytes - for (int i = 0; i < 16; i++) { - byte byteValue = - i < bytes.length ? bytes[bytes.length - 1 - i] : (byte) (bytes[0] < 0 ? -1 : 0); - buffer.addByte(byteValue); - } - } - } - - public void appendDate(int daysSinceEpoch) { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_DATE)); - buffer.writeLittleEndianUnsigned(daysSinceEpoch, 4); - } - - /** Appends a timestamp with timezone (microseconds since epoch) to the variant builder. */ - public void appendTimestampTz(long microsSinceEpoch) { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_TIMESTAMPTZ)); - buffer.writeLittleEndianUnsigned(microsSinceEpoch, 8); - } - - /** Appends a timestamp without timezone (microseconds since epoch) to the variant builder. */ - public void appendTimestampNtz(long microsSinceEpoch) { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_TIMESTAMPNTZ)); - buffer.writeLittleEndianUnsigned(microsSinceEpoch, 8); - } - - public void appendFloat(float value) throws VariantSizeLimitException { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_FLOAT)); - buffer.writeLittleEndianUnsigned(Float.floatToIntBits(value), 4); - } - - public void appendBinary(byte[] value) throws VariantSizeLimitException { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_BINARY)); - buffer.writeLittleEndianUnsigned(value.length, 4); - buffer.addBytes(value); - } - - /** - * Completes writing an object to the buffer. Object fields are already written, and this method - * inserts header including header byte, number of elements, field IDs, and field offsets. - * - * @param startPos The starting position of the object data in the buffer. - * @param fields The list of field entries (key, ID, offset). - */ - private void endObject(int startPos, List fields) { - int numElements = fields.size(); - - // Sort fields by key and ensure no duplicate keys - Collections.sort(fields); - int maxId = numElements == 0 ? 0 : fields.get(0).id; - for (int i = 1; i < numElements; i++) { - maxId = Math.max(maxId, fields.get(i).id); - if (fields.get(i).key.equals(fields.get(i - 1).key)) { - throw new IllegalStateException("Duplicate key in Variant: " + fields.get(i).key); - } - } - - int dataSize = buffer.pos - startPos; // Total byte size of the object values - boolean isLarge = numElements > 0xFF; // Determine whether to use large format - int sizeBytes = isLarge ? 4 : 1; // Number of bytes for the object size - int fieldIdSize = sizeOf(maxId); // Number of bytes for each field id - int fieldOffsetSize = sizeOf(dataSize); // Number of bytes for each field offset - int headerSize = - 1 + sizeBytes + numElements * fieldIdSize + (numElements + 1) * fieldOffsetSize; - - // Shift existing data to make room for header - buffer.shift(startPos, headerSize); - - buffer.insertByte( - VariantUtil.objectHeader(isLarge, fieldIdSize, fieldOffsetSize), - startPos); // Insert header byte - buffer.insertLittleEndianUnsigned( - numElements, sizeBytes, startPos + 1); // Insert number of elements - - // Insert field IDs and offsets - int fieldIdStart = startPos + 1 + sizeBytes; - int fieldOffsetStart = fieldIdStart + numElements * fieldIdSize; - for (int i = 0; i < numElements; i++) { - buffer.insertLittleEndianUnsigned( - fields.get(i).id, fieldIdSize, fieldIdStart + i * fieldIdSize); - buffer.insertLittleEndianUnsigned( - fields.get(i).offset, fieldOffsetSize, fieldOffsetStart + i * fieldOffsetSize); + private void writeFloat(JsonParser parser) throws IOException { + if (!tryWriteDecimal(parser.getText())) { + writeDoubleInternal(parser.getDoubleValue()); } - - // Insert the offset to the end of the data - buffer.insertLittleEndianUnsigned( - dataSize, fieldOffsetSize, fieldOffsetStart + numElements * fieldOffsetSize); } /** - * Completes writing an array to the buffer. Array values are already written, and this method - * inserts header including the header byte, number of elements, and field offsets. + * Attempts to parse a JSON number as a decimal and write it. The input must meet the following + * criteria: - Be in a valid decimal format (integer with an optional '.'). - Not in scientific + * notation. - Fit within the precision and scale limits of decimal types. * - * @param startPos The starting position of the array values in the buffer. - * @param offsets The offsets for each array value. + * @param input the input string representing the JSON number + * @return true if the decimal is valid and written successfully; false otherwise */ - private void endArray(int startPos, List offsets) { - int dataSize = buffer.pos - startPos; // Total byte size of the array values - int numElements = offsets.size(); - - boolean isLarge = numElements > 0xFF; // Determine whether to use large format - int sizeBytes = isLarge ? 4 : 1; // Number of bytes for the array size - int fieldOffsetSize = sizeOf(dataSize); // Number of bytes of each field offset - int headerSize = 1 + sizeBytes + (numElements + 1) * fieldOffsetSize; // header size - int offsetStart = startPos + 1 + sizeBytes; // Start position for offsets - - // Shift existing data to make room for header - buffer.shift(startPos, headerSize); - - buffer.insertByte( - VariantUtil.arrayHeader(isLarge, fieldOffsetSize), startPos); // Insert header byte - buffer.insertLittleEndianUnsigned( - numElements, sizeBytes, startPos + 1); // Insert number of elements - - // Insert field offsets - for (int i = 0; i < numElements; i++) { - buffer.insertLittleEndianUnsigned( - offsets.get(i), fieldOffsetSize, offsetStart + i * fieldOffsetSize); - } - - // Insert the offset to the end of the data - buffer.insertLittleEndianUnsigned( - dataSize, fieldOffsetSize, offsetStart + numElements * fieldOffsetSize); - } - - /** Choose the smallest number of bytes to store the given value. */ - private static int sizeOf(int maxValue) { - if (maxValue <= 0xFF) { - return 1; - } else if (maxValue <= 0xFFFF) { - return 2; - } else if (maxValue <= 0xFFFFFF) { - return 3; - } - - return 4; - } - - private void appendFloat(JsonParser parser) throws IOException { - if (!tryAppendDecimal(parser.getText())) { - appendDouble(parser.getDoubleValue()); - } - } - - /** - * Attempts to parse a JSON number as a decimal and append it. The input must: - Use only decimal - * format (integer with an optional '.'). - Avoid scientific notation. - Fit within the precision - * and scale limits of decimal types. - */ - private boolean tryAppendDecimal(String input) { - // Validate that the input only contains valid decimal characters. + private boolean tryWriteDecimal(String input) { + // Validate that the input matches a decimal format and is not in scientific notation. if (!input.matches("-?\\d+(\\.\\d+)?")) { return false; } @@ -436,155 +166,13 @@ private boolean tryAppendDecimal(String input) { // Parse the input string to BigDecimal. BigDecimal decimalValue = new BigDecimal(input); - // Check if the decimal value meets precision and scale limits. + // Ensure the decimal value meets precision and scale limits. if (decimalValue.scale() <= VariantConstants.MAX_DECIMAL16_PRECISION && decimalValue.precision() <= VariantConstants.MAX_DECIMAL16_PRECISION) { - appendDecimal(decimalValue); + writeDecimalInternal(decimalValue); return true; } return false; } - - // Temporarily store the information of a field. We need to collect all fields in an JSON object, - // sort them by their keys, and build the variant object in sorted order. - public static final class FieldEntry implements Comparable { - private final String key; - private final int id; - private final int offset; - - public FieldEntry(String key, int id, int offset) { - this.key = key; - this.id = id; - this.offset = offset; - } - - FieldEntry withNewOffset(int newOffset) { - return new FieldEntry(key, id, newOffset); - } - - @Override - public int compareTo(FieldEntry other) { - return key.compareTo(other.key); - } - } - - /** An auto-growing byte buffer that doubles its size whenever the capacity is exceeded. */ - private static class ByteBufferWrapper { - private static final int SIZE_LIMIT = 1 << 24; // 16MB size limit - private static final int INITIAL_CAPACITY = 128; // Starting capacity - private byte[] buffer; - private int pos = 0; - private final int sizeLimit; - - ByteBufferWrapper() { - this(INITIAL_CAPACITY, SIZE_LIMIT); - } - - ByteBufferWrapper(int initialCapacity, int sizeLimit) { - if (initialCapacity <= 0) { - throw new IllegalArgumentException("Initial capacity must be positive"); - } - this.buffer = new byte[initialCapacity]; - this.sizeLimit = sizeLimit; - } - - /** - * Ensures the buffer has enough capacity to hold additional bytes. - * - * @param additional The number of additional bytes required. - * @throws VariantSizeLimitException If the required capacity exceeds the size limit. - */ - private void ensureCapacity(int additional) { - int required = pos + additional; - if (required > buffer.length) { - int newCapacity = Integer.highestOneBit(required); - newCapacity = newCapacity < required ? newCapacity * 2 : newCapacity; // Double the capacity - if (newCapacity > this.sizeLimit) { - throw new VariantSizeLimitException(); - } - - byte[] newBuffer = new byte[newCapacity]; - System.arraycopy(buffer, 0, newBuffer, 0, pos); - buffer = newBuffer; - } - } - - /** Adds a byte to the buffer, growing the buffer if necessary. */ - public void addByte(byte value) throws VariantSizeLimitException { - ensureCapacity(1); - buffer[pos++] = value; - } - - /** Adds an array of bytes to the buffer, growing the buffer if necessary. */ - public void addBytes(byte[] values) throws VariantSizeLimitException { - ensureCapacity(values.length); - System.arraycopy(values, 0, buffer, pos, values.length); - pos += values.length; - } - - /** - * Writes a numeric value in little-endian order to the buffer, growing the buffer if necessary. - * - * @param value The numeric value to write. - * @param numBytes The number of bytes to write (e.g., 2 for short, 4 for int, 8 for long). - */ - public void writeLittleEndianUnsigned(long value, int numBytes) { - if (numBytes < 1 || numBytes > 8) { - throw new IllegalArgumentException("numBytes must be between 1 and 8"); - } - ensureCapacity(numBytes); - - for (int i = 0; i < numBytes; ++i) { - buffer[pos + i] = (byte) ((value >>> (8 * i)) & 0xFF); - } - pos += numBytes; - } - - /** - * Move the bytes of buffer range [start, pos) by the provided offset position. This is used for - * writing array/object header. - */ - public void shift(int start, int offset) { - Preconditions.checkArgument(offset > 0, "offset must be positive"); - Preconditions.checkArgument(pos >= start, "start must be no greater than pos"); - ensureCapacity(offset); - - if (pos > start) { - System.arraycopy(buffer, start, buffer, start + offset, pos - start); - } - - pos += offset; - } - - /** - * Insert a byte into the buffer of the provided position. Note: this assumes shift() has been - * called to leave space for insert. - */ - public void insertByte(byte value, int insertPos) { - Preconditions.checkArgument(insertPos < pos, "insertPos must be smaller than pos"); - - buffer[insertPos] = value; - } - - /** - * Insert a number into the buffer of the provided position. Note: this assumes shift() has been - * called to leave space for insert. - */ - public void insertLittleEndianUnsigned(long value, int numBytes, int insertPos) { - Preconditions.checkArgument(insertPos < pos, "insertPos must be smaller than pos"); - if (numBytes < 1 || numBytes > 8) { - throw new IllegalArgumentException("numBytes must be between 1 and 8"); - } - - for (int i = 0; i < numBytes; ++i) { - buffer[insertPos + i] = (byte) ((value >>> (8 * i)) & 0xFF); - } - } - - /** Returns the underlying byte array. */ - public byte[] toByteArray() { - return Arrays.copyOf(buffer, pos); - } - } } diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantBuilderBase.java b/core/src/main/java/org/apache/iceberg/variants/VariantBuilderBase.java new file mode 100644 index 000000000000..4284145f00ff --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/variants/VariantBuilderBase.java @@ -0,0 +1,504 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.variants; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +abstract class VariantBuilderBase { + protected static final int MAX_SHORT_STR_SIZE = 0x3F; + + private final ByteBufferWrapper buffer; + private final Dictionary dict; + private int startPos; + + VariantBuilderBase(ByteBufferWrapper buffer, Dictionary dict) { + this.buffer = buffer; + this.dict = dict; + startPos = buffer.pos; + } + + /** + * Builds the variant metadata from `dictionaryKeys` and returns the resulting Variant object. + * + * @return The constructed Variant object. + */ + public Variant build() { + int numKeys = dict.size(); + + // Calculate total size of dictionary strings + long numStringBytes = dict.totalBytes(); + if (numStringBytes > VariantConstants.SIZE_LIMIT) { + throw new VariantSizeLimitException(); + } + + // Determine the number of bytes required for dictionary size and offset entry + int offsetSize = sizeOf(Math.max((int) numStringBytes, numKeys)); + + // metadata: header byte, dictionary size, offsets and string bytes + long metadataSize = 1 + offsetSize + (numKeys + 1) * offsetSize + numStringBytes; + + // Ensure the metadata size is within limits + if (metadataSize > VariantConstants.SIZE_LIMIT) { + throw new VariantSizeLimitException(); + } + + ByteBufferWrapper metadataBuffer = + new ByteBufferWrapper((int) metadataSize, (int) metadataSize); + + // Write header byte (version + offset size) + metadataBuffer.addByte(VariantUtil.metadataHeader(VariantConstants.VERSION, offsetSize)); + + // Write number of keys + metadataBuffer.writeLittleEndianUnsigned(numKeys, offsetSize); + + // Write offsets + int currentOffset = 0; + for (byte[] key : dict.getKeys()) { + metadataBuffer.writeLittleEndianUnsigned(currentOffset, offsetSize); + currentOffset += key.length; + } + metadataBuffer.writeLittleEndianUnsigned(numStringBytes, offsetSize); + + // Write dictionary strings + dict.getKeys().forEach(metadataBuffer::addBytes); + + return new VariantImpl(metadataBuffer.toByteArray(), buffer.toByteArray()); + } + + protected void writeNullInternal() { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_NULL)); + } + + protected void writeBooleanInternal(boolean value) { + buffer.addByte( + VariantUtil.primitiveHeader( + value ? Variants.Primitives.TYPE_TRUE : Variants.Primitives.TYPE_FALSE)); + } + + /** + * Writes a numeric value to the variant builder, automatically choosing the smallest type (INT8, + * INT16, INT32, or INT64) to store the value efficiently. + * + * @param value The numeric value to append. + */ + protected void writeNumericInternal(long value) { + if (value == (byte) value) { + // INT8: Requires 1 byte for header + 1 byte for value + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_INT8)); + buffer.writeLittleEndianUnsigned(value, 1); + } else if (value == (short) value) { + // INT16: Requires 1 byte for header + 2 bytes for value + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_INT16)); + buffer.writeLittleEndianUnsigned(value, 2); + } else if (value == (int) value) { + // INT32: Requires 1 byte for header + 4 bytes for value + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_INT32)); + buffer.writeLittleEndianUnsigned(value, 4); + } else { + // INT64: Requires 1 byte for header + 8 bytes for value + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_INT64)); + buffer.writeLittleEndianUnsigned(value, 8); + } + } + + protected void writeDoubleInternal(double value) { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_DOUBLE)); + buffer.writeLittleEndianUnsigned(Double.doubleToLongBits(value), 8); + } + + /** + * Writes a decimal value to the variant builder, choosing the smallest decimal type (DECIMAL4, + * DECIMAL8, DECIMAL16) that fits its precision and scale. + */ + public void writeDecimalInternal(BigDecimal value) { + Preconditions.checkArgument( + value.precision() <= VariantConstants.MAX_DECIMAL16_PRECISION, + "Unsupported Decimal precision: %s", + value.precision()); + + BigInteger unscaled = value.unscaledValue(); + if (value.scale() <= VariantConstants.MAX_DECIMAL4_PRECISION + && value.precision() <= VariantConstants.MAX_DECIMAL4_PRECISION) { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_DECIMAL4)); + buffer.addByte((byte) value.scale()); + buffer.writeLittleEndianUnsigned(unscaled.intValueExact(), 4); + } else if (value.scale() <= VariantConstants.MAX_DECIMAL8_PRECISION + && value.precision() <= VariantConstants.MAX_DECIMAL8_PRECISION) { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_DECIMAL8)); + buffer.addByte((byte) value.scale()); + buffer.writeLittleEndianUnsigned(unscaled.longValueExact(), 8); + } else { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_DECIMAL16)); + buffer.addByte((byte) value.scale()); + byte[] bytes = unscaled.toByteArray(); + for (int i = 0; i < 16; i++) { + byte byteValue = + i < bytes.length ? bytes[bytes.length - 1 - i] : (byte) (bytes[0] < 0 ? -1 : 0); + buffer.addByte(byteValue); + } + } + } + + protected void writeDateInternal(int daysSinceEpoch) { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_DATE)); + buffer.writeLittleEndianUnsigned(daysSinceEpoch, 4); + } + + /** Writes a timestamp with timezone (microseconds since epoch) to the variant builder. */ + protected void writeTimestampTzInternal(long microsSinceEpoch) { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_TIMESTAMPTZ)); + buffer.writeLittleEndianUnsigned(microsSinceEpoch, 8); + } + + /** Writes a timestamp without timezone (microseconds since epoch) to the variant builder. */ + protected void writeTimestampNtzInternal(long microsSinceEpoch) { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_TIMESTAMPNTZ)); + buffer.writeLittleEndianUnsigned(microsSinceEpoch, 8); + } + + protected void writeFloatInternal(float value) throws VariantSizeLimitException { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_FLOAT)); + buffer.writeLittleEndianUnsigned(Float.floatToIntBits(value), 4); + } + + protected void writeBinaryInternal(byte[] value) throws VariantSizeLimitException { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_BINARY)); + buffer.writeLittleEndianUnsigned(value.length, 4); + buffer.addBytes(value); + } + + protected void writeStringInternal(String value) { + byte[] text = value.getBytes(StandardCharsets.UTF_8); + boolean longStr = text.length > MAX_SHORT_STR_SIZE; + + // Write header + if (longStr) { + buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_STRING)); + buffer.writeLittleEndianUnsigned(text.length, 4); + } else { + buffer.addByte(VariantUtil.shortStrHeader(text.length)); + } + + // Write string content + buffer.addBytes(text); + } + + /** Choose the smallest number of bytes to store the given value. */ + protected static int sizeOf(int maxValue) { + if (maxValue <= 0xFF) { + return 1; + } else if (maxValue <= 0xFFFF) { + return 2; + } else if (maxValue <= 0xFFFFFF) { + return 3; + } + + return 4; + } + + /** + * Completes writing an object to the buffer. Object fields are already written, and this method + * inserts header including header byte, number of elements, field IDs, and field offsets. + * + * @param objStartPos The starting position of the object data in the buffer. + * @param fields The list of field entries (key, ID, offset). + */ + protected void endObject(int objStartPos, List fields) { + int numElements = fields.size(); + + // Sort fields by key and ensure no duplicate keys + Collections.sort(fields); + int maxId = numElements == 0 ? 0 : fields.get(0).id; + for (int i = 1; i < numElements; i++) { + maxId = Math.max(maxId, fields.get(i).id); + if (fields.get(i).key.equals(fields.get(i - 1).key)) { + throw new IllegalStateException("Duplicate key in Variant: " + fields.get(i).key); + } + } + + int dataSize = buffer.pos - objStartPos; // Total byte size of the object values + boolean isLarge = numElements > 0xFF; // Determine whether to use large format + int sizeBytes = isLarge ? 4 : 1; // Number of bytes for the object size + int fieldIdSize = sizeOf(maxId); // Number of bytes for each field id + int fieldOffsetSize = sizeOf(dataSize); // Number of bytes for each field offset + int headerSize = + 1 + sizeBytes + numElements * fieldIdSize + (numElements + 1) * fieldOffsetSize; + + // Shift existing data to make room for header + buffer.shift(objStartPos, headerSize); + + buffer.insertByte( + VariantUtil.objectHeader(isLarge, fieldIdSize, fieldOffsetSize), + objStartPos); // Insert header byte + buffer.insertLittleEndianUnsigned( + numElements, sizeBytes, objStartPos + 1); // Insert number of elements + + // Insert field IDs and offsets + int fieldIdStart = objStartPos + 1 + sizeBytes; + int fieldOffsetStart = fieldIdStart + numElements * fieldIdSize; + for (int i = 0; i < numElements; i++) { + buffer.insertLittleEndianUnsigned( + fields.get(i).id, fieldIdSize, fieldIdStart + i * fieldIdSize); + buffer.insertLittleEndianUnsigned( + fields.get(i).offset, fieldOffsetSize, fieldOffsetStart + i * fieldOffsetSize); + } + + // Insert the offset to the end of the data + buffer.insertLittleEndianUnsigned( + dataSize, fieldOffsetSize, fieldOffsetStart + numElements * fieldOffsetSize); + } + + /** + * Completes writing an array to the buffer. Array values are already written, and this method + * inserts header including the header byte, number of elements, and field offsets. + * + * @param arrStartPos The starting position of the array values in the buffer. + * @param offsets The offsets for each array value. + */ + protected void endArray(int arrStartPos, List offsets) { + int dataSize = buffer.pos - arrStartPos; // Total byte size of the array values + int numElements = offsets.size(); + + boolean isLarge = numElements > 0xFF; // Determine whether to use large format + int sizeBytes = isLarge ? 4 : 1; // Number of bytes for the array size + int fieldOffsetSize = sizeOf(dataSize); // Number of bytes of each field offset + int headerSize = 1 + sizeBytes + (numElements + 1) * fieldOffsetSize; // header size + int offsetStart = arrStartPos + 1 + sizeBytes; // Start position for offsets + + // Shift existing data to make room for header + buffer.shift(arrStartPos, headerSize); + + buffer.insertByte( + VariantUtil.arrayHeader(isLarge, fieldOffsetSize), arrStartPos); // Insert header byte + buffer.insertLittleEndianUnsigned( + numElements, sizeBytes, arrStartPos + 1); // Insert number of elements + + // Insert field offsets + for (int i = 0; i < numElements; i++) { + buffer.insertLittleEndianUnsigned( + offsets.get(i), fieldOffsetSize, offsetStart + i * fieldOffsetSize); + } + + // Insert the offset to the end of the data + buffer.insertLittleEndianUnsigned( + dataSize, fieldOffsetSize, offsetStart + numElements * fieldOffsetSize); + } + + protected ByteBufferWrapper getBuffer() { + return buffer; + } + + protected Dictionary getDict() { + return dict; + } + + protected int getStartPos() { + return startPos; + } + + /** An auto-growing byte buffer that doubles its size whenever the capacity is exceeded. */ + protected static class ByteBufferWrapper { + private static final int INITIAL_CAPACITY = 128; // Starting capacity + private byte[] buffer; + private int pos = 0; + private final int sizeLimit; + + ByteBufferWrapper() { + this(INITIAL_CAPACITY, VariantConstants.SIZE_LIMIT); + } + + ByteBufferWrapper(int initialCapacity, int sizeLimit) { + if (initialCapacity <= 0) { + throw new IllegalArgumentException("Initial capacity must be positive"); + } + this.buffer = new byte[initialCapacity]; + this.sizeLimit = sizeLimit; + } + + /** + * Ensures the buffer has enough capacity to hold additional bytes. + * + * @param additional The number of additional bytes required. + * @throws VariantSizeLimitException If the required capacity exceeds the size limit. + */ + private void ensureCapacity(int additional) { + int required = pos + additional; + if (required > buffer.length) { + int newCapacity = Integer.highestOneBit(required); + newCapacity = newCapacity < required ? newCapacity * 2 : newCapacity; // Double the capacity + if (newCapacity > this.sizeLimit) { + throw new VariantSizeLimitException(); + } + + byte[] newBuffer = new byte[newCapacity]; + System.arraycopy(buffer, 0, newBuffer, 0, pos); + buffer = newBuffer; + } + } + + /** Adds a byte to the buffer, growing the buffer if necessary. */ + void addByte(byte value) throws VariantSizeLimitException { + ensureCapacity(1); + buffer[pos++] = value; + } + + /** Adds an array of bytes to the buffer, growing the buffer if necessary. */ + void addBytes(byte[] values) throws VariantSizeLimitException { + ensureCapacity(values.length); + System.arraycopy(values, 0, buffer, pos, values.length); + pos += values.length; + } + + /** + * Writes a numeric value in little-endian order to the buffer, growing the buffer if necessary. + * + * @param value The numeric value to write. + * @param numBytes The number of bytes to write (e.g., 2 for short, 4 for int, 8 for long). + */ + void writeLittleEndianUnsigned(long value, int numBytes) { + if (numBytes < 1 || numBytes > 8) { + throw new IllegalArgumentException("numBytes must be between 1 and 8"); + } + ensureCapacity(numBytes); + + for (int i = 0; i < numBytes; ++i) { + buffer[pos + i] = (byte) ((value >>> (8 * i)) & 0xFF); + } + pos += numBytes; + } + + /** + * Move the bytes of buffer range [start, pos) by the provided offset position. This is used for + * writing array/object header. + */ + void shift(int start, int offset) { + Preconditions.checkArgument(offset > 0, "offset must be positive"); + Preconditions.checkArgument(pos >= start, "start must be no greater than pos"); + ensureCapacity(offset); + + if (pos > start) { + System.arraycopy(buffer, start, buffer, start + offset, pos - start); + } + + pos += offset; + } + + /** + * Insert a byte into the buffer of the provided position. Note: this assumes shift() has been + * called to leave space for insert. + */ + void insertByte(byte value, int insertPos) { + Preconditions.checkArgument(insertPos < pos, "insertPos must be smaller than pos"); + + buffer[insertPos] = value; + } + + /** + * Insert a number into the buffer of the provided position. Note: this assumes shift() has been + * called to leave space for insert. + */ + void insertLittleEndianUnsigned(long value, int numBytes, int insertPos) { + Preconditions.checkArgument(insertPos < pos, "insertPos must be smaller than pos"); + if (numBytes < 1 || numBytes > 8) { + throw new IllegalArgumentException("numBytes must be between 1 and 8"); + } + + for (int i = 0; i < numBytes; ++i) { + buffer[insertPos + i] = (byte) ((value >>> (8 * i)) & 0xFF); + } + } + + /** Returns the underlying byte array. */ + byte[] toByteArray() { + return Arrays.copyOf(buffer, pos); + } + + int getPos() { + return pos; + } + } + + /** + * A Variant metadata dictionary implementation which assigns a monotonically increasing assigned + * id to newly added string + */ + protected static class Dictionary { + // Store the mapping from a string to a monotonically increasing assigned id + private final Map stringIds = Maps.newHashMap(); + // Store all the strings encoded with UTF8 in `dictionary` in the order of assigned ids. + private final List utf8Strings = Lists.newArrayList(); + + /** Return the assigned id if string exists; otherwise, assign the next id and return. */ + int add(String key) { + return stringIds.computeIfAbsent( + key, + k -> { + int newId = stringIds.size(); + utf8Strings.add(k.getBytes(StandardCharsets.UTF_8)); + return newId; + }); + } + + int size() { + return utf8Strings.size(); + } + + long totalBytes() { + return utf8Strings.stream().mapToLong(key -> key.length).sum(); + } + + List getKeys() { + return utf8Strings; + } + } + + /** + * Temporarily store the information of a field. We need to collect all fields in an JSON object, + * sort them by their keys, and build the variant object in sorted order. + */ + protected static final class FieldEntry implements Comparable { + private final String key; + private final int id; + private final int offset; + + FieldEntry(String key, int id, int offset) { + this.key = key; + this.id = id; + this.offset = offset; + } + + FieldEntry withNewOffset(int newOffset) { + return new FieldEntry(key, id, newOffset); + } + + @Override + public int compareTo(FieldEntry other) { + return key.compareTo(other.key); + } + } +} diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantImpl.java b/core/src/main/java/org/apache/iceberg/variants/VariantImpl.java new file mode 100644 index 000000000000..b88a0464f718 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/variants/VariantImpl.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.variants; + +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +public final class VariantImpl implements Variant { + private final VariantMetadata metadata; + private final VariantValue value; + + public VariantImpl(byte[] metadata, byte[] value) { + Preconditions.checkArgument( + metadata != null && metadata.length >= 1, "Metadata must not be null or empty."); + Preconditions.checkArgument( + value != null && value.length >= 1, "Value must not be null or empty."); + + Preconditions.checkArgument( + (metadata[0] & VariantConstants.VERSION_MASK) == VariantConstants.VERSION, + "Unsupported metadata version."); + + if (value.length > VariantConstants.SIZE_LIMIT + || metadata.length > VariantConstants.SIZE_LIMIT) { + throw new VariantSizeLimitException(); + } + + this.metadata = SerializedMetadata.from(metadata); + + int header = value[0]; + Variants.BasicType basicType = VariantUtil.basicType(header); + switch (basicType) { + case PRIMITIVE: + this.value = SerializedPrimitive.from(value); + break; + case ARRAY: + this.value = SerializedArray.from((SerializedMetadata) this.metadata, value); + break; + case OBJECT: + this.value = SerializedObject.from((SerializedMetadata) this.metadata, value); + break; + case SHORT_STRING: + this.value = SerializedShortString.from(value); + break; + default: + throw new UnsupportedOperationException("Unsupported basic type: " + basicType); + } + } + + @Override + public VariantMetadata metadata() { + return metadata; + } + + @Override + public VariantValue value() { + return value; + } +} diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantObject.java b/core/src/main/java/org/apache/iceberg/variants/VariantObject.java index 33696dae41c2..e65d71ede6db 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantObject.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantObject.java @@ -20,6 +20,10 @@ /** An variant object value. */ public interface VariantObject extends VariantValue { + default int numElements() { + throw new UnsupportedOperationException(); + } + /** Returns the {@link VariantValue} for the field named {@code name} in this object. */ VariantValue get(String name); diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantObjectBuilder.java b/core/src/main/java/org/apache/iceberg/variants/VariantObjectBuilder.java new file mode 100644 index 000000000000..4960e96f2e50 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/variants/VariantObjectBuilder.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.variants; + +import java.math.BigDecimal; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.OffsetDateTime; +import java.util.List; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.DateTimeUtil; + +public class VariantObjectBuilder extends VariantBuilderBase { + private final List fields; + + VariantObjectBuilder(ByteBufferWrapper buffer, Dictionary dict) { + super(buffer, dict); + fields = Lists.newArrayList(); + } + + public VariantObjectBuilder startObject(String key) { + writeKey(key); + return new VariantObjectBuilder(getBuffer(), getDict()); + } + + public VariantArrayBuilder startArray(String key) { + writeKey(key); + return new VariantArrayBuilder(getBuffer(), getDict()); + } + + private void writeKey(String key) { + int id = getDict().add(key); + fields.add(new FieldEntry(key, id, getBuffer().getPos() - getStartPos())); + } + + public VariantObjectBuilder writeNull(String key) { + writeKey(key); + writeNullInternal(); + return this; + } + + public VariantObjectBuilder writeBoolean(String key, boolean value) { + writeKey(key); + writeBooleanInternal(value); + return this; + } + + public VariantObjectBuilder writeNumeric(String key, long value) { + writeKey(key); + writeNumericInternal(value); + return this; + } + + public VariantObjectBuilder writeDouble(String key, double value) { + writeKey(key); + writeDoubleInternal(value); + return this; + } + + public VariantObjectBuilder writeDecimal(String key, BigDecimal value) { + writeKey(key); + writeDecimalInternal(value); + return this; + } + + public VariantObjectBuilder writeDate(String key, LocalDate value) { + writeKey(key); + writeDateInternal(DateTimeUtil.daysFromDate(value)); + return this; + } + + public VariantObjectBuilder writeTimestampTz(String key, OffsetDateTime value) { + writeKey(key); + writeTimestampTzInternal(DateTimeUtil.microsFromTimestamptz(value)); + return this; + } + + public VariantObjectBuilder writeTimestampNtz(String key, LocalDateTime value) { + writeKey(key); + writeTimestampNtzInternal(DateTimeUtil.microsFromTimestamp(value)); + return this; + } + + public VariantObjectBuilder writeFloat(String key, float value) { + writeKey(key); + writeFloatInternal(value); + return this; + } + + public VariantObjectBuilder writeBinary(String key, byte[] value) { + writeKey(key); + writeBinaryInternal(value); + return this; + } + + public VariantObjectBuilder writeString(String key, String value) { + writeKey(key); + writeStringInternal(value); + return this; + } + + public void endObject() { + super.endObject(getStartPos(), fields); + } +} diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantPrimitiveBuilder.java b/core/src/main/java/org/apache/iceberg/variants/VariantPrimitiveBuilder.java new file mode 100644 index 000000000000..634705a09245 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/variants/VariantPrimitiveBuilder.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.variants; + +import java.math.BigDecimal; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.OffsetDateTime; +import org.apache.iceberg.util.DateTimeUtil; + +public class VariantPrimitiveBuilder extends VariantBuilderBase { + public VariantPrimitiveBuilder(ByteBufferWrapper buffer, Dictionary dict) { + super(buffer, dict); + } + + public VariantPrimitiveBuilder writeNull() { + writeNullInternal(); + return this; + } + + public VariantPrimitiveBuilder writeBoolean(boolean value) { + writeBooleanInternal(value); + return this; + } + + public VariantPrimitiveBuilder writeNumeric(long value) { + writeNumericInternal(value); + return this; + } + + public VariantPrimitiveBuilder writeDouble(double value) { + writeDoubleInternal(value); + return this; + } + + public VariantPrimitiveBuilder writeDecimal(BigDecimal value) { + writeDecimalInternal(value); + return this; + } + + public VariantPrimitiveBuilder writeDate(LocalDate value) { + writeDateInternal(DateTimeUtil.daysFromDate(value)); + return this; + } + + public VariantPrimitiveBuilder writeTimestampTz(OffsetDateTime value) { + writeTimestampTzInternal(DateTimeUtil.microsFromTimestamptz(value)); + return this; + } + + public VariantPrimitiveBuilder writeTimestampNtz(LocalDateTime value) { + writeTimestampNtzInternal(DateTimeUtil.microsFromTimestamp(value)); + return this; + } + + public VariantPrimitiveBuilder writeFloat(float value) { + writeFloatInternal(value); + return this; + } + + public VariantPrimitiveBuilder writeBinary(byte[] value) { + writeBinaryInternal(value); + return this; + } + + public VariantPrimitiveBuilder writeString(String value) { + writeStringInternal(value); + return this; + } +} diff --git a/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilder.java b/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilder.java deleted file mode 100644 index d77272f0b35d..000000000000 --- a/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilder.java +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.variants; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.IOException; -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.util.List; -import java.util.stream.Stream; -import net.minidev.json.JSONArray; -import org.apache.iceberg.util.DateTimeUtil; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.Arguments; -import org.junit.jupiter.params.provider.MethodSource; - -public class TestVariantBuilder { - @ParameterizedTest - @MethodSource("primitiveInputs") - public void testPrimitive(String input, Variants.PhysicalType expectedType, Object expectedValue) throws IOException { - Variant variant = VariantBuilder.parseJson(input); - - SerializedPrimitive primitive = SerializedPrimitive.from(variant); - - assertThat(primitive.type()).isEqualTo(expectedType); - assertThat(primitive.get()).isEqualTo(expectedValue); - } - - private static Stream primitiveInputs() { - return Stream.of( - Arguments.of("null", Variants.PhysicalType.NULL, null), - Arguments.of("true", Variants.PhysicalType.BOOLEAN_TRUE, true), - Arguments.of("false", Variants.PhysicalType.BOOLEAN_FALSE, false), - Arguments.of("34", Variants.PhysicalType.INT8, (byte)34), - Arguments.of("1234", Variants.PhysicalType.INT16, (short)1234), - Arguments.of("1234567890", Variants.PhysicalType.INT32, 1234567890), - Arguments.of("1234567890987654321", Variants.PhysicalType.INT64, 1234567890987654321L), - Arguments.of("1234e-2", Variants.PhysicalType.DOUBLE, 12.34), - Arguments.of("123456.789", Variants.PhysicalType.DECIMAL4, new BigDecimal("123456.789")), - Arguments.of("123456789.987654321", Variants.PhysicalType.DECIMAL8, new BigDecimal("123456789.987654321")), - Arguments.of("12345678901234567890.987654321", Variants.PhysicalType.DECIMAL16, new BigDecimal("12345678901234567890.987654321")), - Arguments.of("\"This test string is used to generate a primitive string type of variant\"", Variants.PhysicalType.STRING, "This test string is used to generate a primitive string type of variant") - - ); - } - - @Test - public void testPrimitiveFloat() { - VariantBuilder builder = new VariantBuilder(); - builder.appendFloat(12.34f); - Variant variant = builder.build(); - SerializedPrimitive primitive = SerializedPrimitive.from(variant); - - assertThat(primitive.type()).isEqualTo(Variants.PhysicalType.FLOAT); - assertThat(primitive.get()).isEqualTo(12.34f); - } - - @Test - public void testPrimitiveDate() { - String dateString = "2017-08-18"; - int daysSinceEpoch = DateTimeUtil.isoDateToDays(dateString); - - VariantBuilder builder = new VariantBuilder(); - builder.appendDate(daysSinceEpoch); - Variant variant = builder.build(); - SerializedPrimitive primitive = SerializedPrimitive.from(variant); - - assertThat(primitive.type()).isEqualTo(Variants.PhysicalType.DATE); - assertThat(DateTimeUtil.daysToIsoDate((int)primitive.get())).isEqualTo(dateString); - } - - @Test - public void testPrimitiveTimestampTZ() { - String tzString = "2017-08-18T14:21:01.919+00:00"; - long microsSinceEpoch = DateTimeUtil.isoTimestamptzToMicros(tzString); - - VariantBuilder builder = new VariantBuilder(); - builder.appendTimestampTz(microsSinceEpoch); - Variant variant = builder.build(); - SerializedPrimitive primitive = SerializedPrimitive.from(variant); - - assertThat(primitive.type()).isEqualTo(Variants.PhysicalType.TIMESTAMPTZ); - assertThat(DateTimeUtil.microsToIsoTimestamptz((long)primitive.get())).isEqualTo(tzString); - } - - @Test - public void testPrimitiveTimestampNTZ() { - String ntzString = "2017-08-18T14:21:01.919"; - long microsSinceEpoch = DateTimeUtil.isoTimestampToMicros(ntzString); - - VariantBuilder builder = new VariantBuilder(); - builder.appendTimestampNtz(microsSinceEpoch); - Variant variant = builder.build(); - SerializedPrimitive primitive = SerializedPrimitive.from(variant); - - assertThat(primitive.type()).isEqualTo(Variants.PhysicalType.TIMESTAMPNTZ); - assertThat(DateTimeUtil.microsToIsoTimestamp((long)primitive.get())).isEqualTo(ntzString); - } - - @Test - public void testPrimitiveBinary() { - VariantBuilder builder = new VariantBuilder(); - builder.appendBinary("iceberg".getBytes()); - Variant variant = builder.build(); - SerializedPrimitive primitive = SerializedPrimitive.from(variant); - - assertThat(primitive.type()).isEqualTo(Variants.PhysicalType.BINARY); - assertThat(primitive.get()).isEqualTo(ByteBuffer.wrap("iceberg".getBytes())); - } - - @Test - public void testShortString() throws IOException { - Variant variant = VariantBuilder.parseJson("\"iceberg\""); - SerializedShortString shortString = SerializedShortString.from(variant); - - assertThat(shortString.type()).isEqualTo(Variants.PhysicalType.STRING); - assertThat(shortString.get()).isEqualTo("iceberg"); - } - - @Test - public void testArray() throws IOException { - List input = List.of("Ford", "BMW", "Fiat"); - Variant variant = VariantBuilder.parseJson(JSONArray.toJSONString(input)); - SerializedArray arr = SerializedArray.from(variant); - - assertThat(arr.type()).isEqualTo(Variants.PhysicalType.ARRAY); - for (int i = 0; i < arr.numElements(); i++) { - assertThat(arr.get(i).asPrimitive().get()).isEqualTo(input.get(i)); - } - } - - @Test - public void testEmptyObject() throws IOException { - Variant variant = VariantBuilder.parseJson("{}"); - SerializedObject object = SerializedObject.from(variant); - - assertThat(object.type()).isEqualTo(Variants.PhysicalType.OBJECT); - assertThat(object.numElements()).isEqualTo(0); - } - - @Test - public void testObject() throws IOException { - Variant variant = VariantBuilder.parseJson("{ \"id\": 1234, \"firstName\": \"Joe\", \"lastName\": \"Smith\", \"phones\":[\"123-456-7890\", \"789-123-4560\"] }"); - SerializedObject object = SerializedObject.from(variant); - - assertThat(object.type()).isEqualTo(Variants.PhysicalType.OBJECT); - assertThat(object.numElements()).isEqualTo(4); - - assertThat(object.get("id").asPrimitive().get()).isEqualTo((short)1234); - assertThat(object.get("firstName").asPrimitive().get()).isEqualTo("Joe"); - assertThat(object.get("lastName").asPrimitive().get()).isEqualTo("Smith"); - - VariantArray phones = object.get("phones").asArray(); - assertThat(phones.numElements()).isEqualTo(2); - assertThat(phones.get(0).asPrimitive().get()).isEqualTo("123-456-7890"); - assertThat(phones.get(1).asPrimitive().get()).isEqualTo("789-123-4560"); - } -} diff --git a/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderArray.java b/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderArray.java new file mode 100644 index 000000000000..20d820594b30 --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderArray.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.variants; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.OffsetDateTime; +import java.util.List; +import net.minidev.json.JSONArray; +import org.apache.iceberg.util.DateTimeUtil; +import org.junit.jupiter.api.Test; + +public class TestVariantBuilderArray { + @Test + public void testSimpleArrayJson() throws IOException { + List input = List.of("Ford", "BMW", "Fiat"); + Variant variant = VariantBuilder.parseJson(JSONArray.toJSONString(input)); + VariantArray arr = variant.value().asArray(); + + assertThat(arr.type()).isEqualTo(Variants.PhysicalType.ARRAY); + for (int i = 0; i < arr.numElements(); i++) { + assertThat(arr.get(i).asPrimitive().get()).isEqualTo(input.get(i)); + } + } + + @Test + public void testArrayJson() throws IOException { + String input = + "[{\n" + + " \"firstName\": \"John\"," + + " \"lastName\": \"Smith\"," + + " \"age\": 25,\n" + + " \"address\" : {\n" + + " \"streetAddress\": \"21 2nd Street\",\n" + + " \"city\": \"New York\",\n" + + " \"state\": \"NY\",\n" + + " \"postalCode\": \"10021\"\n" + + " },\n" + + " \"phoneNumber\": [\n" + + " {\"type\": \"home\", \"number\": \"212 555-1234\"},\n" + + " {\"type\": \"fax\", \"number\": \"646 555-4567\"}\n" + + " ]\n" + + " }]"; + validateVariant(VariantBuilder.parseJson(input)); + } + + @Test + public void testBuildSimpleArray() { + List input = List.of("Ford", "BMW", "Fiat"); + VariantArrayBuilder builder = new VariantBuilder().startArray(); + for (String str : input) { + builder.writeString(str); + } + builder.endArray(); + + Variant variant = builder.build(); + VariantArray arr = variant.value().asArray(); + + assertThat(arr.type()).isEqualTo(Variants.PhysicalType.ARRAY); + assertThat(arr.numElements()).isEqualTo(3); + for (int i = 0; i < arr.numElements(); i++) { + assertThat(arr.get(i).asPrimitive().get()).isEqualTo(input.get(i)); + } + } + + @Test + public void testBuildArray() { + VariantArrayBuilder builder = new VariantBuilder().startArray(); + builder + .writeNull() + .writeBoolean(true) + .writeBoolean(false) + .writeNumeric(34) + .writeNumeric(1234) + .writeNumeric(1234567890) + .writeNumeric(1234567890987654321L) + .writeDouble(1234e-2) + .writeDecimal(new BigDecimal("123456.789")) + .writeDecimal(new BigDecimal("123456789.987654321")) + .writeDecimal(new BigDecimal("12345678901234567890.987654321")) + .writeDate(LocalDate.parse("2017-08-18")) + .writeTimestampTz(OffsetDateTime.parse("2017-08-18T14:21:01.919+00:00")) + .writeTimestampNtz(LocalDateTime.parse("2017-08-18T14:21:01.919")) + .writeFloat(12.34f) + .writeBinary("iceberg".getBytes()) + .writeString("This test string is used to generate a primitive string type of variant") + .writeString("iceberg"); + builder.startArray().writeString("Ford").writeString("BMW").writeString("Fiat").endArray(); + + builder + .startObject() + .writeString("firstName", "John") + .writeString("lastName", "Smith") + .writeNumeric("age", 25) + .endObject(); + builder.endArray(); + + Variant variant = builder.build(); + VariantArray arr = variant.value().asArray(); + assertThat(arr.type()).isEqualTo(Variants.PhysicalType.ARRAY); + assertThat(arr.numElements()).isEqualTo(20); + assertThat(arr.get(0).asPrimitive().get()).isNull(); + assertThat(arr.get(1).asPrimitive().get()).isEqualTo(true); + assertThat(arr.get(2).asPrimitive().get()).isEqualTo(false); + assertThat(arr.get(3).asPrimitive().get()).isEqualTo((byte) 34); + assertThat(arr.get(4).asPrimitive().get()).isEqualTo((short) 1234); + assertThat(arr.get(5).asPrimitive().get()).isEqualTo(1234567890); + assertThat(arr.get(6).asPrimitive().get()).isEqualTo(1234567890987654321L); + assertThat(arr.get(7).asPrimitive().get()).isEqualTo(12.34); + assertThat(arr.get(8).asPrimitive().get()).isEqualTo(new BigDecimal("123456.789")); + assertThat(arr.get(9).asPrimitive().get()).isEqualTo(new BigDecimal("123456789.987654321")); + assertThat(arr.get(10).asPrimitive().get()) + .isEqualTo(new BigDecimal("12345678901234567890.987654321")); + assertThat(arr.get(11).asPrimitive().get()) + .isEqualTo(DateTimeUtil.daysFromDate(LocalDate.parse("2017-08-18"))); + assertThat(arr.get(12).asPrimitive().get()) + .isEqualTo( + DateTimeUtil.microsFromTimestamptz( + OffsetDateTime.parse("2017-08-18T14:21:01.919+00:00"))); + assertThat(arr.get(13).asPrimitive().get()) + .isEqualTo( + DateTimeUtil.microsFromTimestamp(LocalDateTime.parse("2017-08-18T14:21:01.919"))); + assertThat(arr.get(14).asPrimitive().get()).isEqualTo(12.34f); + assertThat(arr.get(15).asPrimitive().get()).isEqualTo(ByteBuffer.wrap("iceberg".getBytes())); + assertThat(arr.get(16).asPrimitive().get()) + .isEqualTo("This test string is used to generate a primitive string type of variant"); + assertThat(arr.get(17).asPrimitive().get()).isEqualTo("iceberg"); + assertThat(arr.get(18).type()).isEqualTo(Variants.PhysicalType.ARRAY); + + assertThat(arr.get(19).type()).isEqualTo(Variants.PhysicalType.OBJECT); + } + + private void validateVariant(Variant variant) { + VariantArray arr = variant.value().asArray(); + assertThat(arr.numElements()).isEqualTo(1); + + VariantObject object = arr.get(0).asObject(); + assertThat(object.type()).isEqualTo(Variants.PhysicalType.OBJECT); + assertThat(object.numElements()).isEqualTo(5); + + assertThat(object.get("firstName").asPrimitive().get()).isEqualTo("John"); + assertThat(object.get("lastName").asPrimitive().get()).isEqualTo("Smith"); + assertThat(object.get("age").asPrimitive().get()).isEqualTo((byte) 25); + + VariantObject address = object.get("address").asObject(); + assertThat(address.type()).isEqualTo(Variants.PhysicalType.OBJECT); + assertThat(address.numElements()).isEqualTo(4); + assertThat(address.get("streetAddress").asPrimitive().get()).isEqualTo("21 2nd Street"); + assertThat(address.get("city").asPrimitive().get()).isEqualTo("New York"); + assertThat(address.get("state").asPrimitive().get()).isEqualTo("NY"); + assertThat(address.get("postalCode").asPrimitive().get()).isEqualTo("10021"); + + VariantArray phoneNumbers = object.get("phoneNumber").asArray(); + assertThat(phoneNumbers.numElements()).isEqualTo(2); + VariantObject phoneNumber1 = phoneNumbers.get(0).asObject(); + assertThat(phoneNumber1.get("type").asPrimitive().get()).isEqualTo("home"); + assertThat(phoneNumber1.get("number").asPrimitive().get()).isEqualTo("212 555-1234"); + VariantObject phoneNumber2 = phoneNumbers.get(1).asObject(); + assertThat(phoneNumber2.get("type").asPrimitive().get()).isEqualTo("fax"); + assertThat(phoneNumber2.get("number").asPrimitive().get()).isEqualTo("646 555-4567"); + } +} diff --git a/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderObject.java b/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderObject.java new file mode 100644 index 000000000000..26d6a4712fc6 --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderObject.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.variants; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import org.junit.jupiter.api.Test; + +public class TestVariantBuilderObject { + @Test + public void testEmptyObjectJson() throws IOException { + Variant variant = VariantBuilder.parseJson("{}"); + VariantObject object = variant.value().asObject(); + + assertThat(object.type()).isEqualTo(Variants.PhysicalType.OBJECT); + assertThat(object.numElements()).isEqualTo(0); + } + + @Test + public void testObjectJson() throws IOException { + String input = + "{\n" + + " \"firstName\": \"John\"," + + " \"lastName\": \"Smith\"," + + " \"age\": 25,\n" + + " \"address\" : {\n" + + " \"streetAddress\": \"21 2nd Street\",\n" + + " \"city\": \"New York\",\n" + + " \"state\": \"NY\",\n" + + " \"postalCode\": \"10021\"\n" + + " },\n" + + " \"phoneNumber\": [\n" + + " {\"type\": \"home\", \"number\": \"212 555-1234\"},\n" + + " {\"type\": \"fax\", \"number\": \"646 555-4567\"}\n" + + " ]\n" + + " }"; + + validateVariant(VariantBuilder.parseJson(input)); + } + + @Test + public void testBuildObject() { + VariantObjectBuilder builder = + new VariantBuilder() + .startObject() + .writeString("firstName", "John") + .writeString("lastName", "Smith") + .writeNumeric("age", 25); + builder + .startObject("address") + .writeString("streetAddress", "21 2nd Street") + .writeString("city", "New York") + .writeString("state", "NY") + .writeString("postalCode", "10021") + .endObject(); + VariantArrayBuilder phoneNumberBuilder = builder.startArray("phoneNumber"); + phoneNumberBuilder + .startObject() + .writeString("type", "home") + .writeString("number", "212 555-1234") + .endObject(); + phoneNumberBuilder + .startObject() + .writeString("type", "fax") + .writeString("number", "646 555-4567") + .endObject(); + phoneNumberBuilder.endArray(); + builder.endObject(); + + validateVariant(builder.build()); + } + + private void validateVariant(Variant variant) { + VariantObject object = variant.value().asObject(); + + assertThat(object.type()).isEqualTo(Variants.PhysicalType.OBJECT); + assertThat(object.numElements()).isEqualTo(5); + + assertThat(object.get("firstName").asPrimitive().get()).isEqualTo("John"); + assertThat(object.get("lastName").asPrimitive().get()).isEqualTo("Smith"); + assertThat(object.get("age").asPrimitive().get()).isEqualTo((byte) 25); + + VariantObject address = object.get("address").asObject(); + assertThat(address.type()).isEqualTo(Variants.PhysicalType.OBJECT); + assertThat(address.numElements()).isEqualTo(4); + assertThat(address.get("streetAddress").asPrimitive().get()).isEqualTo("21 2nd Street"); + assertThat(address.get("city").asPrimitive().get()).isEqualTo("New York"); + assertThat(address.get("state").asPrimitive().get()).isEqualTo("NY"); + assertThat(address.get("postalCode").asPrimitive().get()).isEqualTo("10021"); + + VariantArray phoneNumbers = object.get("phoneNumber").asArray(); + assertThat(phoneNumbers.numElements()).isEqualTo(2); + VariantObject phoneNumber1 = phoneNumbers.get(0).asObject(); + assertThat(phoneNumber1.get("type").asPrimitive().get()).isEqualTo("home"); + assertThat(phoneNumber1.get("number").asPrimitive().get()).isEqualTo("212 555-1234"); + VariantObject phoneNumber2 = phoneNumbers.get(1).asObject(); + assertThat(phoneNumber2.get("type").asPrimitive().get()).isEqualTo("fax"); + assertThat(phoneNumber2.get("number").asPrimitive().get()).isEqualTo("646 555-4567"); + } +} diff --git a/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderPrimitive.java b/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderPrimitive.java new file mode 100644 index 000000000000..aeb3148377bf --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderPrimitive.java @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.variants; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.OffsetDateTime; +import java.util.stream.Stream; +import org.apache.iceberg.util.DateTimeUtil; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.ValueSource; + +public class TestVariantBuilderPrimitive { + private static Stream primitiveInputs() { + return Stream.of( + Arguments.of("null", Variants.PhysicalType.NULL, null), + Arguments.of("true", Variants.PhysicalType.BOOLEAN_TRUE, true), + Arguments.of("false", Variants.PhysicalType.BOOLEAN_FALSE, false), + Arguments.of("34", Variants.PhysicalType.INT8, (byte) 34), + Arguments.of("1234", Variants.PhysicalType.INT16, (short) 1234), + Arguments.of("1234567890", Variants.PhysicalType.INT32, 1234567890), + Arguments.of("1234567890987654321", Variants.PhysicalType.INT64, 1234567890987654321L), + Arguments.of("1234e-2", Variants.PhysicalType.DOUBLE, 12.34), + Arguments.of("123456.789", Variants.PhysicalType.DECIMAL4, new BigDecimal("123456.789")), + Arguments.of( + "123456789.987654321", + Variants.PhysicalType.DECIMAL8, + new BigDecimal("123456789.987654321")), + Arguments.of( + "12345678901234567890.987654321", + Variants.PhysicalType.DECIMAL16, + new BigDecimal("12345678901234567890.987654321")), + Arguments.of( + "\"This test string is used to generate a primitive string type of variant\"", + Variants.PhysicalType.STRING, + "This test string is used to generate a primitive string type of variant")); + } + + @ParameterizedTest + @MethodSource("primitiveInputs") + public void testPrimitiveJson( + String input, Variants.PhysicalType expectedType, Object expectedValue) throws IOException { + Variant variant = VariantBuilder.parseJson(input); + VariantPrimitive primitive = variant.value().asPrimitive(); + + assertThat(primitive.type()).isEqualTo(expectedType); + assertThat(primitive.get()).isEqualTo(expectedValue); + } + + @Test + public void testShortStringJson() throws IOException { + Variant variant = VariantBuilder.parseJson("\"iceberg\""); + VariantPrimitive shortString = variant.value().asPrimitive(); + + assertThat(shortString.type()).isEqualTo(Variants.PhysicalType.STRING); + assertThat(shortString.get()).isEqualTo("iceberg"); + } + + @Test + public void testPrimitiveNull() { + VariantPrimitiveBuilder builder = new VariantBuilder().createPrimitive(); + builder.writeNull(); + Variant variant = builder.build(); + VariantPrimitive primitive = variant.value().asPrimitive(); + + assertThat(primitive.type()).isEqualTo(Variants.PhysicalType.NULL); + assertThat(primitive.get()).isEqualTo(null); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testPrimitiveBoolean(boolean value) { + VariantPrimitiveBuilder builder = new VariantBuilder().createPrimitive(); + builder.writeBoolean(value); + Variant variant = builder.build(); + VariantPrimitive primitive = variant.value().asPrimitive(); + + assertThat(primitive.type()) + .isEqualTo( + value ? Variants.PhysicalType.BOOLEAN_TRUE : Variants.PhysicalType.BOOLEAN_FALSE); + assertThat(primitive.get()).isEqualTo(value); + } + + private static Stream testPrimitiveNumericInputs() { + return Stream.of( + Arguments.of(34, Variants.PhysicalType.INT8, (byte) 34), + Arguments.of(1234, Variants.PhysicalType.INT16, (short) 1234), + Arguments.of(1234567890, Variants.PhysicalType.INT32, 1234567890), + Arguments.of(1234567890987654321L, Variants.PhysicalType.INT64, 1234567890987654321L)); + } + + @ParameterizedTest + @MethodSource("testPrimitiveNumericInputs") + public void testPrimitiveNumeric(long value, Variants.PhysicalType type, Object expectedValue) { + VariantPrimitiveBuilder builder = new VariantBuilder().createPrimitive(); + builder.writeNumeric(value); + Variant variant = builder.build(); + VariantPrimitive primitive = variant.value().asPrimitive(); + + assertThat(primitive.type()).isEqualTo(type); + assertThat(primitive.get()).isEqualTo(expectedValue); + } + + @Test + public void testPrimitiveDouble() { + VariantPrimitiveBuilder builder = new VariantBuilder().createPrimitive(); + builder.writeDouble(1234e-2); + Variant variant = builder.build(); + VariantPrimitive primitive = variant.value().asPrimitive(); + + assertThat(primitive.type()).isEqualTo(Variants.PhysicalType.DOUBLE); + assertThat(primitive.get()).isEqualTo(12.34); + } + + private static Stream testPrimitiveDecimalInputs() { + return Stream.of( + Arguments.of(new BigDecimal("123456.789"), Variants.PhysicalType.DECIMAL4), + Arguments.of(new BigDecimal("123456789.987654321"), Variants.PhysicalType.DECIMAL8), + Arguments.of( + new BigDecimal("12345678901234567890.987654321"), Variants.PhysicalType.DECIMAL16)); + } + + @ParameterizedTest + @MethodSource("testPrimitiveDecimalInputs") + public void testPrimitiveDecimal(BigDecimal value, Variants.PhysicalType type) { + VariantPrimitiveBuilder builder = new VariantBuilder().createPrimitive(); + builder.writeDecimal(value); + Variant variant = builder.build(); + VariantPrimitive primitive = variant.value().asPrimitive(); + + assertThat(primitive.type()).isEqualTo(type); + assertThat(primitive.get()).isEqualTo(value); + } + + @Test + public void testPrimitiveDate() { + String dateString = "2017-08-18"; + LocalDate date = LocalDate.parse(dateString); + + VariantPrimitiveBuilder builder = new VariantBuilder().createPrimitive(); + builder.writeDate(date); + Variant variant = builder.build(); + VariantPrimitive primitive = variant.value().asPrimitive(); + + assertThat(primitive.type()).isEqualTo(Variants.PhysicalType.DATE); + assertThat(primitive.get()).isEqualTo(DateTimeUtil.daysFromDate(date)); + } + + @Test + public void testPrimitiveTimestampTZ() { + String tzString = "2017-08-18T14:21:01.919+00:00"; + OffsetDateTime ts = OffsetDateTime.parse(tzString); + + VariantPrimitiveBuilder builder = new VariantBuilder().createPrimitive(); + builder.writeTimestampTz(ts); + Variant variant = builder.build(); + VariantPrimitive primitive = variant.value().asPrimitive(); + + assertThat(primitive.type()).isEqualTo(Variants.PhysicalType.TIMESTAMPTZ); + assertThat(primitive.get()).isEqualTo(DateTimeUtil.microsFromTimestamptz(ts)); + } + + @Test + public void testPrimitiveTimestampNTZ() { + String ntzString = "2017-08-18T14:21:01.919"; + LocalDateTime ts = LocalDateTime.parse(ntzString); + + VariantPrimitiveBuilder builder = new VariantBuilder().createPrimitive(); + builder.writeTimestampNtz(ts); + Variant variant = builder.build(); + VariantPrimitive primitive = variant.value().asPrimitive(); + + assertThat(primitive.type()).isEqualTo(Variants.PhysicalType.TIMESTAMPNTZ); + assertThat(primitive.get()).isEqualTo(DateTimeUtil.microsFromTimestamp(ts)); + } + + @Test + public void testPrimitiveFloat() { + VariantPrimitiveBuilder builder = new VariantBuilder().createPrimitive(); + builder.writeFloat(12.34f); + Variant variant = builder.build(); + VariantPrimitive primitive = variant.value().asPrimitive(); + + assertThat(primitive.type()).isEqualTo(Variants.PhysicalType.FLOAT); + assertThat(primitive.get()).isEqualTo(12.34f); + } + + @Test + public void testPrimitiveBinary() { + VariantPrimitiveBuilder builder = new VariantBuilder().createPrimitive(); + builder.writeBinary("iceberg".getBytes()); + Variant variant = builder.build(); + VariantPrimitive primitive = variant.value().asPrimitive(); + + assertThat(primitive.type()).isEqualTo(Variants.PhysicalType.BINARY); + assertThat(primitive.get()).isEqualTo(ByteBuffer.wrap("iceberg".getBytes())); + } + + @Test + public void testPrimitiveString() { + String value = "This test string is used to generate a primitive string type of variant"; + VariantPrimitiveBuilder builder = new VariantBuilder().createPrimitive(); + builder.writeString(value); + Variant variant = builder.build(); + VariantPrimitive primitive = variant.value().asPrimitive(); + + assertThat(primitive.type()).isEqualTo(Variants.PhysicalType.STRING); + assertThat(primitive.get()).isEqualTo(value); + } + + @Test + public void testPrimitiveShortString() { + String value = "iceberg"; + VariantPrimitiveBuilder builder = new VariantBuilder().createPrimitive(); + builder.writeString(value); + Variant variant = builder.build(); + VariantPrimitive shortString = variant.value().asPrimitive(); + + assertThat(shortString.type()).isEqualTo(Variants.PhysicalType.STRING); + assertThat(shortString.get()).isEqualTo("iceberg"); + } +} From b2290a04b16440c3535175330d3541e303be56a2 Mon Sep 17 00:00:00 2001 From: Aihua Xu Date: Fri, 31 Jan 2025 16:04:50 -0800 Subject: [PATCH 4/5] Refactoring to use VariantUtil --- .../apache/iceberg/variants/VariantArray.java | 4 - .../iceberg/variants/VariantArrayBuilder.java | 12 +- .../iceberg/variants/VariantBuilder.java | 39 ++- .../iceberg/variants/VariantBuilderBase.java | 256 ++++++------------ .../iceberg/variants/VariantConstants.java | 33 --- .../apache/iceberg/variants/VariantImpl.java | 28 +- .../iceberg/variants/VariantObject.java | 4 - .../variants/VariantObjectBuilder.java | 14 +- .../variants/VariantPrimitiveBuilder.java | 4 +- .../variants/VariantSizeLimitException.java | 26 -- .../apache/iceberg/variants/VariantUtil.java | 21 +- .../org/apache/iceberg/variants/Variants.java | 3 + .../variants/TestVariantBuilderArray.java | 4 +- .../variants/TestVariantBuilderObject.java | 6 +- 14 files changed, 157 insertions(+), 297 deletions(-) delete mode 100644 core/src/main/java/org/apache/iceberg/variants/VariantConstants.java delete mode 100644 core/src/main/java/org/apache/iceberg/variants/VariantSizeLimitException.java diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantArray.java b/core/src/main/java/org/apache/iceberg/variants/VariantArray.java index 5de38df6e417..dd1aa5cf4f10 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantArray.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantArray.java @@ -20,10 +20,6 @@ /** An variant array value. */ public interface VariantArray extends VariantValue { - default int numElements() { - throw new UnsupportedOperationException(); - } - /** Returns the {@link VariantValue} at {@code index} in this array. */ VariantValue get(int index); diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantArrayBuilder.java b/core/src/main/java/org/apache/iceberg/variants/VariantArrayBuilder.java index e07856444e4d..9a8beb1b343d 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantArrayBuilder.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantArrayBuilder.java @@ -29,19 +29,19 @@ public class VariantArrayBuilder extends VariantBuilderBase { private final List offsets; - public VariantArrayBuilder(ByteBufferWrapper buffer, Dictionary dict) { - super(buffer, dict); + public VariantArrayBuilder(ByteBufferWrapper valueBuffer, Dictionary dict) { + super(valueBuffer, dict); offsets = Lists.newArrayList(); } public VariantObjectBuilder startObject() { addOffset(); - return new VariantObjectBuilder(getBuffer(), getDict()); + return new VariantObjectBuilder(valueBuffer, dict); } public VariantArrayBuilder startArray() { addOffset(); - return new VariantArrayBuilder(getBuffer(), getDict()); + return new VariantArrayBuilder(valueBuffer, dict); } public VariantArrayBuilder writeNull() { @@ -111,10 +111,10 @@ public VariantArrayBuilder writeString(String str) { } private void addOffset() { - offsets.add(getBuffer().getPos() - getStartPos()); + offsets.add(valueBuffer.pos() - startPos); } public void endArray() { - super.endArray(getStartPos(), offsets); + super.endArray(startPos, offsets); } } diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantBuilder.java b/core/src/main/java/org/apache/iceberg/variants/VariantBuilder.java index 598981714ba5..6984de7d5970 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantBuilder.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantBuilder.java @@ -32,20 +32,19 @@ /** A builder class to build a primitive/array/object variant. */ public class VariantBuilder extends VariantBuilderBase { public VariantBuilder() { - super(new VariantBuilderBase.ByteBufferWrapper(), new VariantBuilderBase.Dictionary()); + super(new ByteBufferWrapper(), new Dictionary()); } public VariantPrimitiveBuilder createPrimitive() { - VariantPrimitiveBuilder primitiveBuilder = new VariantPrimitiveBuilder(getBuffer(), getDict()); - return primitiveBuilder; + return new VariantPrimitiveBuilder(valueBuffer, dict); } public VariantObjectBuilder startObject() { - return new VariantObjectBuilder(getBuffer(), getDict()); + return new VariantObjectBuilder(valueBuffer, dict); } public VariantArrayBuilder startArray() { - return new VariantArrayBuilder(getBuffer(), getDict()); + return new VariantArrayBuilder(valueBuffer, dict); } /** @@ -63,13 +62,13 @@ public static Variant parseJson(String json) throws IOException { parser.nextToken(); VariantBuilder builder = new VariantBuilder(); - builder.buildJson(parser); + builder.parseJson(parser); return builder.build(); } } - private void buildJson(JsonParser parser) throws IOException { + private void parseJson(JsonParser parser) throws IOException { JsonToken token = parser.currentToken(); if (token == null) { @@ -108,16 +107,16 @@ private void buildJson(JsonParser parser) throws IOException { private void writeObject(JsonParser parser) throws IOException { List fields = Lists.newArrayList(); - int startPos = getBuffer().getPos(); + int startPos = valueBuffer.pos(); // Store object keys to dictionary of metadata while (parser.nextToken() != JsonToken.END_OBJECT) { String key = parser.currentName(); parser.nextToken(); // Move to the value - int id = getDict().add(key); - fields.add(new VariantBuilderBase.FieldEntry(key, id, getBuffer().getPos() - startPos)); - buildJson(parser); + int id = dict.add(key); + fields.add(new VariantBuilderBase.FieldEntry(key, id, valueBuffer.pos() - startPos)); + parseJson(parser); } endObject(startPos, fields); @@ -125,11 +124,11 @@ private void writeObject(JsonParser parser) throws IOException { private void writeArray(JsonParser parser) throws IOException { List offsets = Lists.newArrayList(); - int startPos = getBuffer().getPos(); + int startPos = valueBuffer.pos(); while (parser.nextToken() != JsonToken.END_ARRAY) { - offsets.add(getBuffer().getPos() - startPos); - buildJson(parser); + offsets.add(valueBuffer.pos() - startPos); + parseJson(parser); } endArray(startPos, offsets); @@ -150,12 +149,10 @@ private void writeFloat(JsonParser parser) throws IOException { } /** - * Attempts to parse a JSON number as a decimal and write it. The input must meet the following - * criteria: - Be in a valid decimal format (integer with an optional '.'). - Not in scientific - * notation. - Fit within the precision and scale limits of decimal types. + * This function attempts to parse a JSON number and write it as a decimal value. * - * @param input the input string representing the JSON number - * @return true if the decimal is valid and written successfully; false otherwise + * @param input the input string expecting to be in decimal format, not in scientific notation. + * @return true if the decimal is valid and written successfully; false otherwise. */ private boolean tryWriteDecimal(String input) { // Validate that the input matches a decimal format and is not in scientific notation. @@ -167,8 +164,8 @@ private boolean tryWriteDecimal(String input) { BigDecimal decimalValue = new BigDecimal(input); // Ensure the decimal value meets precision and scale limits. - if (decimalValue.scale() <= VariantConstants.MAX_DECIMAL16_PRECISION - && decimalValue.precision() <= VariantConstants.MAX_DECIMAL16_PRECISION) { + if (decimalValue.scale() <= MAX_DECIMAL16_PRECISION + && decimalValue.precision() <= MAX_DECIMAL16_PRECISION) { writeDecimalInternal(decimalValue); return true; } diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantBuilderBase.java b/core/src/main/java/org/apache/iceberg/variants/VariantBuilderBase.java index 4284145f00ff..5110e1ecbbdd 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantBuilderBase.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantBuilderBase.java @@ -19,9 +19,9 @@ package org.apache.iceberg.variants; import java.math.BigDecimal; -import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.nio.charset.StandardCharsets; -import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; @@ -30,16 +30,18 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; abstract class VariantBuilderBase { - protected static final int MAX_SHORT_STR_SIZE = 0x3F; + private static final int MAX_DECIMAL4_PRECISION = 9; + private static final int MAX_DECIMAL8_PRECISION = 18; + protected static final int MAX_DECIMAL16_PRECISION = 38; - private final ByteBufferWrapper buffer; - private final Dictionary dict; - private int startPos; + protected final ByteBufferWrapper valueBuffer; + protected final Dictionary dict; + protected int startPos; - VariantBuilderBase(ByteBufferWrapper buffer, Dictionary dict) { - this.buffer = buffer; + VariantBuilderBase(ByteBufferWrapper valueBuffer, Dictionary dict) { + this.valueBuffer = valueBuffer; this.dict = dict; - startPos = buffer.pos; + startPos = valueBuffer.pos; } /** @@ -52,9 +54,6 @@ public Variant build() { // Calculate total size of dictionary strings long numStringBytes = dict.totalBytes(); - if (numStringBytes > VariantConstants.SIZE_LIMIT) { - throw new VariantSizeLimitException(); - } // Determine the number of bytes required for dictionary size and offset entry int offsetSize = sizeOf(Math.max((int) numStringBytes, numKeys)); @@ -62,42 +61,43 @@ public Variant build() { // metadata: header byte, dictionary size, offsets and string bytes long metadataSize = 1 + offsetSize + (numKeys + 1) * offsetSize + numStringBytes; - // Ensure the metadata size is within limits - if (metadataSize > VariantConstants.SIZE_LIMIT) { - throw new VariantSizeLimitException(); - } - - ByteBufferWrapper metadataBuffer = - new ByteBufferWrapper((int) metadataSize, (int) metadataSize); + ByteBuffer metadataBuffer = + ByteBuffer.allocate((int) metadataSize).order(ByteOrder.LITTLE_ENDIAN); // Write header byte (version + offset size) - metadataBuffer.addByte(VariantUtil.metadataHeader(VariantConstants.VERSION, offsetSize)); + VariantUtil.writeByte( + metadataBuffer, VariantUtil.metadataHeader(Variants.VERSION, offsetSize), 0); // Write number of keys - metadataBuffer.writeLittleEndianUnsigned(numKeys, offsetSize); + VariantUtil.writeLittleEndianUnsigned(metadataBuffer, numKeys, 1, offsetSize); // Write offsets - int currentOffset = 0; - for (byte[] key : dict.getKeys()) { - metadataBuffer.writeLittleEndianUnsigned(currentOffset, offsetSize); - currentOffset += key.length; + int offset = 1 + offsetSize; + int dictOffset = 0; + for (byte[] key : dict.keys()) { + VariantUtil.writeLittleEndianUnsigned(metadataBuffer, dictOffset, offset, offsetSize); + dictOffset += key.length; + offset += offsetSize; } - metadataBuffer.writeLittleEndianUnsigned(numStringBytes, offsetSize); + VariantUtil.writeLittleEndianUnsigned(metadataBuffer, numStringBytes, offset, offsetSize); // Write dictionary strings - dict.getKeys().forEach(metadataBuffer::addBytes); + offset += offsetSize; + for (byte[] key : dict.keys()) { + VariantUtil.writeBufferAbsolute(metadataBuffer, offset, ByteBuffer.wrap(key)); + offset += key.length; + } - return new VariantImpl(metadataBuffer.toByteArray(), buffer.toByteArray()); + return new VariantImpl(metadataBuffer, valueBuffer.buffer); } protected void writeNullInternal() { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_NULL)); + valueBuffer.writePrimitive(Variants.PhysicalType.NULL, null); } protected void writeBooleanInternal(boolean value) { - buffer.addByte( - VariantUtil.primitiveHeader( - value ? Variants.Primitives.TYPE_TRUE : Variants.Primitives.TYPE_FALSE)); + valueBuffer.writePrimitive( + value ? Variants.PhysicalType.BOOLEAN_TRUE : Variants.PhysicalType.BOOLEAN_FALSE, value); } /** @@ -108,27 +108,18 @@ protected void writeBooleanInternal(boolean value) { */ protected void writeNumericInternal(long value) { if (value == (byte) value) { - // INT8: Requires 1 byte for header + 1 byte for value - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_INT8)); - buffer.writeLittleEndianUnsigned(value, 1); + valueBuffer.writePrimitive(Variants.PhysicalType.INT8, (byte) value); } else if (value == (short) value) { - // INT16: Requires 1 byte for header + 2 bytes for value - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_INT16)); - buffer.writeLittleEndianUnsigned(value, 2); + valueBuffer.writePrimitive(Variants.PhysicalType.INT16, (short) value); } else if (value == (int) value) { - // INT32: Requires 1 byte for header + 4 bytes for value - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_INT32)); - buffer.writeLittleEndianUnsigned(value, 4); + valueBuffer.writePrimitive(Variants.PhysicalType.INT32, (int) value); } else { - // INT64: Requires 1 byte for header + 8 bytes for value - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_INT64)); - buffer.writeLittleEndianUnsigned(value, 8); + valueBuffer.writePrimitive(Variants.PhysicalType.INT64, value); } } protected void writeDoubleInternal(double value) { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_DOUBLE)); - buffer.writeLittleEndianUnsigned(Double.doubleToLongBits(value), 8); + valueBuffer.writePrimitive(Variants.PhysicalType.DOUBLE, value); } /** @@ -137,75 +128,44 @@ protected void writeDoubleInternal(double value) { */ public void writeDecimalInternal(BigDecimal value) { Preconditions.checkArgument( - value.precision() <= VariantConstants.MAX_DECIMAL16_PRECISION, + value.precision() <= MAX_DECIMAL16_PRECISION, "Unsupported Decimal precision: %s", value.precision()); - BigInteger unscaled = value.unscaledValue(); - if (value.scale() <= VariantConstants.MAX_DECIMAL4_PRECISION - && value.precision() <= VariantConstants.MAX_DECIMAL4_PRECISION) { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_DECIMAL4)); - buffer.addByte((byte) value.scale()); - buffer.writeLittleEndianUnsigned(unscaled.intValueExact(), 4); - } else if (value.scale() <= VariantConstants.MAX_DECIMAL8_PRECISION - && value.precision() <= VariantConstants.MAX_DECIMAL8_PRECISION) { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_DECIMAL8)); - buffer.addByte((byte) value.scale()); - buffer.writeLittleEndianUnsigned(unscaled.longValueExact(), 8); + if (value.scale() <= MAX_DECIMAL4_PRECISION && value.precision() <= MAX_DECIMAL4_PRECISION) { + valueBuffer.writePrimitive(Variants.PhysicalType.DECIMAL4, value); + } else if (value.scale() <= MAX_DECIMAL8_PRECISION + && value.precision() <= MAX_DECIMAL8_PRECISION) { + valueBuffer.writePrimitive(Variants.PhysicalType.DECIMAL8, value); } else { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_DECIMAL16)); - buffer.addByte((byte) value.scale()); - byte[] bytes = unscaled.toByteArray(); - for (int i = 0; i < 16; i++) { - byte byteValue = - i < bytes.length ? bytes[bytes.length - 1 - i] : (byte) (bytes[0] < 0 ? -1 : 0); - buffer.addByte(byteValue); - } + valueBuffer.writePrimitive(Variants.PhysicalType.DECIMAL16, value); } } protected void writeDateInternal(int daysSinceEpoch) { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_DATE)); - buffer.writeLittleEndianUnsigned(daysSinceEpoch, 4); + valueBuffer.writePrimitive(Variants.PhysicalType.DATE, daysSinceEpoch); } /** Writes a timestamp with timezone (microseconds since epoch) to the variant builder. */ protected void writeTimestampTzInternal(long microsSinceEpoch) { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_TIMESTAMPTZ)); - buffer.writeLittleEndianUnsigned(microsSinceEpoch, 8); + valueBuffer.writePrimitive(Variants.PhysicalType.TIMESTAMPTZ, microsSinceEpoch); } /** Writes a timestamp without timezone (microseconds since epoch) to the variant builder. */ protected void writeTimestampNtzInternal(long microsSinceEpoch) { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_TIMESTAMPNTZ)); - buffer.writeLittleEndianUnsigned(microsSinceEpoch, 8); + valueBuffer.writePrimitive(Variants.PhysicalType.TIMESTAMPNTZ, microsSinceEpoch); } - protected void writeFloatInternal(float value) throws VariantSizeLimitException { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_FLOAT)); - buffer.writeLittleEndianUnsigned(Float.floatToIntBits(value), 4); + protected void writeFloatInternal(float value) { + valueBuffer.writePrimitive(Variants.PhysicalType.FLOAT, value); } - protected void writeBinaryInternal(byte[] value) throws VariantSizeLimitException { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_BINARY)); - buffer.writeLittleEndianUnsigned(value.length, 4); - buffer.addBytes(value); + protected void writeBinaryInternal(byte[] value) { + valueBuffer.writePrimitive(Variants.PhysicalType.BINARY, ByteBuffer.wrap(value)); } protected void writeStringInternal(String value) { - byte[] text = value.getBytes(StandardCharsets.UTF_8); - boolean longStr = text.length > MAX_SHORT_STR_SIZE; - - // Write header - if (longStr) { - buffer.addByte(VariantUtil.primitiveHeader(Variants.Primitives.TYPE_STRING)); - buffer.writeLittleEndianUnsigned(text.length, 4); - } else { - buffer.addByte(VariantUtil.shortStrHeader(text.length)); - } - - // Write string content - buffer.addBytes(text); + valueBuffer.writePrimitive(Variants.PhysicalType.STRING, value); } /** Choose the smallest number of bytes to store the given value. */ @@ -241,7 +201,7 @@ protected void endObject(int objStartPos, List fields) { } } - int dataSize = buffer.pos - objStartPos; // Total byte size of the object values + int dataSize = valueBuffer.pos - objStartPos; // Total byte size of the object values boolean isLarge = numElements > 0xFF; // Determine whether to use large format int sizeBytes = isLarge ? 4 : 1; // Number of bytes for the object size int fieldIdSize = sizeOf(maxId); // Number of bytes for each field id @@ -250,26 +210,26 @@ protected void endObject(int objStartPos, List fields) { 1 + sizeBytes + numElements * fieldIdSize + (numElements + 1) * fieldOffsetSize; // Shift existing data to make room for header - buffer.shift(objStartPos, headerSize); + valueBuffer.shift(objStartPos, headerSize); - buffer.insertByte( + valueBuffer.insertByte( VariantUtil.objectHeader(isLarge, fieldIdSize, fieldOffsetSize), objStartPos); // Insert header byte - buffer.insertLittleEndianUnsigned( + valueBuffer.insertLittleEndianUnsigned( numElements, sizeBytes, objStartPos + 1); // Insert number of elements // Insert field IDs and offsets int fieldIdStart = objStartPos + 1 + sizeBytes; int fieldOffsetStart = fieldIdStart + numElements * fieldIdSize; for (int i = 0; i < numElements; i++) { - buffer.insertLittleEndianUnsigned( + valueBuffer.insertLittleEndianUnsigned( fields.get(i).id, fieldIdSize, fieldIdStart + i * fieldIdSize); - buffer.insertLittleEndianUnsigned( + valueBuffer.insertLittleEndianUnsigned( fields.get(i).offset, fieldOffsetSize, fieldOffsetStart + i * fieldOffsetSize); } // Insert the offset to the end of the data - buffer.insertLittleEndianUnsigned( + valueBuffer.insertLittleEndianUnsigned( dataSize, fieldOffsetSize, fieldOffsetStart + numElements * fieldOffsetSize); } @@ -281,7 +241,7 @@ protected void endObject(int objStartPos, List fields) { * @param offsets The offsets for each array value. */ protected void endArray(int arrStartPos, List offsets) { - int dataSize = buffer.pos - arrStartPos; // Total byte size of the array values + int dataSize = valueBuffer.pos - arrStartPos; // Total byte size of the array values int numElements = offsets.size(); boolean isLarge = numElements > 0xFF; // Determine whether to use large format @@ -291,105 +251,65 @@ protected void endArray(int arrStartPos, List offsets) { int offsetStart = arrStartPos + 1 + sizeBytes; // Start position for offsets // Shift existing data to make room for header - buffer.shift(arrStartPos, headerSize); + valueBuffer.shift(arrStartPos, headerSize); - buffer.insertByte( + valueBuffer.insertByte( VariantUtil.arrayHeader(isLarge, fieldOffsetSize), arrStartPos); // Insert header byte - buffer.insertLittleEndianUnsigned( + valueBuffer.insertLittleEndianUnsigned( numElements, sizeBytes, arrStartPos + 1); // Insert number of elements // Insert field offsets for (int i = 0; i < numElements; i++) { - buffer.insertLittleEndianUnsigned( + valueBuffer.insertLittleEndianUnsigned( offsets.get(i), fieldOffsetSize, offsetStart + i * fieldOffsetSize); } // Insert the offset to the end of the data - buffer.insertLittleEndianUnsigned( + valueBuffer.insertLittleEndianUnsigned( dataSize, fieldOffsetSize, offsetStart + numElements * fieldOffsetSize); } - protected ByteBufferWrapper getBuffer() { - return buffer; - } - - protected Dictionary getDict() { - return dict; - } - - protected int getStartPos() { - return startPos; - } - /** An auto-growing byte buffer that doubles its size whenever the capacity is exceeded. */ protected static class ByteBufferWrapper { private static final int INITIAL_CAPACITY = 128; // Starting capacity - private byte[] buffer; + private ByteBuffer buffer; private int pos = 0; - private final int sizeLimit; ByteBufferWrapper() { - this(INITIAL_CAPACITY, VariantConstants.SIZE_LIMIT); + this(INITIAL_CAPACITY); } - ByteBufferWrapper(int initialCapacity, int sizeLimit) { + ByteBufferWrapper(int initialCapacity) { if (initialCapacity <= 0) { throw new IllegalArgumentException("Initial capacity must be positive"); } - this.buffer = new byte[initialCapacity]; - this.sizeLimit = sizeLimit; + this.buffer = ByteBuffer.allocate(initialCapacity).order(ByteOrder.LITTLE_ENDIAN); } /** * Ensures the buffer has enough capacity to hold additional bytes. * * @param additional The number of additional bytes required. - * @throws VariantSizeLimitException If the required capacity exceeds the size limit. */ private void ensureCapacity(int additional) { int required = pos + additional; - if (required > buffer.length) { + if (required > buffer.capacity()) { int newCapacity = Integer.highestOneBit(required); newCapacity = newCapacity < required ? newCapacity * 2 : newCapacity; // Double the capacity - if (newCapacity > this.sizeLimit) { - throw new VariantSizeLimitException(); - } - byte[] newBuffer = new byte[newCapacity]; - System.arraycopy(buffer, 0, newBuffer, 0, pos); + ByteBuffer newBuffer = + ByteBuffer.allocate(newCapacity) + .order(ByteOrder.LITTLE_ENDIAN) + .put(buffer.array(), 0, pos); buffer = newBuffer; } } - /** Adds a byte to the buffer, growing the buffer if necessary. */ - void addByte(byte value) throws VariantSizeLimitException { - ensureCapacity(1); - buffer[pos++] = value; - } - - /** Adds an array of bytes to the buffer, growing the buffer if necessary. */ - void addBytes(byte[] values) throws VariantSizeLimitException { - ensureCapacity(values.length); - System.arraycopy(values, 0, buffer, pos, values.length); - pos += values.length; - } - - /** - * Writes a numeric value in little-endian order to the buffer, growing the buffer if necessary. - * - * @param value The numeric value to write. - * @param numBytes The number of bytes to write (e.g., 2 for short, 4 for int, 8 for long). - */ - void writeLittleEndianUnsigned(long value, int numBytes) { - if (numBytes < 1 || numBytes > 8) { - throw new IllegalArgumentException("numBytes must be between 1 and 8"); - } - ensureCapacity(numBytes); - - for (int i = 0; i < numBytes; ++i) { - buffer[pos + i] = (byte) ((value >>> (8 * i)) & 0xFF); - } - pos += numBytes; + void writePrimitive(Variants.PhysicalType type, T value) { + PrimitiveWrapper wrapper = new PrimitiveWrapper(type, value); + ensureCapacity(pos + wrapper.sizeInBytes()); + wrapper.writeTo(buffer, pos); + pos += wrapper.sizeInBytes(); } /** @@ -402,7 +322,7 @@ void shift(int start, int offset) { ensureCapacity(offset); if (pos > start) { - System.arraycopy(buffer, start, buffer, start + offset, pos - start); + System.arraycopy(buffer.array(), start, buffer.array(), start + offset, pos - start); } pos += offset; @@ -414,8 +334,7 @@ void shift(int start, int offset) { */ void insertByte(byte value, int insertPos) { Preconditions.checkArgument(insertPos < pos, "insertPos must be smaller than pos"); - - buffer[insertPos] = value; + VariantUtil.writeByteAbsolute(buffer, value, insertPos); } /** @@ -428,17 +347,10 @@ void insertLittleEndianUnsigned(long value, int numBytes, int insertPos) { throw new IllegalArgumentException("numBytes must be between 1 and 8"); } - for (int i = 0; i < numBytes; ++i) { - buffer[insertPos + i] = (byte) ((value >>> (8 * i)) & 0xFF); - } - } - - /** Returns the underlying byte array. */ - byte[] toByteArray() { - return Arrays.copyOf(buffer, pos); + VariantUtil.writeLittleEndianUnsignedAbsolute(buffer, value, insertPos, numBytes); } - int getPos() { + int pos() { return pos; } } @@ -472,7 +384,7 @@ long totalBytes() { return utf8Strings.stream().mapToLong(key -> key.length).sum(); } - List getKeys() { + List keys() { return utf8Strings; } } diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantConstants.java b/core/src/main/java/org/apache/iceberg/variants/VariantConstants.java deleted file mode 100644 index 8ea93fdc05c1..000000000000 --- a/core/src/main/java/org/apache/iceberg/variants/VariantConstants.java +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.variants; - -public class VariantConstants { - public static final byte VERSION = 1; // Variant version - public static final int SIZE_LIMIT = 1 << 24; // metadata and value size limits - - // The lower 4 bits of the first metadata byte contain the version. - public static final byte VERSION_MASK = 0x0F; - - public static final int MAX_DECIMAL4_PRECISION = 9; - public static final int MAX_DECIMAL8_PRECISION = 18; - public static final int MAX_DECIMAL16_PRECISION = 38; - - private VariantConstants() {} -} diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantImpl.java b/core/src/main/java/org/apache/iceberg/variants/VariantImpl.java index b88a0464f718..4fd048478f04 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantImpl.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantImpl.java @@ -18,43 +18,41 @@ */ package org.apache.iceberg.variants; +import java.nio.ByteBuffer; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; public final class VariantImpl implements Variant { + // The mask to retrieve the version from first metadata byte + private static final byte VERSION_MASK = 0x0F; + private final VariantMetadata metadata; private final VariantValue value; - public VariantImpl(byte[] metadata, byte[] value) { + public VariantImpl(ByteBuffer metadata, ByteBuffer value) { Preconditions.checkArgument( - metadata != null && metadata.length >= 1, "Metadata must not be null or empty."); + metadata != null && metadata.limit() > 0, "Metadata must not be null or empty."); Preconditions.checkArgument( - value != null && value.length >= 1, "Value must not be null or empty."); + value != null && value.limit() > 0, "Value must not be null or empty."); Preconditions.checkArgument( - (metadata[0] & VariantConstants.VERSION_MASK) == VariantConstants.VERSION, - "Unsupported metadata version."); - - if (value.length > VariantConstants.SIZE_LIMIT - || metadata.length > VariantConstants.SIZE_LIMIT) { - throw new VariantSizeLimitException(); - } + (metadata.get(0) & VERSION_MASK) == Variants.VERSION, "Unsupported metadata version."); this.metadata = SerializedMetadata.from(metadata); - int header = value[0]; + int header = value.get(0); Variants.BasicType basicType = VariantUtil.basicType(header); switch (basicType) { case PRIMITIVE: - this.value = SerializedPrimitive.from(value); + this.value = SerializedPrimitive.from(value.array()); break; case ARRAY: - this.value = SerializedArray.from((SerializedMetadata) this.metadata, value); + this.value = SerializedArray.from((SerializedMetadata) this.metadata, value.array()); break; case OBJECT: - this.value = SerializedObject.from((SerializedMetadata) this.metadata, value); + this.value = SerializedObject.from((SerializedMetadata) this.metadata, value.array()); break; case SHORT_STRING: - this.value = SerializedShortString.from(value); + this.value = SerializedShortString.from(value.array()); break; default: throw new UnsupportedOperationException("Unsupported basic type: " + basicType); diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantObject.java b/core/src/main/java/org/apache/iceberg/variants/VariantObject.java index e65d71ede6db..33696dae41c2 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantObject.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantObject.java @@ -20,10 +20,6 @@ /** An variant object value. */ public interface VariantObject extends VariantValue { - default int numElements() { - throw new UnsupportedOperationException(); - } - /** Returns the {@link VariantValue} for the field named {@code name} in this object. */ VariantValue get(String name); diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantObjectBuilder.java b/core/src/main/java/org/apache/iceberg/variants/VariantObjectBuilder.java index 4960e96f2e50..709d63c910e0 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantObjectBuilder.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantObjectBuilder.java @@ -29,24 +29,24 @@ public class VariantObjectBuilder extends VariantBuilderBase { private final List fields; - VariantObjectBuilder(ByteBufferWrapper buffer, Dictionary dict) { - super(buffer, dict); + VariantObjectBuilder(ByteBufferWrapper valueBuffer, Dictionary dict) { + super(valueBuffer, dict); fields = Lists.newArrayList(); } public VariantObjectBuilder startObject(String key) { writeKey(key); - return new VariantObjectBuilder(getBuffer(), getDict()); + return new VariantObjectBuilder(valueBuffer, dict); } public VariantArrayBuilder startArray(String key) { writeKey(key); - return new VariantArrayBuilder(getBuffer(), getDict()); + return new VariantArrayBuilder(valueBuffer, dict); } private void writeKey(String key) { - int id = getDict().add(key); - fields.add(new FieldEntry(key, id, getBuffer().getPos() - getStartPos())); + int id = dict.add(key); + fields.add(new FieldEntry(key, id, valueBuffer.pos() - startPos)); } public VariantObjectBuilder writeNull(String key) { @@ -116,6 +116,6 @@ public VariantObjectBuilder writeString(String key, String value) { } public void endObject() { - super.endObject(getStartPos(), fields); + super.endObject(startPos, fields); } } diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantPrimitiveBuilder.java b/core/src/main/java/org/apache/iceberg/variants/VariantPrimitiveBuilder.java index 634705a09245..875f6a123cc5 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantPrimitiveBuilder.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantPrimitiveBuilder.java @@ -25,8 +25,8 @@ import org.apache.iceberg.util.DateTimeUtil; public class VariantPrimitiveBuilder extends VariantBuilderBase { - public VariantPrimitiveBuilder(ByteBufferWrapper buffer, Dictionary dict) { - super(buffer, dict); + public VariantPrimitiveBuilder(ByteBufferWrapper valueBuffer, Dictionary dict) { + super(valueBuffer, dict); } public VariantPrimitiveBuilder writeNull() { diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantSizeLimitException.java b/core/src/main/java/org/apache/iceberg/variants/VariantSizeLimitException.java deleted file mode 100644 index 3570e3c70666..000000000000 --- a/core/src/main/java/org/apache/iceberg/variants/VariantSizeLimitException.java +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.variants; - -public class VariantSizeLimitException extends RuntimeException { - - public VariantSizeLimitException() { - super("Variant size limit exceeded"); - } -} diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantUtil.java b/core/src/main/java/org/apache/iceberg/variants/VariantUtil.java index 85cde9d11ae9..ecc8f3d919b9 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantUtil.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantUtil.java @@ -44,15 +44,32 @@ static int writeBufferAbsolute(ByteBuffer buffer, int offset, ByteBuffer toCopy) return toCopy.remaining(); } + static void writeByteAbsolute(ByteBuffer buffer, int value, int offset) { + int originalPosition = buffer.position(); + buffer.put(offset, (byte) (value & 0xFF)); + buffer.position(originalPosition); + } + + static void writeLittleEndianUnsignedAbsolute( + ByteBuffer buffer, long value, int offset, int size) { + int originalPosition = buffer.position(); + buffer.position(0); + writeLittleEndianUnsigned(buffer, value, offset, size); + buffer.position(originalPosition); + } + static void writeByte(ByteBuffer buffer, int value, int offset) { buffer.put(buffer.position() + offset, (byte) (value & 0xFF)); } - static void writeLittleEndianUnsigned(ByteBuffer buffer, int value, int offset, int size) { + static void writeLittleEndianUnsigned(ByteBuffer buffer, long value, int offset, int size) { int base = buffer.position() + offset; switch (size) { + case 8: + buffer.putLong(base, value); + return; case 4: - buffer.putInt(base, value); + buffer.putInt(base, (int) (value & 0xFFFFFFFF)); return; case 3: buffer.putShort(base, (short) (value & 0xFFFF)); diff --git a/core/src/main/java/org/apache/iceberg/variants/Variants.java b/core/src/main/java/org/apache/iceberg/variants/Variants.java index e10682fe544a..14b1d4b009a7 100644 --- a/core/src/main/java/org/apache/iceberg/variants/Variants.java +++ b/core/src/main/java/org/apache/iceberg/variants/Variants.java @@ -25,6 +25,9 @@ import org.apache.iceberg.util.DateTimeUtil; public class Variants { + // Variant version + public static final byte VERSION = 1; + private Variants() {} enum LogicalType { diff --git a/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderArray.java b/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderArray.java index 20d820594b30..117469fb26ba 100644 --- a/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderArray.java +++ b/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderArray.java @@ -157,7 +157,7 @@ private void validateVariant(Variant variant) { VariantObject object = arr.get(0).asObject(); assertThat(object.type()).isEqualTo(Variants.PhysicalType.OBJECT); - assertThat(object.numElements()).isEqualTo(5); + assertThat(object.numFields()).isEqualTo(5); assertThat(object.get("firstName").asPrimitive().get()).isEqualTo("John"); assertThat(object.get("lastName").asPrimitive().get()).isEqualTo("Smith"); @@ -165,7 +165,7 @@ private void validateVariant(Variant variant) { VariantObject address = object.get("address").asObject(); assertThat(address.type()).isEqualTo(Variants.PhysicalType.OBJECT); - assertThat(address.numElements()).isEqualTo(4); + assertThat(address.numFields()).isEqualTo(4); assertThat(address.get("streetAddress").asPrimitive().get()).isEqualTo("21 2nd Street"); assertThat(address.get("city").asPrimitive().get()).isEqualTo("New York"); assertThat(address.get("state").asPrimitive().get()).isEqualTo("NY"); diff --git a/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderObject.java b/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderObject.java index 26d6a4712fc6..83034e9b24f4 100644 --- a/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderObject.java +++ b/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderObject.java @@ -30,7 +30,7 @@ public void testEmptyObjectJson() throws IOException { VariantObject object = variant.value().asObject(); assertThat(object.type()).isEqualTo(Variants.PhysicalType.OBJECT); - assertThat(object.numElements()).isEqualTo(0); + assertThat(object.numFields()).isEqualTo(0); } @Test @@ -91,7 +91,7 @@ private void validateVariant(Variant variant) { VariantObject object = variant.value().asObject(); assertThat(object.type()).isEqualTo(Variants.PhysicalType.OBJECT); - assertThat(object.numElements()).isEqualTo(5); + assertThat(object.numFields()).isEqualTo(5); assertThat(object.get("firstName").asPrimitive().get()).isEqualTo("John"); assertThat(object.get("lastName").asPrimitive().get()).isEqualTo("Smith"); @@ -99,7 +99,7 @@ private void validateVariant(Variant variant) { VariantObject address = object.get("address").asObject(); assertThat(address.type()).isEqualTo(Variants.PhysicalType.OBJECT); - assertThat(address.numElements()).isEqualTo(4); + assertThat(address.numFields()).isEqualTo(4); assertThat(address.get("streetAddress").asPrimitive().get()).isEqualTo("21 2nd Street"); assertThat(address.get("city").asPrimitive().get()).isEqualTo("New York"); assertThat(address.get("state").asPrimitive().get()).isEqualTo("NY"); From cc91ec04310b2d7d34e16dcbe2b7c8086661c684 Mon Sep 17 00:00:00 2001 From: Aihua Xu Date: Sun, 2 Feb 2025 21:00:34 -0800 Subject: [PATCH 5/5] refactoring VariantBuilder --- .../iceberg/variants/VariantArrayBuilder.java | 12 +- .../iceberg/variants/VariantBuilder.java | 118 +++++++++--------- .../iceberg/variants/VariantBuilderBase.java | 30 +++-- .../apache/iceberg/variants/VariantImpl.java | 4 +- .../variants/VariantObjectBuilder.java | 14 +-- .../variants/VariantPrimitiveBuilder.java | 4 +- .../variants/TestVariantBuilderArray.java | 12 +- .../variants/TestVariantBuilderObject.java | 32 ++++- .../variants/TestVariantBuilderPrimitive.java | 6 +- 9 files changed, 131 insertions(+), 101 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantArrayBuilder.java b/core/src/main/java/org/apache/iceberg/variants/VariantArrayBuilder.java index 9a8beb1b343d..7be7938add61 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantArrayBuilder.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantArrayBuilder.java @@ -36,12 +36,12 @@ public VariantArrayBuilder(ByteBufferWrapper valueBuffer, Dictionary dict) { public VariantObjectBuilder startObject() { addOffset(); - return new VariantObjectBuilder(valueBuffer, dict); + return new VariantObjectBuilder(valueBuffer(), dict()); } public VariantArrayBuilder startArray() { addOffset(); - return new VariantArrayBuilder(valueBuffer, dict); + return new VariantArrayBuilder(valueBuffer(), dict()); } public VariantArrayBuilder writeNull() { @@ -56,9 +56,9 @@ public VariantArrayBuilder writeBoolean(boolean value) { return this; } - public VariantArrayBuilder writeNumeric(long value) { + public VariantArrayBuilder writeIntegral(long value) { addOffset(); - writeNumericInternal(value); + writeIntegralInternal(value); return this; } @@ -111,10 +111,10 @@ public VariantArrayBuilder writeString(String str) { } private void addOffset() { - offsets.add(valueBuffer.pos() - startPos); + offsets.add(valueBuffer().pos() - startPos()); } public void endArray() { - super.endArray(startPos, offsets); + super.endArray(startPos(), offsets); } } diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantBuilder.java b/core/src/main/java/org/apache/iceberg/variants/VariantBuilder.java index 6984de7d5970..0d3dfae0cb64 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantBuilder.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantBuilder.java @@ -18,16 +18,17 @@ */ package org.apache.iceberg.variants; -import com.fasterxml.jackson.core.JsonFactory; import com.fasterxml.jackson.core.JsonParseException; import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.JsonToken; import com.fasterxml.jackson.core.exc.InputCoercionException; import java.io.IOException; +import java.io.UncheckedIOException; import java.math.BigDecimal; import java.util.List; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.JsonUtil; /** A builder class to build a primitive/array/object variant. */ public class VariantBuilder extends VariantBuilderBase { @@ -36,15 +37,15 @@ public VariantBuilder() { } public VariantPrimitiveBuilder createPrimitive() { - return new VariantPrimitiveBuilder(valueBuffer, dict); + return new VariantPrimitiveBuilder(valueBuffer(), dict()); } public VariantObjectBuilder startObject() { - return new VariantObjectBuilder(valueBuffer, dict); + return new VariantObjectBuilder(valueBuffer(), dict()); } public VariantArrayBuilder startArray() { - return new VariantArrayBuilder(valueBuffer, dict); + return new VariantArrayBuilder(valueBuffer(), dict()); } /** @@ -52,19 +53,18 @@ public VariantArrayBuilder startArray() { * * @param json The JSON string to parse. * @return The constructed Variant object. - * @throws IOException If an error occurs while reading or parsing the JSON. */ - public static Variant parseJson(String json) throws IOException { + public static Variant parseJson(String json) { Preconditions.checkArgument( json != null && !json.isEmpty(), "Input JSON string cannot be null or empty."); - try (JsonParser parser = new JsonFactory().createParser(json)) { + try (JsonParser parser = JsonUtil.factory().createParser(json)) { parser.nextToken(); - VariantBuilder builder = new VariantBuilder(); builder.parseJson(parser); - return builder.build(); + } catch (IOException e) { + throw new UncheckedIOException(e); } } @@ -77,99 +77,93 @@ private void parseJson(JsonParser parser) throws IOException { switch (token) { case START_OBJECT: - writeObject(parser); + parseObject(parser); break; case START_ARRAY: - writeArray(parser); - break; - case VALUE_STRING: - writeStringInternal(parser.getText()); - break; - case VALUE_NUMBER_INT: - writeInteger(parser); - break; - case VALUE_NUMBER_FLOAT: - writeFloat(parser); - break; - case VALUE_TRUE: - writeBooleanInternal(true); - break; - case VALUE_FALSE: - writeBooleanInternal(false); - break; - case VALUE_NULL: - writeNullInternal(); + parseArray(parser); break; default: - throw new JsonParseException(parser, "Unexpected token " + token); + parsePrimitive(parser); } } - private void writeObject(JsonParser parser) throws IOException { + private void parseObject(JsonParser parser) throws IOException { List fields = Lists.newArrayList(); - int startPos = valueBuffer.pos(); + int startPos = valueBuffer().pos(); // Store object keys to dictionary of metadata while (parser.nextToken() != JsonToken.END_OBJECT) { String key = parser.currentName(); parser.nextToken(); // Move to the value - int id = dict.add(key); - fields.add(new VariantBuilderBase.FieldEntry(key, id, valueBuffer.pos() - startPos)); + int id = dict().add(key); + fields.add(new VariantBuilderBase.FieldEntry(key, id, valueBuffer().pos() - startPos)); parseJson(parser); } endObject(startPos, fields); } - private void writeArray(JsonParser parser) throws IOException { + private void parseArray(JsonParser parser) throws IOException { List offsets = Lists.newArrayList(); - int startPos = valueBuffer.pos(); + int startPos = valueBuffer().pos(); while (parser.nextToken() != JsonToken.END_ARRAY) { - offsets.add(valueBuffer.pos() - startPos); + offsets.add(valueBuffer().pos() - startPos); parseJson(parser); } endArray(startPos, offsets); } - private void writeInteger(JsonParser parser) throws IOException { - try { - writeNumericInternal(parser.getLongValue()); - } catch (InputCoercionException ignored) { - writeFloat(parser); // Fallback for large integers - } - } + private void parsePrimitive(JsonParser parser) throws IOException { + JsonToken token = parser.currentToken(); - private void writeFloat(JsonParser parser) throws IOException { - if (!tryWriteDecimal(parser.getText())) { - writeDoubleInternal(parser.getDoubleValue()); + switch (token) { + case VALUE_STRING: + writeStringInternal(parser.getText()); + break; + case VALUE_NUMBER_INT: + try { + writeIntegralInternal(parser.getLongValue()); + } catch (InputCoercionException ignored) { + writeFloatValue(parser); + } + break; + case VALUE_NUMBER_FLOAT: + writeFloatValue(parser); + break; + case VALUE_TRUE: + writeBooleanInternal(true); + break; + case VALUE_FALSE: + writeBooleanInternal(false); + break; + case VALUE_NULL: + writeNullInternal(); + break; + default: + throw new JsonParseException(parser, "Unexpected token " + token); } } /** - * This function attempts to parse a JSON number and write it as a decimal value. + * This function attempts to write floating number in decimal format to store the exact value if + * it fits in the decimal for Variant; otherwise, write as a double value. * - * @param input the input string expecting to be in decimal format, not in scientific notation. - * @return true if the decimal is valid and written successfully; false otherwise. + * @param parser instance of JSONParser with the current token to be floating number */ - private boolean tryWriteDecimal(String input) { - // Validate that the input matches a decimal format and is not in scientific notation. - if (!input.matches("-?\\d+(\\.\\d+)?")) { - return false; - } - - // Parse the input string to BigDecimal. + private void writeFloatValue(JsonParser parser) throws IOException { + String input = parser.getText(); BigDecimal decimalValue = new BigDecimal(input); - // Ensure the decimal value meets precision and scale limits. - if (decimalValue.scale() <= MAX_DECIMAL16_PRECISION + // Decimal values only support a scale in [0, 38] and a precision <= 38 + if (decimalValue.scale() >= 0 + && decimalValue.scale() <= MAX_DECIMAL16_PRECISION && decimalValue.precision() <= MAX_DECIMAL16_PRECISION) { writeDecimalInternal(decimalValue); - return true; + } else { + writeDoubleInternal(parser.getDoubleValue()); } - - return false; } } diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantBuilderBase.java b/core/src/main/java/org/apache/iceberg/variants/VariantBuilderBase.java index 5110e1ecbbdd..a9ddb2256cae 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantBuilderBase.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantBuilderBase.java @@ -34,9 +34,21 @@ abstract class VariantBuilderBase { private static final int MAX_DECIMAL8_PRECISION = 18; protected static final int MAX_DECIMAL16_PRECISION = 38; - protected final ByteBufferWrapper valueBuffer; - protected final Dictionary dict; - protected int startPos; + private final ByteBufferWrapper valueBuffer; + private final Dictionary dict; + private int startPos; + + ByteBufferWrapper valueBuffer() { + return valueBuffer; + } + + Dictionary dict() { + return dict; + } + + int startPos() { + return startPos; + } VariantBuilderBase(ByteBufferWrapper valueBuffer, Dictionary dict) { this.valueBuffer = valueBuffer; @@ -101,12 +113,12 @@ protected void writeBooleanInternal(boolean value) { } /** - * Writes a numeric value to the variant builder, automatically choosing the smallest type (INT8, - * INT16, INT32, or INT64) to store the value efficiently. + * Writes an integral value to the variant builder, automatically choosing the smallest type + * (INT8, INT16, INT32, or INT64) to store the value efficiently. * - * @param value The numeric value to append. + * @param value The integral value to append. */ - protected void writeNumericInternal(long value) { + protected void writeIntegralInternal(long value) { if (value == (byte) value) { valueBuffer.writePrimitive(Variants.PhysicalType.INT8, (byte) value); } else if (value == (short) value) { @@ -404,10 +416,6 @@ protected static final class FieldEntry implements Comparable { this.offset = offset; } - FieldEntry withNewOffset(int newOffset) { - return new FieldEntry(key, id, newOffset); - } - @Override public int compareTo(FieldEntry other) { return key.compareTo(other.key); diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantImpl.java b/core/src/main/java/org/apache/iceberg/variants/VariantImpl.java index 4fd048478f04..8672870d0916 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantImpl.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantImpl.java @@ -46,10 +46,10 @@ public VariantImpl(ByteBuffer metadata, ByteBuffer value) { this.value = SerializedPrimitive.from(value.array()); break; case ARRAY: - this.value = SerializedArray.from((SerializedMetadata) this.metadata, value.array()); + this.value = SerializedArray.from(this.metadata, value.array()); break; case OBJECT: - this.value = SerializedObject.from((SerializedMetadata) this.metadata, value.array()); + this.value = SerializedObject.from(this.metadata, value.array()); break; case SHORT_STRING: this.value = SerializedShortString.from(value.array()); diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantObjectBuilder.java b/core/src/main/java/org/apache/iceberg/variants/VariantObjectBuilder.java index 709d63c910e0..e5974cb4e33e 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantObjectBuilder.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantObjectBuilder.java @@ -36,17 +36,17 @@ public class VariantObjectBuilder extends VariantBuilderBase { public VariantObjectBuilder startObject(String key) { writeKey(key); - return new VariantObjectBuilder(valueBuffer, dict); + return new VariantObjectBuilder(valueBuffer(), dict()); } public VariantArrayBuilder startArray(String key) { writeKey(key); - return new VariantArrayBuilder(valueBuffer, dict); + return new VariantArrayBuilder(valueBuffer(), dict()); } private void writeKey(String key) { - int id = dict.add(key); - fields.add(new FieldEntry(key, id, valueBuffer.pos() - startPos)); + int id = dict().add(key); + fields.add(new FieldEntry(key, id, valueBuffer().pos() - startPos())); } public VariantObjectBuilder writeNull(String key) { @@ -61,9 +61,9 @@ public VariantObjectBuilder writeBoolean(String key, boolean value) { return this; } - public VariantObjectBuilder writeNumeric(String key, long value) { + public VariantObjectBuilder writeIntegral(String key, long value) { writeKey(key); - writeNumericInternal(value); + writeIntegralInternal(value); return this; } @@ -116,6 +116,6 @@ public VariantObjectBuilder writeString(String key, String value) { } public void endObject() { - super.endObject(startPos, fields); + super.endObject(startPos(), fields); } } diff --git a/core/src/main/java/org/apache/iceberg/variants/VariantPrimitiveBuilder.java b/core/src/main/java/org/apache/iceberg/variants/VariantPrimitiveBuilder.java index 875f6a123cc5..367e790a5948 100644 --- a/core/src/main/java/org/apache/iceberg/variants/VariantPrimitiveBuilder.java +++ b/core/src/main/java/org/apache/iceberg/variants/VariantPrimitiveBuilder.java @@ -39,8 +39,8 @@ public VariantPrimitiveBuilder writeBoolean(boolean value) { return this; } - public VariantPrimitiveBuilder writeNumeric(long value) { - writeNumericInternal(value); + public VariantPrimitiveBuilder writeIntegral(long value) { + writeIntegralInternal(value); return this; } diff --git a/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderArray.java b/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderArray.java index 117469fb26ba..1c8061b297ef 100644 --- a/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderArray.java +++ b/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderArray.java @@ -45,7 +45,7 @@ public void testSimpleArrayJson() throws IOException { } @Test - public void testArrayJson() throws IOException { + public void testArrayJson() { String input = "[{\n" + " \"firstName\": \"John\"," @@ -91,10 +91,10 @@ public void testBuildArray() { .writeNull() .writeBoolean(true) .writeBoolean(false) - .writeNumeric(34) - .writeNumeric(1234) - .writeNumeric(1234567890) - .writeNumeric(1234567890987654321L) + .writeIntegral(34) + .writeIntegral(1234) + .writeIntegral(1234567890) + .writeIntegral(1234567890987654321L) .writeDouble(1234e-2) .writeDecimal(new BigDecimal("123456.789")) .writeDecimal(new BigDecimal("123456789.987654321")) @@ -112,7 +112,7 @@ public void testBuildArray() { .startObject() .writeString("firstName", "John") .writeString("lastName", "Smith") - .writeNumeric("age", 25) + .writeIntegral("age", 25) .endObject(); builder.endArray(); diff --git a/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderObject.java b/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderObject.java index 83034e9b24f4..7cad657d0993 100644 --- a/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderObject.java +++ b/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderObject.java @@ -19,8 +19,10 @@ package org.apache.iceberg.variants; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.within; import java.io.IOException; +import java.math.BigDecimal; import org.junit.jupiter.api.Test; public class TestVariantBuilderObject { @@ -34,7 +36,33 @@ public void testEmptyObjectJson() throws IOException { } @Test - public void testObjectJson() throws IOException { + public void testNumbers() { + String input = + "{\n" + + " \"intVal1\": 1234," + + " \"intVal2\": 12345678901234567890," + + " \"floatVal1\": 1.234," + + " \"floatVal2\": 1.234e-10," + + " \"floatVal3\": 1.234e10," + + " \"floatVal4\": 12345678901234567890123456789012345678.90" + + " }"; + + Variant variant = VariantBuilder.parseJson(input); + VariantObject object = variant.value().asObject(); + + assertThat(object.type()).isEqualTo(Variants.PhysicalType.OBJECT); + assertThat(object.get("intVal1").asPrimitive().get()).isEqualTo((short) 1234); + assertThat(object.get("intVal2").asPrimitive().get()) + .isEqualTo(new BigDecimal("12345678901234567890")); + assertThat(object.get("floatVal1").asPrimitive().get()).isEqualTo(new BigDecimal("1.234")); + assertThat(object.get("floatVal2").asPrimitive().get()).isEqualTo(new BigDecimal("1.234e-10")); + assertThat(object.get("floatVal3").asPrimitive().get()).isEqualTo(1.234e10); + assertThat((double) object.get("floatVal4").asPrimitive().get()) + .isCloseTo(12345678901234567890123456789012345678.90, within(0.000001)); + } + + @Test + public void testObjectJson() { String input = "{\n" + " \"firstName\": \"John\"," @@ -62,7 +90,7 @@ public void testBuildObject() { .startObject() .writeString("firstName", "John") .writeString("lastName", "Smith") - .writeNumeric("age", 25); + .writeIntegral("age", 25); builder .startObject("address") .writeString("streetAddress", "21 2nd Street") diff --git a/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderPrimitive.java b/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderPrimitive.java index aeb3148377bf..ce045b9b58a2 100644 --- a/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderPrimitive.java +++ b/core/src/test/java/org/apache/iceberg/variants/TestVariantBuilderPrimitive.java @@ -44,7 +44,7 @@ private static Stream primitiveInputs() { Arguments.of("1234", Variants.PhysicalType.INT16, (short) 1234), Arguments.of("1234567890", Variants.PhysicalType.INT32, 1234567890), Arguments.of("1234567890987654321", Variants.PhysicalType.INT64, 1234567890987654321L), - Arguments.of("1234e-2", Variants.PhysicalType.DOUBLE, 12.34), + Arguments.of("1234e-2", Variants.PhysicalType.DECIMAL4, new BigDecimal("12.34")), Arguments.of("123456.789", Variants.PhysicalType.DECIMAL4, new BigDecimal("123456.789")), Arguments.of( "123456789.987654321", @@ -63,7 +63,7 @@ private static Stream primitiveInputs() { @ParameterizedTest @MethodSource("primitiveInputs") public void testPrimitiveJson( - String input, Variants.PhysicalType expectedType, Object expectedValue) throws IOException { + String input, Variants.PhysicalType expectedType, Object expectedValue) { Variant variant = VariantBuilder.parseJson(input); VariantPrimitive primitive = variant.value().asPrimitive(); @@ -117,7 +117,7 @@ private static Stream testPrimitiveNumericInputs() { @MethodSource("testPrimitiveNumericInputs") public void testPrimitiveNumeric(long value, Variants.PhysicalType type, Object expectedValue) { VariantPrimitiveBuilder builder = new VariantBuilder().createPrimitive(); - builder.writeNumeric(value); + builder.writeIntegral(value); Variant variant = builder.build(); VariantPrimitive primitive = variant.value().asPrimitive();