From ade1a1ffada528e23e322c5a5a93ab9f7f99f9ee Mon Sep 17 00:00:00 2001 From: Russell_Spitzer Date: Fri, 21 Jan 2022 16:46:46 -0600 Subject: [PATCH 01/12] Core: Adds Utility Class for Implementing ZOrdering --- build.gradle | 1 + .../apache/iceberg/util/ZOrderByteUtils.java | 128 +++++++++ .../iceberg/util/TestZOrderByteUtil.java | 244 ++++++++++++++++++ versions.props | 1 + 4 files changed, 374 insertions(+) create mode 100644 core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java create mode 100644 core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java diff --git a/build.gradle b/build.gradle index fa509212fb8b..65346358fe33 100644 --- a/build.gradle +++ b/build.gradle @@ -222,6 +222,7 @@ project(':iceberg-core') { } testImplementation "org.xerial:sqlite-jdbc" + testImplementation "org.apache.commons:commons-lang3" testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') } } diff --git a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java new file mode 100644 index 000000000000..4ef3120a2217 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.util; + +import java.util.Arrays; + +/** + * Within Z-Ordering the byte representations of objects being compared must be ordered, + * this requires several types to be transformed when converted to bytes. The goal is to + * map object's whose byte representation are not lexicographically ordered into representations + * that are lexicographically ordered. + * Most of these techniques are derived from + * https://aws.amazon.com/blogs/database/z-order-indexing-for-multifaceted-queries-in-amazon-dynamodb-part-2/ + */ +public class ZOrderByteUtils { + + private ZOrderByteUtils() { + + } + + /** + * Signed ints do not have their bytes in magnitude order because of the sign bit. + * To fix this, flip the sign bit so that all negatives are ordered before positives. This essentially + * shifts the 0 value so that we don't break our ordering when we cross the new 0 value. + */ + public static byte[] orderIntLikeBytes(byte[] intBytes, int size) { + if (intBytes == null) { + return new byte[size]; + } + intBytes[0] = (byte) (intBytes[0] ^ (1 << 7)); + return intBytes; + } + + /** + * IEEE 754 : + * “If two floating-point numbers in the same format are ordered (say, x < y), + * they are ordered the same way when their bits are reinterpreted as sign-magnitude integers.” + * + * Which means floats can be treated as sign magnitude integers which can then be converted into lexicographically + * comparable bytes + */ + public static byte[] orderFloatLikeBytes(byte[] floatBytes, int size) { + if (floatBytes == null) { + return new byte[size]; + } + if ((floatBytes[0] & (1 << 7)) == 0) { + // The signed magnitude is positive set the first bit (reversing the sign so positives order after negatives) + floatBytes[0] = (byte) (floatBytes[0] | (1 << 7)); + } else { + // The signed magnitude is negative so flip the first bit (reversing the sign so positives order after negatives) + // Then flip all remaining bits so numbers with greater negative magnitude come before those + // with less magnitude (reverse the order) + for (int i = 0; i < floatBytes.length; i++) { + floatBytes[i] = (byte) ~floatBytes[i]; + } + } + return floatBytes; + } + + /** + * Strings are lexicographically sortable BUT if different byte array lengths will + * ruin the Z-Ordering. (ZOrder requires that a given column contribute the same number of bytes every time). + * This implementation just uses a set size to for all output byte representations. Truncating longer strings + * and right padding 0 for shorter strings. + */ + public static byte[] orderUTF8LikeBytes(byte[] stringBytes, int size) { + if (stringBytes == null) { + return new byte[size]; + } + return Arrays.copyOf(stringBytes, size); + } + + /** + * Interleave bits using a naive loop. + * @param columnsBinary an array of byte arrays, none of which are empty + * @return their bits interleaved + */ + public static byte[] interleaveBits(byte[][] columnsBinary) { + int interleavedSize = Arrays.stream(columnsBinary).mapToInt(a -> a.length).sum(); + byte[] interleavedBytes = new byte[interleavedSize]; + int sourceBit = 7; + int sourceByte = 0; + int sourceColumn = 0; + int interleaveBit = 7; + int interleaveByte = 0; + while (interleaveByte < interleavedSize) { + // Take what we have, Get the source Bit of the source Byte, move it to the interleaveBit position + interleavedBytes[interleaveByte] = + (byte) (interleavedBytes[interleaveByte] | + (columnsBinary[sourceColumn][sourceByte] & 1 << sourceBit) >> sourceBit << interleaveBit); + + if (--interleaveBit == -1) { + // Finished a byte in our interleave byte array start a new byte + interleaveByte++; + interleaveBit = 7; + } + + // Find next column with a byte we can use + do { + if (++sourceColumn == columnsBinary.length) { + sourceColumn = 0; + if (--sourceBit == -1) { + sourceByte++; + sourceBit = 7; + } + } + } while (columnsBinary[sourceColumn].length <= sourceByte && interleaveByte < interleavedSize); + } + return interleavedBytes; + } +} diff --git a/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java b/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java new file mode 100644 index 000000000000..87d69dc99182 --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java @@ -0,0 +1,244 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +package org.apache.iceberg.util; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Random; +import org.apache.commons.lang3.RandomStringUtils; +import org.apache.iceberg.relocated.com.google.common.primitives.UnsignedBytes; +import org.junit.Assert; +import org.junit.Test; + +public class TestZOrderByteUtil { + private static final byte IIIIIIII = (byte) 255; + private static final byte IOIOIOIO = (byte) 170; + private static final byte OIOIOIOI = (byte) 85; + private static final byte OOOOIIII = (byte) 15; + private static final byte OOOOOOOI = (byte) 1; + private static final byte OOOOOOOO = (byte) 0; + + private static final int NUM_TESTS = 100000; + + private final Random random = new Random(42); + + private String bytesToString(byte[] bytes) { + StringBuilder result = new StringBuilder(); + for (byte b : bytes) { + result.append(String.format("%8s", Integer.toBinaryString(b & 0xFF)).replace(' ', '0')); + } + return result.toString(); + } + + /** + * Returns a non-0 length byte array + */ + private byte[] generateRandomBytes() { + int length = Math.abs(random.nextInt(100) + 1); + byte[] result = new byte[length]; + random.nextBytes(result); + return result; + } + + /** + * Test method to ensure correctness of byte interleaving code + */ + private String interleaveStrings(String[] strings) { + StringBuilder result = new StringBuilder(); + int totalLength = Arrays.stream(strings).mapToInt(String::length).sum(); + int substringIndex = 0; + int characterIndex = 0; + while (characterIndex < totalLength) { + for (String str : strings) { + if (substringIndex < str.length()) { + result.append(str.charAt(substringIndex)); + characterIndex++; + } + } + substringIndex++; + } + return result.toString(); + } + + /** + * Compares the result of a string based interleaving algorithm implemented above + * versus the binary bit-shifting algorithm used in ZOrderByteUtils. Either both + * algorithms are identically wrong or are both identically correct. + */ + @Test + public void testInterleaveRandomExamples() { + for (int test = 0; test < NUM_TESTS; test++) { + int numByteArrays = Math.abs(random.nextInt(6)) + 1; + byte[][] testBytes = new byte[numByteArrays][]; + String[] testStrings = new String[numByteArrays]; + for (int byteIndex = 0; byteIndex < numByteArrays; byteIndex++) { + testBytes[byteIndex] = generateRandomBytes(); + testStrings[byteIndex] = bytesToString(testBytes[byteIndex]); + } + byte[] byteResult = ZOrderByteUtils.interleaveBits(testBytes); + String byteResultAsString = bytesToString(byteResult); + + String stringResult = interleaveStrings(testStrings); + + Assert.assertEquals("String interleave didn't match byte interleave", stringResult, byteResultAsString); + } + } + + @Test + public void testInterleaveEmptyBits() { + byte[][] test = new byte[4][10]; + byte[] expected = new byte[40]; + + Assert.assertArrayEquals("Should combine empty arrays", + expected, ZOrderByteUtils.interleaveBits(test)); + } + + @Test + public void testInterleaveFullBits() { + byte[][] test = new byte[4][]; + test[0] = new byte[]{IIIIIIII, IIIIIIII}; + test[1] = new byte[]{IIIIIIII}; + test[2] = new byte[0]; + test[3] = new byte[]{IIIIIIII, IIIIIIII, IIIIIIII}; + byte[] expected = new byte[]{IIIIIIII, IIIIIIII, IIIIIIII, IIIIIIII, IIIIIIII, IIIIIIII}; + + Assert.assertArrayEquals("Should combine full arrays", + expected, ZOrderByteUtils.interleaveBits(test)); + } + + @Test + public void testInterleaveMixedBits() { + byte[][] test = new byte[4][]; + test[0] = new byte[]{OOOOOOOI, IIIIIIII, OOOOOOOO, OOOOIIII}; + test[1] = new byte[]{OOOOOOOI, OOOOOOOO, IIIIIIII}; + test[2] = new byte[]{OOOOOOOI}; + test[3] = new byte[]{OOOOOOOI}; + byte[] expected = new byte[]{ + OOOOOOOO, OOOOOOOO, OOOOOOOO, OOOOIIII, + IOIOIOIO, IOIOIOIO, + OIOIOIOI, OIOIOIOI, + OOOOIIII}; + Assert.assertArrayEquals("Should combine mixed byte arrays", + expected, ZOrderByteUtils.interleaveBits(test)); + } + + @Test + public void testIntOrdering() { + for (int i = 0; i < NUM_TESTS; i++) { + int aInt = random.nextInt(); + int bInt = random.nextInt(); + int intCompare = Integer.compare(aInt, bInt); + byte[] aBytes = ZOrderByteUtils.orderIntLikeBytes(bytesOf(aInt), 4); + byte[] bBytes = ZOrderByteUtils.orderIntLikeBytes(bytesOf(bInt), 4); + int byteCompare = UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes); + + Assert.assertTrue(String.format( + "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", + aInt, bInt, intCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), + (intCompare ^ byteCompare) >= 0); + } + } + + @Test + public void testLongOrdering() { + for (int i = 0; i < NUM_TESTS; i++) { + long aLong = random.nextInt(); + long bLong = random.nextInt(); + int longCompare = Long.compare(aLong, bLong); + byte[] aBytes = ZOrderByteUtils.orderIntLikeBytes(bytesOf(aLong), 8); + byte[] bBytes = ZOrderByteUtils.orderIntLikeBytes(bytesOf(bLong), 8); + int byteCompare = UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes); + + Assert.assertTrue(String.format( + "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", + aLong, bLong, longCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), + (longCompare ^ byteCompare) >= 0); + } + } + + @Test + public void testFloatOrdering() { + for (int i = 0; i < NUM_TESTS; i++) { + float aFloat = random.nextFloat(); + float bFloat = random.nextFloat(); + int floatCompare = Float.compare(aFloat, bFloat); + byte[] aBytes = ZOrderByteUtils.orderFloatLikeBytes(bytesOf(aFloat), 4); + byte[] bBytes = ZOrderByteUtils.orderFloatLikeBytes(bytesOf(bFloat), 4); + int byteCompare = UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes); + + Assert.assertTrue(String.format( + "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", + aFloat, bFloat, floatCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), + (floatCompare ^ byteCompare) >= 0); + } + } + + @Test + public void testDoubleOrdering() { + for (int i = 0; i < NUM_TESTS; i++) { + double aDouble = random.nextDouble(); + double bDouble = random.nextDouble(); + int doubleCompare = Double.compare(aDouble, bDouble); + byte[] aBytes = ZOrderByteUtils.orderFloatLikeBytes(bytesOf(aDouble), 8); + byte[] bBytes = ZOrderByteUtils.orderFloatLikeBytes(bytesOf(bDouble), 8); + int byteCompare = UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes); + + Assert.assertTrue(String.format( + "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", + aDouble, bDouble, doubleCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), + (doubleCompare ^ byteCompare) >= 0); + } + } + + @Test + public void testStringOrdering() { + for (int i = 0; i < NUM_TESTS; i++) { + String aString = RandomStringUtils.random(random.nextInt(35), true, true); + String bString = RandomStringUtils.random(random.nextInt(35), true, true); + int stringCompare = aString.compareTo(bString); + byte[] aBytes = ZOrderByteUtils.orderUTF8LikeBytes(aString.getBytes(StandardCharsets.UTF_8), 128); + byte[] bBytes = ZOrderByteUtils.orderUTF8LikeBytes(bString.getBytes(StandardCharsets.UTF_8), 128); + int byteCompare = UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes); + + Assert.assertTrue(String.format( + "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", + aString, bString, stringCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), + (stringCompare ^ byteCompare) >= 0); + } + } + + private byte[] bytesOf(int num) { + return ByteBuffer.allocate(4).putInt(num).array(); + } + + private byte[] bytesOf(long num) { + return ByteBuffer.allocate(8).putLong(num).array(); + } + + private byte[] bytesOf(float num) { + return ByteBuffer.allocate(4).putFloat(num).array(); + } + + private byte[] bytesOf(double num) { + return ByteBuffer.allocate(8).putDouble(num).array(); + } +} diff --git a/versions.props b/versions.props index c9ec027effe6..3dc3a5041f23 100644 --- a/versions.props +++ b/versions.props @@ -1,6 +1,7 @@ org.slf4j:* = 1.7.25 org.apache.avro:avro = 1.10.1 org.apache.calcite:* = 1.10.0 +org.apache.commons:commons-lang3 = 3.12.0 org.apache.flink:* = 1.12.5 org.apache.hadoop:* = 2.7.3 org.apache.hive:* = 2.3.8 From 781a12158534d6651264e3bbb93952b7c7435175 Mon Sep 17 00:00:00 2001 From: Russell_Spitzer Date: Tue, 25 Jan 2022 15:45:02 -0600 Subject: [PATCH 02/12] Fix JavaDoc --- core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java index 4ef3120a2217..759f101b0cc5 100644 --- a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java +++ b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java @@ -50,7 +50,7 @@ public static byte[] orderIntLikeBytes(byte[] intBytes, int size) { /** * IEEE 754 : - * “If two floating-point numbers in the same format are ordered (say, x < y), + * “If two floating-point numbers in the same format are ordered (say, x \< y), * they are ordered the same way when their bits are reinterpreted as sign-magnitude integers.” * * Which means floats can be treated as sign magnitude integers which can then be converted into lexicographically From fda817cdb61140208b625bef885483c9866d0b0d Mon Sep 17 00:00:00 2001 From: Russell_Spitzer Date: Mon, 31 Jan 2022 12:31:49 -0600 Subject: [PATCH 03/12] Switch Implementations to work on Primitives instead of ByteArrays --- .../apache/iceberg/util/ZOrderByteUtils.java | 66 ++++++++------ .../iceberg/util/TestZOrderByteUtil.java | 86 ++++++++----------- 2 files changed, 74 insertions(+), 78 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java index 759f101b0cc5..571ea24d5039 100644 --- a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java +++ b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java @@ -19,6 +19,7 @@ package org.apache.iceberg.util; +import java.nio.ByteBuffer; import java.util.Arrays; /** @@ -28,6 +29,9 @@ * that are lexicographically ordered. * Most of these techniques are derived from * https://aws.amazon.com/blogs/database/z-order-indexing-for-multifaceted-queries-in-amazon-dynamodb-part-2/ + * + * Some implementation is taken from + * https://github.com/apache/hbase/blob/master/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java */ public class ZOrderByteUtils { @@ -40,12 +44,19 @@ private ZOrderByteUtils() { * To fix this, flip the sign bit so that all negatives are ordered before positives. This essentially * shifts the 0 value so that we don't break our ordering when we cross the new 0 value. */ - public static byte[] orderIntLikeBytes(byte[] intBytes, int size) { - if (intBytes == null) { - return new byte[size]; - } - intBytes[0] = (byte) (intBytes[0] ^ (1 << 7)); - return intBytes; + public static byte[] intToOrderedBytes(int val) { + ByteBuffer bytes = ByteBuffer.allocate(Integer.BYTES); + bytes.putInt(val ^ 0x80000000); + return bytes.array(); + } + + /** + * Signed longs are treated the same as the signed ints + */ + public static byte[] longToOrderBytes(long val) { + ByteBuffer bytes = ByteBuffer.allocate(Long.BYTES); + bytes.putLong(val ^ 0x8000000000000000L); + return bytes.array(); } /** @@ -56,22 +67,23 @@ public static byte[] orderIntLikeBytes(byte[] intBytes, int size) { * Which means floats can be treated as sign magnitude integers which can then be converted into lexicographically * comparable bytes */ - public static byte[] orderFloatLikeBytes(byte[] floatBytes, int size) { - if (floatBytes == null) { - return new byte[size]; - } - if ((floatBytes[0] & (1 << 7)) == 0) { - // The signed magnitude is positive set the first bit (reversing the sign so positives order after negatives) - floatBytes[0] = (byte) (floatBytes[0] | (1 << 7)); - } else { - // The signed magnitude is negative so flip the first bit (reversing the sign so positives order after negatives) - // Then flip all remaining bits so numbers with greater negative magnitude come before those - // with less magnitude (reverse the order) - for (int i = 0; i < floatBytes.length; i++) { - floatBytes[i] = (byte) ~floatBytes[i]; - } - } - return floatBytes; + public static byte[] floatToOrderedBytes(float val) { + ByteBuffer bytes = ByteBuffer.allocate(Integer.BYTES); + int ival = Float.floatToIntBits(val); + ival ^= ((ival >> (Integer.SIZE - 1)) | Integer.MIN_VALUE); + bytes.putInt(ival); + return bytes.array(); + } + + /** + * Doubles are treated the same as floats + */ + public static byte[] doubleToOrderedBytes(double val) { + ByteBuffer bytes = ByteBuffer.allocate(Long.BYTES); + long lng = Double.doubleToLongBits(val); + lng ^= ((lng >> (Long.SIZE - 1)) | Long.MIN_VALUE); + bytes.putLong(lng); + return bytes.array(); } /** @@ -80,11 +92,13 @@ public static byte[] orderFloatLikeBytes(byte[] floatBytes, int size) { * This implementation just uses a set size to for all output byte representations. Truncating longer strings * and right padding 0 for shorter strings. */ - public static byte[] orderUTF8LikeBytes(byte[] stringBytes, int size) { - if (stringBytes == null) { - return new byte[size]; + public static byte[] stringToOrderedBytes(String val, int length) { + ByteBuffer bytes = ByteBuffer.allocate(length); + if (val != null) { + int maxLength = Math.min(length, val.length()); + bytes.put(val.getBytes(), 0, maxLength); } - return Arrays.copyOf(stringBytes, size); + return bytes.array(); } /** diff --git a/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java b/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java index 87d69dc99182..b34f950f90c8 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java +++ b/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java @@ -20,8 +20,6 @@ package org.apache.iceberg.util; -import java.nio.ByteBuffer; -import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.Random; import org.apache.commons.lang3.RandomStringUtils; @@ -146,15 +144,15 @@ public void testIntOrdering() { for (int i = 0; i < NUM_TESTS; i++) { int aInt = random.nextInt(); int bInt = random.nextInt(); - int intCompare = Integer.compare(aInt, bInt); - byte[] aBytes = ZOrderByteUtils.orderIntLikeBytes(bytesOf(aInt), 4); - byte[] bBytes = ZOrderByteUtils.orderIntLikeBytes(bytesOf(bInt), 4); - int byteCompare = UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes); + int intCompare = Integer.signum(Integer.compare(aInt, bInt)); + byte[] aBytes = ZOrderByteUtils.intToOrderedBytes(aInt); + byte[] bBytes = ZOrderByteUtils.intToOrderedBytes(bInt); + int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); - Assert.assertTrue(String.format( + Assert.assertEquals(String.format( "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", aInt, bInt, intCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), - (intCompare ^ byteCompare) >= 0); + intCompare, byteCompare); } } @@ -163,15 +161,15 @@ public void testLongOrdering() { for (int i = 0; i < NUM_TESTS; i++) { long aLong = random.nextInt(); long bLong = random.nextInt(); - int longCompare = Long.compare(aLong, bLong); - byte[] aBytes = ZOrderByteUtils.orderIntLikeBytes(bytesOf(aLong), 8); - byte[] bBytes = ZOrderByteUtils.orderIntLikeBytes(bytesOf(bLong), 8); - int byteCompare = UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes); + int longCompare = Integer.signum(Long.compare(aLong, bLong)); + byte[] aBytes = ZOrderByteUtils.longToOrderBytes(aLong); + byte[] bBytes = ZOrderByteUtils.longToOrderBytes(bLong); + int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); - Assert.assertTrue(String.format( - "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", + Assert.assertEquals(String.format( + "Ordering of longs should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", aLong, bLong, longCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), - (longCompare ^ byteCompare) >= 0); + longCompare, byteCompare); } } @@ -180,15 +178,15 @@ public void testFloatOrdering() { for (int i = 0; i < NUM_TESTS; i++) { float aFloat = random.nextFloat(); float bFloat = random.nextFloat(); - int floatCompare = Float.compare(aFloat, bFloat); - byte[] aBytes = ZOrderByteUtils.orderFloatLikeBytes(bytesOf(aFloat), 4); - byte[] bBytes = ZOrderByteUtils.orderFloatLikeBytes(bytesOf(bFloat), 4); - int byteCompare = UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes); + int floatCompare = Integer.signum(Float.compare(aFloat, bFloat)); + byte[] aBytes = ZOrderByteUtils.floatToOrderedBytes(aFloat); + byte[] bBytes = ZOrderByteUtils.floatToOrderedBytes(bFloat); + int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); - Assert.assertTrue(String.format( - "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", + Assert.assertEquals(String.format( + "Ordering of floats should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", aFloat, bFloat, floatCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), - (floatCompare ^ byteCompare) >= 0); + floatCompare, byteCompare); } } @@ -197,15 +195,15 @@ public void testDoubleOrdering() { for (int i = 0; i < NUM_TESTS; i++) { double aDouble = random.nextDouble(); double bDouble = random.nextDouble(); - int doubleCompare = Double.compare(aDouble, bDouble); - byte[] aBytes = ZOrderByteUtils.orderFloatLikeBytes(bytesOf(aDouble), 8); - byte[] bBytes = ZOrderByteUtils.orderFloatLikeBytes(bytesOf(bDouble), 8); - int byteCompare = UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes); + int doubleCompare = Integer.signum(Double.compare(aDouble, bDouble)); + byte[] aBytes = ZOrderByteUtils.doubleToOrderedBytes(aDouble); + byte[] bBytes = ZOrderByteUtils.doubleToOrderedBytes(bDouble); + int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); - Assert.assertTrue(String.format( - "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", + Assert.assertEquals(String.format( + "Ordering of doubles should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", aDouble, bDouble, doubleCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), - (doubleCompare ^ byteCompare) >= 0); + doubleCompare, byteCompare); } } @@ -214,31 +212,15 @@ public void testStringOrdering() { for (int i = 0; i < NUM_TESTS; i++) { String aString = RandomStringUtils.random(random.nextInt(35), true, true); String bString = RandomStringUtils.random(random.nextInt(35), true, true); - int stringCompare = aString.compareTo(bString); - byte[] aBytes = ZOrderByteUtils.orderUTF8LikeBytes(aString.getBytes(StandardCharsets.UTF_8), 128); - byte[] bBytes = ZOrderByteUtils.orderUTF8LikeBytes(bString.getBytes(StandardCharsets.UTF_8), 128); - int byteCompare = UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes); + int stringCompare = Integer.signum(aString.compareTo(bString)); + byte[] aBytes = ZOrderByteUtils.stringToOrderedBytes(aString, 128); + byte[] bBytes = ZOrderByteUtils.stringToOrderedBytes(bString, 128); + int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); - Assert.assertTrue(String.format( - "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", + Assert.assertEquals(String.format( + "Ordering of strings should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", aString, bString, stringCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), - (stringCompare ^ byteCompare) >= 0); + stringCompare, byteCompare); } } - - private byte[] bytesOf(int num) { - return ByteBuffer.allocate(4).putInt(num).array(); - } - - private byte[] bytesOf(long num) { - return ByteBuffer.allocate(8).putLong(num).array(); - } - - private byte[] bytesOf(float num) { - return ByteBuffer.allocate(4).putFloat(num).array(); - } - - private byte[] bytesOf(double num) { - return ByteBuffer.allocate(8).putDouble(num).array(); - } } From c6954e6b544a9da070329e12a01bc69d0a572d6d Mon Sep 17 00:00:00 2001 From: Russell_Spitzer Date: Mon, 31 Jan 2022 16:47:36 -0600 Subject: [PATCH 04/12] Clean up RandomStringUtilUsage --- build.gradle | 1 - .../main/java/org/apache/iceberg/util/ZOrderByteUtils.java | 6 ++++-- .../java/org/apache/iceberg/util/TestZOrderByteUtil.java | 6 +++--- versions.props | 1 - 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/build.gradle b/build.gradle index 65346358fe33..fa509212fb8b 100644 --- a/build.gradle +++ b/build.gradle @@ -222,7 +222,6 @@ project(':iceberg-core') { } testImplementation "org.xerial:sqlite-jdbc" - testImplementation "org.apache.commons:commons-lang3" testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') } } diff --git a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java index 571ea24d5039..750831a9a5e7 100644 --- a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java +++ b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java @@ -120,7 +120,8 @@ public static byte[] interleaveBits(byte[][] columnsBinary) { (byte) (interleavedBytes[interleaveByte] | (columnsBinary[sourceColumn][sourceByte] & 1 << sourceBit) >> sourceBit << interleaveBit); - if (--interleaveBit == -1) { + --interleaveBit; + if (interleaveBit == -1) { // Finished a byte in our interleave byte array start a new byte interleaveByte++; interleaveBit = 7; @@ -128,7 +129,8 @@ public static byte[] interleaveBits(byte[][] columnsBinary) { // Find next column with a byte we can use do { - if (++sourceColumn == columnsBinary.length) { + ++sourceColumn; + if (sourceColumn == columnsBinary.length) { sourceColumn = 0; if (--sourceBit == -1) { sourceByte++; diff --git a/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java b/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java index b34f950f90c8..17f19ec01af7 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java +++ b/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java @@ -22,8 +22,8 @@ import java.util.Arrays; import java.util.Random; -import org.apache.commons.lang3.RandomStringUtils; import org.apache.iceberg.relocated.com.google.common.primitives.UnsignedBytes; +import org.apache.iceberg.types.Types; import org.junit.Assert; import org.junit.Test; @@ -210,8 +210,8 @@ public void testDoubleOrdering() { @Test public void testStringOrdering() { for (int i = 0; i < NUM_TESTS; i++) { - String aString = RandomStringUtils.random(random.nextInt(35), true, true); - String bString = RandomStringUtils.random(random.nextInt(35), true, true); + String aString = (String) RandomUtil.generatePrimitive(Types.StringType.get(), random); + String bString = (String) RandomUtil.generatePrimitive(Types.StringType.get(), random); int stringCompare = Integer.signum(aString.compareTo(bString)); byte[] aBytes = ZOrderByteUtils.stringToOrderedBytes(aString, 128); byte[] bBytes = ZOrderByteUtils.stringToOrderedBytes(bString, 128); diff --git a/versions.props b/versions.props index 3dc3a5041f23..c9ec027effe6 100644 --- a/versions.props +++ b/versions.props @@ -1,7 +1,6 @@ org.slf4j:* = 1.7.25 org.apache.avro:avro = 1.10.1 org.apache.calcite:* = 1.10.0 -org.apache.commons:commons-lang3 = 3.12.0 org.apache.flink:* = 1.12.5 org.apache.hadoop:* = 2.7.3 org.apache.hive:* = 2.3.8 From 3f6fc921ae83cb6d9643fbdb1522682dcbcdf065 Mon Sep 17 00:00:00 2001 From: Russell_Spitzer Date: Mon, 31 Jan 2022 16:56:07 -0600 Subject: [PATCH 05/12] Fix JavaDoc --- core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java index 750831a9a5e7..e41a5b3be763 100644 --- a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java +++ b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java @@ -61,7 +61,7 @@ public static byte[] longToOrderBytes(long val) { /** * IEEE 754 : - * “If two floating-point numbers in the same format are ordered (say, x \< y), + * “If two floating-point numbers in the same format are ordered (say, x {@literal <} y), * they are ordered the same way when their bits are reinterpreted as sign-magnitude integers.” * * Which means floats can be treated as sign magnitude integers which can then be converted into lexicographically From 5ccf8f45bfd57a9305dc935e48807e500d7ebe3f Mon Sep 17 00:00:00 2001 From: Russell_Spitzer Date: Mon, 31 Jan 2022 17:41:05 -0600 Subject: [PATCH 06/12] Add Functions for Smaller Types --- .../apache/iceberg/util/ZOrderByteUtils.java | 18 ++++++++++ .../iceberg/util/TestZOrderByteUtil.java | 34 +++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java index e41a5b3be763..ae68b69b745e 100644 --- a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java +++ b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java @@ -59,6 +59,24 @@ public static byte[] longToOrderBytes(long val) { return bytes.array(); } + /** + * Signed shorts are treated the same as the signed ints + */ + public static byte[] shortToOrderBytes(short val) { + ByteBuffer bytes = ByteBuffer.allocate(Short.BYTES); + bytes.putShort((short) (val ^ (0x8000))); + return bytes.array(); + } + + /** + * Signed tiny ints are treated the same as the signed ints + */ + public static byte[] tinyintToOrderedBytes(byte val) { + ByteBuffer bytes = ByteBuffer.allocate(Byte.BYTES); + bytes.put((byte) (val ^ (0x80))); + return bytes.array(); + } + /** * IEEE 754 : * “If two floating-point numbers in the same format are ordered (say, x {@literal <} y), diff --git a/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java b/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java index 17f19ec01af7..81caf0ad0fb3 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java +++ b/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java @@ -173,6 +173,40 @@ public void testLongOrdering() { } } + @Test + public void testShortOrdering() { + for (int i = 0; i < NUM_TESTS; i++) { + short aShort = (short) (random.nextInt() % (Short.MAX_VALUE + 1)); + short bShort = (short) (random.nextInt() % (Short.MAX_VALUE + 1)); + int longCompare = Integer.signum(Long.compare(aShort, bShort)); + byte[] aBytes = ZOrderByteUtils.longToOrderBytes(aShort); + byte[] bBytes = ZOrderByteUtils.longToOrderBytes(bShort); + int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); + + Assert.assertEquals(String.format( + "Ordering of longs should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", + aShort, bShort, longCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), + longCompare, byteCompare); + } + } + + @Test + public void testTinyOrdering() { + for (int i = 0; i < NUM_TESTS; i++) { + long aByte = (byte) (random.nextInt() % (Byte.MAX_VALUE + 1)); + long bByte = (byte) (random.nextInt() % (Byte.MAX_VALUE + 1)); + int longCompare = Integer.signum(Long.compare(aByte, bByte)); + byte[] aBytes = ZOrderByteUtils.longToOrderBytes(aByte); + byte[] bBytes = ZOrderByteUtils.longToOrderBytes(bByte); + int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); + + Assert.assertEquals(String.format( + "Ordering of longs should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", + aByte, bByte, longCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), + longCompare, byteCompare); + } + } + @Test public void testFloatOrdering() { for (int i = 0; i < NUM_TESTS; i++) { From 30c4633f36faaf5c12f56cc1e9b07259d5a6eca3 Mon Sep 17 00:00:00 2001 From: Russell_Spitzer Date: Mon, 7 Feb 2022 15:00:45 -0600 Subject: [PATCH 07/12] Updates for reviewer comments --- .../org/apache/iceberg/util/ByteBuffers.java | 10 +++ .../apache/iceberg/util/ZOrderByteUtils.java | 84 +++++++++++-------- .../iceberg/util/TestZOrderByteUtil.java | 50 +++++++---- 3 files changed, 93 insertions(+), 51 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/util/ByteBuffers.java b/api/src/main/java/org/apache/iceberg/util/ByteBuffers.java index 213b222dc507..efc05f179f82 100644 --- a/api/src/main/java/org/apache/iceberg/util/ByteBuffers.java +++ b/api/src/main/java/org/apache/iceberg/util/ByteBuffers.java @@ -21,6 +21,7 @@ import java.nio.ByteBuffer; import java.util.Arrays; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; public class ByteBuffers { @@ -46,6 +47,15 @@ public static byte[] toByteArray(ByteBuffer buffer) { } } + public static ByteBuffer reuse(ByteBuffer reuse, int length) { + Preconditions.checkArgument(reuse.hasArray() && reuse.arrayOffset() == 0 && reuse.capacity() == length, + "Cannot reuse buffer: Should be an array %s, should have an offset of 0 %s, should be of size %s was %s", + reuse.hasArray(), reuse.arrayOffset(), length, reuse.capacity()); + reuse.position(0); + reuse.limit(length); + return reuse; + } + public static ByteBuffer copy(ByteBuffer buffer) { if (buffer == null) { return null; diff --git a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java index ae68b69b745e..deab4450a61d 100644 --- a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java +++ b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java @@ -44,35 +44,35 @@ private ZOrderByteUtils() { * To fix this, flip the sign bit so that all negatives are ordered before positives. This essentially * shifts the 0 value so that we don't break our ordering when we cross the new 0 value. */ - public static byte[] intToOrderedBytes(int val) { - ByteBuffer bytes = ByteBuffer.allocate(Integer.BYTES); + public static byte[] intToOrderedBytes(int val, ByteBuffer reuse) { + ByteBuffer bytes = ByteBuffers.reuse(reuse, Integer.BYTES); bytes.putInt(val ^ 0x80000000); return bytes.array(); } /** - * Signed longs are treated the same as the signed ints + * Signed longs are treated the same as the signed ints in {@link #intToOrderedBytes(int, ByteBuffer)} */ - public static byte[] longToOrderBytes(long val) { - ByteBuffer bytes = ByteBuffer.allocate(Long.BYTES); + public static byte[] longToOrderedBytes(long val, ByteBuffer reuse) { + ByteBuffer bytes = ByteBuffers.reuse(reuse, Long.BYTES); bytes.putLong(val ^ 0x8000000000000000L); return bytes.array(); } /** - * Signed shorts are treated the same as the signed ints + * Signed shorts are treated the same as the signed ints in {@link #intToOrderedBytes(int, ByteBuffer)} */ - public static byte[] shortToOrderBytes(short val) { - ByteBuffer bytes = ByteBuffer.allocate(Short.BYTES); + public static byte[] shortToOrderedBytes(short val, ByteBuffer reuse) { + ByteBuffer bytes = ByteBuffers.reuse(reuse, Short.BYTES); bytes.putShort((short) (val ^ (0x8000))); return bytes.array(); } /** - * Signed tiny ints are treated the same as the signed ints + * Signed tiny ints are treated the same as the signed ints in {@link #intToOrderedBytes(int, ByteBuffer)} */ - public static byte[] tinyintToOrderedBytes(byte val) { - ByteBuffer bytes = ByteBuffer.allocate(Byte.BYTES); + public static byte[] tinyintToOrderedBytes(byte val, ByteBuffer reuse) { + ByteBuffer bytes = ByteBuffers.reuse(reuse, Byte.BYTES); bytes.put((byte) (val ^ (0x80))); return bytes.array(); } @@ -85,8 +85,8 @@ public static byte[] tinyintToOrderedBytes(byte val) { * Which means floats can be treated as sign magnitude integers which can then be converted into lexicographically * comparable bytes */ - public static byte[] floatToOrderedBytes(float val) { - ByteBuffer bytes = ByteBuffer.allocate(Integer.BYTES); + public static byte[] floatToOrderedBytes(float val, ByteBuffer reuse) { + ByteBuffer bytes = ByteBuffers.reuse(reuse, Float.BYTES); int ival = Float.floatToIntBits(val); ival ^= ((ival >> (Integer.SIZE - 1)) | Integer.MIN_VALUE); bytes.putInt(ival); @@ -94,10 +94,10 @@ public static byte[] floatToOrderedBytes(float val) { } /** - * Doubles are treated the same as floats + * Doubles are treated the same as floats in {@link #floatToOrderedBytes(float, ByteBuffer)} */ - public static byte[] doubleToOrderedBytes(double val) { - ByteBuffer bytes = ByteBuffer.allocate(Long.BYTES); + public static byte[] doubleToOrderedBytes(double val, ByteBuffer reuse) { + ByteBuffer bytes = ByteBuffers.reuse(reuse, Double.BYTES); long lng = Double.doubleToLongBits(val); lng ^= ((lng >> (Long.SIZE - 1)) | Long.MIN_VALUE); bytes.putLong(lng); @@ -108,54 +108,70 @@ public static byte[] doubleToOrderedBytes(double val) { * Strings are lexicographically sortable BUT if different byte array lengths will * ruin the Z-Ordering. (ZOrder requires that a given column contribute the same number of bytes every time). * This implementation just uses a set size to for all output byte representations. Truncating longer strings - * and right padding 0 for shorter strings. + * and right padding 0 for shorter strings. Requires UTF8 (or ASCII) encoding for ordering guarantees to hold. */ - public static byte[] stringToOrderedBytes(String val, int length) { - ByteBuffer bytes = ByteBuffer.allocate(length); + public static byte[] stringToOrderedBytes(String val, int length, ByteBuffer reuse) { + ByteBuffer bytes = ByteBuffers.reuse(reuse, length); + Arrays.fill(bytes.array(), 0, length, (byte) 0x00); if (val != null) { int maxLength = Math.min(length, val.length()); + // We may truncate mid-character bytes.put(val.getBytes(), 0, maxLength); } return bytes.array(); } /** - * Interleave bits using a naive loop. - * @param columnsBinary an array of byte arrays, none of which are empty - * @return their bits interleaved + * Interleave bits using a naive loop. Variable length inputs are allowed but to get a consistent ordering it is + * required that every column contribute the same number of bytes in each invocation. Bits are interleaved from all + * columns that have a bit available at that position. Once a Column has no more bits to produce it is skipped in the + * interleaving. + * @param columnsBinary an array of ordered byte representations of the columns being ZOrdered + * @return the columnbytes interleaved */ public static byte[] interleaveBits(byte[][] columnsBinary) { int interleavedSize = Arrays.stream(columnsBinary).mapToInt(a -> a.length).sum(); byte[] interleavedBytes = new byte[interleavedSize]; - int sourceBit = 7; - int sourceByte = 0; int sourceColumn = 0; - int interleaveBit = 7; + int sourceByte = 0; + int sourceBit = 7; int interleaveByte = 0; - while (interleaveByte < interleavedSize) { - // Take what we have, Get the source Bit of the source Byte, move it to the interleaveBit position - interleavedBytes[interleaveByte] = - (byte) (interleavedBytes[interleaveByte] | - (columnsBinary[sourceColumn][sourceByte] & 1 << sourceBit) >> sourceBit << interleaveBit); + int interleaveBit = 7; + while (interleaveByte < interleavedSize) { + // Take the source bit from source byte and move it to the output bit position + interleavedBytes[interleaveByte] |= + (columnsBinary[sourceColumn][sourceByte] & 1 << sourceBit) >>> sourceBit << interleaveBit; --interleaveBit; + + // Check if an output byte has been completed if (interleaveBit == -1) { - // Finished a byte in our interleave byte array start a new byte + // Move to the next output byte interleaveByte++; + // Move to the highest order bit of the new output byte interleaveBit = 7; } - // Find next column with a byte we can use + // Check if the last output byte has been completed + if (interleaveByte == interleavedSize) { + break; + } + + // Find the next source bit to interleave do { + // Move to next column ++sourceColumn; if (sourceColumn == columnsBinary.length) { + // If the last source column was used, reset to next bit of first column sourceColumn = 0; - if (--sourceBit == -1) { + --sourceBit; + if (sourceBit == -1) { + // If the last bit of the source byte was used, reset to the highest bit of the next byte sourceByte++; sourceBit = 7; } } - } while (columnsBinary[sourceColumn].length <= sourceByte && interleaveByte < interleavedSize); + } while (columnsBinary[sourceColumn].length <= sourceByte); } return interleavedBytes; } diff --git a/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java b/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java index 81caf0ad0fb3..e2ff29d76c3a 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java +++ b/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java @@ -20,6 +20,7 @@ package org.apache.iceberg.util; +import java.nio.ByteBuffer; import java.util.Arrays; import java.util.Random; import org.apache.iceberg.relocated.com.google.common.primitives.UnsignedBytes; @@ -36,6 +37,7 @@ public class TestZOrderByteUtil { private static final byte OOOOOOOO = (byte) 0; private static final int NUM_TESTS = 100000; + private static final int NUM_INTERLEAVE_TESTS = 1000; private final Random random = new Random(42); @@ -84,7 +86,7 @@ private String interleaveStrings(String[] strings) { */ @Test public void testInterleaveRandomExamples() { - for (int test = 0; test < NUM_TESTS; test++) { + for (int test = 0; test < NUM_INTERLEAVE_TESTS; test++) { int numByteArrays = Math.abs(random.nextInt(6)) + 1; byte[][] testBytes = new byte[numByteArrays][]; String[] testStrings = new String[numByteArrays]; @@ -141,12 +143,14 @@ public void testInterleaveMixedBits() { @Test public void testIntOrdering() { + ByteBuffer aBuffer = ByteBuffer.allocate(Integer.BYTES); + ByteBuffer bBuffer = ByteBuffer.allocate(Integer.BYTES); for (int i = 0; i < NUM_TESTS; i++) { int aInt = random.nextInt(); int bInt = random.nextInt(); int intCompare = Integer.signum(Integer.compare(aInt, bInt)); - byte[] aBytes = ZOrderByteUtils.intToOrderedBytes(aInt); - byte[] bBytes = ZOrderByteUtils.intToOrderedBytes(bInt); + byte[] aBytes = ZOrderByteUtils.intToOrderedBytes(aInt, aBuffer); + byte[] bBytes = ZOrderByteUtils.intToOrderedBytes(bInt, bBuffer); int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); Assert.assertEquals(String.format( @@ -158,12 +162,14 @@ public void testIntOrdering() { @Test public void testLongOrdering() { + ByteBuffer aBuffer = ByteBuffer.allocate(Long.BYTES); + ByteBuffer bBuffer = ByteBuffer.allocate(Long.BYTES); for (int i = 0; i < NUM_TESTS; i++) { long aLong = random.nextInt(); long bLong = random.nextInt(); int longCompare = Integer.signum(Long.compare(aLong, bLong)); - byte[] aBytes = ZOrderByteUtils.longToOrderBytes(aLong); - byte[] bBytes = ZOrderByteUtils.longToOrderBytes(bLong); + byte[] aBytes = ZOrderByteUtils.longToOrderedBytes(aLong, aBuffer); + byte[] bBytes = ZOrderByteUtils.longToOrderedBytes(bLong, bBuffer); int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); Assert.assertEquals(String.format( @@ -175,12 +181,14 @@ public void testLongOrdering() { @Test public void testShortOrdering() { + ByteBuffer aBuffer = ByteBuffer.allocate(Short.BYTES); + ByteBuffer bBuffer = ByteBuffer.allocate(Short.BYTES); for (int i = 0; i < NUM_TESTS; i++) { short aShort = (short) (random.nextInt() % (Short.MAX_VALUE + 1)); short bShort = (short) (random.nextInt() % (Short.MAX_VALUE + 1)); int longCompare = Integer.signum(Long.compare(aShort, bShort)); - byte[] aBytes = ZOrderByteUtils.longToOrderBytes(aShort); - byte[] bBytes = ZOrderByteUtils.longToOrderBytes(bShort); + byte[] aBytes = ZOrderByteUtils.shortToOrderedBytes(aShort, aBuffer); + byte[] bBytes = ZOrderByteUtils.shortToOrderedBytes(bShort, bBuffer); int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); Assert.assertEquals(String.format( @@ -192,12 +200,14 @@ public void testShortOrdering() { @Test public void testTinyOrdering() { + ByteBuffer aBuffer = ByteBuffer.allocate(Byte.BYTES); + ByteBuffer bBuffer = ByteBuffer.allocate(Byte.BYTES); for (int i = 0; i < NUM_TESTS; i++) { - long aByte = (byte) (random.nextInt() % (Byte.MAX_VALUE + 1)); - long bByte = (byte) (random.nextInt() % (Byte.MAX_VALUE + 1)); + byte aByte = (byte) (random.nextInt() % (Byte.MAX_VALUE + 1)); + byte bByte = (byte) (random.nextInt() % (Byte.MAX_VALUE + 1)); int longCompare = Integer.signum(Long.compare(aByte, bByte)); - byte[] aBytes = ZOrderByteUtils.longToOrderBytes(aByte); - byte[] bBytes = ZOrderByteUtils.longToOrderBytes(bByte); + byte[] aBytes = ZOrderByteUtils.tinyintToOrderedBytes(aByte, aBuffer); + byte[] bBytes = ZOrderByteUtils.tinyintToOrderedBytes(bByte, bBuffer); int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); Assert.assertEquals(String.format( @@ -209,12 +219,14 @@ public void testTinyOrdering() { @Test public void testFloatOrdering() { + ByteBuffer aBuffer = ByteBuffer.allocate(Float.BYTES); + ByteBuffer bBuffer = ByteBuffer.allocate(Float.BYTES); for (int i = 0; i < NUM_TESTS; i++) { float aFloat = random.nextFloat(); float bFloat = random.nextFloat(); int floatCompare = Integer.signum(Float.compare(aFloat, bFloat)); - byte[] aBytes = ZOrderByteUtils.floatToOrderedBytes(aFloat); - byte[] bBytes = ZOrderByteUtils.floatToOrderedBytes(bFloat); + byte[] aBytes = ZOrderByteUtils.floatToOrderedBytes(aFloat, aBuffer); + byte[] bBytes = ZOrderByteUtils.floatToOrderedBytes(bFloat, bBuffer); int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); Assert.assertEquals(String.format( @@ -226,12 +238,14 @@ public void testFloatOrdering() { @Test public void testDoubleOrdering() { + ByteBuffer aBuffer = ByteBuffer.allocate(Double.BYTES); + ByteBuffer bBuffer = ByteBuffer.allocate(Double.BYTES); for (int i = 0; i < NUM_TESTS; i++) { double aDouble = random.nextDouble(); double bDouble = random.nextDouble(); int doubleCompare = Integer.signum(Double.compare(aDouble, bDouble)); - byte[] aBytes = ZOrderByteUtils.doubleToOrderedBytes(aDouble); - byte[] bBytes = ZOrderByteUtils.doubleToOrderedBytes(bDouble); + byte[] aBytes = ZOrderByteUtils.doubleToOrderedBytes(aDouble, aBuffer); + byte[] bBytes = ZOrderByteUtils.doubleToOrderedBytes(bDouble, bBuffer); int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); Assert.assertEquals(String.format( @@ -243,12 +257,14 @@ public void testDoubleOrdering() { @Test public void testStringOrdering() { + ByteBuffer aBuffer = ByteBuffer.allocate(128); + ByteBuffer bBuffer = ByteBuffer.allocate(128); for (int i = 0; i < NUM_TESTS; i++) { String aString = (String) RandomUtil.generatePrimitive(Types.StringType.get(), random); String bString = (String) RandomUtil.generatePrimitive(Types.StringType.get(), random); int stringCompare = Integer.signum(aString.compareTo(bString)); - byte[] aBytes = ZOrderByteUtils.stringToOrderedBytes(aString, 128); - byte[] bBytes = ZOrderByteUtils.stringToOrderedBytes(bString, 128); + byte[] aBytes = ZOrderByteUtils.stringToOrderedBytes(aString, 128, aBuffer); + byte[] bBytes = ZOrderByteUtils.stringToOrderedBytes(bString, 128, bBuffer); int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); Assert.assertEquals(String.format( From 10e561c1a53c60341853631fc1d8936c60d6e41f Mon Sep 17 00:00:00 2001 From: Russell_Spitzer Date: Mon, 7 Feb 2022 19:45:49 -0600 Subject: [PATCH 08/12] Specify Output Size --- .../org/apache/iceberg/util/ZOrderByteUtils.java | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java index deab4450a61d..f4d28572be84 100644 --- a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java +++ b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java @@ -121,16 +121,24 @@ public static byte[] stringToOrderedBytes(String val, int length, ByteBuffer reu return bytes.array(); } + /** + * For Testing interleave all available bytes + */ + static byte[] interleaveBits(byte[][] columnsBinary) { + return interleaveBits(columnsBinary, + Arrays.stream(columnsBinary).mapToInt(column -> column.length).max().getAsInt()); + } + /** * Interleave bits using a naive loop. Variable length inputs are allowed but to get a consistent ordering it is * required that every column contribute the same number of bytes in each invocation. Bits are interleaved from all * columns that have a bit available at that position. Once a Column has no more bits to produce it is skipped in the * interleaving. * @param columnsBinary an array of ordered byte representations of the columns being ZOrdered + * @param interleavedSize the number of bytes to use in the output * @return the columnbytes interleaved */ - public static byte[] interleaveBits(byte[][] columnsBinary) { - int interleavedSize = Arrays.stream(columnsBinary).mapToInt(a -> a.length).sum(); + public static byte[] interleaveBits(byte[][] columnsBinary, int interleavedSize) { byte[] interleavedBytes = new byte[interleavedSize]; int sourceColumn = 0; int sourceByte = 0; From 74d20a4ad15d620ba35e55bbfecf774cca663037 Mon Sep 17 00:00:00 2001 From: Russell_Spitzer Date: Tue, 8 Feb 2022 08:48:29 -0600 Subject: [PATCH 09/12] Fix Encoding Also a patch for the test interleave method length calculation --- .../main/java/org/apache/iceberg/util/ZOrderByteUtils.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java index f4d28572be84..52180cac7a2d 100644 --- a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java +++ b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java @@ -20,6 +20,7 @@ package org.apache.iceberg.util; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.Arrays; /** @@ -108,7 +109,7 @@ public static byte[] doubleToOrderedBytes(double val, ByteBuffer reuse) { * Strings are lexicographically sortable BUT if different byte array lengths will * ruin the Z-Ordering. (ZOrder requires that a given column contribute the same number of bytes every time). * This implementation just uses a set size to for all output byte representations. Truncating longer strings - * and right padding 0 for shorter strings. Requires UTF8 (or ASCII) encoding for ordering guarantees to hold. + * and right padding 0 for shorter strings. */ public static byte[] stringToOrderedBytes(String val, int length, ByteBuffer reuse) { ByteBuffer bytes = ByteBuffers.reuse(reuse, length); @@ -116,7 +117,7 @@ public static byte[] stringToOrderedBytes(String val, int length, ByteBuffer reu if (val != null) { int maxLength = Math.min(length, val.length()); // We may truncate mid-character - bytes.put(val.getBytes(), 0, maxLength); + bytes.put(val.getBytes(StandardCharsets.UTF_8), 0, maxLength); } return bytes.array(); } @@ -126,7 +127,7 @@ public static byte[] stringToOrderedBytes(String val, int length, ByteBuffer reu */ static byte[] interleaveBits(byte[][] columnsBinary) { return interleaveBits(columnsBinary, - Arrays.stream(columnsBinary).mapToInt(column -> column.length).max().getAsInt()); + Arrays.stream(columnsBinary).mapToInt(column -> column.length).sum()); } /** From 848de3b11a2fc414438c9029e98b8dfae80e278f Mon Sep 17 00:00:00 2001 From: Russell_Spitzer Date: Tue, 8 Feb 2022 11:58:08 -0600 Subject: [PATCH 10/12] Methods return ByteBuffers, Strings are efit into our buffer using CharsetEncoder.encode --- .../apache/iceberg/util/ZOrderByteUtils.java | 32 ++++++++++--------- .../iceberg/util/TestZOrderByteUtil.java | 31 ++++++++++-------- 2 files changed, 34 insertions(+), 29 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java index 52180cac7a2d..967aa0bf7c5c 100644 --- a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java +++ b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java @@ -20,6 +20,8 @@ package org.apache.iceberg.util; import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetEncoder; import java.nio.charset.StandardCharsets; import java.util.Arrays; @@ -45,37 +47,37 @@ private ZOrderByteUtils() { * To fix this, flip the sign bit so that all negatives are ordered before positives. This essentially * shifts the 0 value so that we don't break our ordering when we cross the new 0 value. */ - public static byte[] intToOrderedBytes(int val, ByteBuffer reuse) { + public static ByteBuffer intToOrderedBytes(int val, ByteBuffer reuse) { ByteBuffer bytes = ByteBuffers.reuse(reuse, Integer.BYTES); bytes.putInt(val ^ 0x80000000); - return bytes.array(); + return bytes; } /** * Signed longs are treated the same as the signed ints in {@link #intToOrderedBytes(int, ByteBuffer)} */ - public static byte[] longToOrderedBytes(long val, ByteBuffer reuse) { + public static ByteBuffer longToOrderedBytes(long val, ByteBuffer reuse) { ByteBuffer bytes = ByteBuffers.reuse(reuse, Long.BYTES); bytes.putLong(val ^ 0x8000000000000000L); - return bytes.array(); + return bytes; } /** * Signed shorts are treated the same as the signed ints in {@link #intToOrderedBytes(int, ByteBuffer)} */ - public static byte[] shortToOrderedBytes(short val, ByteBuffer reuse) { + public static ByteBuffer shortToOrderedBytes(short val, ByteBuffer reuse) { ByteBuffer bytes = ByteBuffers.reuse(reuse, Short.BYTES); bytes.putShort((short) (val ^ (0x8000))); - return bytes.array(); + return bytes; } /** * Signed tiny ints are treated the same as the signed ints in {@link #intToOrderedBytes(int, ByteBuffer)} */ - public static byte[] tinyintToOrderedBytes(byte val, ByteBuffer reuse) { + public static ByteBuffer tinyintToOrderedBytes(byte val, ByteBuffer reuse) { ByteBuffer bytes = ByteBuffers.reuse(reuse, Byte.BYTES); bytes.put((byte) (val ^ (0x80))); - return bytes.array(); + return bytes; } /** @@ -86,23 +88,23 @@ public static byte[] tinyintToOrderedBytes(byte val, ByteBuffer reuse) { * Which means floats can be treated as sign magnitude integers which can then be converted into lexicographically * comparable bytes */ - public static byte[] floatToOrderedBytes(float val, ByteBuffer reuse) { + public static ByteBuffer floatToOrderedBytes(float val, ByteBuffer reuse) { ByteBuffer bytes = ByteBuffers.reuse(reuse, Float.BYTES); int ival = Float.floatToIntBits(val); ival ^= ((ival >> (Integer.SIZE - 1)) | Integer.MIN_VALUE); bytes.putInt(ival); - return bytes.array(); + return bytes; } /** * Doubles are treated the same as floats in {@link #floatToOrderedBytes(float, ByteBuffer)} */ - public static byte[] doubleToOrderedBytes(double val, ByteBuffer reuse) { + public static ByteBuffer doubleToOrderedBytes(double val, ByteBuffer reuse) { ByteBuffer bytes = ByteBuffers.reuse(reuse, Double.BYTES); long lng = Double.doubleToLongBits(val); lng ^= ((lng >> (Long.SIZE - 1)) | Long.MIN_VALUE); bytes.putLong(lng); - return bytes.array(); + return bytes; } /** @@ -111,15 +113,15 @@ public static byte[] doubleToOrderedBytes(double val, ByteBuffer reuse) { * This implementation just uses a set size to for all output byte representations. Truncating longer strings * and right padding 0 for shorter strings. */ - public static byte[] stringToOrderedBytes(String val, int length, ByteBuffer reuse) { + public static ByteBuffer stringToOrderedBytes(String val, int length, ByteBuffer reuse, CharsetEncoder encoder) { ByteBuffer bytes = ByteBuffers.reuse(reuse, length); Arrays.fill(bytes.array(), 0, length, (byte) 0x00); if (val != null) { int maxLength = Math.min(length, val.length()); // We may truncate mid-character - bytes.put(val.getBytes(StandardCharsets.UTF_8), 0, maxLength); + encoder.encode(CharBuffer.wrap(val), bytes, true); } - return bytes.array(); + return bytes; } /** diff --git a/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java b/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java index e2ff29d76c3a..bf84319d0d45 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java +++ b/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java @@ -21,6 +21,8 @@ package org.apache.iceberg.util; import java.nio.ByteBuffer; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.Random; import org.apache.iceberg.relocated.com.google.common.primitives.UnsignedBytes; @@ -149,8 +151,8 @@ public void testIntOrdering() { int aInt = random.nextInt(); int bInt = random.nextInt(); int intCompare = Integer.signum(Integer.compare(aInt, bInt)); - byte[] aBytes = ZOrderByteUtils.intToOrderedBytes(aInt, aBuffer); - byte[] bBytes = ZOrderByteUtils.intToOrderedBytes(bInt, bBuffer); + byte[] aBytes = ZOrderByteUtils.intToOrderedBytes(aInt, aBuffer).array(); + byte[] bBytes = ZOrderByteUtils.intToOrderedBytes(bInt, bBuffer).array(); int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); Assert.assertEquals(String.format( @@ -168,8 +170,8 @@ public void testLongOrdering() { long aLong = random.nextInt(); long bLong = random.nextInt(); int longCompare = Integer.signum(Long.compare(aLong, bLong)); - byte[] aBytes = ZOrderByteUtils.longToOrderedBytes(aLong, aBuffer); - byte[] bBytes = ZOrderByteUtils.longToOrderedBytes(bLong, bBuffer); + byte[] aBytes = ZOrderByteUtils.longToOrderedBytes(aLong, aBuffer).array(); + byte[] bBytes = ZOrderByteUtils.longToOrderedBytes(bLong, bBuffer).array(); int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); Assert.assertEquals(String.format( @@ -187,8 +189,8 @@ public void testShortOrdering() { short aShort = (short) (random.nextInt() % (Short.MAX_VALUE + 1)); short bShort = (short) (random.nextInt() % (Short.MAX_VALUE + 1)); int longCompare = Integer.signum(Long.compare(aShort, bShort)); - byte[] aBytes = ZOrderByteUtils.shortToOrderedBytes(aShort, aBuffer); - byte[] bBytes = ZOrderByteUtils.shortToOrderedBytes(bShort, bBuffer); + byte[] aBytes = ZOrderByteUtils.shortToOrderedBytes(aShort, aBuffer).array(); + byte[] bBytes = ZOrderByteUtils.shortToOrderedBytes(bShort, bBuffer).array(); int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); Assert.assertEquals(String.format( @@ -206,8 +208,8 @@ public void testTinyOrdering() { byte aByte = (byte) (random.nextInt() % (Byte.MAX_VALUE + 1)); byte bByte = (byte) (random.nextInt() % (Byte.MAX_VALUE + 1)); int longCompare = Integer.signum(Long.compare(aByte, bByte)); - byte[] aBytes = ZOrderByteUtils.tinyintToOrderedBytes(aByte, aBuffer); - byte[] bBytes = ZOrderByteUtils.tinyintToOrderedBytes(bByte, bBuffer); + byte[] aBytes = ZOrderByteUtils.tinyintToOrderedBytes(aByte, aBuffer).array(); + byte[] bBytes = ZOrderByteUtils.tinyintToOrderedBytes(bByte, bBuffer).array(); int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); Assert.assertEquals(String.format( @@ -225,8 +227,8 @@ public void testFloatOrdering() { float aFloat = random.nextFloat(); float bFloat = random.nextFloat(); int floatCompare = Integer.signum(Float.compare(aFloat, bFloat)); - byte[] aBytes = ZOrderByteUtils.floatToOrderedBytes(aFloat, aBuffer); - byte[] bBytes = ZOrderByteUtils.floatToOrderedBytes(bFloat, bBuffer); + byte[] aBytes = ZOrderByteUtils.floatToOrderedBytes(aFloat, aBuffer).array(); + byte[] bBytes = ZOrderByteUtils.floatToOrderedBytes(bFloat, bBuffer).array(); int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); Assert.assertEquals(String.format( @@ -244,8 +246,8 @@ public void testDoubleOrdering() { double aDouble = random.nextDouble(); double bDouble = random.nextDouble(); int doubleCompare = Integer.signum(Double.compare(aDouble, bDouble)); - byte[] aBytes = ZOrderByteUtils.doubleToOrderedBytes(aDouble, aBuffer); - byte[] bBytes = ZOrderByteUtils.doubleToOrderedBytes(bDouble, bBuffer); + byte[] aBytes = ZOrderByteUtils.doubleToOrderedBytes(aDouble, aBuffer).array(); + byte[] bBytes = ZOrderByteUtils.doubleToOrderedBytes(bDouble, bBuffer).array(); int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); Assert.assertEquals(String.format( @@ -257,14 +259,15 @@ public void testDoubleOrdering() { @Test public void testStringOrdering() { + CharsetEncoder encoder = StandardCharsets.UTF_8.newEncoder(); ByteBuffer aBuffer = ByteBuffer.allocate(128); ByteBuffer bBuffer = ByteBuffer.allocate(128); for (int i = 0; i < NUM_TESTS; i++) { String aString = (String) RandomUtil.generatePrimitive(Types.StringType.get(), random); String bString = (String) RandomUtil.generatePrimitive(Types.StringType.get(), random); int stringCompare = Integer.signum(aString.compareTo(bString)); - byte[] aBytes = ZOrderByteUtils.stringToOrderedBytes(aString, 128, aBuffer); - byte[] bBytes = ZOrderByteUtils.stringToOrderedBytes(bString, 128, bBuffer); + byte[] aBytes = ZOrderByteUtils.stringToOrderedBytes(aString, 128, aBuffer, encoder).array(); + byte[] bBytes = ZOrderByteUtils.stringToOrderedBytes(bString, 128, bBuffer, encoder).array(); int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); Assert.assertEquals(String.format( From 83586f1f5262490509bb599649357d8f8189bfc6 Mon Sep 17 00:00:00 2001 From: Russell_Spitzer Date: Tue, 8 Feb 2022 21:49:55 -0600 Subject: [PATCH 11/12] Remove unused string length --- .../java/org/apache/iceberg/util/ZOrderByteUtils.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java index 967aa0bf7c5c..3ec4c0f430f4 100644 --- a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java +++ b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java @@ -24,6 +24,7 @@ import java.nio.charset.CharsetEncoder; import java.nio.charset.StandardCharsets; import java.util.Arrays; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; /** * Within Z-Ordering the byte representations of objects being compared must be ordered, @@ -114,12 +115,14 @@ public static ByteBuffer doubleToOrderedBytes(double val, ByteBuffer reuse) { * and right padding 0 for shorter strings. */ public static ByteBuffer stringToOrderedBytes(String val, int length, ByteBuffer reuse, CharsetEncoder encoder) { + Preconditions.checkArgument(encoder.charset().equals(StandardCharsets.UTF_8), + "Cannot use an encoder not using UTF_8 as it's Charset"); + ByteBuffer bytes = ByteBuffers.reuse(reuse, length); Arrays.fill(bytes.array(), 0, length, (byte) 0x00); if (val != null) { - int maxLength = Math.min(length, val.length()); - // We may truncate mid-character - encoder.encode(CharBuffer.wrap(val), bytes, true); + CharBuffer inputBuffer = CharBuffer.wrap(val); + encoder.encode(inputBuffer, bytes, true); } return bytes; } From 57e1462fb27a70d659c1bdaaac15c659cfa22bbb Mon Sep 17 00:00:00 2001 From: Russell_Spitzer Date: Tue, 8 Feb 2022 22:05:29 -0600 Subject: [PATCH 12/12] Update docs --- .../main/java/org/apache/iceberg/util/ZOrderByteUtils.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java index 3ec4c0f430f4..b008461ea8ca 100644 --- a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java +++ b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java @@ -30,10 +30,12 @@ * Within Z-Ordering the byte representations of objects being compared must be ordered, * this requires several types to be transformed when converted to bytes. The goal is to * map object's whose byte representation are not lexicographically ordered into representations - * that are lexicographically ordered. + * that are lexicographically ordered. Bytes produced should be compared lexicographically as + * unsigned bytes, big-endian. + *

* Most of these techniques are derived from * https://aws.amazon.com/blogs/database/z-order-indexing-for-multifaceted-queries-in-amazon-dynamodb-part-2/ - * + *

* Some implementation is taken from * https://github.com/apache/hbase/blob/master/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java */