From 388743d824a04177bbf1f9d74d58e1a77b40511f Mon Sep 17 00:00:00 2001 From: Aditya Subrahmanyan Date: Mon, 5 Jan 2026 22:02:36 +0000 Subject: [PATCH] fix: Use correct byte representation for decimal hashing Spec states that: "Decimal values are hashed using the minimum number of bytes required to hold the unscaled value as a two's complement big-endian". Prior to this fix, we would incorrectly consume leading 0xFF bytes and hash them. Now, we only consume the bytes starting with the one that is used to preserve the sign, and everything that follows that byte. --- crates/iceberg/src/transform/bucket.rs | 43 +++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/crates/iceberg/src/transform/bucket.rs b/crates/iceberg/src/transform/bucket.rs index 8807fb1f79..e6786a70ca 100644 --- a/crates/iceberg/src/transform/bucket.rs +++ b/crates/iceberg/src/transform/bucket.rs @@ -78,12 +78,26 @@ impl Bucket { /// ref: https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements #[inline] fn hash_decimal(v: i128) -> i32 { + if v == 0 { + return Self::hash_bytes(&[0]); + } + let bytes = v.to_be_bytes(); - if let Some(start) = bytes.iter().position(|&x| x != 0) { - Self::hash_bytes(&bytes[start..]) + let start = if v > 0 { + // Positive: skip 0x00 unless next byte would appear negative + bytes + .windows(2) + .position(|w| w[0] != 0x00 || w[1] & 0x80 != 0) + .unwrap_or(15) } else { - Self::hash_bytes(&[0]) - } + // Negative: skip 0xFF only if next byte stays negative + bytes + .windows(2) + .position(|w| w[0] != 0xFF || w[1] & 0x80 == 0) + .unwrap_or(15) + }; + + Self::hash_bytes(&bytes[start..]) } /// def bucket_N(x) = (murmur3_x86_32_hash(x) & Integer.MAX_VALUE) % N @@ -790,6 +804,27 @@ mod test { ); } + #[test] + fn test_hash_decimal_with_negative_value() { + // Test cases from GitHub issue #1981 + assert_eq!(Bucket::hash_decimal(1), -463810133); + assert_eq!(Bucket::hash_decimal(-1), -43192051); + + // Additional test cases for edge case values + assert_eq!(Bucket::hash_decimal(0), Bucket::hash_decimal(0)); + assert_eq!(Bucket::hash_decimal(127), Bucket::hash_decimal(127)); + assert_eq!(Bucket::hash_decimal(-128), Bucket::hash_decimal(-128)); + + // Test minimum representation is used + // -1 should hash as [0xFF] not [0xFF, 0xFF, ..., 0xFF] + // 128 should hash as [0x00, 0x80] not [0x00, 0x00, ..., 0x80] + assert_eq!(Bucket::hash_decimal(128), Bucket::hash_bytes(&[0x00, 0x80])); + assert_eq!( + Bucket::hash_decimal(-129), + Bucket::hash_bytes(&[0xFF, 0x7F]) + ); + } + #[test] fn test_int_literal() { let bucket = Bucket::new(10);