From 0a0854749954399b6a70c8369597cba34a7e53fe Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Thu, 26 Feb 2026 16:24:05 +0800 Subject: [PATCH 1/3] refactor(encoding): remove HLL cardinality paths --- Cargo.lock | 1 - Cargo.toml | 1 - rust/lance-encoding/Cargo.toml | 1 - rust/lance-encoding/src/data.rs | 4 +- rust/lance-encoding/src/previous/encoder.rs | 23 +- rust/lance-encoding/src/statistics.rs | 244 +------------------- 6 files changed, 19 insertions(+), 255 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f7840609716..f27cfb5ba37 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4969,7 +4969,6 @@ dependencies = [ "fsst", "futures", "hex", - "hyperloglogplus", "itertools 0.13.0", "lance-arrow", "lance-bitpacking", diff --git a/Cargo.toml b/Cargo.toml index 33092f9c2b5..2b04e78ebfa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -145,7 +145,6 @@ geo-traits = "0.3.0" geo-types = "0.7.16" http = "1.1.0" humantime = "2.2.0" -hyperloglogplus = { version = "0.4.1", features = ["const-loop"] } itertools = "0.13" jieba-rs = { version = "0.8.1", default-features = false } jsonb = { version = "0.5.3", default-features = false, features = ["databend"] } diff --git a/rust/lance-encoding/Cargo.toml b/rust/lance-encoding/Cargo.toml index c8f23f9b914..1b92ec5c7a4 100644 --- a/rust/lance-encoding/Cargo.toml +++ b/rust/lance-encoding/Cargo.toml @@ -30,7 +30,6 @@ itertools.workspace = true log.workspace = true num-traits.workspace = true prost.workspace = true -hyperloglogplus.workspace = true prost-types.workspace = true rand.workspace = true snafu.workspace = true diff --git a/rust/lance-encoding/src/data.rs b/rust/lance-encoding/src/data.rs index 8828673326f..803d6b26cce 100644 --- a/rust/lance-encoding/src/data.rs +++ b/rust/lance-encoding/src/data.rs @@ -61,8 +61,8 @@ impl AllNullDataBlock { use std::collections::HashMap; -// `BlockInfo` stores the statistics of this `DataBlock`, such as `NullCount` for `NullableDataBlock`, -// `BitWidth` for `FixedWidthDataBlock`, `Cardinality` for all `DataBlock` +// `BlockInfo` stores the statistics of this `DataBlock`, such as `NullCount` for `NullableDataBlock` +// and `BitWidth` for `FixedWidthDataBlock`. #[derive(Debug, Clone)] pub struct BlockInfo(pub Arc>>>); diff --git a/rust/lance-encoding/src/previous/encoder.rs b/rust/lance-encoding/src/previous/encoder.rs index b6ab35722f7..7233ab884c7 100644 --- a/rust/lance-encoding/src/previous/encoder.rs +++ b/rust/lance-encoding/src/previous/encoder.rs @@ -1,11 +1,14 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::{collections::HashMap, env, hash::RandomState, sync::Arc}; +use std::{ + collections::{HashMap, HashSet}, + env, + sync::Arc, +}; use arrow_array::{cast::AsArray, ArrayRef, UInt8Array}; use arrow_schema::DataType; -use hyperloglogplus::{HyperLogLog, HyperLogLogPlus}; use snafu::location; use crate::{ @@ -513,25 +516,21 @@ fn get_dict_encoding_threshold() -> u64 { // by applying a threshold on cardinality // returns true if cardinality < threshold but false if the total number of rows is less than the threshold // The choice to use 100 is just a heuristic for now -// hyperloglog is used for cardinality estimation -// error rate = 1.04 / sqrt(2^p), where p is the precision -// and error rate is 1.04 / sqrt(2^12) = 1.56% fn check_dict_encoding(arrays: &[ArrayRef], threshold: u64) -> bool { let num_total_rows = arrays.iter().map(|arr| arr.len()).sum::(); if num_total_rows < threshold as usize { return false; } - const PRECISION: u8 = 12; - - let mut hll: HyperLogLogPlus = - HyperLogLogPlus::new(PRECISION, RandomState::new()).unwrap(); + let threshold = usize::try_from(threshold).unwrap_or(usize::MAX); + let mut unique_values = HashSet::with_capacity(threshold.min(1024)); for arr in arrays { let string_array = arrow_array::cast::as_string_array(arr); for value in string_array.iter().flatten() { - hll.insert(value); - let estimated_cardinality = hll.count() as u64; - if estimated_cardinality >= threshold { + if !unique_values.contains(value) { + unique_values.insert(value.to_string()); + } + if unique_values.len() >= threshold { return false; } } diff --git a/rust/lance-encoding/src/statistics.rs b/rust/lance-encoding/src/statistics.rs index d3362965c2f..27b40315503 100644 --- a/rust/lance-encoding/src/statistics.rs +++ b/rust/lance-encoding/src/statistics.rs @@ -3,12 +3,10 @@ use std::{ fmt::{self}, - hash::{Hash, RandomState}, sync::Arc, }; use arrow_array::{cast::AsArray, types::UInt64Type, Array, ArrowPrimitiveType, UInt64Array}; -use hyperloglogplus::{HyperLogLog, HyperLogLogPlus}; use num_traits::PrimInt; use crate::data::{ @@ -20,7 +18,6 @@ use crate::data::{ pub enum Stat { BitWidth, DataSize, - Cardinality, FixedSize, NullCount, MaxLength, @@ -33,7 +30,6 @@ impl fmt::Debug for Stat { match self { Self::BitWidth => write!(f, "BitWidth"), Self::DataSize => write!(f, "DataSize"), - Self::Cardinality => write!(f, "Cardinality"), Self::FixedSize => write!(f, "FixedSize"), Self::NullCount => write!(f, "NullCount"), Self::MaxLength => write!(f, "MaxLength"), @@ -186,31 +182,12 @@ impl GetStat for NullableDataBlock { impl GetStat for VariableWidthBlock { fn get_stat(&self, stat: Stat) -> Option> { - { - let block_info = self.block_info.0.read().unwrap(); - if block_info.is_empty() { - panic!("get_stat should be called after statistics are computed."); - } - if let Some(stat_value) = block_info.get(&stat) { - return Some(stat_value.clone()); - } - } - - if stat != Stat::Cardinality { - return None; - } + let block_info = self.block_info.0.read().unwrap(); - let computed = self.compute_cardinality(); - let mut block_info = self.block_info.0.write().unwrap(); if block_info.is_empty() { panic!("get_stat should be called after statistics are computed."); } - Some( - block_info - .entry(stat) - .or_insert_with(|| computed.clone()) - .clone(), - ) + block_info.get(&stat).cloned() } } @@ -230,55 +207,6 @@ impl GetStat for FixedSizeListBlock { } impl VariableWidthBlock { - // Caveat: the computation here assumes VariableWidthBlock.offsets maps directly to VariableWidthBlock.data - // without any adjustment(for example, no null_adjustment for offsets) - fn compute_cardinality(&self) -> Arc { - const PRECISION: u8 = 4; - // The default hasher (currently sip hash 1-3) does not seem to give good results - // with HLL. - // - // In particular, when using randomly generated 12-byte strings, the HLL count was - // suggested a cardinality of 500 (out of 1000 unique items and hashes) at least 10% - // of the time. - // - // Using xxhash3 consistently gives better results. - let mut hll: HyperLogLogPlus<&[u8], xxhash_rust::xxh3::Xxh3Builder> = - HyperLogLogPlus::new(PRECISION, xxhash_rust::xxh3::Xxh3Builder::default()).unwrap(); - - match self.bits_per_offset { - 32 => { - let offsets_ref = self.offsets.borrow_to_typed_slice::(); - let offsets: &[u32] = offsets_ref.as_ref(); - - offsets - .iter() - .zip(offsets.iter().skip(1)) - .for_each(|(&start, &end)| { - hll.insert(&self.data[start as usize..end as usize]); - }); - let cardinality = hll.count() as u64; - Arc::new(UInt64Array::from(vec![cardinality])) - } - 64 => { - let offsets_ref = self.offsets.borrow_to_typed_slice::(); - let offsets: &[u64] = offsets_ref.as_ref(); - - offsets - .iter() - .zip(offsets.iter().skip(1)) - .for_each(|(&start, &end)| { - hll.insert(&self.data[start as usize..end as usize]); - }); - - let cardinality = hll.count() as u64; - Arc::new(UInt64Array::from(vec![cardinality])) - } - _ => { - unreachable!("the bits_per_offset of VariableWidthBlock can only be 32 or 64") - } - } - } - fn max_length(&mut self) -> Arc { match self.bits_per_offset { 32 => { @@ -323,30 +251,13 @@ impl GetStat for AllNullDataBlock { impl GetStat for FixedWidthDataBlock { fn get_stat(&self, stat: Stat) -> Option> { - { - let block_info = self.block_info.0.read().unwrap(); - - if block_info.is_empty() { - panic!("get_stat should be called after statistics are computed."); - } + let block_info = self.block_info.0.read().unwrap(); - if let Some(stat_value) = block_info.get(&stat) { - return Some(stat_value.clone()); - } + if block_info.is_empty() { + panic!("get_stat should be called after statistics are computed."); } - if stat == Stat::Cardinality && (self.bits_per_value == 64 || self.bits_per_value == 128) { - let computed = self.cardinality(); - let mut block_info = self.block_info.0.write().unwrap(); - Some( - block_info - .entry(stat) - .or_insert_with(|| computed.clone()) - .clone(), - ) - } else { - None - } + block_info.get(&stat).cloned() } } @@ -405,39 +316,6 @@ impl FixedWidthDataBlock { } } - fn cardinality(&self) -> Arc { - match self.bits_per_value { - 64 => { - let u64_slice_ref = self.data.borrow_to_typed_slice::(); - let u64_slice = u64_slice_ref.as_ref(); - - const PRECISION: u8 = 4; - let mut hll: HyperLogLogPlus = - HyperLogLogPlus::new(PRECISION, xxhash_rust::xxh3::Xxh3Builder::default()) - .unwrap(); - for val in u64_slice { - hll.insert(val); - } - let cardinality = hll.count() as u64; - Arc::new(UInt64Array::from(vec![cardinality])) - } - 128 => { - let u128_slice_ref = self.data.borrow_to_typed_slice::(); - let u128_slice = u128_slice_ref.as_ref(); - - const PRECISION: u8 = 4; - let mut hll: HyperLogLogPlus = - HyperLogLogPlus::new(PRECISION, RandomState::new()).unwrap(); - for val in u128_slice { - hll.insert(val); - } - let cardinality = hll.count() as u64; - Arc::new(UInt64Array::from(vec![cardinality])) - } - _ => unreachable!(), - } - } - /// Counts the number of runs (consecutive sequences of equal values) in the data. /// /// A "run" is defined as a sequence of one or more consecutive equal values. @@ -1067,62 +945,6 @@ mod tests { assert!(block.get_stat(Stat::BitWidth).is_none(),); } - #[test] - fn test_cardinality_variable_width_datablock() { - let string_array = StringArray::from(vec![Some("hello"), Some("world")]); - let block = DataBlock::from_array(string_array); - let expected_cardinality = 2; - let actual_cardinality = block.expect_single_stat::(Stat::Cardinality); - assert_eq!(actual_cardinality, expected_cardinality,); - - let string_array = StringArray::from(vec![ - Some("to be named by variables"), - Some("to be passed as arguments to procedures"), - Some("to be returned as values of procedures"), - ]); - let block = DataBlock::from_array(string_array); - let expected_cardinality = 3; - let actual_cardinality = block.expect_single_stat::(Stat::Cardinality); - - assert_eq!(actual_cardinality, expected_cardinality,); - - let string_array = StringArray::from(vec![ - Some("Samuel Eilenberg"), - Some("Saunders Mac Lane"), - Some("Samuel Eilenberg"), - ]); - let block = DataBlock::from_array(string_array); - let expected_cardinality = 2; - let actual_cardinality = block.expect_single_stat::(Stat::Cardinality); - assert_eq!(actual_cardinality, expected_cardinality,); - - let string_array = LargeStringArray::from(vec![Some("hello"), Some("world")]); - let block = DataBlock::from_array(string_array); - let expected_cardinality = 2; - let actual_cardinality = block.expect_single_stat::(Stat::Cardinality); - assert_eq!(actual_cardinality, expected_cardinality,); - - let string_array = LargeStringArray::from(vec![ - Some("to be named by variables"), - Some("to be passed as arguments to procedures"), - Some("to be returned as values of procedures"), - ]); - let block = DataBlock::from_array(string_array); - let expected_cardinality = 3; - let actual_cardinality = block.expect_single_stat::(Stat::Cardinality); - assert_eq!(actual_cardinality, expected_cardinality,); - - let string_array = LargeStringArray::from(vec![ - Some("Samuel Eilenberg"), - Some("Saunders Mac Lane"), - Some("Samuel Eilenberg"), - ]); - let block = DataBlock::from_array(string_array); - let expected_cardinality = 2; - let actual_cardinality = block.expect_single_stat::(Stat::Cardinality); - assert_eq!(actual_cardinality, expected_cardinality,); - } - #[test] fn test_max_length_variable_width_datablock() { let string_array = StringArray::from(vec![Some("hello"), Some("world")]); @@ -1212,58 +1034,4 @@ mod tests { let actual_run_count = block.expect_single_stat::(Stat::RunCount); assert_eq!(actual_run_count, expected_run_count); } - - #[test] - fn test_fixed_width_cardinality_is_lazy() { - let int64_array = Int64Array::from(vec![1, 2, 3, 1, 2, 3, 1]); - let block = DataBlock::from_array(int64_array); - - let DataBlock::FixedWidth(fixed) = &block else { - panic!("Expected FixedWidth datablock"); - }; - - let info = fixed.block_info.0.read().unwrap(); - assert!(info.contains_key(&Stat::DataSize)); - assert!(info.contains_key(&Stat::BitWidth)); - assert!(!info.contains_key(&Stat::Cardinality)); - } - - #[test] - fn test_fixed_width_cardinality_computed_on_demand() { - let int64_array = Int64Array::from(vec![1, 2, 3, 1, 2, 3, 1]); - let block = DataBlock::from_array(int64_array); - - let cardinality = block.expect_single_stat::(Stat::Cardinality); - assert_eq!(cardinality, 3); - - let DataBlock::FixedWidth(fixed) = &block else { - panic!("Expected FixedWidth datablock"); - }; - - let info = fixed.block_info.0.read().unwrap(); - assert!(info.contains_key(&Stat::Cardinality)); - } - - #[test] - fn test_variable_width_cardinality_is_lazy() { - let string_array = StringArray::from(vec!["a", "b", "a"]); - let block = DataBlock::from_array(string_array); - - let DataBlock::VariableWidth(var) = &block else { - panic!("Expected VariableWidth datablock"); - }; - - { - let info = var.block_info.0.read().unwrap(); - assert!(info.contains_key(&Stat::DataSize)); - assert!(info.contains_key(&Stat::MaxLength)); - assert!(!info.contains_key(&Stat::Cardinality)); - } - - let cardinality = block.expect_single_stat::(Stat::Cardinality); - assert_eq!(cardinality, 2); - - let info = var.block_info.0.read().unwrap(); - assert!(info.contains_key(&Stat::Cardinality)); - } } From 4345b287f1398ef52733d60ac658d12f3d50ab6d Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Thu, 26 Feb 2026 16:26:00 +0800 Subject: [PATCH 2/3] docs(encoding): clarify legacy dict decision comment --- rust/lance-encoding/src/previous/encoder.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/rust/lance-encoding/src/previous/encoder.rs b/rust/lance-encoding/src/previous/encoder.rs index 7233ab884c7..03cddfe2ecd 100644 --- a/rust/lance-encoding/src/previous/encoder.rs +++ b/rust/lance-encoding/src/previous/encoder.rs @@ -512,10 +512,15 @@ fn get_dict_encoding_threshold() -> u64 { .unwrap_or(100) } -// check whether we want to use dictionary encoding or not -// by applying a threshold on cardinality -// returns true if cardinality < threshold but false if the total number of rows is less than the threshold -// The choice to use 100 is just a heuristic for now +// Check whether dictionary encoding is worthwhile for legacy UTF8 pages. +// +// We track exact unique values until `threshold` and bail out early once we hit +// the limit. This avoids building a full set for high-cardinality inputs while +// keeping the decision deterministic. +// +// Returns true only when: +// 1. total row count is at least `threshold`, and +// 2. exact distinct count is strictly less than `threshold`. fn check_dict_encoding(arrays: &[ArrayRef], threshold: u64) -> bool { let num_total_rows = arrays.iter().map(|arr| arr.len()).sum::(); if num_total_rows < threshold as usize { From ba72d2838dbbebad52ecf0c27bc3e1678d1e7ad7 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Fri, 27 Feb 2026 00:33:15 +0800 Subject: [PATCH 3/3] ci: update lockfiles after hll removal --- java/lance-jni/Cargo.lock | 1 - python/Cargo.lock | 1 - 2 files changed, 2 deletions(-) diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index a4c138413fd..95723eb3c20 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -3550,7 +3550,6 @@ dependencies = [ "fsst", "futures", "hex", - "hyperloglogplus", "itertools 0.13.0", "lance-arrow", "lance-bitpacking", diff --git a/python/Cargo.lock b/python/Cargo.lock index a8a97502f43..089c7211513 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -4109,7 +4109,6 @@ dependencies = [ "fsst", "futures", "hex", - "hyperloglogplus", "itertools 0.13.0", "lance-arrow", "lance-bitpacking",