From 9d9da4b24504ee8f40f2a05153c4f0d4fbab7312 Mon Sep 17 00:00:00 2001 From: forwardxu Date: Mon, 22 Dec 2025 10:36:27 +0800 Subject: [PATCH 1/6] fix: improve error handling for environment variable parsing --- rust/lance/src/dataset/scanner.rs | 111 ++++++++++++++++++++++++++---- 1 file changed, 96 insertions(+), 15 deletions(-) diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index fc5f636f5b1..08e91eb4652 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -101,34 +101,48 @@ use lance_datafusion::substrait::parse_substrait; use snafu::location; pub(crate) const BATCH_SIZE_FALLBACK: usize = 8192; + +/// Parse an environment variable as a specific type, logging a warning on parse failure. +fn parse_env_var(env_var_name: &str, default_hint: &str) -> Option +where + T::Err: std::fmt::Display, +{ + std::env::var(env_var_name) + .ok() + .and_then(|val| match val.parse() { + Ok(value) => Some(value), + Err(e) => { + log::warn!( + "Failed to parse {}='{}': {}. {}", + env_var_name, + val, + e, + default_hint + ); + None + } + }) +} + // For backwards compatibility / historical reasons we re-calculate the default batch size // on each call pub fn get_default_batch_size() -> Option { - std::env::var("LANCE_DEFAULT_BATCH_SIZE") - .map(|val| Some(val.parse().unwrap())) - .unwrap_or(None) + parse_env_var("LANCE_DEFAULT_BATCH_SIZE", "Using default.") } pub const LEGACY_DEFAULT_FRAGMENT_READAHEAD: usize = 4; -pub static DEFAULT_FRAGMENT_READAHEAD: LazyLock> = LazyLock::new(|| { - std::env::var("LANCE_DEFAULT_FRAGMENT_READAHEAD") - .map(|val| Some(val.parse().unwrap())) - .unwrap_or(None) -}); +pub static DEFAULT_FRAGMENT_READAHEAD: LazyLock> = + LazyLock::new(|| parse_env_var("LANCE_DEFAULT_FRAGMENT_READAHEAD", "Using default.")); -pub static DEFAULT_XTR_OVERFETCH: LazyLock = LazyLock::new(|| { - std::env::var("LANCE_XTR_OVERFETCH") - .map(|val| val.parse().unwrap()) - .unwrap_or(10) -}); +pub static DEFAULT_XTR_OVERFETCH: LazyLock = + LazyLock::new(|| parse_env_var("LANCE_XTR_OVERFETCH", "Using default value 10.").unwrap_or(10)); // We want to support ~256 concurrent reads to maximize throughput on cloud storage systems // Our typical page size is 8MiB (though not all reads are this large yet due to offset buffers, validity buffers, etc.) // So we want to support 256 * 8MiB ~= 2GiB of queued reads pub static DEFAULT_IO_BUFFER_SIZE: LazyLock = LazyLock::new(|| { - std::env::var("LANCE_DEFAULT_IO_BUFFER_SIZE") - .map(|val| val.parse().unwrap()) + parse_env_var("LANCE_DEFAULT_IO_BUFFER_SIZE", "Using default value 2GiB.") .unwrap_or(2 * 1024 * 1024 * 1024) }); @@ -4247,6 +4261,73 @@ mod test { assert_plan_node_equals, DatagenExt, FragmentCount, FragmentRowCount, ThrottledStoreWrapper, }; + #[test] + fn test_env_var_parsing() { + // Test that invalid environment variable values don't panic + + // Test invalid LANCE_DEFAULT_BATCH_SIZE + std::env::set_var("LANCE_DEFAULT_BATCH_SIZE", "not_a_number"); + let result = get_default_batch_size(); + assert_eq!(result, None, "Should return None for invalid batch size"); + + // Test valid LANCE_DEFAULT_BATCH_SIZE + std::env::set_var("LANCE_DEFAULT_BATCH_SIZE", "2048"); + let result = get_default_batch_size(); + assert_eq!(result, Some(2048), "Should parse valid batch size"); + + // Test unset LANCE_DEFAULT_BATCH_SIZE + std::env::remove_var("LANCE_DEFAULT_BATCH_SIZE"); + let result = get_default_batch_size(); + assert_eq!(result, None, "Should return None when env var is not set"); + } + + #[test] + fn test_parse_env_var() { + // Test parse_env_var with different types to ensure full coverage + + // Test with a unique env var name to avoid conflicts + let test_var = "LANCE_TEST_PARSE_ENV_VAR_USIZE"; + + // Test valid usize parsing + std::env::set_var(test_var, "12345"); + let result: Option = parse_env_var(test_var, "Using default."); + assert_eq!(result, Some(12345)); + + // Test invalid usize parsing (triggers warning log) + std::env::set_var(test_var, "not_a_number"); + let result: Option = parse_env_var(test_var, "Using default."); + assert_eq!(result, None); + + // Test unset env var + std::env::remove_var(test_var); + let result: Option = parse_env_var(test_var, "Using default."); + assert_eq!(result, None); + + // Test with u32 type + let test_var_u32 = "LANCE_TEST_PARSE_ENV_VAR_U32"; + std::env::set_var(test_var_u32, "42"); + let result: Option = parse_env_var(test_var_u32, "Using default value."); + assert_eq!(result, Some(42)); + + std::env::set_var(test_var_u32, "invalid"); + let result: Option = parse_env_var(test_var_u32, "Using default value."); + assert_eq!(result, None); + + std::env::remove_var(test_var_u32); + + // Test with u64 type + let test_var_u64 = "LANCE_TEST_PARSE_ENV_VAR_U64"; + std::env::set_var(test_var_u64, "9999999999"); + let result: Option = parse_env_var(test_var_u64, "Using default value."); + assert_eq!(result, Some(9999999999)); + + std::env::set_var(test_var_u64, "-1"); + let result: Option = parse_env_var(test_var_u64, "Using default value."); + assert_eq!(result, None); + + std::env::remove_var(test_var_u64); + } + async fn make_binary_vector_dataset() -> Result<(TempStrDir, Dataset)> { let tmp_dir = TempStrDir::default(); let dim = 4; From e2640bd476be11289e6637f1d675954b6257373b Mon Sep 17 00:00:00 2001 From: forwardxu Date: Fri, 26 Dec 2025 16:13:35 +0800 Subject: [PATCH 2/6] fix: improve error handling for environment variable parsing --- rust/lance/src/dataset/scanner.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 08e91eb4652..eb9f3672ee9 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -103,7 +103,7 @@ use snafu::location; pub(crate) const BATCH_SIZE_FALLBACK: usize = 8192; /// Parse an environment variable as a specific type, logging a warning on parse failure. -fn parse_env_var(env_var_name: &str, default_hint: &str) -> Option +fn parse_env_var(env_var_name: &str, default_val: &str) -> Option where T::Err: std::fmt::Display, { @@ -113,11 +113,11 @@ where Ok(value) => Some(value), Err(e) => { log::warn!( - "Failed to parse {}='{}': {}. {}", + "Failed to parse the environment variable {}='{}': {}, the default value is: {}.", env_var_name, val, e, - default_hint + default_val ); None } From 8b78a436990f2f14f1e248923d9d0502fdbd110b Mon Sep 17 00:00:00 2001 From: forwardxu Date: Fri, 16 Jan 2026 16:14:01 +0800 Subject: [PATCH 3/6] fix: improve error handling for environment variable parsing --- rust/lance/src/dataset/scanner.rs | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index eb9f3672ee9..4380cc0d339 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -101,6 +101,7 @@ use lance_datafusion::substrait::parse_substrait; use snafu::location; pub(crate) const BATCH_SIZE_FALLBACK: usize = 8192; +const BATCH_SIZE_FALLBACK_STR: &str = "8192"; /// Parse an environment variable as a specific type, logging a warning on parse failure. fn parse_env_var(env_var_name: &str, default_val: &str) -> Option @@ -127,23 +128,36 @@ where // For backwards compatibility / historical reasons we re-calculate the default batch size // on each call pub fn get_default_batch_size() -> Option { - parse_env_var("LANCE_DEFAULT_BATCH_SIZE", "Using default.") + parse_env_var("LANCE_DEFAULT_BATCH_SIZE", BATCH_SIZE_FALLBACK_STR) } pub const LEGACY_DEFAULT_FRAGMENT_READAHEAD: usize = 4; +const LEGACY_DEFAULT_FRAGMENT_READAHEAD_STR: &str = "4"; -pub static DEFAULT_FRAGMENT_READAHEAD: LazyLock> = - LazyLock::new(|| parse_env_var("LANCE_DEFAULT_FRAGMENT_READAHEAD", "Using default.")); +pub static DEFAULT_FRAGMENT_READAHEAD: LazyLock> = LazyLock::new(|| { + parse_env_var( + "LANCE_DEFAULT_FRAGMENT_READAHEAD", + LEGACY_DEFAULT_FRAGMENT_READAHEAD_STR, + ) +}); + +const DEFAULT_XTR_OVERFETCH_VALUE: u32 = 10; +const DEFAULT_XTR_OVERFETCH_STR: &str = "10"; -pub static DEFAULT_XTR_OVERFETCH: LazyLock = - LazyLock::new(|| parse_env_var("LANCE_XTR_OVERFETCH", "Using default value 10.").unwrap_or(10)); +pub static DEFAULT_XTR_OVERFETCH: LazyLock = LazyLock::new(|| { + parse_env_var("LANCE_XTR_OVERFETCH", DEFAULT_XTR_OVERFETCH_STR) + .unwrap_or(DEFAULT_XTR_OVERFETCH_VALUE) +}); // We want to support ~256 concurrent reads to maximize throughput on cloud storage systems // Our typical page size is 8MiB (though not all reads are this large yet due to offset buffers, validity buffers, etc.) // So we want to support 256 * 8MiB ~= 2GiB of queued reads +const DEFAULT_IO_BUFFER_SIZE_VALUE: u64 = 2 * 1024 * 1024 * 1024; +const DEFAULT_IO_BUFFER_SIZE_STR: &str = "2147483648"; + pub static DEFAULT_IO_BUFFER_SIZE: LazyLock = LazyLock::new(|| { - parse_env_var("LANCE_DEFAULT_IO_BUFFER_SIZE", "Using default value 2GiB.") - .unwrap_or(2 * 1024 * 1024 * 1024) + parse_env_var("LANCE_DEFAULT_IO_BUFFER_SIZE", DEFAULT_IO_BUFFER_SIZE_STR) + .unwrap_or(DEFAULT_IO_BUFFER_SIZE_VALUE) }); /// Defines an ordering for a single column From 9a32bf850b272f118c961fb0de8a815f222292da Mon Sep 17 00:00:00 2001 From: forwardxu Date: Fri, 16 Jan 2026 16:19:05 +0800 Subject: [PATCH 4/6] fix: improve error handling for environment variable parsing --- rust/lance/src/dataset/scanner.rs | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 4380cc0d339..a62c8593898 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -101,7 +101,6 @@ use lance_datafusion::substrait::parse_substrait; use snafu::location; pub(crate) const BATCH_SIZE_FALLBACK: usize = 8192; -const BATCH_SIZE_FALLBACK_STR: &str = "8192"; /// Parse an environment variable as a specific type, logging a warning on parse failure. fn parse_env_var(env_var_name: &str, default_val: &str) -> Option @@ -128,36 +127,22 @@ where // For backwards compatibility / historical reasons we re-calculate the default batch size // on each call pub fn get_default_batch_size() -> Option { - parse_env_var("LANCE_DEFAULT_BATCH_SIZE", BATCH_SIZE_FALLBACK_STR) + parse_env_var("LANCE_DEFAULT_BATCH_SIZE", "8192") } pub const LEGACY_DEFAULT_FRAGMENT_READAHEAD: usize = 4; -const LEGACY_DEFAULT_FRAGMENT_READAHEAD_STR: &str = "4"; -pub static DEFAULT_FRAGMENT_READAHEAD: LazyLock> = LazyLock::new(|| { - parse_env_var( - "LANCE_DEFAULT_FRAGMENT_READAHEAD", - LEGACY_DEFAULT_FRAGMENT_READAHEAD_STR, - ) -}); - -const DEFAULT_XTR_OVERFETCH_VALUE: u32 = 10; -const DEFAULT_XTR_OVERFETCH_STR: &str = "10"; +pub static DEFAULT_FRAGMENT_READAHEAD: LazyLock> = + LazyLock::new(|| parse_env_var("LANCE_DEFAULT_FRAGMENT_READAHEAD", "4")); -pub static DEFAULT_XTR_OVERFETCH: LazyLock = LazyLock::new(|| { - parse_env_var("LANCE_XTR_OVERFETCH", DEFAULT_XTR_OVERFETCH_STR) - .unwrap_or(DEFAULT_XTR_OVERFETCH_VALUE) -}); +pub static DEFAULT_XTR_OVERFETCH: LazyLock = + LazyLock::new(|| parse_env_var("LANCE_XTR_OVERFETCH", "10").unwrap_or(10)); // We want to support ~256 concurrent reads to maximize throughput on cloud storage systems // Our typical page size is 8MiB (though not all reads are this large yet due to offset buffers, validity buffers, etc.) // So we want to support 256 * 8MiB ~= 2GiB of queued reads -const DEFAULT_IO_BUFFER_SIZE_VALUE: u64 = 2 * 1024 * 1024 * 1024; -const DEFAULT_IO_BUFFER_SIZE_STR: &str = "2147483648"; - pub static DEFAULT_IO_BUFFER_SIZE: LazyLock = LazyLock::new(|| { - parse_env_var("LANCE_DEFAULT_IO_BUFFER_SIZE", DEFAULT_IO_BUFFER_SIZE_STR) - .unwrap_or(DEFAULT_IO_BUFFER_SIZE_VALUE) + parse_env_var("LANCE_DEFAULT_IO_BUFFER_SIZE", "2147483648").unwrap_or(2 * 1024 * 1024 * 1024) }); /// Defines an ordering for a single column From 0ea3d8f350ab138660388790dfeb094b2ca567b3 Mon Sep 17 00:00:00 2001 From: forwardxu Date: Fri, 16 Jan 2026 17:24:31 +0800 Subject: [PATCH 5/6] fix: improve error handling for environment variable parsing --- rust/lance/src/dataset/scanner.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index a62c8593898..68203caa111 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -127,13 +127,17 @@ where // For backwards compatibility / historical reasons we re-calculate the default batch size // on each call pub fn get_default_batch_size() -> Option { - parse_env_var("LANCE_DEFAULT_BATCH_SIZE", "8192") + parse_env_var("LANCE_DEFAULT_BATCH_SIZE", &BATCH_SIZE_FALLBACK.to_string()) } pub const LEGACY_DEFAULT_FRAGMENT_READAHEAD: usize = 4; -pub static DEFAULT_FRAGMENT_READAHEAD: LazyLock> = - LazyLock::new(|| parse_env_var("LANCE_DEFAULT_FRAGMENT_READAHEAD", "4")); +pub static DEFAULT_FRAGMENT_READAHEAD: LazyLock> = LazyLock::new(|| { + parse_env_var( + "LANCE_DEFAULT_FRAGMENT_READAHEAD", + &LEGACY_DEFAULT_FRAGMENT_READAHEAD.to_string(), + ) +}); pub static DEFAULT_XTR_OVERFETCH: LazyLock = LazyLock::new(|| parse_env_var("LANCE_XTR_OVERFETCH", "10").unwrap_or(10)); From 5d3e01ccc5c7b50200ea40812f5fb0c64b6fae6c Mon Sep 17 00:00:00 2001 From: forwardxu Date: Fri, 16 Jan 2026 17:55:54 +0800 Subject: [PATCH 6/6] fix: improve error handling for environment variable parsing --- rust/lance/src/dataset/scanner.rs | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 68203caa111..78079ac30d0 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -139,14 +139,27 @@ pub static DEFAULT_FRAGMENT_READAHEAD: LazyLock> = LazyLock::new(| ) }); -pub static DEFAULT_XTR_OVERFETCH: LazyLock = - LazyLock::new(|| parse_env_var("LANCE_XTR_OVERFETCH", "10").unwrap_or(10)); +const DEFAULT_XTR_OVERFETCH_VALUE: u32 = 10; + +pub static DEFAULT_XTR_OVERFETCH: LazyLock = LazyLock::new(|| { + parse_env_var( + "LANCE_XTR_OVERFETCH", + &DEFAULT_XTR_OVERFETCH_VALUE.to_string(), + ) + .unwrap_or(DEFAULT_XTR_OVERFETCH_VALUE) +}); // We want to support ~256 concurrent reads to maximize throughput on cloud storage systems // Our typical page size is 8MiB (though not all reads are this large yet due to offset buffers, validity buffers, etc.) // So we want to support 256 * 8MiB ~= 2GiB of queued reads +const DEFAULT_IO_BUFFER_SIZE_VALUE: u64 = 2 * 1024 * 1024 * 1024; + pub static DEFAULT_IO_BUFFER_SIZE: LazyLock = LazyLock::new(|| { - parse_env_var("LANCE_DEFAULT_IO_BUFFER_SIZE", "2147483648").unwrap_or(2 * 1024 * 1024 * 1024) + parse_env_var( + "LANCE_DEFAULT_IO_BUFFER_SIZE", + &DEFAULT_IO_BUFFER_SIZE_VALUE.to_string(), + ) + .unwrap_or(DEFAULT_IO_BUFFER_SIZE_VALUE) }); /// Defines an ordering for a single column