From 95f55d72a407f7effb1d3476d228c4df27cb2b3c Mon Sep 17 00:00:00 2001 From: Miroslav Drbal Date: Wed, 11 Feb 2026 14:30:56 +0100 Subject: [PATCH 1/3] feat: make geodatafusion/geoarrow optional via `geo` feature flag Make geospatial dependencies (geodatafusion, geoarrow-array, geoarrow-schema, geo-traits, geo-types) optional across the lance crate stack via a new `geo` feature flag. This allows consumers that don't need spatial indexing or geospatial UDFs to avoid pulling in ~40 transitive dependencies and the associated runtime overhead of registering geo UDFs in every DataFusion SessionContext. The `geo` feature is enabled by default in the top-level `lance` crate, preserving backward compatibility. Consumers can opt out with `default-features = false`. Crates modified: - lance-geo: all geo deps made optional, bbox module gated - lance-datafusion: lance-geo made optional, geo UDF registration gated - lance-index: lance-geo + geoarrow deps optional, rtree module and RTreeIndexPlugin registration gated, geo bench requires feature - lance: geo feature added to defaults, propagates to sub-crates Co-Authored-By: Claude Opus 4.6 --- rust/lance-datafusion/Cargo.toml | 3 ++- rust/lance-datafusion/src/udf.rs | 1 + rust/lance-geo/Cargo.toml | 13 ++++++++----- rust/lance-geo/src/lib.rs | 4 ++++ rust/lance-index/Cargo.toml | 10 ++++++---- rust/lance-index/src/registry.rs | 5 ++++- rust/lance-index/src/scalar.rs | 1 + rust/lance/Cargo.toml | 5 +++-- rust/lance/src/dataset/tests/mod.rs | 1 + 9 files changed, 30 insertions(+), 13 deletions(-) diff --git a/rust/lance-datafusion/Cargo.toml b/rust/lance-datafusion/Cargo.toml index 47315ce4712..b0aa7a67d67 100644 --- a/rust/lance-datafusion/Cargo.toml +++ b/rust/lance-datafusion/Cargo.toml @@ -27,7 +27,7 @@ jsonb = {workspace = true} lance-arrow.workspace = true lance-core = {workspace = true, features = ["datafusion"]} lance-datagen.workspace = true -lance-geo = {workspace = true} +lance-geo = {workspace = true, optional = true} chrono.workspace = true log.workspace = true pin-project.workspace = true @@ -40,6 +40,7 @@ tracing.workspace = true lance-datagen.workspace = true [features] +geo = ["dep:lance-geo"] substrait = ["dep:datafusion-substrait"] [lints] diff --git a/rust/lance-datafusion/src/udf.rs b/rust/lance-datafusion/src/udf.rs index 9117b67f82e..224f3f5c7e0 100644 --- a/rust/lance-datafusion/src/udf.rs +++ b/rust/lance-datafusion/src/udf.rs @@ -27,6 +27,7 @@ pub fn register_functions(ctx: &SessionContext) { ctx.register_udf(json::json_array_contains_udf()); ctx.register_udf(json::json_array_length_udf()); // GEO functions + #[cfg(feature = "geo")] lance_geo::register_functions(ctx); } diff --git a/rust/lance-geo/Cargo.toml b/rust/lance-geo/Cargo.toml index b8e1103027b..d8a1decfccb 100644 --- a/rust/lance-geo/Cargo.toml +++ b/rust/lance-geo/Cargo.toml @@ -13,13 +13,16 @@ description = "Lance's geospatial extension providing geospatial UDFs." [dependencies] datafusion.workspace = true -geoarrow-array.workspace = true -geoarrow-schema.workspace = true -geodatafusion.workspace = true -geo-traits.workspace = true -geo-types.workspace = true +geoarrow-array = { workspace = true, optional = true } +geoarrow-schema = { workspace = true, optional = true } +geodatafusion = { workspace = true, optional = true } +geo-traits = { workspace = true, optional = true } +geo-types = { workspace = true, optional = true } lance-core.workspace = true serde.workspace = true +[features] +geo = ["dep:geoarrow-array", "dep:geoarrow-schema", "dep:geodatafusion", "dep:geo-traits", "dep:geo-types"] + [lints] workspace = true diff --git a/rust/lance-geo/src/lib.rs b/rust/lance-geo/src/lib.rs index 8eb562333b2..238ce3ee004 100644 --- a/rust/lance-geo/src/lib.rs +++ b/rust/lance-geo/src/lib.rs @@ -3,8 +3,12 @@ use datafusion::prelude::SessionContext; +#[cfg(feature = "geo")] pub mod bbox; pub fn register_functions(ctx: &SessionContext) { + #[cfg(feature = "geo")] geodatafusion::register(ctx); + #[cfg(not(feature = "geo"))] + let _ = ctx; } diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml index 70262b8de92..3ce4e45ed34 100644 --- a/rust/lance-index/Cargo.toml +++ b/rust/lance-index/Cargo.toml @@ -30,9 +30,9 @@ deepsize.workspace = true dirs.workspace = true fst.workspace = true futures.workspace = true -geoarrow-array = { workspace = true } -geoarrow-schema = { workspace = true } -geo-types = { workspace = true } +geoarrow-array = { workspace = true, optional = true } +geoarrow-schema = { workspace = true, optional = true } +geo-types = { workspace = true, optional = true } half.workspace = true itertools.workspace = true jieba-rs = { workspace = true, optional = true } @@ -42,7 +42,7 @@ lance-core.workspace = true lance-datafusion.workspace = true lance-encoding.workspace = true lance-file.workspace = true -lance-geo.workspace = true +lance-geo = { workspace = true, optional = true } lance-io.workspace = true lance-linalg.workspace = true lance-table.workspace = true @@ -88,6 +88,7 @@ rstest.workspace = true chrono.workspace = true [features] +geo = ["dep:lance-geo", "lance-geo/geo", "dep:geoarrow-array", "dep:geoarrow-schema", "dep:geo-types"] protoc = ["dep:protobuf-src"] tokenizer-lindera = ["dep:lindera", "dep:lindera-tantivy"] tokenizer-jieba = ["dep:jieba-rs"] @@ -162,6 +163,7 @@ harness = false [[bench]] name = "geo" harness = false +required-features = ["geo"] [lints] workspace = true diff --git a/rust/lance-index/src/registry.rs b/rust/lance-index/src/registry.rs index 82d24649049..f5e9209088e 100644 --- a/rust/lance-index/src/registry.rs +++ b/rust/lance-index/src/registry.rs @@ -10,10 +10,12 @@ use crate::{ scalar::{ bitmap::BitmapIndexPlugin, bloomfilter::BloomFilterIndexPlugin, btree::BTreeIndexPlugin, inverted::InvertedIndexPlugin, json::JsonIndexPlugin, label_list::LabelListIndexPlugin, - ngram::NGramIndexPlugin, registry::ScalarIndexPlugin, rtree::RTreeIndexPlugin, + ngram::NGramIndexPlugin, registry::ScalarIndexPlugin, zonemap::ZoneMapIndexPlugin, }, }; +#[cfg(feature = "geo")] +use crate::scalar::rtree::RTreeIndexPlugin; /// A registry of index plugins pub struct IndexPluginRegistry { @@ -62,6 +64,7 @@ impl IndexPluginRegistry { registry.add_plugin::(); registry.add_plugin::(); registry.add_plugin::(); + #[cfg(feature = "geo")] registry.add_plugin::(); let registry = Arc::new(registry); diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index 6d07b5b8218..13562bc7587 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -39,6 +39,7 @@ pub mod label_list; pub mod lance_format; pub mod ngram; pub mod registry; +#[cfg(feature = "geo")] pub mod rtree; pub mod zoned; pub mod zonemap; diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml index 25c30230b35..4207d62e62d 100644 --- a/rust/lance/Cargo.toml +++ b/rust/lance/Cargo.toml @@ -26,7 +26,7 @@ lance-linalg = { workspace = true } lance-index = { workspace = true } lance-namespace = { workspace = true } lance-table = { workspace = true } -lance-geo = { workspace = true } +lance-geo = { workspace = true, optional = true } arrow-arith = { workspace = true } arrow-array = { workspace = true } arrow-buffer = { workspace = true } @@ -116,7 +116,7 @@ geo-types = { workspace = true } [features] -default = ["aws", "azure", "gcp", "oss", "huggingface"] +default = ["aws", "azure", "gcp", "oss", "huggingface", "geo"] fp16kernels = ["lance-linalg/fp16kernels"] # Prevent dynamic linking of lzma, which comes from datafusion cli = ["dep:clap", "lzma-sys/static"] @@ -134,6 +134,7 @@ gcp = ["lance-io/gcp"] azure = ["lance-io/azure"] oss = ["lance-io/oss"] huggingface = ["lance-io/huggingface"] +geo = ["dep:lance-geo", "lance-geo/geo", "lance-datafusion/geo", "lance-index/geo"] # Enable slow integration tests (disabled by default in CI) slow_tests = [] diff --git a/rust/lance/src/dataset/tests/mod.rs b/rust/lance/src/dataset/tests/mod.rs index a1197dcc198..66ead418f5a 100644 --- a/rust/lance/src/dataset/tests/mod.rs +++ b/rust/lance/src/dataset/tests/mod.rs @@ -3,6 +3,7 @@ mod dataset_common; mod dataset_concurrency_store; +#[cfg(feature = "geo")] mod dataset_geo; mod dataset_index; mod dataset_io; From 435404e9914d2ece429393d0ae22d75cc82d8f7c Mon Sep 17 00:00:00 2001 From: Miroslav Drbal Date: Thu, 12 Feb 2026 13:21:43 +0100 Subject: [PATCH 2/3] feat: graceful handling when geo feature is disabled When the `geo` feature is not enabled, users now get actionable errors and graceful degradation instead of confusing failures: - Registry: "No plugin for name 'rtree'" error now suggests enabling the `geo` feature - scalar_index_info: skips unsupported geo indices with a warning, allowing non-geo filtered scans to work on datasets with RTree indices - remap_index: drops geo indices gracefully during compaction instead of failing the entire operation - merge_indices: skips index merge for unavailable plugins instead of blocking append operations - UDFs: registers stub ST_* functions that return clear "enable geo feature" errors instead of DataFusion's generic "Unknown function" - Dead code: gates GeoQuery, RelationQuery, and GeoQueryParser behind #[cfg(feature = "geo")] since they are unreachable without it Co-Authored-By: Claude Opus 4.6 --- rust/lance-datafusion/src/udf.rs | 44 +++++++++++++++++++++++ rust/lance-index/src/registry.rs | 14 ++++++-- rust/lance-index/src/scalar.rs | 3 ++ rust/lance-index/src/scalar/expression.rs | 9 +++-- rust/lance/src/index.rs | 27 ++++++++++++-- rust/lance/src/index/append.rs | 16 +++++++-- 6 files changed, 102 insertions(+), 11 deletions(-) diff --git a/rust/lance-datafusion/src/udf.rs b/rust/lance-datafusion/src/udf.rs index 224f3f5c7e0..7d20c74071e 100644 --- a/rust/lance-datafusion/src/udf.rs +++ b/rust/lance-datafusion/src/udf.rs @@ -29,6 +29,50 @@ pub fn register_functions(ctx: &SessionContext) { // GEO functions #[cfg(feature = "geo")] lance_geo::register_functions(ctx); + #[cfg(not(feature = "geo"))] + register_geo_stub_functions(ctx); +} + +/// When the `geo` feature is disabled, register stub UDFs for spatial SQL functions +/// so that users get a clear error mentioning the feature flag instead of +/// DataFusion's generic "Unknown function" error. +#[cfg(not(feature = "geo"))] +fn register_geo_stub_functions(ctx: &SessionContext) { + let geo_funcs = [ + "st_intersects", + "st_contains", + "st_within", + "st_touches", + "st_crosses", + "st_overlaps", + "st_covers", + "st_coveredby", + "st_distance", + "st_area", + "st_length", + ]; + + for name in geo_funcs { + let func_name = name.to_string(); + let stub = Arc::new(make_scalar_function( + move |_args: &[ArrayRef]| { + Err(datafusion::error::DataFusionError::Plan(format!( + "Function '{}' requires the `geo` feature. \ + Rebuild with `--features geo` to enable geospatial functions.", + func_name + ))) + }, + vec![], + )); + + ctx.register_udf(create_udf( + name, + vec![DataType::Binary, DataType::Binary], + DataType::Boolean, + Volatility::Immutable, + stub, + )); + } } /// This method checks whether a string contains all specified tokens. The tokens are separated by diff --git a/rust/lance-index/src/registry.rs b/rust/lance-index/src/registry.rs index f5e9209088e..4ffd7dbbe88 100644 --- a/rust/lance-index/src/registry.rs +++ b/rust/lance-index/src/registry.rs @@ -80,9 +80,17 @@ impl IndexPluginRegistry { self.plugins .get(name) .map(|plugin| plugin.as_ref()) - .ok_or_else(|| Error::InvalidInput { - source: format!("No scalar index plugin found for name {}", name).into(), - location: location!(), + .ok_or_else(|| { + let hint = if name == "rtree" { + ". The 'rtree' index requires the `geo` feature. \ + Rebuild with `--features geo` to enable geospatial support" + } else { + "" + }; + Error::InvalidInput { + source: format!("No scalar index plugin found for name '{name}'{hint}").into(), + location: location!(), + } }) } diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index 13562bc7587..d5b37bd181a 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -736,6 +736,7 @@ impl AnyQuery for TokenQuery { } } +#[cfg(feature = "geo")] #[derive(Debug, Clone, PartialEq)] pub struct RelationQuery { pub value: ScalarValue, @@ -743,12 +744,14 @@ pub struct RelationQuery { } /// A query that a Geo index can satisfy +#[cfg(feature = "geo")] #[derive(Debug, Clone, PartialEq)] pub enum GeoQuery { IntersectQuery(RelationQuery), IsNull, } +#[cfg(feature = "geo")] impl AnyQuery for GeoQuery { fn as_any(&self) -> &dyn Any { self diff --git a/rust/lance-index/src/scalar/expression.rs b/rust/lance-index/src/scalar/expression.rs index 6aacfc565d0..04b8bfeddb6 100644 --- a/rust/lance-index/src/scalar/expression.rs +++ b/rust/lance-index/src/scalar/expression.rs @@ -19,9 +19,11 @@ use datafusion_expr::{ use tokio::try_join; use super::{ - AnyQuery, BloomFilterQuery, GeoQuery, LabelListQuery, MetricsCollector, RelationQuery, - SargableQuery, ScalarIndex, SearchResult, TextQuery, TokenQuery, + AnyQuery, BloomFilterQuery, LabelListQuery, MetricsCollector, SargableQuery, ScalarIndex, + SearchResult, TextQuery, TokenQuery, }; +#[cfg(feature = "geo")] +use super::{GeoQuery, RelationQuery}; use lance_core::{ utils::mask::{NullableRowAddrMask, RowAddrMask}, Error, Result, @@ -689,17 +691,20 @@ impl ScalarQueryParser for FtsQueryParser { } /// A parser for geo indices that handles spatial queries +#[cfg(feature = "geo")] #[derive(Debug, Clone)] pub struct GeoQueryParser { index_name: String, } +#[cfg(feature = "geo")] impl GeoQueryParser { pub fn new(index_name: String) -> Self { Self { index_name } } } +#[cfg(feature = "geo")] impl ScalarQueryParser for GeoQueryParser { fn visit_between( &self, diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index a2b02dd5cf4..92fcf15adec 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -291,9 +291,20 @@ pub(crate) async fn remap_index( let new_id = Uuid::new_v4(); - let generic = dataset + let generic = match dataset .open_generic_index(&field_path, &index_id.to_string(), &NoOpMetricsCollector) - .await?; + .await + { + Ok(g) => g, + Err(e) => { + log::warn!( + "Cannot open index '{}' on '{}': {}. \ + Index will be dropped during compaction.", + index_id, field_path, e + ); + return Ok(RemapResult::Drop); + } + }; let created_index = match generic.index_type() { it if it.is_scalar() => { @@ -1663,7 +1674,17 @@ impl DatasetIndexInternalExt for Dataset { continue; } - let plugin = index_details.get_plugin()?; + let plugin = match index_details.get_plugin() { + Ok(plugin) => plugin, + Err(e) => { + log::warn!( + "Skipping index '{}' on column '{}': {}. \ + Queries on this column will fall back to a full scan.", + index.name, field_path, e + ); + continue; + } + }; let query_parser = plugin.new_query_parser(index.name.clone(), &index_details.0); if let Some(query_parser) = query_parser { diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs index 9657f144dc8..1e12f7f4755 100644 --- a/rust/lance/src/index/append.rs +++ b/rust/lance/src/index/append.rs @@ -86,10 +86,20 @@ pub async fn merge_indices_with_unindexed_frags<'a>( let field_path = dataset.schema().field_path(old_indices[0].fields[0])?; let mut indices = Vec::with_capacity(old_indices.len()); for idx in old_indices { - let index = dataset + match dataset .open_generic_index(&field_path, &idx.uuid.to_string(), &NoOpMetricsCollector) - .await?; - indices.push(index); + .await + { + Ok(index) => indices.push(index), + Err(e) => { + log::warn!( + "Cannot open index on column '{}': {}. \ + Skipping index merge for this column.", + field_path, e + ); + return Ok(None); + } + } } if indices From 09188a67c750c169e9c25cf4d3624f23734cf261 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 23 Feb 2026 10:41:54 -0800 Subject: [PATCH 3/3] format --- rust/lance-index/src/registry.rs | 7 +++---- rust/lance/src/index.rs | 8 ++++++-- rust/lance/src/index/append.rs | 3 ++- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/rust/lance-index/src/registry.rs b/rust/lance-index/src/registry.rs index 4ffd7dbbe88..0c61a0b77ce 100644 --- a/rust/lance-index/src/registry.rs +++ b/rust/lance-index/src/registry.rs @@ -5,17 +5,16 @@ use std::{collections::HashMap, sync::Arc}; use lance_core::{Error, Result}; use snafu::location; +#[cfg(feature = "geo")] +use crate::scalar::rtree::RTreeIndexPlugin; use crate::{ pb, pbold, scalar::{ bitmap::BitmapIndexPlugin, bloomfilter::BloomFilterIndexPlugin, btree::BTreeIndexPlugin, inverted::InvertedIndexPlugin, json::JsonIndexPlugin, label_list::LabelListIndexPlugin, - ngram::NGramIndexPlugin, registry::ScalarIndexPlugin, - zonemap::ZoneMapIndexPlugin, + ngram::NGramIndexPlugin, registry::ScalarIndexPlugin, zonemap::ZoneMapIndexPlugin, }, }; -#[cfg(feature = "geo")] -use crate::scalar::rtree::RTreeIndexPlugin; /// A registry of index plugins pub struct IndexPluginRegistry { diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 92fcf15adec..3f74b5c3b8d 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -300,7 +300,9 @@ pub(crate) async fn remap_index( log::warn!( "Cannot open index '{}' on '{}': {}. \ Index will be dropped during compaction.", - index_id, field_path, e + index_id, + field_path, + e ); return Ok(RemapResult::Drop); } @@ -1680,7 +1682,9 @@ impl DatasetIndexInternalExt for Dataset { log::warn!( "Skipping index '{}' on column '{}': {}. \ Queries on this column will fall back to a full scan.", - index.name, field_path, e + index.name, + field_path, + e ); continue; } diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs index 1e12f7f4755..5a3366ed308 100644 --- a/rust/lance/src/index/append.rs +++ b/rust/lance/src/index/append.rs @@ -95,7 +95,8 @@ pub async fn merge_indices_with_unindexed_frags<'a>( log::warn!( "Cannot open index on column '{}': {}. \ Skipping index merge for this column.", - field_path, e + field_path, + e ); return Ok(None); }