diff --git a/rust/lance-datafusion/Cargo.toml b/rust/lance-datafusion/Cargo.toml index 9a561fa9b88..36ea639e13d 100644 --- a/rust/lance-datafusion/Cargo.toml +++ b/rust/lance-datafusion/Cargo.toml @@ -27,7 +27,7 @@ jsonb = {workspace = true} lance-arrow.workspace = true lance-core = {workspace = true, features = ["datafusion"]} lance-datagen.workspace = true -lance-geo = {workspace = true} +lance-geo = {workspace = true, optional = true} chrono.workspace = true log.workspace = true pin-project.workspace = true @@ -44,6 +44,7 @@ protobuf-src = {version = "2.1", optional = true} lance-datagen.workspace = true [features] +geo = ["dep:lance-geo"] substrait = ["dep:datafusion-substrait"] protoc = ["dep:protobuf-src"] diff --git a/rust/lance-datafusion/src/udf.rs b/rust/lance-datafusion/src/udf.rs index 9117b67f82e..7d20c74071e 100644 --- a/rust/lance-datafusion/src/udf.rs +++ b/rust/lance-datafusion/src/udf.rs @@ -27,7 +27,52 @@ pub fn register_functions(ctx: &SessionContext) { ctx.register_udf(json::json_array_contains_udf()); ctx.register_udf(json::json_array_length_udf()); // GEO functions + #[cfg(feature = "geo")] lance_geo::register_functions(ctx); + #[cfg(not(feature = "geo"))] + register_geo_stub_functions(ctx); +} + +/// When the `geo` feature is disabled, register stub UDFs for spatial SQL functions +/// so that users get a clear error mentioning the feature flag instead of +/// DataFusion's generic "Unknown function" error. +#[cfg(not(feature = "geo"))] +fn register_geo_stub_functions(ctx: &SessionContext) { + let geo_funcs = [ + "st_intersects", + "st_contains", + "st_within", + "st_touches", + "st_crosses", + "st_overlaps", + "st_covers", + "st_coveredby", + "st_distance", + "st_area", + "st_length", + ]; + + for name in geo_funcs { + let func_name = name.to_string(); + let stub = Arc::new(make_scalar_function( + move |_args: &[ArrayRef]| { + Err(datafusion::error::DataFusionError::Plan(format!( + "Function '{}' requires the `geo` feature. \ + Rebuild with `--features geo` to enable geospatial functions.", + func_name + ))) + }, + vec![], + )); + + ctx.register_udf(create_udf( + name, + vec![DataType::Binary, DataType::Binary], + DataType::Boolean, + Volatility::Immutable, + stub, + )); + } } /// This method checks whether a string contains all specified tokens. The tokens are separated by diff --git a/rust/lance-geo/Cargo.toml b/rust/lance-geo/Cargo.toml index b8e1103027b..d8a1decfccb 100644 --- a/rust/lance-geo/Cargo.toml +++ b/rust/lance-geo/Cargo.toml @@ -13,13 +13,16 @@ description = "Lance's geospatial extension providing geospatial UDFs." [dependencies] datafusion.workspace = true -geoarrow-array.workspace = true -geoarrow-schema.workspace = true -geodatafusion.workspace = true -geo-traits.workspace = true -geo-types.workspace = true +geoarrow-array = { workspace = true, optional = true } +geoarrow-schema = { workspace = true, optional = true } +geodatafusion = { workspace = true, optional = true } +geo-traits = { workspace = true, optional = true } +geo-types = { workspace = true, optional = true } lance-core.workspace = true serde.workspace = true +[features] +geo = ["dep:geoarrow-array", "dep:geoarrow-schema", "dep:geodatafusion", "dep:geo-traits", "dep:geo-types"] + [lints] workspace = true diff --git a/rust/lance-geo/src/lib.rs b/rust/lance-geo/src/lib.rs index 8eb562333b2..238ce3ee004 100644 --- a/rust/lance-geo/src/lib.rs +++ b/rust/lance-geo/src/lib.rs @@ -3,8 +3,12 @@ use datafusion::prelude::SessionContext; +#[cfg(feature = "geo")] pub mod bbox; pub fn register_functions(ctx: &SessionContext) { + #[cfg(feature = "geo")] geodatafusion::register(ctx); + #[cfg(not(feature = "geo"))] + let _ = ctx; } diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml index 70262b8de92..3ce4e45ed34 100644 --- a/rust/lance-index/Cargo.toml +++ b/rust/lance-index/Cargo.toml @@ -30,9 +30,9 @@ deepsize.workspace = true dirs.workspace = true fst.workspace = true futures.workspace = true -geoarrow-array = { workspace = true } -geoarrow-schema = { workspace = true } -geo-types = { workspace = true } +geoarrow-array = { workspace = true, optional = true } +geoarrow-schema = { workspace = true, optional = true } +geo-types = { workspace = true, optional = true } half.workspace = true itertools.workspace = true jieba-rs = { workspace = true, optional = true } @@ -42,7 +42,7 @@ lance-core.workspace = true lance-datafusion.workspace = true lance-encoding.workspace = true lance-file.workspace = true -lance-geo.workspace = true +lance-geo = { workspace = true, optional = true } lance-io.workspace = true lance-linalg.workspace = true lance-table.workspace = true @@ -88,6 +88,7 @@ rstest.workspace = true chrono.workspace = true [features] +geo = ["dep:lance-geo", "lance-geo/geo", "dep:geoarrow-array", "dep:geoarrow-schema", "dep:geo-types"] protoc = ["dep:protobuf-src"] tokenizer-lindera = ["dep:lindera", "dep:lindera-tantivy"] tokenizer-jieba = ["dep:jieba-rs"] @@ -162,6 +163,7 @@ harness = false [[bench]] name = "geo" harness = false +required-features = ["geo"] [lints] workspace = true diff --git a/rust/lance-index/src/registry.rs b/rust/lance-index/src/registry.rs index 82d24649049..0c61a0b77ce 100644 --- a/rust/lance-index/src/registry.rs +++ b/rust/lance-index/src/registry.rs @@ -5,13 +5,14 @@ use std::{collections::HashMap, sync::Arc}; use lance_core::{Error, Result}; use snafu::location; +#[cfg(feature = "geo")] +use crate::scalar::rtree::RTreeIndexPlugin; use crate::{ pb, pbold, scalar::{ bitmap::BitmapIndexPlugin, bloomfilter::BloomFilterIndexPlugin, btree::BTreeIndexPlugin, inverted::InvertedIndexPlugin, json::JsonIndexPlugin, label_list::LabelListIndexPlugin, - ngram::NGramIndexPlugin, registry::ScalarIndexPlugin, rtree::RTreeIndexPlugin, - zonemap::ZoneMapIndexPlugin, + ngram::NGramIndexPlugin, registry::ScalarIndexPlugin, zonemap::ZoneMapIndexPlugin, }, }; @@ -62,6 +63,7 @@ impl IndexPluginRegistry { registry.add_plugin::(); registry.add_plugin::(); registry.add_plugin::(); + #[cfg(feature = "geo")] registry.add_plugin::(); let registry = Arc::new(registry); @@ -77,9 +79,17 @@ impl IndexPluginRegistry { self.plugins .get(name) .map(|plugin| plugin.as_ref()) - .ok_or_else(|| Error::InvalidInput { - source: format!("No scalar index plugin found for name {}", name).into(), - location: location!(), + .ok_or_else(|| { + let hint = if name == "rtree" { + ". The 'rtree' index requires the `geo` feature. \ + Rebuild with `--features geo` to enable geospatial support" + } else { + "" + }; + Error::InvalidInput { + source: format!("No scalar index plugin found for name '{name}'{hint}").into(), + location: location!(), + } }) } diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index 5d7d93331b3..ead2b3bc526 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -40,6 +40,7 @@ pub mod label_list; pub mod lance_format; pub mod ngram; pub mod registry; +#[cfg(feature = "geo")] pub mod rtree; pub mod zoned; pub mod zonemap; @@ -736,6 +737,7 @@ impl AnyQuery for TokenQuery { } } +#[cfg(feature = "geo")] #[derive(Debug, Clone, PartialEq)] pub struct RelationQuery { pub value: ScalarValue, @@ -743,12 +745,14 @@ pub struct RelationQuery { } /// A query that a Geo index can satisfy +#[cfg(feature = "geo")] #[derive(Debug, Clone, PartialEq)] pub enum GeoQuery { IntersectQuery(RelationQuery), IsNull, } +#[cfg(feature = "geo")] impl AnyQuery for GeoQuery { fn as_any(&self) -> &dyn Any { self diff --git a/rust/lance-index/src/scalar/expression.rs b/rust/lance-index/src/scalar/expression.rs index 62c2cb57b8a..ec01e220115 100644 --- a/rust/lance-index/src/scalar/expression.rs +++ b/rust/lance-index/src/scalar/expression.rs @@ -19,9 +19,11 @@ use datafusion_expr::{ use tokio::try_join; use super::{ - AnyQuery, BloomFilterQuery, GeoQuery, LabelListQuery, MetricsCollector, RelationQuery, - SargableQuery, ScalarIndex, SearchResult, TextQuery, TokenQuery, + AnyQuery, BloomFilterQuery, LabelListQuery, MetricsCollector, SargableQuery, ScalarIndex, + SearchResult, TextQuery, TokenQuery, }; +#[cfg(feature = "geo")] +use super::{GeoQuery, RelationQuery}; use lance_core::{ utils::mask::{NullableRowAddrMask, RowAddrMask}, Error, Result, @@ -692,17 +694,20 @@ impl ScalarQueryParser for FtsQueryParser { } /// A parser for geo indices that handles spatial queries +#[cfg(feature = "geo")] #[derive(Debug, Clone)] pub struct GeoQueryParser { index_name: String, } +#[cfg(feature = "geo")] impl GeoQueryParser { pub fn new(index_name: String) -> Self { Self { index_name } } } +#[cfg(feature = "geo")] impl ScalarQueryParser for GeoQueryParser { fn visit_between( &self, diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml index 427de2aa30e..f630905369a 100644 --- a/rust/lance/Cargo.toml +++ b/rust/lance/Cargo.toml @@ -26,7 +26,7 @@ lance-linalg = { workspace = true } lance-index = { workspace = true } lance-namespace = { workspace = true } lance-table = { workspace = true } -lance-geo = { workspace = true } +lance-geo = { workspace = true, optional = true } arrow-arith = { workspace = true } arrow-array = { workspace = true } arrow-buffer = { workspace = true } @@ -119,7 +119,7 @@ geo-types = { workspace = true } datafusion-substrait = { workspace = true } [features] -default = ["aws", "azure", "gcp", "oss", "huggingface", "tencent"] +default = ["aws", "azure", "gcp", "oss", "huggingface", "tencent", "geo"] fp16kernels = ["lance-linalg/fp16kernels"] # Prevent dynamic linking of lzma, which comes from datafusion cli = ["dep:clap", "lzma-sys/static"] @@ -138,6 +138,7 @@ azure = ["lance-io/azure"] oss = ["lance-io/oss"] tencent = ["lance-io/tencent"] huggingface = ["lance-io/huggingface"] +geo = ["dep:lance-geo", "lance-geo/geo", "lance-datafusion/geo", "lance-index/geo"] # Enable slow integration tests (disabled by default in CI) slow_tests = [] diff --git a/rust/lance/src/dataset/tests/mod.rs b/rust/lance/src/dataset/tests/mod.rs index c0a6002debb..ecc64587b0c 100644 --- a/rust/lance/src/dataset/tests/mod.rs +++ b/rust/lance/src/dataset/tests/mod.rs @@ -5,6 +5,7 @@ mod dataset_aggregate; mod dataset_common; mod dataset_concurrency_store; +#[cfg(feature = "geo")] mod dataset_geo; mod dataset_index; mod dataset_io; diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 6d846cff4a5..828374e4d11 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -292,9 +292,22 @@ pub(crate) async fn remap_index( let new_id = Uuid::new_v4(); - let generic = dataset + let generic = match dataset .open_generic_index(&field_path, &index_id.to_string(), &NoOpMetricsCollector) - .await?; + .await + { + Ok(g) => g, + Err(e) => { + log::warn!( + "Cannot open index '{}' on '{}': {}. \ + Index will be dropped during compaction.", + index_id, + field_path, + e + ); + return Ok(RemapResult::Drop); + } + }; let created_index = match generic.index_type() { it if it.is_scalar() => { @@ -1718,7 +1731,19 @@ impl DatasetIndexInternalExt for Dataset { continue; } - let plugin = index_details.get_plugin()?; + let plugin = match index_details.get_plugin() { + Ok(plugin) => plugin, + Err(e) => { + log::warn!( + "Skipping index '{}' on column '{}': {}. \ + Queries on this column will fall back to a full scan.", + index.name, + field_path, + e + ); + continue; + } + }; let query_parser = plugin.new_query_parser(index.name.clone(), &index_details.0); if let Some(query_parser) = query_parser { diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs index f03503fb61d..e552afcaa8f 100644 --- a/rust/lance/src/index/append.rs +++ b/rust/lance/src/index/append.rs @@ -87,10 +87,21 @@ pub async fn merge_indices_with_unindexed_frags<'a>( let field_path = dataset.schema().field_path(old_indices[0].fields[0])?; let mut indices = Vec::with_capacity(old_indices.len()); for idx in old_indices { - let index = dataset + match dataset .open_generic_index(&field_path, &idx.uuid.to_string(), &NoOpMetricsCollector) - .await?; - indices.push(index); + .await + { + Ok(index) => indices.push(index), + Err(e) => { + log::warn!( + "Cannot open index on column '{}': {}. \ + Skipping index merge for this column.", + field_path, + e + ); + return Ok(None); + } + } } if indices