diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 853178634f6..adf923f0c71 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -185,7 +185,7 @@ jobs: ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -` cargo build --benches --features ${ALL_FEATURES} --tests mac-build: - runs-on: "macos-14" + runs-on: "warp-macos-14-arm64-6x" timeout-minutes: 45 strategy: matrix: @@ -221,7 +221,7 @@ jobs: run: | cargo check --benches --features fp16kernels,cli,tensorflow,dynamodb,substrait windows-build: - runs-on: windows-latest + runs-on: warp-windows-latest-x64-4x defaults: run: working-directory: rust diff --git a/python/python/tests/forward_compat/datagen.py b/python/python/tests/forward_compat/datagen.py index dcd7b6af6a6..2e4104afbfd 100644 --- a/python/python/tests/forward_compat/datagen.py +++ b/python/python/tests/forward_compat/datagen.py @@ -99,9 +99,26 @@ def write_dataset_scalar_index(): dataset.create_scalar_index("bloomfilter", "BLOOMFILTER") +def write_dataset_fts_index(): + shutil.rmtree(get_path("fts_index"), ignore_errors=True) + + data = pa.table( + { + "idx": pa.array(range(1000)), + "text": pa.array( + [f"document with words {i} and more text" for i in range(1000)] + ), + } + ) + + dataset = lance.write_dataset(data, get_path("fts_index")) + dataset.create_scalar_index("text", "INVERTED") + + if __name__ == "__main__": write_basic_types() write_large() write_dataset_pq_buffer() write_dataset_scalar_index() write_dataset_json() + write_dataset_fts_index() diff --git a/python/python/tests/forward_compat/test_compat.py b/python/python/tests/forward_compat/test_compat.py index 63d540df9af..a9b3ec8fbee 100644 --- a/python/python/tests/forward_compat/test_compat.py +++ b/python/python/tests/forward_compat/test_compat.py @@ -101,3 +101,18 @@ def test_pq_buffer(): "column": "vec", } ) + + +@pytest.mark.forward +@pytest.mark.skipif( + Version(lance.__version__) < Version("0.36.0"), + reason="FTS token set format was introduced in 0.36.0", +) +def test_list_indices_ignores_new_fts_index_version(): + # Dataset::load_manifest does not do retain_supported_indices + # so this can only work with no cache + session = lance.Session(index_cache_size_bytes=0, metadata_cache_size_bytes=0) + ds = lance.dataset(get_path("fts_index"), session=session) + indices = ds.list_indices() + # the new index version should be ignored + assert len(indices) == 0 diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index a12fcf1eeb0..14bd9910ef6 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -81,7 +81,9 @@ use crate::Index; use crate::{prefilter::PreFilter, scalar::inverted::iter::take_fst_keys}; use std::str::FromStr; -pub const INVERTED_INDEX_VERSION: u32 = 0; +// Version 0: Arrow TokenSetFormat (legacy) +// Version 1: Fst TokenSetFormat (new default, incompatible clients < 0.38) +pub const INVERTED_INDEX_VERSION: u32 = 1; pub const TOKENS_FILE: &str = "tokens.lance"; pub const INVERT_LIST_FILE: &str = "invert.lance"; pub const DOCS_FILE: &str = "docs.lance"; @@ -552,9 +554,15 @@ impl ScalarIndex for InvertedIndex { let details = pbold::InvertedIndexDetails::try_from(&self.params)?; + // Use version 0 for Arrow format (legacy), version 1 for Fst format (new) + let index_version = match self.token_set_format { + TokenSetFormat::Arrow => 0, + TokenSetFormat::Fst => INVERTED_INDEX_VERSION, + }; + Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&details).unwrap(), - index_version: INVERTED_INDEX_VERSION, + index_version, }) } @@ -567,9 +575,15 @@ impl ScalarIndex for InvertedIndex { let details = pbold::InvertedIndexDetails::try_from(&self.params)?; + // Use version 0 for Arrow format (legacy), version 1 for Fst format (new) + let index_version = match self.token_set_format { + TokenSetFormat::Arrow => 0, + TokenSetFormat::Fst => INVERTED_INDEX_VERSION, + }; + Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&details).unwrap(), - index_version: INVERTED_INDEX_VERSION, + index_version, }) } diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index a00e6418a2e..d66237d3a0b 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -97,6 +97,7 @@ use crate::dataset::refs::{BranchContents, Branches, Tags}; use crate::dataset::sql::SqlQueryBuilder; use crate::datatypes::Schema; use crate::error::box_error; +use crate::index::retain_supported_indices; use crate::io::commit::{ commit_detached_transaction, commit_new_dataset, commit_transaction, detect_overlapping_fragments, read_transaction_file, @@ -628,12 +629,12 @@ impl Dataset { let message_data = &last_block[offset_in_block + 4..offset_in_block + 4 + message_len]; let section = lance_table::format::pb::IndexSection::decode(message_data)?; - let indices: Vec = section + let mut indices: Vec = section .indices .into_iter() .map(IndexMetadata::try_from) .collect::>>()?; - + retain_supported_indices(&mut indices); let ds_index_cache = session.index_cache.for_dataset(uri); let metadata_key = crate::session::index_caches::IndexMetadataKey { version: manifest_location.version, diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 2d5d0e491c8..09522a4aa2a 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -889,7 +889,7 @@ impl DatasetIndexExt for Dataset { } } -fn retain_supported_indices(indices: &mut Vec) { +pub(crate) fn retain_supported_indices(indices: &mut Vec) { indices.retain(|idx| { let max_supported_version = idx .index_details