From 8fad3586d571d82e453bf7293a647bfaab22150c Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Tue, 7 Oct 2025 10:27:30 -0700 Subject: [PATCH 1/7] fix: forward compatibility of FTS index --- python/python/tests/forward_compat/datagen.py | 15 +++++++++++++ .../tests/forward_compat/test_compat.py | 22 +++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/python/python/tests/forward_compat/datagen.py b/python/python/tests/forward_compat/datagen.py index dcd7b6af6a6..eabbf5cba8e 100644 --- a/python/python/tests/forward_compat/datagen.py +++ b/python/python/tests/forward_compat/datagen.py @@ -99,9 +99,24 @@ def write_dataset_scalar_index(): dataset.create_scalar_index("bloomfilter", "BLOOMFILTER") +def write_dataset_fts_index(): + shutil.rmtree(get_path("fts_index"), ignore_errors=True) + + data = pa.table( + { + "idx": pa.array(range(1000)), + "text": pa.array([f"document with words {i} and more text" for i in range(1000)]), + } + ) + + dataset = lance.write_dataset(data, get_path("fts_index")) + dataset.create_scalar_index("text", "INVERTED") + + if __name__ == "__main__": write_basic_types() write_large() write_dataset_pq_buffer() write_dataset_scalar_index() write_dataset_json() + write_dataset_fts_index() diff --git a/python/python/tests/forward_compat/test_compat.py b/python/python/tests/forward_compat/test_compat.py index 63d540df9af..0c306c16a17 100644 --- a/python/python/tests/forward_compat/test_compat.py +++ b/python/python/tests/forward_compat/test_compat.py @@ -101,3 +101,25 @@ def test_pq_buffer(): "column": "vec", } ) + + +@pytest.mark.forward +@pytest.mark.skipif( + Version(lance.__version__) < Version("0.36.0"), + reason="FTS index was introduced in 0.36.0", +) +def test_fts_index_list_indices(): + # Test that list_indices() doesn't panic when reading FTS index created by newer version + # This is a regression test for forward compatibility issue where Lance 0.36 would panic + # when calling list_indices() on datasets with FTS indices created by Lance 0.38+ + ds = lance.dataset(get_path("fts_index")) + + # This should not panic + indices = ds.list_indices() + + # Should have at least one index + assert len(indices) > 0 + + # Should be able to find the FTS index + fts_indices = [idx for idx in indices if idx["type"] == "FTS" or idx["type"] == "Inverted"] + assert len(fts_indices) > 0 From 455f78995c5df241a581e88461394016e88c9871 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Tue, 7 Oct 2025 13:54:14 -0700 Subject: [PATCH 2/7] fix --- python/python/tests/forward_compat/datagen.py | 4 +++- .../tests/forward_compat/test_compat.py | 18 ++++------------- rust/lance-index/src/scalar/inverted/index.rs | 20 ++++++++++++++++--- 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/python/python/tests/forward_compat/datagen.py b/python/python/tests/forward_compat/datagen.py index eabbf5cba8e..2e4104afbfd 100644 --- a/python/python/tests/forward_compat/datagen.py +++ b/python/python/tests/forward_compat/datagen.py @@ -105,7 +105,9 @@ def write_dataset_fts_index(): data = pa.table( { "idx": pa.array(range(1000)), - "text": pa.array([f"document with words {i} and more text" for i in range(1000)]), + "text": pa.array( + [f"document with words {i} and more text" for i in range(1000)] + ), } ) diff --git a/python/python/tests/forward_compat/test_compat.py b/python/python/tests/forward_compat/test_compat.py index 0c306c16a17..d36264a3fae 100644 --- a/python/python/tests/forward_compat/test_compat.py +++ b/python/python/tests/forward_compat/test_compat.py @@ -106,20 +106,10 @@ def test_pq_buffer(): @pytest.mark.forward @pytest.mark.skipif( Version(lance.__version__) < Version("0.36.0"), - reason="FTS index was introduced in 0.36.0", + reason="FTS token set format was introduced in 0.36.0", ) -def test_fts_index_list_indices(): - # Test that list_indices() doesn't panic when reading FTS index created by newer version - # This is a regression test for forward compatibility issue where Lance 0.36 would panic - # when calling list_indices() on datasets with FTS indices created by Lance 0.38+ +def test_list_indices_ignores_new_fts_index_version(): ds = lance.dataset(get_path("fts_index")) - - # This should not panic indices = ds.list_indices() - - # Should have at least one index - assert len(indices) > 0 - - # Should be able to find the FTS index - fts_indices = [idx for idx in indices if idx["type"] == "FTS" or idx["type"] == "Inverted"] - assert len(fts_indices) > 0 + # the new index version should be ignored + assert len(indices) == 0 diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index a12fcf1eeb0..bc5370ad93f 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -81,7 +81,9 @@ use crate::Index; use crate::{prefilter::PreFilter, scalar::inverted::iter::take_fst_keys}; use std::str::FromStr; -pub const INVERTED_INDEX_VERSION: u32 = 0; +// Version 0: Arrow TokenSetFormat (legacy) +// Version 1: Fst TokenSetFormat (new default, incompatible with old clients) +pub const INVERTED_INDEX_VERSION: u32 = 1; pub const TOKENS_FILE: &str = "tokens.lance"; pub const INVERT_LIST_FILE: &str = "invert.lance"; pub const DOCS_FILE: &str = "docs.lance"; @@ -552,9 +554,15 @@ impl ScalarIndex for InvertedIndex { let details = pbold::InvertedIndexDetails::try_from(&self.params)?; + // Use version 0 for Arrow format (legacy), version 1 for Fst format (new) + let index_version = match self.token_set_format { + TokenSetFormat::Arrow => 0, + TokenSetFormat::Fst => INVERTED_INDEX_VERSION, + }; + Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&details).unwrap(), - index_version: INVERTED_INDEX_VERSION, + index_version, }) } @@ -567,9 +575,15 @@ impl ScalarIndex for InvertedIndex { let details = pbold::InvertedIndexDetails::try_from(&self.params)?; + // Use version 0 for Arrow format (legacy), version 1 for Fst format (new) + let index_version = match self.token_set_format { + TokenSetFormat::Arrow => 0, + TokenSetFormat::Fst => INVERTED_INDEX_VERSION, + }; + Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&details).unwrap(), - index_version: INVERTED_INDEX_VERSION, + index_version, }) } From 7a30206a1dc004f9ef8842fa86c1f65c560a4e65 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Tue, 7 Oct 2025 14:24:29 -0700 Subject: [PATCH 3/7] use an empty session --- python/python/tests/forward_compat/test_compat.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/python/tests/forward_compat/test_compat.py b/python/python/tests/forward_compat/test_compat.py index d36264a3fae..6ffe2cec37d 100644 --- a/python/python/tests/forward_compat/test_compat.py +++ b/python/python/tests/forward_compat/test_compat.py @@ -109,7 +109,8 @@ def test_pq_buffer(): reason="FTS token set format was introduced in 0.36.0", ) def test_list_indices_ignores_new_fts_index_version(): - ds = lance.dataset(get_path("fts_index")) + session = lance.Session(index_cache_size_bytes=0, metadata_cache_size_bytes=0) + ds = lance.dataset(get_path("fts_index"), session=session) indices = ds.list_indices() # the new index version should be ignored assert len(indices) == 0 From 59d0aa39ffca19aac3eaddfbf3a62812efbf796f Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Tue, 7 Oct 2025 16:29:05 -0700 Subject: [PATCH 4/7] comments --- rust/lance-index/src/scalar/inverted/index.rs | 2 +- rust/lance/src/dataset.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index bc5370ad93f..14bd9910ef6 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -82,7 +82,7 @@ use crate::{prefilter::PreFilter, scalar::inverted::iter::take_fst_keys}; use std::str::FromStr; // Version 0: Arrow TokenSetFormat (legacy) -// Version 1: Fst TokenSetFormat (new default, incompatible with old clients) +// Version 1: Fst TokenSetFormat (new default, incompatible clients < 0.38) pub const INVERTED_INDEX_VERSION: u32 = 1; pub const TOKENS_FILE: &str = "tokens.lance"; pub const INVERT_LIST_FILE: &str = "invert.lance"; diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index a00e6418a2e..941950776cd 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -628,12 +628,12 @@ impl Dataset { let message_data = &last_block[offset_in_block + 4..offset_in_block + 4 + message_len]; let section = lance_table::format::pb::IndexSection::decode(message_data)?; - let indices: Vec = section + let mut indices: Vec = section .indices .into_iter() .map(IndexMetadata::try_from) .collect::>>()?; - + retain_supported_indices(&mut indices); let ds_index_cache = session.index_cache.for_dataset(uri); let metadata_key = crate::session::index_caches::IndexMetadataKey { version: manifest_location.version, From cde7989541baaae99f3c67fad7876eb1bf65bb2a Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Tue, 7 Oct 2025 16:32:44 -0700 Subject: [PATCH 5/7] comments --- python/python/tests/forward_compat/test_compat.py | 2 ++ rust/lance/src/dataset.rs | 1 + rust/lance/src/index.rs | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/python/python/tests/forward_compat/test_compat.py b/python/python/tests/forward_compat/test_compat.py index 6ffe2cec37d..a9b3ec8fbee 100644 --- a/python/python/tests/forward_compat/test_compat.py +++ b/python/python/tests/forward_compat/test_compat.py @@ -109,6 +109,8 @@ def test_pq_buffer(): reason="FTS token set format was introduced in 0.36.0", ) def test_list_indices_ignores_new_fts_index_version(): + # Dataset::load_manifest does not do retain_supported_indices + # so this can only work with no cache session = lance.Session(index_cache_size_bytes=0, metadata_cache_size_bytes=0) ds = lance.dataset(get_path("fts_index"), session=session) indices = ds.list_indices() diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index 941950776cd..2b09334a930 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -122,6 +122,7 @@ pub use write::{ write_fragments, AutoCleanupParams, CommitBuilder, DeleteBuilder, InsertBuilder, WriteDestination, WriteMode, WriteParams, }; +use crate::index::retain_supported_indices; const INDICES_DIR: &str = "_indices"; diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 2d5d0e491c8..09522a4aa2a 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -889,7 +889,7 @@ impl DatasetIndexExt for Dataset { } } -fn retain_supported_indices(indices: &mut Vec) { +pub(crate) fn retain_supported_indices(indices: &mut Vec) { indices.retain(|idx| { let max_supported_version = idx .index_details From cf50a8f26583904a95bce0fe10ae34e484b0417f Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Tue, 7 Oct 2025 17:59:02 -0700 Subject: [PATCH 6/7] fmt --- rust/lance/src/dataset.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index 2b09334a930..d66237d3a0b 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -97,6 +97,7 @@ use crate::dataset::refs::{BranchContents, Branches, Tags}; use crate::dataset::sql::SqlQueryBuilder; use crate::datatypes::Schema; use crate::error::box_error; +use crate::index::retain_supported_indices; use crate::io::commit::{ commit_detached_transaction, commit_new_dataset, commit_transaction, detect_overlapping_fragments, read_transaction_file, @@ -122,7 +123,6 @@ pub use write::{ write_fragments, AutoCleanupParams, CommitBuilder, DeleteBuilder, InsertBuilder, WriteDestination, WriteMode, WriteParams, }; -use crate::index::retain_supported_indices; const INDICES_DIR: &str = "_indices"; From 83bb81f406e2526ba6bbda462fb31bded73a5c1a Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Tue, 7 Oct 2025 19:50:27 -0700 Subject: [PATCH 7/7] use faster runners --- .github/workflows/rust.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 853178634f6..adf923f0c71 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -185,7 +185,7 @@ jobs: ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -` cargo build --benches --features ${ALL_FEATURES} --tests mac-build: - runs-on: "macos-14" + runs-on: "warp-macos-14-arm64-6x" timeout-minutes: 45 strategy: matrix: @@ -221,7 +221,7 @@ jobs: run: | cargo check --benches --features fp16kernels,cli,tensorflow,dynamodb,substrait windows-build: - runs-on: windows-latest + runs-on: warp-windows-latest-x64-4x defaults: run: working-directory: rust