Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ jobs:
ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -`
cargo build --benches --features ${ALL_FEATURES} --tests
mac-build:
runs-on: "macos-14"
runs-on: "warp-macos-14-arm64-6x"
timeout-minutes: 45
strategy:
matrix:
Expand Down Expand Up @@ -221,7 +221,7 @@ jobs:
run: |
cargo check --benches --features fp16kernels,cli,tensorflow,dynamodb,substrait
windows-build:
runs-on: windows-latest
runs-on: warp-windows-latest-x64-4x
defaults:
run:
working-directory: rust
Expand Down
17 changes: 17 additions & 0 deletions python/python/tests/forward_compat/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,26 @@ def write_dataset_scalar_index():
dataset.create_scalar_index("bloomfilter", "BLOOMFILTER")


def write_dataset_fts_index():
shutil.rmtree(get_path("fts_index"), ignore_errors=True)

data = pa.table(
{
"idx": pa.array(range(1000)),
"text": pa.array(
[f"document with words {i} and more text" for i in range(1000)]
),
}
)

dataset = lance.write_dataset(data, get_path("fts_index"))
dataset.create_scalar_index("text", "INVERTED")


if __name__ == "__main__":
write_basic_types()
write_large()
write_dataset_pq_buffer()
write_dataset_scalar_index()
write_dataset_json()
write_dataset_fts_index()
15 changes: 15 additions & 0 deletions python/python/tests/forward_compat/test_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,18 @@ def test_pq_buffer():
"column": "vec",
}
)


@pytest.mark.forward
@pytest.mark.skipif(
Version(lance.__version__) < Version("0.36.0"),
reason="FTS token set format was introduced in 0.36.0",
)
def test_list_indices_ignores_new_fts_index_version():
# Dataset::load_manifest does not do retain_supported_indices
# so this can only work with no cache
session = lance.Session(index_cache_size_bytes=0, metadata_cache_size_bytes=0)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wait, are we supposed to set index_cache_size_bytes=0 and metadata_cache_size_bytes=0?

ds = lance.dataset(get_path("fts_index"), session=session)
indices = ds.list_indices()
# the new index version should be ignored
assert len(indices) == 0
20 changes: 17 additions & 3 deletions rust/lance-index/src/scalar/inverted/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,9 @@ use crate::Index;
use crate::{prefilter::PreFilter, scalar::inverted::iter::take_fst_keys};
use std::str::FromStr;

pub const INVERTED_INDEX_VERSION: u32 = 0;
// Version 0: Arrow TokenSetFormat (legacy)
// Version 1: Fst TokenSetFormat (new default, incompatible clients < 0.38)
pub const INVERTED_INDEX_VERSION: u32 = 1;
pub const TOKENS_FILE: &str = "tokens.lance";
pub const INVERT_LIST_FILE: &str = "invert.lance";
pub const DOCS_FILE: &str = "docs.lance";
Expand Down Expand Up @@ -552,9 +554,15 @@ impl ScalarIndex for InvertedIndex {

let details = pbold::InvertedIndexDetails::try_from(&self.params)?;

// Use version 0 for Arrow format (legacy), version 1 for Fst format (new)
let index_version = match self.token_set_format {
TokenSetFormat::Arrow => 0,
TokenSetFormat::Fst => INVERTED_INDEX_VERSION,
};

Ok(CreatedIndex {
index_details: prost_types::Any::from_msg(&details).unwrap(),
index_version: INVERTED_INDEX_VERSION,
index_version,
})
}

Expand All @@ -567,9 +575,15 @@ impl ScalarIndex for InvertedIndex {

let details = pbold::InvertedIndexDetails::try_from(&self.params)?;

// Use version 0 for Arrow format (legacy), version 1 for Fst format (new)
let index_version = match self.token_set_format {
TokenSetFormat::Arrow => 0,
TokenSetFormat::Fst => INVERTED_INDEX_VERSION,
};

Ok(CreatedIndex {
index_details: prost_types::Any::from_msg(&details).unwrap(),
index_version: INVERTED_INDEX_VERSION,
index_version,
})
}

Expand Down
5 changes: 3 additions & 2 deletions rust/lance/src/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ use crate::dataset::refs::{BranchContents, Branches, Tags};
use crate::dataset::sql::SqlQueryBuilder;
use crate::datatypes::Schema;
use crate::error::box_error;
use crate::index::retain_supported_indices;
use crate::io::commit::{
commit_detached_transaction, commit_new_dataset, commit_transaction,
detect_overlapping_fragments, read_transaction_file,
Expand Down Expand Up @@ -628,12 +629,12 @@ impl Dataset {
let message_data =
&last_block[offset_in_block + 4..offset_in_block + 4 + message_len];
let section = lance_table::format::pb::IndexSection::decode(message_data)?;
let indices: Vec<IndexMetadata> = section
let mut indices: Vec<IndexMetadata> = section
.indices
.into_iter()
.map(IndexMetadata::try_from)
.collect::<Result<Vec<_>>>()?;

retain_supported_indices(&mut indices);
let ds_index_cache = session.index_cache.for_dataset(uri);
let metadata_key = crate::session::index_caches::IndexMetadataKey {
version: manifest_location.version,
Expand Down
2 changes: 1 addition & 1 deletion rust/lance/src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -889,7 +889,7 @@ impl DatasetIndexExt for Dataset {
}
}

fn retain_supported_indices(indices: &mut Vec<IndexMetadata>) {
pub(crate) fn retain_supported_indices(indices: &mut Vec<IndexMetadata>) {
indices.retain(|idx| {
let max_supported_version = idx
.index_details
Expand Down