-
Notifications
You must be signed in to change notification settings - Fork 638
fix: preserve create index transaction semantics #6204
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3054240
42e6180
cc27f5e
af21acf
98e2401
a769946
3e51b29
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -171,19 +171,24 @@ impl<'a> CreateIndexBuilder<'a> { | |
| } | ||
| candidate | ||
| }; | ||
| if let Some(idx) = indices.iter().find(|i| i.name == index_name) { | ||
| if idx.fields == [field.id] && !self.replace { | ||
| return Err(Error::index(format!( | ||
| "Index name '{index_name} already exists, \ | ||
| please specify a different name or use replace=True" | ||
| ))); | ||
| }; | ||
| if idx.fields != [field.id] { | ||
| return Err(Error::index(format!( | ||
| "Index name '{index_name} already exists with different fields, \ | ||
| please specify a different name" | ||
| ))); | ||
| } | ||
| let existing_named_indices = indices | ||
| .iter() | ||
| .filter(|idx| idx.name == index_name) | ||
| .collect::<Vec<_>>(); | ||
| if existing_named_indices | ||
| .iter() | ||
| .any(|idx| idx.fields != [field.id]) | ||
| { | ||
| return Err(Error::index(format!( | ||
| "Index name '{index_name}' already exists with different fields, \ | ||
| please specify a different name" | ||
| ))); | ||
| } | ||
| if !existing_named_indices.is_empty() && !self.replace { | ||
| return Err(Error::index(format!( | ||
| "Index name '{index_name}' already exists, \ | ||
| please specify a different name or use replace=True" | ||
| ))); | ||
|
Comment on lines
+187
to
+191
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this mean I will need
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In fact, our current API didn't support adding a UUID to an existing index, it's our next step. We'll need a new public API for that. |
||
| } | ||
|
|
||
| let index_id = match &self.index_uuid { | ||
|
|
@@ -435,11 +440,22 @@ impl<'a> CreateIndexBuilder<'a> { | |
| async fn execute(mut self) -> Result<IndexMetadata> { | ||
| let new_idx = self.execute_uncommitted().await?; | ||
| let index_uuid = new_idx.uuid; | ||
| let removed_indices = if self.replace { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does this mean if we need to manually commit a this seems a breaking change, are we replying on this somewhere (distributed indexing?)?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. With this change, That means callers which want replace semantics need to populate |
||
| self.dataset | ||
| .load_indices() | ||
| .await? | ||
| .iter() | ||
| .filter(|idx| idx.name == new_idx.name) | ||
| .cloned() | ||
| .collect() | ||
| } else { | ||
| vec![] | ||
| }; | ||
| let transaction = Transaction::new( | ||
| new_idx.dataset_version, | ||
| Operation::CreateIndex { | ||
| new_indices: vec![new_idx], | ||
| removed_indices: vec![], | ||
| removed_indices, | ||
| }, | ||
| None, | ||
| ); | ||
|
|
@@ -628,6 +644,92 @@ mod tests { | |
| assert!(err.to_string().contains("already exists")); | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn test_concurrent_create_index_same_name_returns_retryable_conflict() { | ||
| let tmpdir = TempStrDir::default(); | ||
| let dataset_uri = format!("file://{}", tmpdir.as_str()); | ||
| let reader = gen_batch() | ||
| .col("a", lance_datagen::array::step::<Int32Type>()) | ||
| .into_reader_rows( | ||
| lance_datagen::RowCount::from(100), | ||
| lance_datagen::BatchCount::from(1), | ||
| ); | ||
| let dataset = Dataset::write(reader, &dataset_uri, None).await.unwrap(); | ||
|
|
||
| let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree); | ||
| let read_version = dataset.manifest.version; | ||
| let mut reader1 = dataset.checkout_version(read_version).await.unwrap(); | ||
| let mut reader2 = dataset.checkout_version(read_version).await.unwrap(); | ||
|
|
||
| let first = CreateIndexBuilder::new(&mut reader1, &["a"], IndexType::BTree, ¶ms) | ||
| .name("a_idx".to_string()) | ||
| .execute() | ||
| .await; | ||
| assert!( | ||
| first.is_ok(), | ||
| "first create_index should succeed: {first:?}" | ||
| ); | ||
|
|
||
| let second = CreateIndexBuilder::new(&mut reader2, &["a"], IndexType::BTree, ¶ms) | ||
| .name("a_idx".to_string()) | ||
| .execute() | ||
| .await; | ||
| assert!( | ||
| matches!(second, Err(Error::RetryableCommitConflict { .. })), | ||
| "second concurrent create_index should be retryable, got {second:?}" | ||
| ); | ||
|
|
||
| let latest_indices = reader1.load_indices_by_name("a_idx").await.unwrap(); | ||
| assert_eq!(latest_indices.len(), 1); | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn test_concurrent_replace_index_same_name_returns_retryable_conflict() { | ||
| let tmpdir = TempStrDir::default(); | ||
| let dataset_uri = format!("file://{}", tmpdir.as_str()); | ||
| let reader = gen_batch() | ||
| .col("a", lance_datagen::array::step::<Int32Type>()) | ||
| .into_reader_rows( | ||
| lance_datagen::RowCount::from(100), | ||
| lance_datagen::BatchCount::from(1), | ||
| ); | ||
| let mut dataset = Dataset::write(reader, &dataset_uri, None).await.unwrap(); | ||
|
|
||
| let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree); | ||
| let original = CreateIndexBuilder::new(&mut dataset, &["a"], IndexType::BTree, ¶ms) | ||
| .name("a_idx".to_string()) | ||
| .execute() | ||
| .await | ||
| .unwrap(); | ||
|
|
||
| let read_version = dataset.manifest.version; | ||
| let mut reader1 = dataset.checkout_version(read_version).await.unwrap(); | ||
| let mut reader2 = dataset.checkout_version(read_version).await.unwrap(); | ||
|
|
||
| let replacement = CreateIndexBuilder::new(&mut reader1, &["a"], IndexType::BTree, ¶ms) | ||
| .name("a_idx".to_string()) | ||
| .replace(true) | ||
| .execute() | ||
| .await | ||
| .unwrap(); | ||
| assert_ne!(replacement.uuid, original.uuid); | ||
|
|
||
| let second = CreateIndexBuilder::new(&mut reader2, &["a"], IndexType::BTree, ¶ms) | ||
| .name("a_idx".to_string()) | ||
| .replace(true) | ||
| .execute() | ||
| .await; | ||
| assert!( | ||
| matches!(second, Err(Error::RetryableCommitConflict { .. })), | ||
| "second concurrent replace should be retryable, got {second:?}" | ||
| ); | ||
|
|
||
| let latest_indices = reader1.load_indices_by_name("a_idx").await.unwrap(); | ||
| assert_eq!(latest_indices.len(), 1); | ||
| assert_eq!(latest_indices[0].uuid, replacement.uuid); | ||
| assert_ne!(latest_indices[0].uuid, original.uuid); | ||
| } | ||
|
|
||
| // Helper function to create test data with text field suitable for inverted index | ||
| fn create_text_batch(start: i32, end: i32) -> RecordBatch { | ||
| let schema = Arc::new(ArrowSchema::new(vec![ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
does this duplicate https://github.com/lance-format/lance/pull/6204/changes#diff-ed935ef0e993a9a039bc75398d27abe79036df7c41f0c40fa0a9d7325c0ad3a6R443
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not exactly. These two checks happen at different stages and serve different purposes.
It's possible to have a helper function to make it more clear, do you want it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah, the two implementations are diff already, maybe we don't need to check
fieldsas we don't allow indices with the same name but diff fields. Good to have a helper function to get consistent behavior