-
Notifications
You must be signed in to change notification settings - Fork 638
perf: speed up filtered scan by up to 18.9× by moving the heavy CPU task out #5165
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
98272a3
dd9050c
2951c3a
1990809
19d7264
1bd887f
7fce0dc
98d6c07
c4acd47
ed34990
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1437,7 +1437,15 @@ impl BatchDecodeStream { | |
| let emitted_batch_size_warning = slf.emitted_batch_size_warning.clone(); | ||
| let task = async move { | ||
| let next_task = next_task?; | ||
| next_task.into_batch(emitted_batch_size_warning) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought we were going to do the spawn fix by replacing the existing
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, I misunderstood your previous comments. The
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I rethink about this and sure that we can remove the But the spawn inside
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, I'm not entirely sure I agree but I don't want to go back and forth too much. We can merge this and revisit later (I still want to get rid of some of the I/O tasks) if you would like. I think we will also want a more complex benchmark, we could use one of the more compute intensive TPC-H queries. We will also need to add support for The goal should be that one thread task does decoding and filtering. This way when we reach the filtering stage, the data is already in the CPU cache. If we put a spawn here then the decoding will happen on one thread task and the filtering on another. This means we will have to transfer the data between main memory.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in #5242 I agree most of your comments. The blocker here is the change set might be bigger than we expected. Let's revisit this part as follow ups. |
||
| // Real decode work happens inside into_batch, which can block the current | ||
| // thread for a long time. By spawning it as a new task, we allow Tokio's | ||
| // worker threads to keep making progress. | ||
| tokio::spawn(async move { next_task.into_batch(emitted_batch_size_warning) }) | ||
| .await | ||
| .map_err(|err| Error::Wrapped { | ||
| error: err.into(), | ||
| location: location!(), | ||
| })? | ||
| }; | ||
| (task, num_rows) | ||
| }); | ||
|
|
@@ -1760,7 +1768,15 @@ impl StructuralBatchDecodeStream { | |
| let emitted_batch_size_warning = slf.emitted_batch_size_warning.clone(); | ||
| let task = async move { | ||
| let next_task = next_task?; | ||
| next_task.into_batch(emitted_batch_size_warning) | ||
| // Real decode work happens inside into_batch, which can block the current | ||
| // thread for a long time. By spawning it as a new task, we allow Tokio's | ||
| // worker threads to keep making progress. | ||
| tokio::spawn(async move { next_task.into_batch(emitted_batch_size_warning) }) | ||
| .await | ||
| .map_err(|err| Error::Wrapped { | ||
| error: err.into(), | ||
| location: location!(), | ||
| })? | ||
| }; | ||
| (task, num_rows) | ||
| }); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,167 @@ | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
| // SPDX-FileCopyrightText: Copyright The Lance Authors | ||
|
|
||
| use std::sync::Arc; | ||
|
|
||
| use arrow_array::{Float64Array, Int64Array, RecordBatch, RecordBatchIterator, StringArray}; | ||
| use arrow_schema::{DataType, Field, Schema as ArrowSchema}; | ||
| use criterion::{criterion_group, criterion_main, Criterion}; | ||
| use lance::dataset::{Dataset, ProjectionRequest, WriteParams}; | ||
| use lance_file::version::LanceFileVersion; | ||
| use std::collections::HashMap; | ||
| use tokio::runtime::Runtime; | ||
| use uuid::Uuid; | ||
|
|
||
| const TOTAL_ROWS: usize = 500_000; | ||
| const BATCH_SIZE: usize = 1024; | ||
| const LIMIT: i64 = 10_000; | ||
| const SHIP_MODES: [&str; 5] = ["FOB", "RAIL", "AIR", "MAIL", "TRUCK"]; | ||
| const ROW_IDS: [u64; 5] = [1, 40, 100, 130, 200]; | ||
|
|
||
| fn bench_random_access(c: &mut Criterion) { | ||
| let runtime = Runtime::new().expect("failed to build tokio runtime"); | ||
|
|
||
| let dataset_v2_0 = runtime.block_on(prepare_dataset(LanceFileVersion::V2_0, true)); | ||
| let dataset_v2_1_fsst = runtime.block_on(prepare_dataset(LanceFileVersion::V2_1, true)); | ||
| let dataset_v2_1_no_fsst = runtime.block_on(prepare_dataset(LanceFileVersion::V2_1, false)); | ||
|
|
||
| benchmark_dataset(&runtime, c, dataset_v2_0, "V2_0"); | ||
| benchmark_dataset(&runtime, c, dataset_v2_1_fsst, "V2_1 (FSST)"); | ||
| benchmark_dataset(&runtime, c, dataset_v2_1_no_fsst, "V2_1 (FSST disabled)"); | ||
| } | ||
|
|
||
| fn benchmark_dataset(rt: &Runtime, c: &mut Criterion, dataset: Dataset, label: &str) { | ||
| let dataset = Arc::new(dataset); | ||
| bench_filtered_scan(rt, c, dataset.clone(), label); | ||
| bench_random_take(rt, c, dataset, label); | ||
| } | ||
|
|
||
| fn bench_filtered_scan(rt: &Runtime, c: &mut Criterion, dataset: Arc<Dataset>, label: &str) { | ||
| let bench_name = format!("{label} Filtered Scan ({LIMIT} limit)"); | ||
| c.bench_function(&bench_name, |b| { | ||
| let dataset = dataset.clone(); | ||
| b.to_async(rt).iter(move || { | ||
| let dataset = dataset.clone(); | ||
| async move { | ||
| let batch = dataset | ||
| .scan() | ||
| .filter("l_shipmode = 'FOB'") | ||
| .expect("failed to apply filter") | ||
| .limit(Some(LIMIT), None) | ||
| .expect("failed to set limit") | ||
| .try_into_batch() | ||
| .await | ||
| .expect("scan execution failed"); | ||
| assert_eq!(batch.num_rows(), LIMIT as usize); | ||
| } | ||
| }); | ||
| }); | ||
| } | ||
|
|
||
| fn bench_random_take(rt: &Runtime, c: &mut Criterion, dataset: Arc<Dataset>, label: &str) { | ||
| let bench_name = format!("{label} Random Take {} rows", ROW_IDS.len()); | ||
| let projection = Arc::new(dataset.schema().clone()); | ||
| c.bench_function(&bench_name, |b| { | ||
| let dataset = dataset.clone(); | ||
| let projection = projection.clone(); | ||
| b.to_async(rt).iter(move || { | ||
| let dataset = dataset.clone(); | ||
| let projection = projection.clone(); | ||
| async move { | ||
| let batch = dataset | ||
| .take_rows(&ROW_IDS, ProjectionRequest::Schema(projection.clone())) | ||
| .await | ||
| .expect("take_rows failed"); | ||
| assert_eq!(batch.num_rows(), ROW_IDS.len()); | ||
| } | ||
| }); | ||
| }); | ||
| } | ||
|
|
||
| fn utf8_field_without_fsst(name: &str) -> Field { | ||
| let mut metadata = HashMap::new(); | ||
| metadata.insert("lance-encoding:compression".to_string(), "none".to_string()); | ||
| Field::new(name, DataType::Utf8, false).with_metadata(metadata) | ||
| } | ||
|
|
||
| fn utf8_field_for(version: LanceFileVersion, enable_fsst: bool, name: &str) -> Field { | ||
| if enable_fsst && version >= LanceFileVersion::V2_1 { | ||
| Field::new(name, DataType::Utf8, false) | ||
| } else { | ||
| utf8_field_without_fsst(name) | ||
| } | ||
| } | ||
|
|
||
| async fn prepare_dataset(version: LanceFileVersion, enable_fsst: bool) -> Dataset { | ||
| let schema = Arc::new(ArrowSchema::new(vec![ | ||
| Field::new("l_orderkey", DataType::Int64, false), | ||
| utf8_field_for(version, enable_fsst, "l_shipmode"), | ||
| Field::new("l_extendedprice", DataType::Float64, false), | ||
| utf8_field_for(version, enable_fsst, "l_comment"), | ||
| ])); | ||
|
|
||
| let batches = generate_batches(schema.clone()); | ||
| let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema); | ||
|
|
||
| let params = WriteParams { | ||
| data_storage_version: Some(version), | ||
| ..Default::default() | ||
| }; | ||
|
|
||
| let uri = format!( | ||
| "memory://random-access-{}-{}", | ||
| version_label(version), | ||
| Uuid::new_v4() | ||
| ); | ||
|
|
||
| Dataset::write(reader, uri.as_str(), Some(params)) | ||
| .await | ||
| .expect("failed to write dataset") | ||
| } | ||
|
|
||
| fn generate_batches(schema: Arc<ArrowSchema>) -> Vec<RecordBatch> { | ||
| let mut batches = Vec::with_capacity(TOTAL_ROWS.div_ceil(BATCH_SIZE)); | ||
| let mut start = 0usize; | ||
|
|
||
| while start < TOTAL_ROWS { | ||
| let end = usize::min(start + BATCH_SIZE, TOTAL_ROWS); | ||
| let order_key = Int64Array::from_iter_values((start as i64)..(end as i64)); | ||
| let ship_mode = StringArray::from_iter_values( | ||
| (start..end).map(|idx| SHIP_MODES[idx % SHIP_MODES.len()].to_string()), | ||
| ); | ||
| let extended_price = Float64Array::from_iter_values((start..end).map(|idx| { | ||
| let base = (idx % 10_000) as f64; | ||
| base * 1.5 + 42.0 | ||
| })); | ||
| let comment = StringArray::from_iter_values( | ||
| (start..end).map(|idx| format!("Shipment comment #{idx}")), | ||
| ); | ||
|
|
||
| let batch = RecordBatch::try_new( | ||
| schema.clone(), | ||
| vec![ | ||
| Arc::new(order_key), | ||
| Arc::new(ship_mode), | ||
| Arc::new(extended_price), | ||
| Arc::new(comment), | ||
| ], | ||
| ) | ||
| .expect("failed to build record batch"); | ||
|
|
||
| batches.push(batch); | ||
| start = end; | ||
| } | ||
|
|
||
| batches | ||
| } | ||
|
|
||
| fn version_label(version: LanceFileVersion) -> &'static str { | ||
| match version { | ||
| LanceFileVersion::V2_0 => "v2_0", | ||
| LanceFileVersion::V2_1 => "v2_1", | ||
| _ => "other", | ||
| } | ||
| } | ||
|
|
||
| criterion_group!(benches, bench_random_access); | ||
| criterion_main!(benches); |
Uh oh!
There was an error while loading. Please reload this page.