From 03c3878eb28d7c4eff404129ce9c8f462f8f8846 Mon Sep 17 00:00:00 2001 From: kclaka <33363343+kclaka@users.noreply.github.com> Date: Wed, 25 Feb 2026 08:32:41 -0800 Subject: [PATCH] v1.2.1: Add criterion benchmarks and fix CI integration test parallelism - Add criterion benchmark suite (engine, classify, output formatters) measuring rows/sec throughput across generation strategies - Fix MySQL integration test failures caused by parallel test execution against a shared database (--test-threads=1) - Add performance table to README with baseline benchmark results - Bump version to 1.2.1 --- .github/workflows/ci.yml | 2 +- Cargo.lock | 236 +++++++++++++++++- Cargo.toml | 3 +- README.md | 21 +- crates/seedkit-core/Cargo.toml | 13 + crates/seedkit-core/benches/classify.rs | 128 ++++++++++ crates/seedkit-core/benches/engine.rs | 310 ++++++++++++++++++++++++ crates/seedkit-core/benches/output.rs | 138 +++++++++++ 8 files changed, 843 insertions(+), 8 deletions(-) create mode 100644 crates/seedkit-core/benches/classify.rs create mode 100644 crates/seedkit-core/benches/engine.rs create mode 100644 crates/seedkit-core/benches/output.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 83ee00e..4411561 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -69,4 +69,4 @@ jobs: env: TEST_POSTGRES_URL: postgres://seedkit:seedkit@localhost:5432/seedkit_test TEST_MYSQL_URL: mysql://seedkit:seedkit@localhost:3306/seedkit_test - run: cargo test --test '*' + run: cargo test --test '*' -- --test-threads=1 diff --git a/Cargo.lock b/Cargo.lock index d1867cc..415e340 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -26,6 +26,12 @@ dependencies = [ "libc", ] +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + [[package]] name = "anstream" version = "0.6.21" @@ -166,6 +172,12 @@ version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + [[package]] name = "cc" version = "1.2.56" @@ -196,6 +208,33 @@ dependencies = [ "windows-link", ] +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + [[package]] name = "clap" version = "4.5.60" @@ -331,6 +370,61 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-queue" version = "0.3.12" @@ -369,6 +463,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + [[package]] name = "crypto-common" version = "0.1.7" @@ -710,6 +810,17 @@ dependencies = [ "tracing", ] +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "zerocopy", +] + [[package]] name = "hashbrown" version = "0.15.5" @@ -742,6 +853,12 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + [[package]] name = "hex" version = "0.4.3" @@ -1066,12 +1183,32 @@ dependencies = [ "serde", ] +[[package]] +name = "is-terminal" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.61.2", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.17" @@ -1302,6 +1439,12 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" +[[package]] +name = "oorandom" +version = "11.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" + [[package]] name = "openssl" version = "0.10.75" @@ -1439,6 +1582,34 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + [[package]] name = "portable-atomic" version = "1.13.1" @@ -1556,6 +1727,26 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "redox_syscall" version = "0.5.18" @@ -1736,6 +1927,15 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "schannel" version = "0.1.28" @@ -1776,7 +1976,7 @@ dependencies = [ [[package]] name = "seedkit-cli" -version = "1.2.0" +version = "1.2.1" dependencies = [ "anyhow", "clap", @@ -1794,12 +1994,13 @@ dependencies = [ [[package]] name = "seedkit-core" -version = "1.2.0" +version = "1.2.1" dependencies = [ "anyhow", "base64", "chrono", "comfy-table", + "criterion", "dotenvy", "fake", "indexmap", @@ -1823,7 +2024,7 @@ dependencies = [ [[package]] name = "seedkit-testutil" -version = "1.2.0" +version = "1.2.1" dependencies = [ "indexmap", "seedkit-core", @@ -2339,6 +2540,16 @@ dependencies = [ "zerovec", ] +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "tinyvec" version = "1.10.0" @@ -2694,6 +2905,16 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "want" version = "0.3.1" @@ -2890,6 +3111,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" diff --git a/Cargo.toml b/Cargo.toml index fd9aefb..6cabcce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ members = ["crates/seedkit-core", "crates/seedkit-cli", "crates/seedkit-testutil resolver = "2" [workspace.package] -version = "1.2.0" +version = "1.2.1" edition = "2021" authors = ["SeedKit Contributors"] license = "MIT" @@ -69,6 +69,7 @@ dotenvy = "0.15" # Testing tempfile = "3" +criterion = { version = "0.5", features = ["html_reports"] } # Logging tracing = "0.1" diff --git a/README.md b/README.md index f26e1a4..d7d9c8d 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@

CI Tests - Version + Version Rust License: MIT Databases @@ -81,7 +81,7 @@ cargo install --path crates/seedkit-cli ```bash seedkit --version -# seedkit 1.2.0 +# seedkit 1.2.1 ``` ### Zero-Config Database Detection @@ -271,6 +271,21 @@ git checkout --ours seedkit.lock seedkit generate --force ``` +## Performance + +Benchmarked with [criterion](https://github.com/bheisler/criterion.rs) on Apple Silicon (M-series). Run `cargo bench` to reproduce. + +| Operation | Throughput | +|---|---| +| Generation (10 cols, semantic providers) | ~480K rows/sec | +| Generation (FK references only) | ~3.7M rows/sec | +| Generation (weighted value lists) | ~6.9M rows/sec | +| Generation (distribution sampling) | ~8.6M rows/sec | +| Classification (100 tables x 20 cols) | ~2.1M cols/sec | +| SQL output formatting | ~1.5M rows/sec | +| JSON output formatting | ~1.1M rows/sec | +| CSV output formatting | ~1.5M rows/sec | + ## Comparison | Feature | SeedKit | Faker/factory_bot | Snaplet | @@ -314,7 +329,7 @@ cargo test docker compose -f docker/docker-compose.test.yml up -d TEST_POSTGRES_URL=postgres://seedkit:seedkit@localhost:5432/seedkit_test \ TEST_MYSQL_URL=mysql://seedkit:seedkit@localhost:3307/seedkit_test \ - cargo test --test '*' + cargo test --test '*' -- --test-threads=1 ``` ## License diff --git a/crates/seedkit-core/Cargo.toml b/crates/seedkit-core/Cargo.toml index f017e3e..6a7460c 100644 --- a/crates/seedkit-core/Cargo.toml +++ b/crates/seedkit-core/Cargo.toml @@ -33,3 +33,16 @@ url.workspace = true [dev-dependencies] tokio = { workspace = true, features = ["test-util"] } tempfile.workspace = true +criterion.workspace = true + +[[bench]] +name = "engine" +harness = false + +[[bench]] +name = "classify" +harness = false + +[[bench]] +name = "output" +harness = false diff --git a/crates/seedkit-core/benches/classify.rs b/crates/seedkit-core/benches/classify.rs new file mode 100644 index 0000000..ad4997c --- /dev/null +++ b/crates/seedkit-core/benches/classify.rs @@ -0,0 +1,128 @@ +//! Benchmarks for column classification — regex matching and schema-wide classification. +//! +//! Classification runs once per introspection, but regex compilation and matching +//! are worth measuring to catch regressions from rule changes. + +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; + +use seedkit_core::classify::rules::{classify_column, classify_schema}; +use seedkit_core::schema::types::*; + +/// Column name/type pairs representing a realistic mix of columns. +fn realistic_columns() -> Vec<(&'static str, DataType)> { + vec![ + ("id", DataType::Serial), + ("email", DataType::VarChar), + ("first_name", DataType::VarChar), + ("last_name", DataType::VarChar), + ("password_hash", DataType::VarChar), + ("created_at", DataType::TimestampTz), + ("updated_at", DataType::TimestampTz), + ("is_active", DataType::Boolean), + ("age", DataType::Integer), + ("price", DataType::Numeric), + ("description", DataType::Text), + ("avatar_url", DataType::VarChar), + ("phone", DataType::VarChar), + ("city", DataType::VarChar), + ("zip_code", DataType::VarChar), + ("country", DataType::VarChar), + ("status", DataType::VarChar), + ("metadata", DataType::Jsonb), + ("slug", DataType::VarChar), + ("quantity", DataType::Integer), + ] +} + +fn bench_classify_single_column(c: &mut Criterion) { + let mut group = c.benchmark_group("classify/single_column"); + + // Benchmark individual column classification across different match depths. + // "email" is an early match, "status" is mid-list, "zzz_unknown" misses all rules. + let cases = vec![ + ("early_match", "email", DataType::VarChar), + ("mid_match", "status", DataType::VarChar), + ("late_match", "tenant_id", DataType::Integer), + ("no_match", "zzz_unknown", DataType::VarChar), + ("camel_case", "firstName", DataType::VarChar), + ("type_constrained", "age", DataType::Integer), + ]; + + for (label, col_name, data_type) in &cases { + group.bench_with_input( + BenchmarkId::new("type", label), + &(col_name, data_type), + |b, &(name, dt)| { + b.iter(|| { + classify_column(name, dt, "users", false, false, None); + }); + }, + ); + } + group.finish(); +} + +fn bench_classify_schema(c: &mut Criterion) { + let mut group = c.benchmark_group("classify/schema"); + let columns = realistic_columns(); + + for table_count in [10, 50, 100] { + let schema = build_schema(table_count, &columns); + let total_columns = table_count * columns.len(); + + group.throughput(Throughput::Elements(total_columns as u64)); + group.bench_with_input( + BenchmarkId::new("tables", table_count), + &schema, + |b, schema| { + b.iter(|| { + classify_schema(schema); + }); + }, + ); + } + group.finish(); +} + +/// Build a schema with N tables, each containing the realistic column set. +fn build_schema(table_count: usize, columns: &[(&str, DataType)]) -> DatabaseSchema { + let mut schema = DatabaseSchema::new(DatabaseType::PostgreSQL, "bench".to_string()); + + let table_prefixes = [ + "users", "orders", "products", "reviews", "categories", + "tags", "comments", "posts", "sessions", "notifications", + "invoices", "payments", "addresses", "companies", "departments", + "employees", "projects", "tasks", "events", "logs", + ]; + + for i in 0..table_count { + let table_name = if i < table_prefixes.len() { + table_prefixes[i].to_string() + } else { + format!("table_{}", i) + }; + + let mut table = Table::new(table_name.clone()); + for (col_name, data_type) in columns { + let mut col = Column::new( + col_name.to_string(), + data_type.clone(), + data_type.to_string(), + ); + if *col_name == "id" { + col.is_auto_increment = true; + } + table.columns.insert(col_name.to_string(), col); + } + table.primary_key = Some(PrimaryKey { + columns: vec!["id".to_string()], + name: None, + }); + schema.tables.insert(table_name, table); + } + + schema +} + +criterion_group!(benches, bench_classify_single_column, bench_classify_schema); +criterion_main!(benches); diff --git a/crates/seedkit-core/benches/engine.rs b/crates/seedkit-core/benches/engine.rs new file mode 100644 index 0000000..5dd1d82 --- /dev/null +++ b/crates/seedkit-core/benches/engine.rs @@ -0,0 +1,310 @@ +//! Benchmarks for the generation engine — the core hot path. +//! +//! Measures rows-per-second throughput for `execute_plan` across +//! different table sizes, column counts, and strategy mixes. + +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use std::collections::{BTreeMap, HashMap}; + +use seedkit_core::classify::semantic::SemanticType; +use seedkit_core::generate::engine::execute_plan; +use seedkit_core::generate::plan::*; +use seedkit_core::sample::stats::ColumnDistribution; +use seedkit_core::schema::types::*; + +/// Build a schema with one table containing N semantic columns (no FKs). +fn single_table_schema(num_columns: usize) -> (DatabaseSchema, HashMap<(String, String), SemanticType>) { + let mut schema = DatabaseSchema::new(DatabaseType::PostgreSQL, "bench".to_string()); + let mut table = Table::new("items".to_string()); + let mut classifications = HashMap::new(); + + let types = [ + ("email", DataType::VarChar, SemanticType::Email), + ("first_name", DataType::VarChar, SemanticType::FirstName), + ("last_name", DataType::VarChar, SemanticType::LastName), + ("price", DataType::Numeric, SemanticType::Price), + ("created_at", DataType::TimestampTz, SemanticType::CreatedAt), + ("is_active", DataType::Boolean, SemanticType::BooleanFlag), + ("description", DataType::Text, SemanticType::Description), + ("status", DataType::VarChar, SemanticType::Status), + ("quantity", DataType::Integer, SemanticType::Quantity), + ("url", DataType::VarChar, SemanticType::Url), + ]; + + for i in 0..num_columns { + let (name, dt, st) = &types[i % types.len()]; + let col_name = if i < types.len() { + name.to_string() + } else { + format!("{}_{}", name, i / types.len()) + }; + let col = Column::new(col_name.clone(), dt.clone(), dt.to_string()); + table.columns.insert(col_name.clone(), col); + classifications.insert(("items".to_string(), col_name), *st); + } + + schema.tables.insert("items".to_string(), table); + (schema, classifications) +} + +/// Build a schema with parent/child FK relationship. +fn fk_schema() -> (DatabaseSchema, HashMap<(String, String), SemanticType>) { + let mut schema = DatabaseSchema::new(DatabaseType::PostgreSQL, "bench".to_string()); + let mut classifications = HashMap::new(); + + // Parent: users + let mut users = Table::new("users".to_string()); + let mut id_col = Column::new("id".to_string(), DataType::Serial, "serial".to_string()); + id_col.is_auto_increment = true; + users.columns.insert("id".to_string(), id_col); + users.primary_key = Some(PrimaryKey { + columns: vec!["id".to_string()], + name: None, + }); + let email_col = Column::new("email".to_string(), DataType::VarChar, "varchar".to_string()); + users.columns.insert("email".to_string(), email_col); + classifications.insert(("users".to_string(), "id".to_string()), SemanticType::AutoIncrement); + classifications.insert(("users".to_string(), "email".to_string()), SemanticType::Email); + + // Child: orders + let mut orders = Table::new("orders".to_string()); + let mut order_id = Column::new("id".to_string(), DataType::Serial, "serial".to_string()); + order_id.is_auto_increment = true; + orders.columns.insert("id".to_string(), order_id); + orders.primary_key = Some(PrimaryKey { + columns: vec!["id".to_string()], + name: None, + }); + let user_id_col = Column::new("user_id".to_string(), DataType::Integer, "integer".to_string()); + orders.columns.insert("user_id".to_string(), user_id_col); + orders.foreign_keys.push(ForeignKey { + name: Some("orders_user_id_fkey".to_string()), + source_columns: vec!["user_id".to_string()], + referenced_table: "users".to_string(), + referenced_columns: vec!["id".to_string()], + on_delete: ForeignKeyAction::Cascade, + on_update: ForeignKeyAction::NoAction, + is_deferrable: false, + }); + let amount_col = Column::new("amount".to_string(), DataType::Numeric, "numeric".to_string()); + orders.columns.insert("amount".to_string(), amount_col); + + classifications.insert(("orders".to_string(), "id".to_string()), SemanticType::AutoIncrement); + classifications.insert(("orders".to_string(), "user_id".to_string()), SemanticType::ExternalId); + classifications.insert(("orders".to_string(), "amount".to_string()), SemanticType::Price); + + schema.tables.insert("users".to_string(), users); + schema.tables.insert("orders".to_string(), orders); + (schema, classifications) +} + +fn bench_single_table_generation(c: &mut Criterion) { + let mut group = c.benchmark_group("engine/single_table"); + + let (schema, classifications) = single_table_schema(10); + let insertion_order = vec!["items".to_string()]; + let empty_overrides = BTreeMap::new(); + let empty_col_overrides = BTreeMap::new(); + + for row_count in [100, 1000, 10_000] { + group.throughput(Throughput::Elements(row_count as u64)); + group.bench_with_input( + BenchmarkId::new("rows", row_count), + &row_count, + |b, &rows| { + let plan = GenerationPlan::build( + &schema, + &classifications, + &insertion_order, + Vec::new(), + rows, + &empty_overrides, + 42, + None, + &empty_col_overrides, + None, + ); + b.iter(|| { + execute_plan(&plan, &schema, None).unwrap(); + }); + }, + ); + } + group.finish(); +} + +fn bench_column_count(c: &mut Criterion) { + let mut group = c.benchmark_group("engine/column_count"); + let row_count = 1000; + let empty_overrides = BTreeMap::new(); + let empty_col_overrides = BTreeMap::new(); + + for col_count in [5, 10, 20] { + let (schema, classifications) = single_table_schema(col_count); + let insertion_order = vec!["items".to_string()]; + + group.throughput(Throughput::Elements(row_count as u64)); + group.bench_with_input( + BenchmarkId::new("cols", col_count), + &col_count, + |b, _| { + let plan = GenerationPlan::build( + &schema, + &classifications, + &insertion_order, + Vec::new(), + row_count, + &empty_overrides, + 42, + None, + &empty_col_overrides, + None, + ); + b.iter(|| { + execute_plan(&plan, &schema, None).unwrap(); + }); + }, + ); + } + group.finish(); +} + +fn bench_fk_generation(c: &mut Criterion) { + let mut group = c.benchmark_group("engine/foreign_keys"); + let (schema, classifications) = fk_schema(); + let insertion_order = vec!["users".to_string(), "orders".to_string()]; + let empty_col_overrides = BTreeMap::new(); + + // Parent:child ratios — 100 users + varying order counts + for order_count in [500, 2000, 10_000] { + let mut overrides = BTreeMap::new(); + overrides.insert("users".to_string(), 100); + overrides.insert("orders".to_string(), order_count); + let total = 100 + order_count; + + group.throughput(Throughput::Elements(total as u64)); + group.bench_with_input( + BenchmarkId::new("orders", order_count), + &order_count, + |b, _| { + let plan = GenerationPlan::build( + &schema, + &classifications, + &insertion_order, + Vec::new(), + 100, + &overrides, + 42, + None, + &empty_col_overrides, + None, + ); + b.iter(|| { + execute_plan(&plan, &schema, None).unwrap(); + }); + }, + ); + } + group.finish(); +} + +fn bench_value_list_strategy(c: &mut Criterion) { + let mut group = c.benchmark_group("engine/value_list"); + + let mut schema = DatabaseSchema::new(DatabaseType::PostgreSQL, "bench".to_string()); + let mut table = Table::new("items".to_string()); + let col = Column::new("color".to_string(), DataType::VarChar, "varchar".to_string()); + table.columns.insert("color".to_string(), col); + schema.tables.insert("items".to_string(), table); + + let plan = GenerationPlan { + table_plans: vec![TableGenerationPlan { + table_name: "items".to_string(), + row_count: 10_000, + column_plans: vec![ColumnGenerationPlan { + column_name: "color".to_string(), + semantic_type: SemanticType::Unknown, + strategy: GenerationStrategy::ValueList { + values: vec![ + "red".into(), "blue".into(), "green".into(), + "black".into(), "white".into(), + ], + weights: Some(vec![0.25, 0.20, 0.20, 0.20, 0.15]), + }, + nullable: false, + null_probability: 0.0, + check_constraints: Vec::new(), + }], + correlation_groups: Vec::new(), + }], + deferred_edges: Vec::new(), + seed: 42, + default_row_count: 10_000, + base_time: chrono::Utc::now().naive_utc(), + sequence_offset: 0, + }; + + group.throughput(Throughput::Elements(10_000)); + group.bench_function("weighted_10k", |b| { + b.iter(|| { + execute_plan(&plan, &schema, None).unwrap(); + }); + }); + group.finish(); +} + +fn bench_distribution_strategy(c: &mut Criterion) { + let mut group = c.benchmark_group("engine/distribution"); + + let mut schema = DatabaseSchema::new(DatabaseType::PostgreSQL, "bench".to_string()); + let mut table = Table::new("items".to_string()); + let col = Column::new("price".to_string(), DataType::Numeric, "numeric".to_string()); + table.columns.insert("price".to_string(), col); + schema.tables.insert("items".to_string(), table); + + let plan = GenerationPlan { + table_plans: vec![TableGenerationPlan { + table_name: "items".to_string(), + row_count: 10_000, + column_plans: vec![ColumnGenerationPlan { + column_name: "price".to_string(), + semantic_type: SemanticType::Unknown, + strategy: GenerationStrategy::Distribution { + distribution: ColumnDistribution::Numeric { + min: 0.0, + max: 1000.0, + mean: 49.99, + stddev: 25.0, + }, + }, + nullable: false, + null_probability: 0.0, + check_constraints: Vec::new(), + }], + correlation_groups: Vec::new(), + }], + deferred_edges: Vec::new(), + seed: 42, + default_row_count: 10_000, + base_time: chrono::Utc::now().naive_utc(), + sequence_offset: 0, + }; + + group.throughput(Throughput::Elements(10_000)); + group.bench_function("numeric_normal_10k", |b| { + b.iter(|| { + execute_plan(&plan, &schema, None).unwrap(); + }); + }); + group.finish(); +} + +criterion_group!( + benches, + bench_single_table_generation, + bench_column_count, + bench_fk_generation, + bench_value_list_strategy, + bench_distribution_strategy, +); +criterion_main!(benches); diff --git a/crates/seedkit-core/benches/output.rs b/crates/seedkit-core/benches/output.rs new file mode 100644 index 0000000..5ba3ac4 --- /dev/null +++ b/crates/seedkit-core/benches/output.rs @@ -0,0 +1,138 @@ +//! Benchmarks for output formatters — SQL, JSON, and CSV serialization. +//! +//! Measures throughput of formatting pre-generated data into various output +//! formats. Uses a black-hole writer to isolate formatter cost from I/O. + +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use indexmap::IndexMap; +use std::borrow::Cow; +use std::io::Write; + +use seedkit_core::generate::engine::GeneratedData; +use seedkit_core::generate::value::Value; +use seedkit_core::output::{csv, json, sql}; +use seedkit_core::schema::types::*; + +/// A writer that discards all output — isolates formatter cost from I/O. +struct NullWriter; + +impl Write for NullWriter { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + Ok(buf.len()) + } + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} + +/// Build pre-generated data with N rows of realistic column types. +fn make_generated_data(row_count: usize) -> GeneratedData { + let mut rows = Vec::with_capacity(row_count); + for i in 0..row_count { + let mut row = IndexMap::new(); + row.insert( + "name".to_string(), + Value::String(Cow::Owned(format!("User {}", i))), + ); + row.insert( + "email".to_string(), + Value::String(Cow::Owned(format!("user{}@example.com", i))), + ); + row.insert("age".to_string(), Value::Int(20 + (i as i64 % 60))); + row.insert("price".to_string(), Value::Float(9.99 + i as f64 * 0.01)); + row.insert("active".to_string(), Value::Bool(i % 3 != 0)); + row.insert( + "created_at".to_string(), + Value::Timestamp( + chrono::NaiveDateTime::new( + chrono::NaiveDate::from_ymd_opt(2025, 1, 1).unwrap(), + chrono::NaiveTime::from_hms_opt(12, 0, 0).unwrap(), + ) + chrono::Duration::seconds(i as i64), + ), + ); + if i % 10 == 0 { + row.insert("bio".to_string(), Value::Null); + } else { + row.insert( + "bio".to_string(), + Value::String(Cow::Owned(format!( + "A longer description field that contains commas, \"quotes\", and other special characters for row {}.", + i + ))), + ); + } + rows.push(row); + } + + let mut tables = IndexMap::new(); + tables.insert("users".to_string(), rows); + + GeneratedData { + tables, + deferred_updates: Vec::new(), + } +} + +fn bench_sql_output(c: &mut Criterion) { + let mut group = c.benchmark_group("output/sql"); + let schema = DatabaseSchema::new(DatabaseType::PostgreSQL, "bench".to_string()); + + for row_count in [100, 1000, 10_000] { + let data = make_generated_data(row_count); + group.throughput(Throughput::Elements(row_count as u64)); + group.bench_with_input( + BenchmarkId::new("rows", row_count), + &data, + |b, data| { + b.iter(|| { + let mut w = NullWriter; + sql::write_sql(&mut w, data, &schema).unwrap(); + }); + }, + ); + } + group.finish(); +} + +fn bench_json_output(c: &mut Criterion) { + let mut group = c.benchmark_group("output/json"); + + for row_count in [100, 1000, 10_000] { + let data = make_generated_data(row_count); + group.throughput(Throughput::Elements(row_count as u64)); + group.bench_with_input( + BenchmarkId::new("rows", row_count), + &data, + |b, data| { + b.iter(|| { + let mut w = NullWriter; + json::write_json(&mut w, data).unwrap(); + }); + }, + ); + } + group.finish(); +} + +fn bench_csv_output(c: &mut Criterion) { + let mut group = c.benchmark_group("output/csv"); + + for row_count in [100, 1000, 10_000] { + let data = make_generated_data(row_count); + group.throughput(Throughput::Elements(row_count as u64)); + group.bench_with_input( + BenchmarkId::new("rows", row_count), + &data, + |b, data| { + b.iter(|| { + let mut w = NullWriter; + csv::write_csv(&mut w, data).unwrap(); + }); + }, + ); + } + group.finish(); +} + +criterion_group!(benches, bench_sql_output, bench_json_output, bench_csv_output); +criterion_main!(benches);