diff --git a/parquet-variant/benches/variant_builder.rs b/parquet-variant/benches/variant_builder.rs index f69e3170c663..5e5494f0a5cc 100644 --- a/parquet-variant/benches/variant_builder.rs +++ b/parquet-variant/benches/variant_builder.rs @@ -19,7 +19,7 @@ extern crate parquet_variant; use criterion::*; -use parquet_variant::VariantBuilder; +use parquet_variant::{Variant, VariantBuilder}; use rand::{ distr::{uniform::SampleUniform, Alphanumeric}, rngs::StdRng, @@ -388,6 +388,113 @@ fn bench_object_list_partially_same_schema(c: &mut Criterion) { }); } +// Benchmark validation performance +fn bench_validation_validated_vs_unvalidated(c: &mut Criterion) { + let mut rng = StdRng::seed_from_u64(42); + let mut string_table = RandomStringGenerator::new(&mut rng, 117); + + // Pre-generate test data + let mut test_data = Vec::new(); + for _ in 0..100 { + let mut builder = VariantBuilder::new(); + let mut obj = builder.new_object(); + obj.insert("field1", string_table.next()); + obj.insert("field2", rng.random::()); + obj.insert("field3", rng.random::()); + + let mut list = obj.new_list("field4"); + for _ in 0..10 { + list.append_value(rng.random::()); + } + list.finish(); + + obj.finish().unwrap(); + test_data.push(builder.finish()); + } + + let mut group = c.benchmark_group("validation"); + + group.bench_function("validated_construction", |b| { + b.iter(|| { + for (metadata, value) in &test_data { + let variant = Variant::try_new(metadata, value).unwrap(); + hint::black_box(variant); + } + }) + }); + + group.bench_function("unvalidated_construction", |b| { + b.iter(|| { + for (metadata, value) in &test_data { + let variant = Variant::new(metadata, value); + hint::black_box(variant); + } + }) + }); + + group.bench_function("validation_cost", |b| { + // Create unvalidated variants first + let unvalidated: Vec<_> = test_data + .iter() + .map(|(metadata, value)| Variant::new(metadata, value)) + .collect(); + + b.iter(|| { + for variant in &unvalidated { + let validated = variant.clone().validate().unwrap(); + hint::black_box(validated); + } + }) + }); + + group.finish(); +} + +// Benchmark iteration performance on validated vs unvalidated variants +fn bench_iteration_performance(c: &mut Criterion) { + let mut rng = StdRng::seed_from_u64(42); + + // Create a complex nested structure + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + + for i in 0..1000 { + let mut obj = list.new_object(); + obj.insert(&format!("field_{i}"), rng.random::()); + obj.insert("nested_data", format!("data_{i}").as_str()); + obj.finish().unwrap(); + } + list.finish(); + + let (metadata, value) = builder.finish(); + let validated = Variant::try_new(&metadata, &value).unwrap(); + let unvalidated = Variant::new(&metadata, &value); + + let mut group = c.benchmark_group("iteration"); + + group.bench_function("validated_iteration", |b| { + b.iter(|| { + if let Some(list) = validated.as_list() { + for item in list.iter() { + hint::black_box(item); + } + } + }) + }); + + group.bench_function("unvalidated_fallible_iteration", |b| { + b.iter(|| { + if let Some(list) = unvalidated.as_list() { + for item in list.iter_try().flatten() { + hint::black_box(item); + } + } + }) + }); + + group.finish(); +} + criterion_group!( benches, bench_object_field_names_reverse_order, @@ -396,7 +503,9 @@ criterion_group!( bench_object_unknown_schema, bench_object_list_unknown_schema, bench_object_partially_same_schema, - bench_object_list_partially_same_schema + bench_object_list_partially_same_schema, + bench_validation_validated_vs_unvalidated, + bench_iteration_performance ); criterion_main!(benches); diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index dcf1200d3346..c95d81a3e904 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -26,6 +26,9 @@ use parquet_variant::{ ShortString, Variant, VariantBuilder, VariantDecimal16, VariantDecimal4, VariantDecimal8, }; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; + /// Returns a directory path for the parquet variant test data. /// /// The data lives in the `parquet-testing` git repository: @@ -275,3 +278,303 @@ fn variant_object_builder() { } // TODO: Add tests for object_nested and array_nested + +// +// Validation Fuzzing Tests +// +// 1. Generate valid variants using the builder +// 2. Randomly corrupt bytes in the serialized data +// 3. Test both validation pathways: +// - If validation succeeds -> verify infallible APIs don't panic +// - If validation fails -> verify fallible APIs handle errors gracefully +// + +#[test] +fn test_validation_fuzz_integration() { + let mut rng = StdRng::seed_from_u64(42); + + for _ in 0..1000 { + // Generate a random valid variant + let (metadata, value) = generate_random_variant(&mut rng); + + // Corrupt it + let (corrupted_metadata, corrupted_value) = corrupt_variant_data(&mut rng, metadata, value); + + // Test the validation workflow + test_validation_workflow(&corrupted_metadata, &corrupted_value); + } +} + +fn generate_random_variant(rng: &mut StdRng) -> (Vec, Vec) { + let mut builder = VariantBuilder::new(); + generate_random_value(rng, &mut builder, 3); // Max depth of 3 + builder.finish() +} + +fn generate_random_value(rng: &mut StdRng, builder: &mut VariantBuilder, max_depth: u32) { + if max_depth == 0 { + // Force simple values at max depth + builder.append_value(rng.random::()); + return; + } + + match rng.random_range(0..15) { + 0 => builder.append_value(()), + 1 => builder.append_value(rng.random::()), + 2 => builder.append_value(rng.random::()), + 3 => builder.append_value(rng.random::()), + 4 => builder.append_value(rng.random::()), + 5 => builder.append_value(rng.random::()), + 6 => builder.append_value(rng.random::()), + 7 => builder.append_value(rng.random::()), + 8 => { + let len = rng.random_range(0..50); + let s: String = (0..len).map(|_| rng.random::()).collect(); + builder.append_value(s.as_str()); + } + 9 => { + let len = rng.random_range(0..50); + let bytes: Vec = (0..len).map(|_| rng.random()).collect(); + builder.append_value(bytes.as_slice()); + } + 10 => { + if let Ok(decimal) = VariantDecimal4::try_new(rng.random(), rng.random_range(0..10)) { + builder.append_value(decimal); + } else { + builder.append_value(0i32); + } + } + 11 => { + if let Ok(decimal) = VariantDecimal8::try_new(rng.random(), rng.random_range(0..19)) { + builder.append_value(decimal); + } else { + builder.append_value(0i64); + } + } + 12 => { + if let Ok(decimal) = VariantDecimal16::try_new(rng.random(), rng.random_range(0..39)) { + builder.append_value(decimal); + } else { + builder.append_value(0i64); // Use i64 instead of i128 + } + } + 13 => { + // Generate a list + let mut list_builder = builder.new_list(); + let list_len = rng.random_range(0..10); + + for _ in 0..list_len { + list_builder.append_value(rng.random::()); + } + list_builder.finish(); + } + 14 => { + // Generate an object + let mut object_builder = builder.new_object(); + let obj_size = rng.random_range(0..10); + + for i in 0..obj_size { + let key = format!("field_{i}"); + object_builder.insert(&key, rng.random::()); + } + object_builder.finish().unwrap(); + } + _ => unreachable!(), + } +} + +fn corrupt_variant_data( + rng: &mut StdRng, + mut metadata: Vec, + mut value: Vec, +) -> (Vec, Vec) { + // Randomly decide what to corrupt + let corrupt_metadata = rng.random_bool(0.3); + let corrupt_value = rng.random_bool(0.7); + + if corrupt_metadata && !metadata.is_empty() { + let idx = rng.random_range(0..metadata.len()); + let bit = rng.random_range(0..8); + metadata[idx] ^= 1 << bit; + } + + if corrupt_value && !value.is_empty() { + let idx = rng.random_range(0..value.len()); + let bit = rng.random_range(0..8); + value[idx] ^= 1 << bit; + } + + (metadata, value) +} + +fn test_validation_workflow(metadata: &[u8], value: &[u8]) { + // Step 1: Try unvalidated construction - should not panic + let variant_result = std::panic::catch_unwind(|| Variant::new(metadata, value)); + + let variant = match variant_result { + Ok(v) => v, + Err(_) => return, // Construction failed, which is acceptable for corrupted data + }; + + // Step 2: Try validation + let validation_result = std::panic::catch_unwind(|| variant.clone().validate()); + + match validation_result { + Ok(Ok(validated)) => { + // Validation succeeded - infallible access should not panic + test_infallible_access(&validated); + } + Ok(Err(_)) => { + // Validation failed - fallible access should handle errors gracefully + test_fallible_access(&variant); + } + Err(_) => { + // Validation panicked - this may indicate severely corrupted data + // For now, we accept this, but it could indicate a validation bug + } + } +} + +fn test_infallible_access(variant: &Variant) { + // All these should not panic on validated variants + let _ = variant.as_null(); + let _ = variant.as_boolean(); + let _ = variant.as_int32(); + let _ = variant.as_string(); + + if let Some(obj) = variant.as_object() { + for (_, _) in obj.iter() { + // Should not panic + } + for i in 0..obj.len() { + let _ = obj.field(i); + } + } + + if let Some(list) = variant.as_list() { + for _ in list.iter() { + // Should not panic + } + for i in 0..list.len() { + let _ = list.get(i); + } + } +} + +fn test_fallible_access(variant: &Variant) { + // These should handle errors gracefully, never panic + if let Some(obj) = variant.as_object() { + for result in obj.iter_try() { + let _ = result; // May be Ok or Err, but should not panic + } + for i in 0..obj.len() { + let _ = obj.try_field(i); // May be Ok or Err, but should not panic + } + } + + if let Some(list) = variant.as_list() { + for result in list.iter_try() { + let _ = result; // May be Ok or Err, but should not panic + } + for i in 0..list.len() { + let _ = list.try_get(i); // May be Ok or Err, but should not panic + } + } +} + +#[test] +fn test_specific_validation_error_cases() { + // Test specific malformed cases that should trigger validation errors + + // Case 1: Invalid header byte + test_validation_workflow_simple(&[0x01, 0x00, 0x00], &[0xFF, 0x42]); // Invalid basic type + + // Case 2: Truncated metadata + test_validation_workflow_simple(&[0x01], &[0x05, 0x48, 0x65, 0x6C, 0x6C, 0x6F]); // Incomplete metadata + + // Case 3: Truncated value + test_validation_workflow_simple(&[0x01, 0x00, 0x00], &[0x09]); // String header but no data + + // Case 4: Invalid object with out-of-bounds field ID + test_validation_workflow_simple(&[0x01, 0x00, 0x00], &[0x0F, 0x01, 0xFF, 0x00, 0x00]); // Field ID 255 doesn't exist + + // Case 5: Invalid list with malformed offsets + test_validation_workflow_simple(&[0x01, 0x00, 0x00], &[0x13, 0x02, 0xFF, 0x00, 0x00]); + // Malformed offset array +} + +fn test_validation_workflow_simple(metadata: &[u8], value: &[u8]) { + // Simple version without randomization, always runs regardless of feature flag + + // Step 1: Try unvalidated construction - should not panic + let variant_result = std::panic::catch_unwind(|| Variant::new(metadata, value)); + + let variant = match variant_result { + Ok(v) => v, + Err(_) => return, // Construction failed, which is acceptable for corrupted data + }; + + // Step 2: Try validation + let validation_result = std::panic::catch_unwind(|| variant.clone().validate()); + + match validation_result { + Ok(Ok(validated)) => { + // Validation succeeded - infallible access should not panic + test_infallible_access_simple(&validated); + } + Ok(Err(_)) => { + // Validation failed - fallible access should handle errors gracefully + test_fallible_access_simple(&variant); + } + Err(_) => { + // Validation panicked - this may indicate severely corrupted data + } + } +} + +fn test_infallible_access_simple(variant: &Variant) { + // All these should not panic on validated variants + let _ = variant.as_null(); + let _ = variant.as_boolean(); + let _ = variant.as_int32(); + let _ = variant.as_string(); + + if let Some(obj) = variant.as_object() { + for (_, _) in obj.iter() { + // Should not panic + } + for i in 0..obj.len() { + let _ = obj.field(i); + } + } + + if let Some(list) = variant.as_list() { + for _ in list.iter() { + // Should not panic + } + for i in 0..list.len() { + let _ = list.get(i); + } + } +} + +fn test_fallible_access_simple(variant: &Variant) { + // These should handle errors gracefully, never panic + if let Some(obj) = variant.as_object() { + for result in obj.iter_try() { + let _ = result; // May be Ok or Err, but should not panic + } + for i in 0..obj.len() { + let _ = obj.try_field(i); // May be Ok or Err, but should not panic + } + } + + if let Some(list) = variant.as_list() { + for result in list.iter_try() { + let _ = result; // May be Ok or Err, but should not panic + } + for i in 0..list.len() { + let _ = list.try_get(i); // May be Ok or Err, but should not panic + } + } +}