diff --git a/microbenchmarks/.gitignore b/microbenchmarks/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/microbenchmarks/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/microbenchmarks/README.md b/microbenchmarks/README.md index d4ddc1f..1fbfdc6 100644 --- a/microbenchmarks/README.md +++ b/microbenchmarks/README.md @@ -25,6 +25,17 @@ This directory contains microbenchmarks for comparing DataFusion and DuckDB perf The benchmarks generate synthetic data, write it to Parquet format, and then measure the execution time of various SQL functions across both DataFusion and DuckDB. Results include per-function timing comparisons and summary statistics. +Benchmarks are organized into **suites**, each focusing on a specific category of SQL functions: + +| Suite | Description | Functions | +|-------|-------------|-----------| +| `strings` | String manipulation functions (trim, lower, upper, concat, etc.) | 27 | +| `temporal` | Date/time functions (year, month, date_trunc, etc.) | 21 | +| `numeric` | Math functions (sqrt, pow, sin, cos, log, round, etc.) | 38 | +| `conditional` | Conditional logic (CASE, COALESCE, boolean ops, comparisons) | 36 | + +All benchmarks run in single-threaded mode for fair comparison between engines. + ## Setup Create a virtual environment and install dependencies: @@ -41,36 +52,44 @@ pip install -r requirements.txt Run a benchmark: ```shell -python microbenchmarks.py +python microbenchmarks.py --suite strings ``` ### Options | Option | Default | Description | |--------|---------|-------------| +| `--suite` | `strings` | Benchmark suite to run (`strings`, `temporal`, `numeric`, `conditional`) | | `--rows` | `1000000` | Number of rows in the generated test data | | `--warmup` | `2` | Number of warmup iterations before timing | | `--iterations` | `5` | Number of timed iterations (results are averaged) | | `--output` | stdout | Output file path for markdown results | +| `--string-view` | `false` | Use Arrow StringView type instead of String | ### Examples -Run the benchmark with default settings: +Run the string functions benchmark (default): ```shell -python microbenchmark.py +python microbenchmarks.py ``` -Run the benchmark with 10 million rows: +Run the temporal functions benchmark: ```shell -python microbenchmarks.py --rows 10000000 +python microbenchmarks.py --suite temporal ``` -Run the benchmark and save results to a file: +Run with 10 million rows: ```shell -python microbenchmarks.py --output results.md +python microbenchmarks.py --suite strings --rows 10000000 +``` + +Run with StringView type and save results: + +```shell +python microbenchmarks.py --suite strings --string-view --output results.md ``` ## Output @@ -83,4 +102,59 @@ The benchmark outputs a markdown table comparing execution times: | lower | 8.90 | 7.50 | 1.19x | DuckDB | | ... | ... | ... | ... | ... | -A summary section shows overall statistics including how many functions each engine won and total execution times. \ No newline at end of file +A summary section shows overall statistics including how many functions each engine won and total execution times. + +## Project Structure + +``` +microbenchmarks/ +├── microbenchmarks.py # Main benchmark runner +├── requirements.txt # Python dependencies +└── suites/ # Benchmark suite definitions + ├── __init__.py # Suite registry and base classes + ├── strings.py # String function benchmarks + └── temporal.py # Date/time function benchmarks +``` + +## Adding New Suites + +To add a new benchmark suite: + +1. Create a new file in `suites/` (e.g., `suites/numeric.py`) + +2. Define your functions and data generator: + +```python +from . import BenchmarkFunction, Suite +import pyarrow as pa + +FUNCTIONS = [ + BenchmarkFunction("abs", "abs({col})", "abs({col})"), + BenchmarkFunction("sqrt", "sqrt({col})", "sqrt({col})"), + # ... more functions +] + +def generate_data(num_rows: int, use_string_view: bool = False) -> pa.Table: + # Generate appropriate test data + return pa.table({'num_col': pa.array(...)}) + +SUITE = Suite( + name="numeric", + description="Numeric function benchmarks", + column_name="num_col", + functions=FUNCTIONS, + generate_data=generate_data, +) +``` + +3. Register the suite in `suites/__init__.py`: + +```python +from . import numeric + +SUITES: dict[str, Suite] = { + 'strings': strings.SUITE, + 'temporal': temporal.SUITE, + 'numeric': numeric.SUITE, # Add new suite here +} +``` \ No newline at end of file diff --git a/microbenchmarks/microbenchmarks.py b/microbenchmarks/microbenchmarks.py index c57483d..c904307 100755 --- a/microbenchmarks/microbenchmarks.py +++ b/microbenchmarks/microbenchmarks.py @@ -1,20 +1,20 @@ #!/usr/bin/env python3 """ Microbenchmark comparing DataFusion and DuckDB performance -for SQL string functions on Parquet files. +for various SQL functions on Parquet files. """ import tempfile import time import os from dataclasses import dataclass -from pathlib import Path -import pyarrow as pa import pyarrow.parquet as pq import datafusion import duckdb +from suites import get_suite, list_suites, Suite + @dataclass class BenchmarkResult: @@ -32,85 +32,6 @@ def speedup(self) -> float: return self.duckdb_time_ms / self.datafusion_time_ms -@dataclass -class StringFunction: - """Defines a string function with syntax for both engines.""" - name: str - datafusion_expr: str # Expression using {col} as placeholder for column name - duckdb_expr: str # Expression using {col} as placeholder for column name - - -# String functions to benchmark -# {col} will be replaced with the actual column name -STRING_FUNCTIONS = [ - StringFunction("trim", "trim({col})", "trim({col})"), - StringFunction("ltrim", "ltrim({col})", "ltrim({col})"), - StringFunction("rtrim", "rtrim({col})", "rtrim({col})"), - StringFunction("lower", "lower({col})", "lower({col})"), - StringFunction("upper", "upper({col})", "upper({col})"), - StringFunction("length", "length({col})", "length({col})"), - StringFunction("char_length", "char_length({col})", "length({col})"), - StringFunction("reverse", "reverse({col})", "reverse({col})"), - StringFunction("repeat_3", "repeat({col}, 3)", "repeat({col}, 3)"), - StringFunction("concat", "concat({col}, {col})", "concat({col}, {col})"), - StringFunction("concat_ws", "concat_ws('-', {col}, {col})", "concat_ws('-', {col}, {col})"), - StringFunction("substring_1_5", "substring({col}, 1, 5)", "substring({col}, 1, 5)"), - StringFunction("left_5", "left({col}, 5)", "left({col}, 5)"), - StringFunction("right_5", "right({col}, 5)", "right({col}, 5)"), - StringFunction("lpad_20", "lpad({col}, 20, '*')", "lpad({col}, 20, '*')"), - StringFunction("rpad_20", "rpad({col}, 20, '*')", "rpad({col}, 20, '*')"), - StringFunction("replace", "replace({col}, 'a', 'X')", "replace({col}, 'a', 'X')"), - StringFunction("translate", "translate({col}, 'aeiou', '12345')", "translate({col}, 'aeiou', '12345')"), - StringFunction("ascii", "ascii({col})", "ascii({col})"), - StringFunction("md5", "md5({col})", "md5({col})"), - StringFunction("sha256", "sha256({col})", "sha256({col})"), - StringFunction("btrim", "btrim({col}, ' ')", "trim({col}, ' ')"), - StringFunction("split_part", "split_part({col}, ' ', 1)", "split_part({col}, ' ', 1)"), - StringFunction("starts_with", "starts_with({col}, 'test')", "starts_with({col}, 'test')"), - StringFunction("ends_with", "ends_with({col}, 'data')", "ends_with({col}, 'data')"), - StringFunction("strpos", "strpos({col}, 'e')", "strpos({col}, 'e')"), - StringFunction("regexp_replace", "regexp_replace({col}, '[aeiou]', '*')", "regexp_replace({col}, '[aeiou]', '*', 'g')"), -] - - -def generate_test_data(num_rows: int = 1_000_000, use_string_view: bool = False) -> pa.Table: - """Generate test data with various string patterns.""" - import random - import string - - random.seed(42) # For reproducibility - - # Generate diverse string data - strings = [] - for i in range(num_rows): - # Mix of different string patterns - pattern_type = i % 5 - if pattern_type == 0: - # Short strings with spaces - s = f" test_{i % 1000} " - elif pattern_type == 1: - # Longer strings - s = ''.join(random.choices(string.ascii_lowercase, k=20)) - elif pattern_type == 2: - # Mixed case with numbers - s = f"TestData_{i}_Value" - elif pattern_type == 3: - # Strings with special patterns - s = f"hello world {i % 100} data" - else: - # Random length strings - length = random.randint(5, 50) - s = ''.join(random.choices(string.ascii_letters + string.digits + ' ', k=length)) - strings.append(s) - - str_type = pa.string_view() if use_string_view else pa.string() - table = pa.table({ - 'str_col': pa.array(strings, type=str_type) - }) - - return table - - def setup_datafusion(parquet_path: str) -> datafusion.SessionContext: """Create and configure DataFusion context with single thread/partition.""" config = datafusion.SessionConfig().with_target_partitions(1) @@ -167,20 +88,20 @@ def benchmark_duckdb(conn: duckdb.DuckDBPyConnection, expr: str, return sum(times) / len(times) -def run_benchmarks(num_rows: int = 1_000_000, +def run_benchmarks(suite: Suite, + num_rows: int = 1_000_000, warmup: int = 2, iterations: int = 5, use_string_view: bool = False) -> list[BenchmarkResult]: - """Run all benchmarks and return results.""" + """Run all benchmarks for a suite and return results.""" results = [] with tempfile.TemporaryDirectory() as tmpdir: parquet_path = os.path.join(tmpdir, 'test_data.parquet') # Generate and save test data - str_type = "StringView" if use_string_view else "String" - print(f"Generating {num_rows:,} rows of test data (type: {str_type})...") - table = generate_test_data(num_rows, use_string_view) + print(f"Generating {num_rows:,} rows of test data for '{suite.name}' suite...") + table = suite.generate_data(num_rows, use_string_view) pq.write_table(table, parquet_path) print(f"Parquet file written to: {parquet_path}") print(f"File size: {os.path.getsize(parquet_path) / 1024 / 1024:.2f} MB") @@ -195,8 +116,8 @@ def run_benchmarks(num_rows: int = 1_000_000, # Run benchmarks print(f"\nRunning benchmarks ({warmup} warmup, {iterations} iterations each)...\n") - col = 'str_col' - for func in STRING_FUNCTIONS: + col = suite.column_name + for func in suite.functions: df_expr = func.datafusion_expr.format(col=col) duck_expr = func.duckdb_expr.format(col=col) @@ -235,12 +156,15 @@ def run_benchmarks(num_rows: int = 1_000_000, return results -def format_results_markdown(results: list[BenchmarkResult], use_string_view: bool = False) -> str: +def format_results_markdown(results: list[BenchmarkResult], + suite: Suite, + use_string_view: bool = False) -> str: """Format benchmark results as a markdown table.""" str_type = "StringView" if use_string_view else "String" lines = [ - "# String Function Microbenchmarks: DataFusion vs DuckDB", + f"# {suite.description}: DataFusion vs DuckDB", "", + f"**Suite:** {suite.name} ", f"**DataFusion version:** {datafusion.__version__} ", f"**DuckDB version:** {duckdb.__version__} ", f"**Rows:** {results[0].rows:,} ", @@ -298,8 +222,15 @@ def format_results_markdown(results: list[BenchmarkResult], use_string_view: boo def main(): import argparse + available_suites = list_suites() + parser = argparse.ArgumentParser( - description="Benchmark string functions: DataFusion vs DuckDB" + description="Benchmark SQL functions: DataFusion vs DuckDB" + ) + parser.add_argument( + "--suite", type=str, default="strings", + choices=available_suites, + help=f"Benchmark suite to run (default: strings). Available: {', '.join(available_suites)}" ) parser.add_argument( "--rows", type=int, default=1_000_000, @@ -324,18 +255,21 @@ def main(): args = parser.parse_args() + suite = get_suite(args.suite) + print("=" * 60) - print("String Function Microbenchmarks: DataFusion vs DuckDB") + print(f"{suite.description}: DataFusion vs DuckDB") print("=" * 60) results = run_benchmarks( + suite=suite, num_rows=args.rows, warmup=args.warmup, iterations=args.iterations, use_string_view=args.string_view ) - markdown = format_results_markdown(results, use_string_view=args.string_view) + markdown = format_results_markdown(results, suite=suite, use_string_view=args.string_view) print("\n" + "=" * 60) print("RESULTS") diff --git a/microbenchmarks/suites/__init__.py b/microbenchmarks/suites/__init__.py new file mode 100644 index 0000000..329ffba --- /dev/null +++ b/microbenchmarks/suites/__init__.py @@ -0,0 +1,51 @@ +"""Benchmark suites for microbenchmarks.""" + +from dataclasses import dataclass +from typing import Callable +import pyarrow as pa + + +@dataclass +class BenchmarkFunction: + """Defines a function with syntax for both engines.""" + name: str + datafusion_expr: str # Expression using {col} as placeholder for column name + duckdb_expr: str # Expression using {col} as placeholder for column name + + +@dataclass +class Suite: + """Defines a benchmark suite.""" + name: str + description: str + column_name: str + functions: list[BenchmarkFunction] + generate_data: Callable[[int, bool], pa.Table] # (num_rows, use_string_view) -> Table + + +# Import suites to register them +from . import strings +from . import temporal +from . import numeric +from . import conditional + +# Registry of available suites +SUITES: dict[str, Suite] = { + 'strings': strings.SUITE, + 'temporal': temporal.SUITE, + 'numeric': numeric.SUITE, + 'conditional': conditional.SUITE, +} + + +def get_suite(name: str) -> Suite: + """Get a suite by name.""" + if name not in SUITES: + available = ', '.join(SUITES.keys()) + raise ValueError(f"Unknown suite: {name}. Available: {available}") + return SUITES[name] + + +def list_suites() -> list[str]: + """List available suite names.""" + return list(SUITES.keys()) diff --git a/microbenchmarks/suites/conditional.py b/microbenchmarks/suites/conditional.py new file mode 100644 index 0000000..0ebbf2a --- /dev/null +++ b/microbenchmarks/suites/conditional.py @@ -0,0 +1,110 @@ +"""Conditional/logic functions benchmark suite.""" + +import random + +import pyarrow as pa + +from . import BenchmarkFunction, Suite + + +FUNCTIONS = [ + # CASE expressions + BenchmarkFunction("case_simple", + "CASE {col} WHEN 1 THEN 'one' WHEN 2 THEN 'two' ELSE 'other' END", + "CASE {col} WHEN 1 THEN 'one' WHEN 2 THEN 'two' ELSE 'other' END"), + BenchmarkFunction("case_searched", + "CASE WHEN {col} < 0 THEN 'negative' WHEN {col} = 0 THEN 'zero' ELSE 'positive' END", + "CASE WHEN {col} < 0 THEN 'negative' WHEN {col} = 0 THEN 'zero' ELSE 'positive' END"), + BenchmarkFunction("case_many_branches", + "CASE WHEN {col} < -50 THEN 'a' WHEN {col} < -25 THEN 'b' WHEN {col} < 0 THEN 'c' WHEN {col} < 25 THEN 'd' WHEN {col} < 50 THEN 'e' ELSE 'f' END", + "CASE WHEN {col} < -50 THEN 'a' WHEN {col} < -25 THEN 'b' WHEN {col} < 0 THEN 'c' WHEN {col} < 25 THEN 'd' WHEN {col} < 50 THEN 'e' ELSE 'f' END"), + BenchmarkFunction("case_nested", + "CASE WHEN {col} > 0 THEN CASE WHEN {col} > 50 THEN 'high' ELSE 'low' END ELSE 'negative' END", + "CASE WHEN {col} > 0 THEN CASE WHEN {col} > 50 THEN 'high' ELSE 'low' END ELSE 'negative' END"), + + # NULL handling + BenchmarkFunction("coalesce_2", "COALESCE(nullable_col, 0)", "COALESCE(nullable_col, 0)"), + BenchmarkFunction("coalesce_3", "COALESCE(nullable_col, {col}, 0)", "COALESCE(nullable_col, {col}, 0)"), + BenchmarkFunction("coalesce_many", "COALESCE(nullable_col, NULL, NULL, {col}, 0)", "COALESCE(nullable_col, NULL, NULL, {col}, 0)"), + BenchmarkFunction("nullif", "NULLIF({col}, 0)", "NULLIF({col}, 0)"), + BenchmarkFunction("nullif_expr", "NULLIF({col} % 10, 5)", "NULLIF({col} % 10, 5)"), + BenchmarkFunction("ifnull", "IFNULL(nullable_col, -1)", "IFNULL(nullable_col, -1)"), + BenchmarkFunction("nvl", "NVL(nullable_col, -1)", "IFNULL(nullable_col, -1)"), + + # Comparison functions + BenchmarkFunction("greatest_2", "GREATEST({col}, {col} * -1)", "GREATEST({col}, {col} * -1)"), + BenchmarkFunction("greatest_3", "GREATEST({col}, 0, -100)", "GREATEST({col}, 0, -100)"), + BenchmarkFunction("least_2", "LEAST({col}, {col} * -1)", "LEAST({col}, {col} * -1)"), + BenchmarkFunction("least_3", "LEAST({col}, 0, 100)", "LEAST({col}, 0, 100)"), + + # Boolean logic + BenchmarkFunction("and_simple", "{col} > 0 AND {col} < 50", "{col} > 0 AND {col} < 50"), + BenchmarkFunction("or_simple", "{col} < -50 OR {col} > 50", "{col} < -50 OR {col} > 50"), + BenchmarkFunction("not", "NOT ({col} > 0)", "NOT ({col} > 0)"), + BenchmarkFunction("and_or_mixed", "({col} > 0 AND {col} < 50) OR {col} < -50", "({col} > 0 AND {col} < 50) OR {col} < -50"), + BenchmarkFunction("complex_bool", "({col} > 0 AND {col} < 25) OR ({col} < 0 AND {col} > -25) OR {col} = 0", + "({col} > 0 AND {col} < 25) OR ({col} < 0 AND {col} > -25) OR {col} = 0"), + + # Comparison operators + BenchmarkFunction("eq", "{col} = 0", "{col} = 0"), + BenchmarkFunction("neq", "{col} <> 0", "{col} <> 0"), + BenchmarkFunction("lt", "{col} < 0", "{col} < 0"), + BenchmarkFunction("lte", "{col} <= 0", "{col} <= 0"), + BenchmarkFunction("gt", "{col} > 0", "{col} > 0"), + BenchmarkFunction("gte", "{col} >= 0", "{col} >= 0"), + + # BETWEEN and IN + BenchmarkFunction("between", "{col} BETWEEN -50 AND 50", "{col} BETWEEN -50 AND 50"), + BenchmarkFunction("not_between", "{col} NOT BETWEEN -25 AND 25", "{col} NOT BETWEEN -25 AND 25"), + BenchmarkFunction("in_list_small", "{col} IN (1, 2, 3, 4, 5)", "{col} IN (1, 2, 3, 4, 5)"), + BenchmarkFunction("in_list_medium", "{col} IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)", "{col} IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)"), + BenchmarkFunction("not_in", "{col} NOT IN (1, 2, 3, 4, 5)", "{col} NOT IN (1, 2, 3, 4, 5)"), + + # NULL checks + BenchmarkFunction("is_null", "nullable_col IS NULL", "nullable_col IS NULL"), + BenchmarkFunction("is_not_null", "nullable_col IS NOT NULL", "nullable_col IS NOT NULL"), + + # IF (conditional expression) - DataFusion uses CASE, DuckDB has IF + BenchmarkFunction("if_simple", + "CASE WHEN {col} > 0 THEN 'positive' ELSE 'non-positive' END", + "IF({col} > 0, 'positive', 'non-positive')"), + BenchmarkFunction("if_numeric", + "CASE WHEN {col} > 0 THEN {col} ELSE 0 END", + "IF({col} > 0, {col}, 0)"), + BenchmarkFunction("if_nested", + "CASE WHEN {col} > 0 THEN CASE WHEN {col} > 50 THEN 'high' ELSE 'low' END ELSE 'negative' END", + "IF({col} > 0, IF({col} > 50, 'high', 'low'), 'negative')"), +] + + +def generate_data(num_rows: int = 1_000_000, use_string_view: bool = False) -> pa.Table: + """Generate test data with integers and nullable values.""" + random.seed(42) + + values = [] + nullable_values = [] + + for i in range(num_rows): + # Integer values in range for various conditional tests + v = random.randint(-100, 100) + values.append(v) + + # Nullable column: ~30% nulls + if random.random() < 0.3: + nullable_values.append(None) + else: + nullable_values.append(random.randint(-100, 100)) + + return pa.table({ + 'val_col': pa.array(values, type=pa.int64()), + 'nullable_col': pa.array(nullable_values, type=pa.int64()), + }) + + +SUITE = Suite( + name="conditional", + description="Conditional/logic function benchmarks", + column_name="val_col", + functions=FUNCTIONS, + generate_data=generate_data, +) diff --git a/microbenchmarks/suites/numeric.py b/microbenchmarks/suites/numeric.py new file mode 100644 index 0000000..9ad4ac4 --- /dev/null +++ b/microbenchmarks/suites/numeric.py @@ -0,0 +1,104 @@ +"""Numeric/math functions benchmark suite.""" + +import random + +import pyarrow as pa + +from . import BenchmarkFunction, Suite + + +FUNCTIONS = [ + # Basic math + BenchmarkFunction("abs", "abs({col})", "abs({col})"), + BenchmarkFunction("ceil", "ceil({col})", "ceil({col})"), + BenchmarkFunction("floor", "floor({col})", "floor({col})"), + BenchmarkFunction("round", "round({col}, 2)", "round({col}, 2)"), + BenchmarkFunction("trunc", "trunc({col})", "trunc({col})"), + BenchmarkFunction("signum", "signum({col})", "sign({col})"), + + # Powers and roots + BenchmarkFunction("sqrt", "sqrt(abs({col}))", "sqrt(abs({col}))"), + BenchmarkFunction("cbrt", "cbrt({col})", "cbrt({col})"), + BenchmarkFunction("power", "power({col}, 2)", "power({col}, 2)"), + BenchmarkFunction("exp", "exp({col} / 100)", "exp({col} / 100)"), + + # Logarithms + BenchmarkFunction("ln", "ln(abs({col}) + 1)", "ln(abs({col}) + 1)"), + BenchmarkFunction("log10", "log10(abs({col}) + 1)", "log10(abs({col}) + 1)"), + BenchmarkFunction("log2", "log2(abs({col}) + 1)", "log2(abs({col}) + 1)"), + BenchmarkFunction("log", "log(2, abs({col}) + 1)", "log(2, abs({col}) + 1)"), + + # Trigonometric + BenchmarkFunction("sin", "sin({col})", "sin({col})"), + BenchmarkFunction("cos", "cos({col})", "cos({col})"), + BenchmarkFunction("tan", "tan({col})", "tan({col})"), + BenchmarkFunction("asin", "asin(sin({col}))", "asin(sin({col}))"), + BenchmarkFunction("acos", "acos(cos({col}))", "acos(cos({col}))"), + BenchmarkFunction("atan", "atan({col})", "atan({col})"), + BenchmarkFunction("atan2", "atan2({col}, {col} + 1)", "atan2({col}, {col} + 1)"), + + # Hyperbolic + BenchmarkFunction("sinh", "sinh({col} / 100)", "sinh({col} / 100)"), + BenchmarkFunction("cosh", "cosh({col} / 100)", "cosh({col} / 100)"), + BenchmarkFunction("tanh", "tanh({col})", "tanh({col})"), + + # Other math functions + BenchmarkFunction("degrees", "degrees({col})", "degrees({col})"), + BenchmarkFunction("radians", "radians({col})", "radians({col})"), + BenchmarkFunction("pi", "pi() * {col}", "pi() * {col}"), + BenchmarkFunction("mod", "CAST({col} AS BIGINT) % 7", "CAST({col} AS BIGINT) % 7"), + BenchmarkFunction("gcd", "gcd(CAST({col} AS BIGINT), 12)", "gcd(CAST({col} AS BIGINT), 12)"), + BenchmarkFunction("lcm", "lcm(CAST(abs({col}) AS BIGINT) % 1000 + 1, 12)", "lcm(CAST(abs({col}) AS BIGINT) % 1000 + 1, 12)"), + BenchmarkFunction("factorial", "factorial(CAST(abs({col}) AS BIGINT) % 20)", "factorial(CAST(abs({col}) AS INTEGER) % 20)"), + + # Comparison + BenchmarkFunction("greatest", "greatest({col}, {col} * 2, 0)", "greatest({col}, {col} * 2, 0)"), + BenchmarkFunction("least", "least({col}, {col} * 2, 0)", "least({col}, {col} * 2, 0)"), + + # Null handling with numeric + BenchmarkFunction("coalesce", "coalesce({col}, 0)", "coalesce({col}, 0)"), + BenchmarkFunction("nullif", "nullif({col}, 0)", "nullif({col}, 0)"), + + # Bitwise (on integers) + BenchmarkFunction("bit_and", "CAST({col} AS BIGINT) & 255", "CAST({col} AS BIGINT) & 255"), + BenchmarkFunction("bit_or", "CAST({col} AS BIGINT) | 255", "CAST({col} AS BIGINT) | 255"), + BenchmarkFunction("bit_xor", "CAST({col} AS BIGINT) ^ 255", "xor(CAST({col} AS BIGINT), 255)"), +] + + +def generate_data(num_rows: int = 1_000_000, use_string_view: bool = False) -> pa.Table: + """Generate test data with various numeric patterns.""" + random.seed(42) # For reproducibility + + values = [] + for i in range(num_rows): + pattern_type = i % 5 + if pattern_type == 0: + # Small integers + v = random.randint(-100, 100) + elif pattern_type == 1: + # Larger integers + v = random.randint(-10000, 10000) + elif pattern_type == 2: + # Floating point values + v = random.uniform(-1000, 1000) + elif pattern_type == 3: + # Small decimals + v = random.uniform(-1, 1) + else: + # Mixed range + v = random.gauss(0, 500) + values.append(v) + + return pa.table({ + 'num_col': pa.array(values, type=pa.float64()) + }) + + +SUITE = Suite( + name="numeric", + description="Numeric function benchmarks", + column_name="num_col", + functions=FUNCTIONS, + generate_data=generate_data, +) diff --git a/microbenchmarks/suites/strings.py b/microbenchmarks/suites/strings.py new file mode 100644 index 0000000..05f730e --- /dev/null +++ b/microbenchmarks/suites/strings.py @@ -0,0 +1,74 @@ +"""String functions benchmark suite.""" + +import random +import string + +import pyarrow as pa + +from . import BenchmarkFunction, Suite + + +FUNCTIONS = [ + BenchmarkFunction("trim", "trim({col})", "trim({col})"), + BenchmarkFunction("ltrim", "ltrim({col})", "ltrim({col})"), + BenchmarkFunction("rtrim", "rtrim({col})", "rtrim({col})"), + BenchmarkFunction("lower", "lower({col})", "lower({col})"), + BenchmarkFunction("upper", "upper({col})", "upper({col})"), + BenchmarkFunction("length", "length({col})", "length({col})"), + BenchmarkFunction("char_length", "char_length({col})", "length({col})"), + BenchmarkFunction("reverse", "reverse({col})", "reverse({col})"), + BenchmarkFunction("repeat_3", "repeat({col}, 3)", "repeat({col}, 3)"), + BenchmarkFunction("concat", "concat({col}, {col})", "concat({col}, {col})"), + BenchmarkFunction("concat_ws", "concat_ws('-', {col}, {col})", "concat_ws('-', {col}, {col})"), + BenchmarkFunction("substring_1_5", "substring({col}, 1, 5)", "substring({col}, 1, 5)"), + BenchmarkFunction("left_5", "left({col}, 5)", "left({col}, 5)"), + BenchmarkFunction("right_5", "right({col}, 5)", "right({col}, 5)"), + BenchmarkFunction("lpad_20", "lpad({col}, 20, '*')", "lpad({col}, 20, '*')"), + BenchmarkFunction("rpad_20", "rpad({col}, 20, '*')", "rpad({col}, 20, '*')"), + BenchmarkFunction("replace", "replace({col}, 'a', 'X')", "replace({col}, 'a', 'X')"), + BenchmarkFunction("translate", "translate({col}, 'aeiou', '12345')", "translate({col}, 'aeiou', '12345')"), + BenchmarkFunction("ascii", "ascii({col})", "ascii({col})"), + BenchmarkFunction("md5", "md5({col})", "md5({col})"), + BenchmarkFunction("sha256", "sha256({col})", "sha256({col})"), + BenchmarkFunction("btrim", "btrim({col}, ' ')", "trim({col}, ' ')"), + BenchmarkFunction("split_part", "split_part({col}, ' ', 1)", "split_part({col}, ' ', 1)"), + BenchmarkFunction("starts_with", "starts_with({col}, 'test')", "starts_with({col}, 'test')"), + BenchmarkFunction("ends_with", "ends_with({col}, 'data')", "ends_with({col}, 'data')"), + BenchmarkFunction("strpos", "strpos({col}, 'e')", "strpos({col}, 'e')"), + BenchmarkFunction("regexp_replace", "regexp_replace({col}, '[aeiou]', '*')", "regexp_replace({col}, '[aeiou]', '*', 'g')"), +] + + +def generate_data(num_rows: int = 1_000_000, use_string_view: bool = False) -> pa.Table: + """Generate test data with various string patterns.""" + random.seed(42) # For reproducibility + + strings_data = [] + for i in range(num_rows): + pattern_type = i % 5 + if pattern_type == 0: + s = f" test_{i % 1000} " + elif pattern_type == 1: + s = ''.join(random.choices(string.ascii_lowercase, k=20)) + elif pattern_type == 2: + s = f"TestData_{i}_Value" + elif pattern_type == 3: + s = f"hello world {i % 100} data" + else: + length = random.randint(5, 50) + s = ''.join(random.choices(string.ascii_letters + string.digits + ' ', k=length)) + strings_data.append(s) + + str_type = pa.string_view() if use_string_view else pa.string() + return pa.table({ + 'str_col': pa.array(strings_data, type=str_type) + }) + + +SUITE = Suite( + name="strings", + description="String function benchmarks", + column_name="str_col", + functions=FUNCTIONS, + generate_data=generate_data, +) diff --git a/microbenchmarks/suites/temporal.py b/microbenchmarks/suites/temporal.py new file mode 100644 index 0000000..3e1c02e --- /dev/null +++ b/microbenchmarks/suites/temporal.py @@ -0,0 +1,91 @@ +"""Temporal (date/time) functions benchmark suite.""" + +import random +from datetime import datetime, timedelta + +import pyarrow as pa + +from . import BenchmarkFunction, Suite + + +FUNCTIONS = [ + # Date extraction functions + BenchmarkFunction("year", "date_part('year', {col})", "year({col})"), + BenchmarkFunction("month", "date_part('month', {col})", "month({col})"), + BenchmarkFunction("day", "date_part('day', {col})", "day({col})"), + BenchmarkFunction("hour", "date_part('hour', {col})", "hour({col})"), + BenchmarkFunction("minute", "date_part('minute', {col})", "minute({col})"), + BenchmarkFunction("second", "date_part('second', {col})", "second({col})"), + BenchmarkFunction("week", "date_part('week', {col})", "week({col})"), + BenchmarkFunction("quarter", "date_part('quarter', {col})", "quarter({col})"), + BenchmarkFunction("day_of_week", "date_part('dow', {col})", "dayofweek({col})"), + BenchmarkFunction("day_of_year", "date_part('doy', {col})", "dayofyear({col})"), + + # Date truncation + BenchmarkFunction("date_trunc_day", "date_trunc('day', {col})", "date_trunc('day', {col})"), + BenchmarkFunction("date_trunc_month", "date_trunc('month', {col})", "date_trunc('month', {col})"), + BenchmarkFunction("date_trunc_year", "date_trunc('year', {col})", "date_trunc('year', {col})"), + BenchmarkFunction("date_trunc_hour", "date_trunc('hour', {col})", "date_trunc('hour', {col})"), + + # Date arithmetic + BenchmarkFunction("date_add_days", "{col} + interval '7 days'", "{col} + interval '7 days'"), + BenchmarkFunction("date_sub_days", "{col} - interval '7 days'", "{col} - interval '7 days'"), + BenchmarkFunction("date_add_months", "{col} + interval '1 month'", "{col} + interval '1 month'"), + + # Date formatting/parsing + BenchmarkFunction("to_char", "to_char({col}, '%Y-%m-%d')", "strftime({col}, '%Y-%m-%d')"), + + # Date parts + BenchmarkFunction("date_part_hour", "date_part('hour', {col})", "date_part('hour', {col})"), + BenchmarkFunction("date_part_minute", "date_part('minute', {col})", "date_part('minute', {col})"), + + # Current date/time comparisons + BenchmarkFunction("is_past", "{col} < now()", "{col} < now()"), +] + + +def generate_data(num_rows: int = 1_000_000, use_string_view: bool = False) -> pa.Table: + """Generate test data with various timestamp patterns.""" + random.seed(42) # For reproducibility + + # Generate timestamps spanning several years + base_date = datetime(2020, 1, 1) + max_days = 365 * 5 # 5 years of data + + timestamps = [] + for i in range(num_rows): + # Mix of different timestamp patterns + pattern_type = i % 4 + if pattern_type == 0: + # Random timestamp within range + days = random.randint(0, max_days) + hours = random.randint(0, 23) + minutes = random.randint(0, 59) + seconds = random.randint(0, 59) + ts = base_date + timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds) + elif pattern_type == 1: + # Timestamps at midnight (common pattern) + days = random.randint(0, max_days) + ts = base_date + timedelta(days=days) + elif pattern_type == 2: + # Timestamps at specific hours (business hours) + days = random.randint(0, max_days) + hours = random.choice([9, 10, 11, 12, 13, 14, 15, 16, 17]) + ts = base_date + timedelta(days=days, hours=hours) + else: + # Sequential timestamps (time series pattern) + ts = base_date + timedelta(seconds=i) + timestamps.append(ts) + + return pa.table({ + 'ts_col': pa.array(timestamps, type=pa.timestamp('us')) + }) + + +SUITE = Suite( + name="temporal", + description="Date/time function benchmarks", + column_name="ts_col", + functions=FUNCTIONS, + generate_data=generate_data, +)