From dc71983cd9076c07e92a52597c677866bd9e2727 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sun, 12 Apr 2026 16:00:35 +0000 Subject: [PATCH] Iteration 9: Add 21 new benchmark pairs (total 22) Add benchmark pairs for: dataframe_creation, series_arithmetic, groupby_mean, series_sort, dataframe_filter, concat, merge, rolling_mean, describe, series_value_counts, read_csv, series_string_ops, pivot_table, ewm_mean, dataframe_apply, series_fillna, dataframe_dropna, dataframe_sort, series_cumsum, series_shift, dataframe_rename. Fix playground/benchmarks.html to handle null tsb values gracefully. Update results.json with pandas timing results. Metric: 22 (previous best: 13, delta: +9) Run: https://github.com/githubnext/tsessebe/actions/runs/24310339206 --- benchmarks/pandas/bench_concat.py | 28 ++ benchmarks/pandas/bench_dataframe_apply.py | 27 ++ benchmarks/pandas/bench_dataframe_creation.py | 27 ++ benchmarks/pandas/bench_dataframe_dropna.py | 27 ++ benchmarks/pandas/bench_dataframe_filter.py | 26 ++ benchmarks/pandas/bench_dataframe_rename.py | 27 ++ benchmarks/pandas/bench_dataframe_sort.py | 28 ++ benchmarks/pandas/bench_describe.py | 27 ++ benchmarks/pandas/bench_ewm_mean.py | 26 ++ benchmarks/pandas/bench_groupby_mean.py | 27 ++ benchmarks/pandas/bench_merge.py | 29 ++ benchmarks/pandas/bench_pivot_table.py | 28 ++ benchmarks/pandas/bench_read_csv.py | 30 +++ benchmarks/pandas/bench_rolling_mean.py | 26 ++ benchmarks/pandas/bench_series_arithmetic.py | 26 ++ benchmarks/pandas/bench_series_cumsum.py | 26 ++ benchmarks/pandas/bench_series_fillna.py | 26 ++ benchmarks/pandas/bench_series_shift.py | 26 ++ benchmarks/pandas/bench_series_sort.py | 27 ++ benchmarks/pandas/bench_series_string_ops.py | 27 ++ .../pandas/bench_series_value_counts.py | 25 ++ benchmarks/results.json | 248 +++++++++++++++++- benchmarks/tsb/bench_concat.ts | 32 +++ benchmarks/tsb/bench_dataframe_apply.ts | 32 +++ benchmarks/tsb/bench_dataframe_creation.ts | 33 +++ benchmarks/tsb/bench_dataframe_dropna.ts | 31 +++ benchmarks/tsb/bench_dataframe_filter.ts | 30 +++ benchmarks/tsb/bench_dataframe_rename.ts | 31 +++ benchmarks/tsb/bench_dataframe_sort.ts | 31 +++ benchmarks/tsb/bench_describe.ts | 31 +++ benchmarks/tsb/bench_ewm_mean.ts | 30 +++ benchmarks/tsb/bench_groupby_mean.ts | 31 +++ benchmarks/tsb/bench_merge.ts | 33 +++ benchmarks/tsb/bench_pivot_table.ts | 32 +++ benchmarks/tsb/bench_read_csv.ts | 39 +++ benchmarks/tsb/bench_rolling_mean.ts | 30 +++ benchmarks/tsb/bench_series_arithmetic.ts | 30 +++ benchmarks/tsb/bench_series_cumsum.ts | 30 +++ benchmarks/tsb/bench_series_fillna.ts | 31 +++ benchmarks/tsb/bench_series_shift.ts | 30 +++ benchmarks/tsb/bench_series_sort.ts | 30 +++ benchmarks/tsb/bench_series_string_ops.ts | 32 +++ benchmarks/tsb/bench_series_value_counts.ts | 30 +++ playground/benchmarks.html | 51 ++-- 44 files changed, 1505 insertions(+), 19 deletions(-) create mode 100644 benchmarks/pandas/bench_concat.py create mode 100644 benchmarks/pandas/bench_dataframe_apply.py create mode 100644 benchmarks/pandas/bench_dataframe_creation.py create mode 100644 benchmarks/pandas/bench_dataframe_dropna.py create mode 100644 benchmarks/pandas/bench_dataframe_filter.py create mode 100644 benchmarks/pandas/bench_dataframe_rename.py create mode 100644 benchmarks/pandas/bench_dataframe_sort.py create mode 100644 benchmarks/pandas/bench_describe.py create mode 100644 benchmarks/pandas/bench_ewm_mean.py create mode 100644 benchmarks/pandas/bench_groupby_mean.py create mode 100644 benchmarks/pandas/bench_merge.py create mode 100644 benchmarks/pandas/bench_pivot_table.py create mode 100644 benchmarks/pandas/bench_read_csv.py create mode 100644 benchmarks/pandas/bench_rolling_mean.py create mode 100644 benchmarks/pandas/bench_series_arithmetic.py create mode 100644 benchmarks/pandas/bench_series_cumsum.py create mode 100644 benchmarks/pandas/bench_series_fillna.py create mode 100644 benchmarks/pandas/bench_series_shift.py create mode 100644 benchmarks/pandas/bench_series_sort.py create mode 100644 benchmarks/pandas/bench_series_string_ops.py create mode 100644 benchmarks/pandas/bench_series_value_counts.py create mode 100644 benchmarks/tsb/bench_concat.ts create mode 100644 benchmarks/tsb/bench_dataframe_apply.ts create mode 100644 benchmarks/tsb/bench_dataframe_creation.ts create mode 100644 benchmarks/tsb/bench_dataframe_dropna.ts create mode 100644 benchmarks/tsb/bench_dataframe_filter.ts create mode 100644 benchmarks/tsb/bench_dataframe_rename.ts create mode 100644 benchmarks/tsb/bench_dataframe_sort.ts create mode 100644 benchmarks/tsb/bench_describe.ts create mode 100644 benchmarks/tsb/bench_ewm_mean.ts create mode 100644 benchmarks/tsb/bench_groupby_mean.ts create mode 100644 benchmarks/tsb/bench_merge.ts create mode 100644 benchmarks/tsb/bench_pivot_table.ts create mode 100644 benchmarks/tsb/bench_read_csv.ts create mode 100644 benchmarks/tsb/bench_rolling_mean.ts create mode 100644 benchmarks/tsb/bench_series_arithmetic.ts create mode 100644 benchmarks/tsb/bench_series_cumsum.ts create mode 100644 benchmarks/tsb/bench_series_fillna.ts create mode 100644 benchmarks/tsb/bench_series_shift.ts create mode 100644 benchmarks/tsb/bench_series_sort.ts create mode 100644 benchmarks/tsb/bench_series_string_ops.ts create mode 100644 benchmarks/tsb/bench_series_value_counts.ts diff --git a/benchmarks/pandas/bench_concat.py b/benchmarks/pandas/bench_concat.py new file mode 100644 index 00000000..3533109e --- /dev/null +++ b/benchmarks/pandas/bench_concat.py @@ -0,0 +1,28 @@ +"""Benchmark: concat — concatenate two 50k-row DataFrames""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 50_000 +WARMUP = 5 +ITERATIONS = 20 + +vals1 = np.arange(ROWS, dtype=np.float64) +vals2 = np.arange(ROWS, dtype=np.float64) * 2.0 +df1 = pd.DataFrame({"value": vals1}) +df2 = pd.DataFrame({"value": vals2}) + +for _ in range(WARMUP): + pd.concat([df1, df2], ignore_index=True) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.concat([df1, df2], ignore_index=True) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "concat", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_apply.py b/benchmarks/pandas/bench_dataframe_apply.py new file mode 100644 index 00000000..6788d422 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_apply.py @@ -0,0 +1,27 @@ +"""Benchmark: dataframe_apply — apply a function across rows of a 10k-row DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 + +a = np.arange(ROWS, dtype=np.float64) +b = np.arange(ROWS, dtype=np.float64) * 2.0 +df = pd.DataFrame({"a": a, "b": b}) + +for _ in range(WARMUP): + df.apply(lambda row: row["a"] + row["b"], axis=1) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.apply(lambda row: row["a"] + row["b"], axis=1) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_apply", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_creation.py b/benchmarks/pandas/bench_dataframe_creation.py new file mode 100644 index 00000000..706c8b13 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_creation.py @@ -0,0 +1,27 @@ +"""Benchmark: DataFrame creation from arrays (pandas equivalent)""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +nums1 = np.arange(ROWS, dtype=np.float64) * 1.1 +nums2 = np.arange(ROWS, dtype=np.float64) * 2.2 +strs = [f"label_{i % 100}" for i in range(ROWS)] + +for _ in range(WARMUP): + pd.DataFrame({"a": nums1, "b": nums2, "c": strs}) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.DataFrame({"a": nums1, "b": nums2, "c": strs}) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_creation", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_dropna.py b/benchmarks/pandas/bench_dataframe_dropna.py new file mode 100644 index 00000000..08a11895 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_dropna.py @@ -0,0 +1,27 @@ +"""Benchmark: dataframe_dropna — drop rows with NaN values from 100k-row DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +a = np.where(np.arange(ROWS) % 10 == 0, np.nan, np.arange(ROWS) * 1.1) +b = np.where(np.arange(ROWS) % 7 == 0, np.nan, np.arange(ROWS) * 2.2) +df = pd.DataFrame({"a": a, "b": b}) + +for _ in range(WARMUP): + df.dropna() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.dropna() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_dropna", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_filter.py b/benchmarks/pandas/bench_dataframe_filter.py new file mode 100644 index 00000000..112384f8 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_filter.py @@ -0,0 +1,26 @@ +"""Benchmark: DataFrame filter (boolean mask on 100k-row DataFrame)""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +vals = np.arange(ROWS, dtype=np.float64) * 0.1 +df = pd.DataFrame({"value": vals}) + +for _ in range(WARMUP): + df[df["value"] > 5000] + +start = time.perf_counter() +for _ in range(ITERATIONS): + df[df["value"] > 5000] +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_filter", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_rename.py b/benchmarks/pandas/bench_dataframe_rename.py new file mode 100644 index 00000000..65e44626 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_rename.py @@ -0,0 +1,27 @@ +"""Benchmark: dataframe_rename — rename columns in a 100k-row DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +a = np.arange(ROWS, dtype=np.float64) * 1.1 +b = np.arange(ROWS, dtype=np.float64) * 2.2 +df = pd.DataFrame({"old_a": a, "old_b": b}) + +for _ in range(WARMUP): + df.rename(columns={"old_a": "new_a", "old_b": "new_b"}) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.rename(columns={"old_a": "new_a", "old_b": "new_b"}) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_rename", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_sort.py b/benchmarks/pandas/bench_dataframe_sort.py new file mode 100644 index 00000000..6ef3c84d --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_sort.py @@ -0,0 +1,28 @@ +"""Benchmark: dataframe_sort — sort a 100k-row DataFrame by two columns""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +rng = np.random.default_rng(42) +a = [f"group_{i % 100}" for i in range(ROWS)] +b = rng.random(ROWS) * 1000 +df = pd.DataFrame({"a": a, "b": b}) + +for _ in range(WARMUP): + df.sort_values(["a", "b"]) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.sort_values(["a", "b"]) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_sort", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_describe.py b/benchmarks/pandas/bench_describe.py new file mode 100644 index 00000000..b9e84dcc --- /dev/null +++ b/benchmarks/pandas/bench_describe.py @@ -0,0 +1,27 @@ +"""Benchmark: describe — summary statistics on a 100k-row DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +a = np.arange(ROWS, dtype=np.float64) * 1.1 +b = np.sqrt(np.arange(1, ROWS + 1, dtype=np.float64)) +df = pd.DataFrame({"a": a, "b": b}) + +for _ in range(WARMUP): + df.describe() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.describe() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "describe", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_ewm_mean.py b/benchmarks/pandas/bench_ewm_mean.py new file mode 100644 index 00000000..4e6cbadd --- /dev/null +++ b/benchmarks/pandas/bench_ewm_mean.py @@ -0,0 +1,26 @@ +"""Benchmark: ewm_mean — exponentially weighted mean on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.sin(np.arange(ROWS) * 0.05) +s = pd.Series(data) + +for _ in range(WARMUP): + s.ewm(span=20).mean() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.ewm(span=20).mean() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "ewm_mean", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_groupby_mean.py b/benchmarks/pandas/bench_groupby_mean.py new file mode 100644 index 00000000..050959af --- /dev/null +++ b/benchmarks/pandas/bench_groupby_mean.py @@ -0,0 +1,27 @@ +"""Benchmark: GroupBy mean on 100k-row DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +keys = [f"group_{i % 100}" for i in range(ROWS)] +vals = np.arange(ROWS, dtype=np.float64) * 0.1 +df = pd.DataFrame({"key": keys, "value": vals}) + +for _ in range(WARMUP): + df.groupby("key")["value"].mean() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.groupby("key")["value"].mean() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "groupby_mean", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_merge.py b/benchmarks/pandas/bench_merge.py new file mode 100644 index 00000000..9775f4a2 --- /dev/null +++ b/benchmarks/pandas/bench_merge.py @@ -0,0 +1,29 @@ +"""Benchmark: merge — inner join two 50k-row DataFrames on a key column""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 50_000 +WARMUP = 3 +ITERATIONS = 10 + +keys = np.arange(ROWS) % 1000 +vals1 = np.arange(ROWS, dtype=np.float64) +vals2 = np.arange(ROWS, dtype=np.float64) * 2.0 +df1 = pd.DataFrame({"key": keys, "val1": vals1}) +df2 = pd.DataFrame({"key": keys, "val2": vals2}) + +for _ in range(WARMUP): + pd.merge(df1, df2, on="key", how="inner") + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.merge(df1, df2, on="key", how="inner") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "merge", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_pivot_table.py b/benchmarks/pandas/bench_pivot_table.py new file mode 100644 index 00000000..f65f9321 --- /dev/null +++ b/benchmarks/pandas/bench_pivot_table.py @@ -0,0 +1,28 @@ +"""Benchmark: pivot_table — pivot aggregation on 100k-row DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +rows = [f"row_{i % 100}" for i in range(ROWS)] +cols = [f"col_{i % 50}" for i in range(ROWS)] +vals = np.arange(ROWS, dtype=np.float64) * 0.1 +df = pd.DataFrame({"row": rows, "col": cols, "value": vals}) + +for _ in range(WARMUP): + df.pivot_table(values="value", index="row", columns="col", aggfunc="mean") + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.pivot_table(values="value", index="row", columns="col", aggfunc="mean") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "pivot_table", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_read_csv.py b/benchmarks/pandas/bench_read_csv.py new file mode 100644 index 00000000..d6aa816a --- /dev/null +++ b/benchmarks/pandas/bench_read_csv.py @@ -0,0 +1,30 @@ +"""Benchmark: read_csv — parse a 100k-row CSV file""" +import json, time, os, tempfile +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 2 +ITERATIONS = 5 + +# Build CSV file +tmp_path = "/tmp/gh-aw/agent/bench_read_csv.csv" +with open(tmp_path, "w") as f: + f.write("id,value,label\n") + for i in range(ROWS): + f.write(f"{i},{i * 1.1:.4f},cat_{i % 50}\n") + +for _ in range(WARMUP): + pd.read_csv(tmp_path) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.read_csv(tmp_path) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "read_csv", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_rolling_mean.py b/benchmarks/pandas/bench_rolling_mean.py new file mode 100644 index 00000000..5258fca4 --- /dev/null +++ b/benchmarks/pandas/bench_rolling_mean.py @@ -0,0 +1,26 @@ +"""Benchmark: rolling mean with window=100 on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.sin(np.arange(ROWS) * 0.01) +s = pd.Series(data) + +for _ in range(WARMUP): + s.rolling(100).mean() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.rolling(100).mean() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "rolling_mean", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_arithmetic.py b/benchmarks/pandas/bench_series_arithmetic.py new file mode 100644 index 00000000..4f0325b0 --- /dev/null +++ b/benchmarks/pandas/bench_series_arithmetic.py @@ -0,0 +1,26 @@ +"""Benchmark: Series arithmetic (add + multiply on 100k-element Series)""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +data = np.arange(ROWS, dtype=np.float64) * 0.5 +s = pd.Series(data) + +for _ in range(WARMUP): + (s + 2.0) * 0.5 + +start = time.perf_counter() +for _ in range(ITERATIONS): + (s + 2.0) * 0.5 +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_arithmetic", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_cumsum.py b/benchmarks/pandas/bench_series_cumsum.py new file mode 100644 index 00000000..556e3ebd --- /dev/null +++ b/benchmarks/pandas/bench_series_cumsum.py @@ -0,0 +1,26 @@ +"""Benchmark: series_cumsum — cumulative sum on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +data = np.arange(ROWS, dtype=np.float64) * 0.001 +s = pd.Series(data) + +for _ in range(WARMUP): + s.cumsum() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.cumsum() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_cumsum", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_fillna.py b/benchmarks/pandas/bench_series_fillna.py new file mode 100644 index 00000000..6b62f6ad --- /dev/null +++ b/benchmarks/pandas/bench_series_fillna.py @@ -0,0 +1,26 @@ +"""Benchmark: series_fillna — fill NaN values in a 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +data = np.where(np.arange(ROWS) % 5 == 0, np.nan, np.arange(ROWS) * 1.1) +s = pd.Series(data) + +for _ in range(WARMUP): + s.fillna(0.0) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.fillna(0.0) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_fillna", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_shift.py b/benchmarks/pandas/bench_series_shift.py new file mode 100644 index 00000000..0b294485 --- /dev/null +++ b/benchmarks/pandas/bench_series_shift.py @@ -0,0 +1,26 @@ +"""Benchmark: series_shift — shift values by 1 position in a 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +data = np.arange(ROWS, dtype=np.float64) +s = pd.Series(data) + +for _ in range(WARMUP): + s.shift(1) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.shift(1) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_shift", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_sort.py b/benchmarks/pandas/bench_series_sort.py new file mode 100644 index 00000000..c31de4aa --- /dev/null +++ b/benchmarks/pandas/bench_series_sort.py @@ -0,0 +1,27 @@ +"""Benchmark: Series sort (sort_values on 100k-element numeric Series)""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +rng = np.random.default_rng(42) +data = rng.random(ROWS) * 1000 +s = pd.Series(data) + +for _ in range(WARMUP): + s.sort_values() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.sort_values() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_sort", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_string_ops.py b/benchmarks/pandas/bench_series_string_ops.py new file mode 100644 index 00000000..8744ddcc --- /dev/null +++ b/benchmarks/pandas/bench_series_string_ops.py @@ -0,0 +1,27 @@ +"""Benchmark: series_string_ops — str.upper and str.contains on 100k strings""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"hello_world_{i % 200}" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.upper() + s.str.contains("world") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.upper() + s.str.contains("world") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_string_ops", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_value_counts.py b/benchmarks/pandas/bench_series_value_counts.py new file mode 100644 index 00000000..c156a1eb --- /dev/null +++ b/benchmarks/pandas/bench_series_value_counts.py @@ -0,0 +1,25 @@ +"""Benchmark: value_counts on a 100k-element Series with 100 distinct values""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"cat_{i % 100}" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.value_counts() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.value_counts() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_value_counts", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/results.json b/benchmarks/results.json index 7d1fa6ec..c883f334 100644 --- a/benchmarks/results.json +++ b/benchmarks/results.json @@ -1 +1,247 @@ -{ "benchmarks": [], "timestamp": null } +{ + "benchmarks": [ + { + "function": "concat", + "tsb": null, + "pandas": { + "function": "concat", + "mean_ms": 0.11375509999993483, + "iterations": 20, + "total_ms": 2.2751019999986966 + }, + "ratio": null + }, + { + "function": "dataframe_apply", + "tsb": null, + "pandas": { + "function": "dataframe_apply", + "mean_ms": 47.161531699998704, + "iterations": 10, + "total_ms": 471.61531699998704 + }, + "ratio": null + }, + { + "function": "dataframe_creation", + "tsb": null, + "pandas": { + "function": "dataframe_creation", + "mean_ms": 5.148059900000135, + "iterations": 10, + "total_ms": 51.48059900000135 + }, + "ratio": null + }, + { + "function": "dataframe_dropna", + "tsb": null, + "pandas": { + "function": "dataframe_dropna", + "mean_ms": 2.42739894999886, + "iterations": 20, + "total_ms": 48.547978999977204 + }, + "ratio": null + }, + { + "function": "dataframe_filter", + "tsb": null, + "pandas": { + "function": "dataframe_filter", + "mean_ms": 0.4964389500003108, + "iterations": 20, + "total_ms": 9.928779000006216 + }, + "ratio": null + }, + { + "function": "dataframe_rename", + "tsb": null, + "pandas": { + "function": "dataframe_rename", + "mean_ms": 0.17103454999869427, + "iterations": 20, + "total_ms": 3.4206909999738855 + }, + "ratio": null + }, + { + "function": "dataframe_sort", + "tsb": null, + "pandas": { + "function": "dataframe_sort", + "mean_ms": 33.301584399998774, + "iterations": 10, + "total_ms": 333.01584399998774 + }, + "ratio": null + }, + { + "function": "describe", + "tsb": null, + "pandas": { + "function": "describe", + "mean_ms": 5.521558600003118, + "iterations": 10, + "total_ms": 55.21558600003118 + }, + "ratio": null + }, + { + "function": "ewm_mean", + "tsb": null, + "pandas": { + "function": "ewm_mean", + "mean_ms": 1.7652839999982461, + "iterations": 10, + "total_ms": 17.65283999998246 + }, + "ratio": null + }, + { + "function": "groupby_mean", + "tsb": null, + "pandas": { + "function": "groupby_mean", + "mean_ms": 8.079756900002621, + "iterations": 10, + "total_ms": 80.79756900002621 + }, + "ratio": null + }, + { + "function": "merge", + "tsb": null, + "pandas": { + "function": "merge", + "mean_ms": 60.42320619999941, + "iterations": 10, + "total_ms": 604.2320619999941 + }, + "ratio": null + }, + { + "function": "pivot_table", + "tsb": null, + "pandas": { + "function": "pivot_table", + "mean_ms": 22.500251999997545, + "iterations": 10, + "total_ms": 225.00251999997545 + }, + "ratio": null + }, + { + "function": "read_csv", + "tsb": null, + "pandas": { + "function": "read_csv", + "mean_ms": 29.951929399999244, + "iterations": 5, + "total_ms": 149.75964699999622 + }, + "ratio": null + }, + { + "function": "rolling_mean", + "tsb": null, + "pandas": { + "function": "rolling_mean", + "mean_ms": 1.71982609999759, + "iterations": 10, + "total_ms": 17.1982609999759 + }, + "ratio": null + }, + { + "function": "series_arithmetic", + "tsb": null, + "pandas": { + "function": "series_arithmetic", + "mean_ms": 0.764571400000591, + "iterations": 20, + "total_ms": 15.29142800001182 + }, + "ratio": null + }, + { + "function": "series_creation", + "tsb": null, + "pandas": { + "function": "series_creation", + "mean_ms": 7.607, + "iterations": 50, + "total_ms": 380.349 + }, + "ratio": null + }, + { + "function": "series_cumsum", + "tsb": null, + "pandas": { + "function": "series_cumsum", + "mean_ms": 1.1250383499998406, + "iterations": 20, + "total_ms": 22.500766999996813 + }, + "ratio": null + }, + { + "function": "series_fillna", + "tsb": null, + "pandas": { + "function": "series_fillna", + "mean_ms": 0.18527670000025864, + "iterations": 20, + "total_ms": 3.705534000005173 + }, + "ratio": null + }, + { + "function": "series_shift", + "tsb": null, + "pandas": { + "function": "series_shift", + "mean_ms": 0.07249699999931636, + "iterations": 20, + "total_ms": 1.4499399999863272 + }, + "ratio": null + }, + { + "function": "series_sort", + "tsb": null, + "pandas": { + "function": "series_sort", + "mean_ms": 5.127767300001551, + "iterations": 10, + "total_ms": 51.27767300001551 + }, + "ratio": null + }, + { + "function": "series_string_ops", + "tsb": null, + "pandas": { + "function": "series_string_ops", + "mean_ms": 34.08206670000027, + "iterations": 10, + "total_ms": 340.8206670000027 + }, + "ratio": null + }, + { + "function": "series_value_counts", + "tsb": null, + "pandas": { + "function": "series_value_counts", + "mean_ms": 9.212644899997713, + "iterations": 10, + "total_ms": 92.12644899997713 + }, + "ratio": null + } + ], + "timestamp": "2026-04-12T15:46:00Z" +} \ No newline at end of file diff --git a/benchmarks/tsb/bench_concat.ts b/benchmarks/tsb/bench_concat.ts new file mode 100644 index 00000000..7a72f777 --- /dev/null +++ b/benchmarks/tsb/bench_concat.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: concat — concatenate two 50k-row DataFrames + */ +import { DataFrame, concat } from "../../src/index.js"; + +const ROWS = 50_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const vals1 = Float64Array.from({ length: ROWS }, (_, i) => i * 1.0); +const vals2 = Float64Array.from({ length: ROWS }, (_, i) => i * 2.0); +const df1 = new DataFrame({ value: vals1 }); +const df2 = new DataFrame({ value: vals2 }); + +for (let i = 0; i < WARMUP; i++) { + concat([df1, df2]); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + concat([df1, df2]); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "concat", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_apply.ts b/benchmarks/tsb/bench_dataframe_apply.ts new file mode 100644 index 00000000..32a99a68 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_apply.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: dataframe_apply — apply a function across rows of a 10k-row DataFrame + * (reduced size due to JS per-row overhead) + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const a = Float64Array.from({ length: ROWS }, (_, i) => i * 1.0); +const b = Float64Array.from({ length: ROWS }, (_, i) => i * 2.0); +const df = new DataFrame({ a, b }); + +for (let i = 0; i < WARMUP; i++) { + df.apply((row) => (row["a"] as number) + (row["b"] as number), { axis: 1 }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.apply((row) => (row["a"] as number) + (row["b"] as number), { axis: 1 }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_apply", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_creation.ts b/benchmarks/tsb/bench_dataframe_creation.ts new file mode 100644 index 00000000..2eb8fd56 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_creation.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: DataFrame creation from arrays + * Creates a 3-column (2 numeric + 1 string) 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const nums1 = Float64Array.from({ length: ROWS }, (_, i) => i * 1.1); +const nums2 = Float64Array.from({ length: ROWS }, (_, i) => i * 2.2); +const strs = Array.from({ length: ROWS }, (_, i) => `label_${i % 100}`); + +// Warm up +for (let i = 0; i < WARMUP; i++) { + new DataFrame({ a: nums1, b: nums2, c: strs }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + new DataFrame({ a: nums1, b: nums2, c: strs }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_creation", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_dropna.ts b/benchmarks/tsb/bench_dataframe_dropna.ts new file mode 100644 index 00000000..e4fef46b --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_dropna.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: dataframe_dropna — drop rows with NaN values from 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const a = Float64Array.from({ length: ROWS }, (_, i) => (i % 10 === 0 ? NaN : i * 1.1)); +const b = Float64Array.from({ length: ROWS }, (_, i) => (i % 7 === 0 ? NaN : i * 2.2)); +const df = new DataFrame({ a, b }); + +for (let i = 0; i < WARMUP; i++) { + df.dropna(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.dropna(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_dropna", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_filter.ts b/benchmarks/tsb/bench_dataframe_filter.ts new file mode 100644 index 00000000..57d78bd7 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_filter.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: DataFrame filter (boolean mask on 100k-row DataFrame) + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const vals = Float64Array.from({ length: ROWS }, (_, i) => i * 0.1); +const df = new DataFrame({ value: vals }); + +for (let i = 0; i < WARMUP; i++) { + df.filter((row) => (row["value"] as number) > 5000); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.filter((row) => (row["value"] as number) > 5000); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_filter", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_rename.ts b/benchmarks/tsb/bench_dataframe_rename.ts new file mode 100644 index 00000000..807b63c9 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_rename.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: dataframe_rename — rename columns in a 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const a = Float64Array.from({ length: ROWS }, (_, i) => i * 1.1); +const b = Float64Array.from({ length: ROWS }, (_, i) => i * 2.2); +const df = new DataFrame({ old_a: a, old_b: b }); + +for (let i = 0; i < WARMUP; i++) { + df.rename({ old_a: "new_a", old_b: "new_b" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.rename({ old_a: "new_a", old_b: "new_b" }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_rename", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_sort.ts b/benchmarks/tsb/bench_dataframe_sort.ts new file mode 100644 index 00000000..707e4ecf --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_sort.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: dataframe_sort — sort a 100k-row DataFrame by two columns + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const a = Array.from({ length: ROWS }, (_, i) => `group_${i % 100}`); +const b = Float64Array.from({ length: ROWS }, () => Math.random() * 1000); +const df = new DataFrame({ a, b }); + +for (let i = 0; i < WARMUP; i++) { + df.sort_values(["a", "b"]); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.sort_values(["a", "b"]); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_sort", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_describe.ts b/benchmarks/tsb/bench_describe.ts new file mode 100644 index 00000000..368156a3 --- /dev/null +++ b/benchmarks/tsb/bench_describe.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: describe — summary statistics on a 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const a = Float64Array.from({ length: ROWS }, (_, i) => i * 1.1); +const b = Float64Array.from({ length: ROWS }, (_, i) => Math.sqrt(i + 1)); +const df = new DataFrame({ a, b }); + +for (let i = 0; i < WARMUP; i++) { + df.describe(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.describe(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "describe", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_ewm_mean.ts b/benchmarks/tsb/bench_ewm_mean.ts new file mode 100644 index 00000000..8e6597f7 --- /dev/null +++ b/benchmarks/tsb/bench_ewm_mean.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: ewm_mean — exponentially weighted mean on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.05)); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.ewm({ span: 20 }).mean(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.ewm({ span: 20 }).mean(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "ewm_mean", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_groupby_mean.ts b/benchmarks/tsb/bench_groupby_mean.ts new file mode 100644 index 00000000..efecfddb --- /dev/null +++ b/benchmarks/tsb/bench_groupby_mean.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: GroupBy mean on 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const keys = Array.from({ length: ROWS }, (_, i) => `group_${i % 100}`); +const vals = Float64Array.from({ length: ROWS }, (_, i) => i * 0.1); +const df = new DataFrame({ key: keys, value: vals }); + +for (let i = 0; i < WARMUP; i++) { + df.groupby("key").mean(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.groupby("key").mean(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "groupby_mean", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_merge.ts b/benchmarks/tsb/bench_merge.ts new file mode 100644 index 00000000..da68b52b --- /dev/null +++ b/benchmarks/tsb/bench_merge.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: merge — inner join two 50k-row DataFrames on a key column + */ +import { DataFrame, merge } from "../../src/index.js"; + +const ROWS = 50_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const keys = Array.from({ length: ROWS }, (_, i) => i % 1000); +const vals1 = Float64Array.from({ length: ROWS }, (_, i) => i * 1.0); +const vals2 = Float64Array.from({ length: ROWS }, (_, i) => i * 2.0); +const df1 = new DataFrame({ key: keys, val1: vals1 }); +const df2 = new DataFrame({ key: keys, val2: vals2 }); + +for (let i = 0; i < WARMUP; i++) { + merge(df1, df2, { on: "key", how: "inner" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + merge(df1, df2, { on: "key", how: "inner" }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "merge", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_pivot_table.ts b/benchmarks/tsb/bench_pivot_table.ts new file mode 100644 index 00000000..78b94702 --- /dev/null +++ b/benchmarks/tsb/bench_pivot_table.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: pivot_table — pivot aggregation on 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const rows = Array.from({ length: ROWS }, (_, i) => `row_${i % 100}`); +const cols = Array.from({ length: ROWS }, (_, i) => `col_${i % 50}`); +const vals = Float64Array.from({ length: ROWS }, (_, i) => i * 0.1); +const df = new DataFrame({ row: rows, col: cols, value: vals }); + +for (let i = 0; i < WARMUP; i++) { + df.pivot_table({ values: "value", index: "row", columns: "col", aggfunc: "mean" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.pivot_table({ values: "value", index: "row", columns: "col", aggfunc: "mean" }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "pivot_table", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_read_csv.ts b/benchmarks/tsb/bench_read_csv.ts new file mode 100644 index 00000000..0d9462bf --- /dev/null +++ b/benchmarks/tsb/bench_read_csv.ts @@ -0,0 +1,39 @@ +/** + * Benchmark: read_csv — parse a 100k-row CSV string + */ +import { read_csv } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 2; +const ITERATIONS = 5; + +// Build CSV string +const lines = ["id,value,label"]; +for (let i = 0; i < ROWS; i++) { + lines.push(`${i},${(i * 1.1).toFixed(4)},cat_${i % 50}`); +} +const csvContent = lines.join("\n"); + +// Write to a temp file +import { writeFileSync } from "node:fs"; +const tmpPath = "/tmp/gh-aw/agent/bench_read_csv.csv"; +writeFileSync(tmpPath, csvContent, "utf8"); + +for (let i = 0; i < WARMUP; i++) { + read_csv(tmpPath); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + read_csv(tmpPath); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "read_csv", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_rolling_mean.ts b/benchmarks/tsb/bench_rolling_mean.ts new file mode 100644 index 00000000..646d3100 --- /dev/null +++ b/benchmarks/tsb/bench_rolling_mean.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: rolling mean with window=100 on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01)); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.rolling(100).mean(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.rolling(100).mean(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "rolling_mean", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_arithmetic.ts b/benchmarks/tsb/bench_series_arithmetic.ts new file mode 100644 index 00000000..552be2ca --- /dev/null +++ b/benchmarks/tsb/bench_series_arithmetic.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: Series arithmetic (add + multiply on 100k-element Series) + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const data = Float64Array.from({ length: ROWS }, (_, i) => i * 0.5); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.add(2.0).mul(0.5); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.add(2.0).mul(0.5); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_arithmetic", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_cumsum.ts b/benchmarks/tsb/bench_series_cumsum.ts new file mode 100644 index 00000000..3eeba5b0 --- /dev/null +++ b/benchmarks/tsb/bench_series_cumsum.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: series_cumsum — cumulative sum on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const data = Float64Array.from({ length: ROWS }, (_, i) => i * 0.001); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.cumsum(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.cumsum(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_cumsum", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_fillna.ts b/benchmarks/tsb/bench_series_fillna.ts new file mode 100644 index 00000000..3e658b01 --- /dev/null +++ b/benchmarks/tsb/bench_series_fillna.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: series_fillna — fill NaN/null values in a 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +// Create series with every 5th value as NaN +const data = Float64Array.from({ length: ROWS }, (_, i) => (i % 5 === 0 ? NaN : i * 1.1)); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.fillna(0.0); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.fillna(0.0); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_fillna", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_shift.ts b/benchmarks/tsb/bench_series_shift.ts new file mode 100644 index 00000000..46e79d19 --- /dev/null +++ b/benchmarks/tsb/bench_series_shift.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: series_shift — shift values by 1 position in a 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const data = Float64Array.from({ length: ROWS }, (_, i) => i * 1.0); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.shift(1); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.shift(1); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_shift", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_sort.ts b/benchmarks/tsb/bench_series_sort.ts new file mode 100644 index 00000000..c6aedb93 --- /dev/null +++ b/benchmarks/tsb/bench_series_sort.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: Series sort (argsort on 100k-element numeric Series) + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Float64Array.from({ length: ROWS }, () => Math.random() * 1000); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.sort_values(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.sort_values(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_sort", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_string_ops.ts b/benchmarks/tsb/bench_series_string_ops.ts new file mode 100644 index 00000000..c44cdefe --- /dev/null +++ b/benchmarks/tsb/bench_series_string_ops.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: series_string_ops — str.upper and str.contains on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `hello_world_${i % 200}`); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.str.upper(); + s.str.contains("world"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.upper(); + s.str.contains("world"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_string_ops", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_value_counts.ts b/benchmarks/tsb/bench_series_value_counts.ts new file mode 100644 index 00000000..b5352f54 --- /dev/null +++ b/benchmarks/tsb/bench_series_value_counts.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: value_counts on a 100k-element Series with 100 distinct values + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `cat_${i % 100}`); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.value_counts(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.value_counts(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_value_counts", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/playground/benchmarks.html b/playground/benchmarks.html index 6b5dde65..c4a74f9f 100644 --- a/playground/benchmarks.html +++ b/playground/benchmarks.html @@ -300,43 +300,58 @@

🤖 About

// Find max time for scaling bars let maxTime = 0; for (const b of benchmarks) { - maxTime = Math.max(maxTime, b.tsb.mean_ms, b.pandas.mean_ms); + if (b.tsb != null) maxTime = Math.max(maxTime, b.tsb.mean_ms); + if (b.pandas != null) maxTime = Math.max(maxTime, b.pandas.mean_ms); } // Render bar chart for (const b of benchmarks) { const label = b.function.replace(/_/g, " "); - const tsPct = (b.tsb.mean_ms / maxTime) * 100; - const pyPct = (b.pandas.mean_ms / maxTime) * 100; + const pyPct = b.pandas != null ? (b.pandas.mean_ms / maxTime) * 100 : 0; + const tsPct = b.tsb != null ? (b.tsb.mean_ms / maxTime) * 100 : 0; + + const tsBar = b.tsb != null + ? '
' + b.tsb.mean_ms.toFixed(3) + ' ms
' + : '
pending
'; + const pyBar = b.pandas != null + ? '
' + b.pandas.mean_ms.toFixed(3) + ' ms
' + : '
pending
'; const row = document.createElement("div"); row.className = "bar-row"; row.innerHTML = '
' + label + '
' + - '
' + - '
' + b.tsb.mean_ms + ' ms
' + - '
' + b.pandas.mean_ms + ' ms
' + - '
'; + '
' + tsBar + pyBar + '
'; barChart.appendChild(row); } // Render table for (const b of benchmarks) { - const ratio = b.ratio; - const faster = ratio < 1 ? "tsb" : "pandas"; - const badgeClass = ratio < 1 ? "fast" : "slow"; - const fasterClass = ratio < 1 ? "faster-tsb" : "faster-pandas"; - const displayRatio = ratio < 1 - ? (1 / ratio).toFixed(2) + "x faster" - : ratio.toFixed(2) + "x slower"; + const ratio = (b.tsb != null && b.pandas != null && b.pandas.mean_ms > 0) + ? b.tsb.mean_ms / b.pandas.mean_ms + : null; + const faster = ratio != null ? (ratio < 1 ? "tsb" : "pandas") : "—"; + const badgeClass = ratio != null ? (ratio < 1 ? "fast" : "slow") : ""; + const fasterClass = ratio != null ? (ratio < 1 ? "faster-tsb" : "faster-pandas") : ""; + const ratioDisplay = ratio != null + ? '' + ratio.toFixed(3) + "x" + : "—"; + const displayRatio = ratio != null + ? (ratio < 1 + ? (1 / ratio).toFixed(2) + "x faster" + : ratio.toFixed(2) + "x slower") + : ""; + const fasterDisplay = ratio != null ? faster + " (" + displayRatio + ")" : "—"; + const tsMsDisplay = b.tsb != null ? b.tsb.mean_ms.toFixed(3) : "—"; + const pyMsDisplay = b.pandas != null ? b.pandas.mean_ms.toFixed(3) : "—"; const tr = document.createElement("tr"); tr.innerHTML = "" + b.function.replace(/_/g, " ") + "" + - "" + b.tsb.mean_ms + "" + - "" + b.pandas.mean_ms + "" + - '' + ratio + "x" + - '' + faster + " (" + displayRatio + ")"; + "" + tsMsDisplay + "" + + "" + pyMsDisplay + "" + + "" + ratioDisplay + "" + + '' + fasterDisplay + ""; benchTbody.appendChild(tr); } })();