githubnext · mrjf · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/benchmarks/pandas/bench_applySeries_fn.py b/benchmarks/pandas/bench_applySeries_fn.py
@@ -0,0 +1,34 @@
+"""
+Benchmark: pandas Series.apply() with (value) lambda — 100k-element Series.
+Mirrors tsb's applySeries (stats/apply.ts) behavior.
+Outputs JSON: {"function": "applySeries_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+import json
+import time
+import pandas as pd
+
+SIZE = 100_000
+WARMUP = 5
+ITERATIONS = 30
+
+s = pd.Series([i * 0.5 for i in range(SIZE)])
+
+fn = lambda v: v * 2 + 1  # noqa: E731
+
+for _ in range(WARMUP):
+    s.apply(fn)
+
+times = []
+for _ in range(ITERATIONS):
+    t0 = time.perf_counter()
+    s.apply(fn)
+    times.append((time.perf_counter() - t0) * 1000)
+
+total_ms = sum(times)
+mean_ms = total_ms / ITERATIONS
+print(json.dumps({
+    "function": "applySeries_fn",
+    "mean_ms": mean_ms,
+    "iterations": ITERATIONS,
+    "total_ms": total_ms,
+}))
diff --git a/benchmarks/pandas/bench_categorical_index_modify.py b/benchmarks/pandas/bench_categorical_index_modify.py
@@ -0,0 +1,46 @@
+"""
+Benchmark: pandas CategoricalIndex modification — rename_categories, reorder_categories,
+remove_categories, set_categories, remove_unused_categories on a 10k-element index.
+Outputs JSON: {"function": "categorical_index_modify", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+import json
+import time
+import pandas as pd
+
+SIZE = 10_000
+WARMUP = 5
+ITERATIONS = 50
+
+CATS = ["alpha", "beta", "gamma", "delta", "epsilon"]
+labels = [CATS[i % len(CATS)] for i in range(SIZE)]
+ci = pd.CategoricalIndex(labels)
+
+for _ in range(WARMUP):
+    ci.rename_categories(["A", "B", "C", "D", "E"])
+    ci.reorder_categories(["epsilon", "delta", "gamma", "beta", "alpha"])
+    ci.remove_categories(["epsilon"])
+    ci.set_categories(["alpha", "beta", "gamma"])
+    ci.remove_unused_categories()
+    ci.as_ordered()
+    ci.as_unordered()
+
+times = []
+for _ in range(ITERATIONS):
+    t0 = time.perf_counter()
+    ci.rename_categories(["A", "B", "C", "D", "E"])
+    ci.reorder_categories(["epsilon", "delta", "gamma", "beta", "alpha"])
+    ci.remove_categories(["epsilon"])
+    ci.set_categories(["alpha", "beta", "gamma"])
+    ci.remove_unused_categories()
+    ci.as_ordered()
+    ci.as_unordered()
+    times.append((time.perf_counter() - t0) * 1000)
+
+total_ms = sum(times)
+mean_ms = total_ms / ITERATIONS
+print(json.dumps({
+    "function": "categorical_index_modify",
+    "mean_ms": mean_ms,
+    "iterations": ITERATIONS,
+    "total_ms": total_ms,
+}))
diff --git a/benchmarks/pandas/bench_clip_dataframe_with_bounds.py b/benchmarks/pandas/bench_clip_dataframe_with_bounds.py
@@ -0,0 +1,39 @@
+"""
+Benchmark: pandas DataFrame.clip with Series bounds (axis=0) on 100k-row DataFrame.
+Outputs JSON: {"function": "clip_dataframe_with_bounds", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+import json
+import time
+import numpy as np
+import pandas as pd
+
+SIZE = 100_000
+WARMUP = 5
+ITERATIONS = 30
+
+df = pd.DataFrame({
+    "a": [(i % 200) - 100 for i in range(SIZE)],
+    "b": [(i % 150) - 75 for i in range(SIZE)],
+    "c": [(i % 100) - 50 for i in range(SIZE)],
+})
+
+lower_bounds = pd.Series([(i % 40) - 20 for i in range(SIZE)])
+upper_bounds = pd.Series([(i % 40) + 20 for i in range(SIZE)])
+
+for _ in range(WARMUP):
+    df.clip(lower=lower_bounds, upper=upper_bounds, axis=0)
+
+times = []
+for _ in range(ITERATIONS):
+    t0 = time.perf_counter()
+    df.clip(lower=lower_bounds, upper=upper_bounds, axis=0)
+    times.append((time.perf_counter() - t0) * 1000)
+
+total_ms = sum(times)
+mean_ms = total_ms / ITERATIONS
+print(json.dumps({
+    "function": "clip_dataframe_with_bounds",
+    "mean_ms": mean_ms,
+    "iterations": ITERATIONS,
+    "total_ms": total_ms,
+}))
diff --git a/benchmarks/pandas/bench_clip_series_with_bounds.py b/benchmarks/pandas/bench_clip_series_with_bounds.py
@@ -0,0 +1,35 @@
+"""
+Benchmark: pandas Series.clip with per-element Series bounds on 100k values.
+Outputs JSON: {"function": "clip_series_with_bounds", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+import json
+import time
+import numpy as np
+import pandas as pd
+
+SIZE = 100_000
+WARMUP = 5
+ITERATIONS = 30
+
+data = [(i % 200) - 100 for i in range(SIZE)]
+lower = pd.Series([(i % 50) - 30 for i in range(SIZE)])
+upper = pd.Series([(i % 50) + 20 for i in range(SIZE)])
+series = pd.Series(data)
+
+for _ in range(WARMUP):
+    series.clip(lower=lower, upper=upper)
+
+times = []
+for _ in range(ITERATIONS):
+    t0 = time.perf_counter()
+    series.clip(lower=lower, upper=upper)
+    times.append((time.perf_counter() - t0) * 1000)
+
+total_ms = sum(times)
+mean_ms = total_ms / ITERATIONS
+print(json.dumps({
+    "function": "clip_series_with_bounds",
+    "mean_ms": mean_ms,
+    "iterations": ITERATIONS,
+    "total_ms": total_ms,
+}))
diff --git a/benchmarks/pandas/bench_concat_series_axis0.py b/benchmarks/pandas/bench_concat_series_axis0.py
@@ -0,0 +1,30 @@
+"""Benchmark: pd.concat of multiple Series along axis=0 — vertical stacking
+of 5 Series of 20k elements each."""
+import json, time
+import numpy as np
+import pandas as pd
+
+CHUNK = 20_000
+WARMUP = 5
+ITERATIONS = 30
+
+s1 = pd.Series(np.arange(CHUNK, dtype=float) * 1.0)
+s2 = pd.Series(np.arange(CHUNK, dtype=float) * 2.0)
+s3 = pd.Series(np.arange(CHUNK, dtype=float) * 3.0)
+s4 = pd.Series(np.arange(CHUNK, dtype=float) * 4.0)
+s5 = pd.Series(np.arange(CHUNK, dtype=float) * 5.0)
+
+for _ in range(WARMUP):
+    pd.concat([s1, s2, s3, s4, s5])
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    pd.concat([s1, s2, s3, s4, s5])
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "concat_series_axis0",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))
diff --git a/benchmarks/pandas/bench_dataframe_apply_stats.py b/benchmarks/pandas/bench_dataframe_apply_stats.py
@@ -0,0 +1,41 @@
+"""
+Benchmark: pandas DataFrame.apply() — apply fn to each column (axis=0) and row (axis=1).
+Mirrors tsb's dataFrameApply (stats/apply.ts) behavior.
+Outputs JSON: {"function": "dataframe_apply_stats", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+import json
+import time
+import pandas as pd
+import numpy as np
+
+SIZE = 10_000
+WARMUP = 3
+ITERATIONS = 20
+
+df = pd.DataFrame({
+    "a": (np.arange(SIZE) * 1.0),
+    "b": (np.arange(SIZE) * 2.0),
+    "c": (np.arange(SIZE) * 3.0),
+})
+
+sum_fn = lambda col: col.mean()  # noqa: E731
+
+for _ in range(WARMUP):
+    df.apply(sum_fn, axis=0)
+    df.apply(sum_fn, axis=1)
+
+times = []
+for _ in range(ITERATIONS):
+    t0 = time.perf_counter()
+    df.apply(sum_fn, axis=0)
+    df.apply(sum_fn, axis=1)
+    times.append((time.perf_counter() - t0) * 1000)
+
+total_ms = sum(times)
+mean_ms = total_ms / ITERATIONS
+print(json.dumps({
+    "function": "dataframe_apply_stats",
+    "mean_ms": mean_ms,
+    "iterations": ITERATIONS,
+    "total_ms": total_ms,
+}))
diff --git a/benchmarks/pandas/bench_dataframe_from_columns.py b/benchmarks/pandas/bench_dataframe_from_columns.py
@@ -0,0 +1,36 @@
+"""
+Benchmark: pandas DataFrame() construction — create 100k-row DataFrame from column arrays.
+Mirrors tsb's DataFrame.fromColumns() behavior.
+Outputs JSON: {"function": "dataframe_from_columns", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+import json
+import time
+import pandas as pd
+import numpy as np
+
+SIZE = 100_000
+WARMUP = 5
+ITERATIONS = 30
+
+col_a = np.arange(SIZE, dtype=float)
+col_b = np.arange(SIZE, dtype=float) * 2.5
+col_c = np.arange(SIZE) % 1000
+col_d = np.sin(np.arange(SIZE) * 0.001)
+
+for _ in range(WARMUP):
+    pd.DataFrame({"a": col_a, "b": col_b, "c": col_c, "d": col_d})
+
+times = []
+for _ in range(ITERATIONS):
+    t0 = time.perf_counter()
+    pd.DataFrame({"a": col_a, "b": col_b, "c": col_c, "d": col_d})
+    times.append((time.perf_counter() - t0) * 1000)
+
+total_ms = sum(times)
+mean_ms = total_ms / ITERATIONS
+print(json.dumps({
+    "function": "dataframe_from_columns",
+    "mean_ms": mean_ms,
+    "iterations": ITERATIONS,
+    "total_ms": total_ms,
+}))
diff --git a/benchmarks/pandas/bench_dataframe_has_col_get.py b/benchmarks/pandas/bench_dataframe_has_col_get.py
@@ -0,0 +1,25 @@
+"""Benchmark: DataFrame column presence and access (.keys(), [], __getitem__) on 100k-row DataFrame."""
+import json, time
+import pandas as pd
+
+SIZE = 100_000
+WARMUP = 10
+ITERATIONS = 100
+
+df = pd.DataFrame({"a": list(range(SIZE)), "b": [i * 2.0 for i in range(SIZE)], "c": [str(i) for i in range(SIZE)]})
+
+for _ in range(WARMUP):
+    "a" in df.columns
+    df["b"]
+    df.get("c")
+
+times = []
+for _ in range(ITERATIONS):
+    t0 = time.perf_counter()
+    "a" in df.columns
+    df["b"]
+    df.get("c")
+    times.append((time.perf_counter() - t0) * 1000)
+
+total = sum(times)
+print(json.dumps({"function": "dataframe_has_col_get", "mean_ms": round(total / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total, 3)}))
diff --git a/benchmarks/pandas/bench_dataframe_median_method.py b/benchmarks/pandas/bench_dataframe_median_method.py
@@ -0,0 +1,20 @@
+"""Benchmark: DataFrame.median() — column-wise median on 100k-row DataFrame."""
+import json, time
+import pandas as pd
+
+SIZE = 100_000
+WARMUP = 5
+ITERATIONS = 50
+
+df = pd.DataFrame({"a": [i * 1.1 for i in range(SIZE)], "b": [i * 2.2 for i in range(SIZE)], "c": [i * 3.3 for i in range(SIZE)]})
+
+for _ in range(WARMUP): df.median()
+
+times = []
+for _ in range(ITERATIONS):
+    t0 = time.perf_counter()
+    df.median()
+    times.append((time.perf_counter() - t0) * 1000)
+
+total = sum(times)
+print(json.dumps({"function": "dataframe_median_method", "mean_ms": round(total / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total, 3)}))
diff --git a/benchmarks/pandas/bench_dataframe_pipe_to.py b/benchmarks/pandas/bench_dataframe_pipe_to.py
@@ -0,0 +1,42 @@
+"""
+Benchmark: pandas DataFrame.pipe with positional target argument on 100k-row DataFrame.
+Mirrors tsb's dataFramePipeTo — inserting the DataFrame at a specific arg position.
+Outputs JSON: {"function": "dataframe_pipe_to", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+import json
+import time
+import numpy as np
+import pandas as pd
+
+SIZE = 100_000
+WARMUP = 5
+ITERATIONS = 50
+
+
+def filter_above(threshold: float, df: pd.DataFrame) -> pd.DataFrame:
+    return df[df["val"] > threshold]
+
+
+left = pd.DataFrame({
+    "key": [i % 1000 for i in range(SIZE)],
+    "val": [i * 1.5 for i in range(SIZE)],
+})
+
+for _ in range(WARMUP):
+    # pandas pipe with tuple form: (fn, 'positional_kwarg') — use pipe with lambda here
+    left.pipe(lambda df: filter_above(50_000, df))
+
+times = []
+for _ in range(ITERATIONS):
+    t0 = time.perf_counter()
+    left.pipe(lambda df: filter_above(50_000, df))
+    times.append((time.perf_counter() - t0) * 1000)
+
+total_ms = sum(times)
+mean_ms = total_ms / ITERATIONS
+print(json.dumps({
+    "function": "dataframe_pipe_to",
+    "mean_ms": mean_ms,
+    "iterations": ITERATIONS,
+    "total_ms": total_ms,
+}))
diff --git a/benchmarks/pandas/bench_dataframe_var_method.py b/benchmarks/pandas/bench_dataframe_var_method.py
@@ -0,0 +1,20 @@
+"""Benchmark: DataFrame.var() — column-wise variance on 100k-row DataFrame."""
+import json, time
+import pandas as pd
+
+SIZE = 100_000
+WARMUP = 10
+ITERATIONS = 100
+
+df = pd.DataFrame({"a": [i * 1.1 for i in range(SIZE)], "b": [i * 2.2 for i in range(SIZE)], "c": [i * 3.3 for i in range(SIZE)]})
+
+for _ in range(WARMUP): df.var()
+
+times = []
+for _ in range(ITERATIONS):
+    t0 = time.perf_counter()
+    df.var()
+    times.append((time.perf_counter() - t0) * 1000)
+
+total = sum(times)
+print(json.dumps({"function": "dataframe_var_method", "mean_ms": round(total / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total, 3)}))