githubnext · mrjf · Apr 17, 2026 · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/benchmarks/pandas/bench_align_dataframe.py b/benchmarks/pandas/bench_align_dataframe.py
@@ -0,0 +1,43 @@
+"""
+Benchmark: DataFrame.align — align two 10k-row DataFrames on inner/outer/left join.
+Outputs JSON: {"function": "align_dataframe", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+import json
+import time
+import numpy as np
+import pandas as pd
+
+SIZE = 10_000
+WARMUP = 5
+ITERATIONS = 30
+
+idx_a = [i * 2 for i in range(SIZE)]
+idx_b = [i * 3 for i in range(SIZE)]
+
+df_a = pd.DataFrame(
+    {"x": [i * 1.0 for i in range(SIZE)], "y": [i * 2.0 for i in range(SIZE)], "z": [i * 3.0 for i in range(SIZE)]},
+    index=idx_a,
+)
+df_b = pd.DataFrame(
+    {"y": [i * 10.0 for i in range(SIZE)], "z": [i * 20.0 for i in range(SIZE)], "w": [i * 30.0 for i in range(SIZE)]},
+    index=idx_b,
+)
+
+for _ in range(WARMUP):
+    df_a.align(df_b, join="inner")
+    df_a.align(df_b, join="outer")
+    df_a.align(df_b, join="left")
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    df_a.align(df_b, join="inner")
+    df_a.align(df_b, join="outer")
+    df_a.align(df_b, join="left")
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "align_dataframe",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))
diff --git a/benchmarks/pandas/bench_align_series.py b/benchmarks/pandas/bench_align_series.py
@@ -0,0 +1,36 @@
+"""
+Benchmark: Series.align — align two 50k-element Series on inner/outer/left join.
+Outputs JSON: {"function": "align_series", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+import json
+import time
+import numpy as np
+import pandas as pd
+
+SIZE = 50_000
+WARMUP = 5
+ITERATIONS = 30
+
+idx_a = [i * 2 for i in range(SIZE)]
+idx_b = [i * 3 for i in range(SIZE)]
+s_a = pd.Series([i * 1.0 for i in range(SIZE)], index=idx_a)
+s_b = pd.Series([i * 2.0 for i in range(SIZE)], index=idx_b)
+
+for _ in range(WARMUP):
+    s_a.align(s_b, join="inner")
+    s_a.align(s_b, join="outer")
+    s_a.align(s_b, join="left")
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    s_a.align(s_b, join="inner")
+    s_a.align(s_b, join="outer")
+    s_a.align(s_b, join="left")
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "align_series",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))
diff --git a/benchmarks/pandas/bench_argsort_scalars.py b/benchmarks/pandas/bench_argsort_scalars.py
@@ -0,0 +1,24 @@
+"""Benchmark: np.argsort / np.searchsorted — sort/search utilities on 100k-element arrays."""
+import json
+import time
+import numpy as np
+
+SIZE = 100_000
+WARMUP = 3
+ITERATIONS = 20
+
+arr = np.sin(np.arange(SIZE) * 0.001) * SIZE
+sorted_arr = np.sort(arr)
+queries = (np.arange(1000) - 500) * SIZE / 500
+
+for _ in range(WARMUP):
+    np.argsort(arr)
+    np.searchsorted(sorted_arr, queries)
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    np.argsort(arr)
+    np.searchsorted(sorted_arr, queries)
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({"function": "argsort_scalars", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total}))
diff --git a/benchmarks/pandas/bench_bdate_range.py b/benchmarks/pandas/bench_bdate_range.py
@@ -0,0 +1,25 @@
+"""
+Benchmark: pd.bdate_range — generate business-day DatetimeIndex with 1000 periods.
+Outputs JSON: {"function": "bdate_range", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+import json
+import time
+import pandas as pd
+
+WARMUP = 5
+ITERATIONS = 100
+
+for _ in range(WARMUP):
+    pd.bdate_range(start="2020-01-01", periods=1000)
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    pd.bdate_range(start="2020-01-01", periods=1000)
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "bdate_range",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))
diff --git a/benchmarks/pandas/bench_cast_scalar.py b/benchmarks/pandas/bench_cast_scalar.py
@@ -0,0 +1,36 @@
+"""
+Benchmark: Python type coercion equivalents — int(), float(), str(), bool() conversions.
+Outputs JSON: {"function": "cast_scalar", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+import json
+import time
+
+SIZE = 100_000
+WARMUP = 5
+ITERATIONS = 50
+
+int_values = [i % 1000 for i in range(SIZE)]
+float_values = [i * 0.5 for i in range(SIZE)]
+str_values = [str(i % 1000) for i in range(SIZE)]
+bool_values = [i % 2 == 0 for i in range(SIZE)]
+
+for _ in range(WARMUP):
+    for j in range(SIZE):
+        int(float_values[j])
+        float(int_values[j])
+        int(str_values[j])
+        int(bool_values[j])
+
+times = []
+for _ in range(ITERATIONS):
+    t0 = time.perf_counter()
+    for j in range(SIZE):
+        int(float_values[j])
+        float(int_values[j])
+        int(str_values[j])
+        int(bool_values[j])
+    times.append((time.perf_counter() - t0) * 1000)
+
+total_ms = sum(times)
+mean_ms = total_ms / ITERATIONS
+print(json.dumps({"function": "cast_scalar", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)}))
diff --git a/benchmarks/pandas/bench_cat_codes_accessor.py b/benchmarks/pandas/bench_cat_codes_accessor.py
@@ -0,0 +1,42 @@
+"""
+Benchmark: pd.Categorical.codes / categories / ordered — category accessor properties
+on a 100k-element categorical Series.
+Outputs JSON: {"function": "cat_codes_accessor", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+import json
+import time
+import numpy as np
+import pandas as pd
+
+SIZE = 100_000
+CATS = 50
+WARMUP = 5
+ITERATIONS = 30
+
+categories = [f"cat_{i}" for i in range(CATS)]
+data = [categories[i % CATS] for i in range(SIZE)]
+s = pd.Categorical(data, categories=categories)
+ps = pd.Series(s)
+
+for _ in range(WARMUP):
+    _ = ps.cat.codes
+    _ = ps.cat.categories
+    _ = ps.cat.ordered
+    _ = len(ps.cat.categories)
+
+times = []
+for _ in range(ITERATIONS):
+    t0 = time.perf_counter()
+    _ = ps.cat.codes
+    _ = ps.cat.categories
+    _ = ps.cat.ordered
+    _ = len(ps.cat.categories)
+    times.append((time.perf_counter() - t0) * 1000)
+
+total_ms = sum(times)
+print(json.dumps({
+    "function": "cat_codes_accessor",
+    "mean_ms": round(total_ms / ITERATIONS, 3),
+    "iterations": ITERATIONS,
+    "total_ms": round(total_ms, 3),
+}))
diff --git a/benchmarks/pandas/bench_categorical_index.py b/benchmarks/pandas/bench_categorical_index.py
@@ -0,0 +1,41 @@
+"""
+Benchmark: pandas.CategoricalIndex — creation, get_loc, add_categories, set operations on 100k elements.
+Outputs JSON: {"function": "categorical_index", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+import json
+import time
+import numpy as np
+import pandas as pd
+
+SIZE = 100_000
+WARMUP = 5
+ITERATIONS = 30
+
+CATS = ["alpha", "beta", "gamma", "delta", "epsilon"]
+labels = [CATS[i % len(CATS)] for i in range(SIZE)]
+ci = pd.CategoricalIndex(labels)
+labels2 = [CATS[(i + 2) % len(CATS)] for i in range(SIZE // 2)]
+ci2 = pd.CategoricalIndex(labels2)
+
+for _ in range(WARMUP):
+    pd.CategoricalIndex(labels)
+    ci.get_loc("beta")
+    ci.add_categories(["zeta"])
+    ci.union(ci2)
+    ci.intersection(ci2)
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    pd.CategoricalIndex(labels)
+    ci.get_loc("beta")
+    ci.add_categories(["zeta"])
+    ci.union(ci2)
+    ci.intersection(ci2)
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "categorical_index",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))
diff --git a/benchmarks/pandas/bench_clip_series_bounds.py b/benchmarks/pandas/bench_clip_series_bounds.py
@@ -0,0 +1,41 @@
+"""
+Benchmark: Series.clip(lower=, upper=) / DataFrame.clip(lower=, upper=) — element-wise clip bounds.
+Outputs JSON: {"function": "clip_series_bounds", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+import json
+import time
+import numpy as np
+import pandas as pd
+
+SIZE = 100_000
+WARMUP = 5
+ITERATIONS = 50
+
+data = np.arange(SIZE) - SIZE / 2
+s = pd.Series(data)
+lower_s = pd.Series(np.full(SIZE, -10000.0))
+upper_s = pd.Series(np.full(SIZE, 10000.0))
+
+df = pd.DataFrame({
+    "a": np.arange(SIZE) - SIZE / 2,
+    "b": np.sin(np.arange(SIZE) * 0.01) * 100,
+})
+lower_df = pd.DataFrame({"a": np.full(SIZE, -10000.0), "b": np.full(SIZE, -50.0)})
+upper_df = pd.DataFrame({"a": np.full(SIZE, 10000.0), "b": np.full(SIZE, 50.0)})
+
+for _ in range(WARMUP):
+    s.clip(lower=lower_s, upper=upper_s)
+    df.clip(lower=lower_df, upper=upper_df)
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    s.clip(lower=lower_s, upper=upper_s)
+    df.clip(lower=lower_df, upper=upper_df)
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "clip_series_bounds",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))
diff --git a/benchmarks/pandas/bench_combine_first_dataframe.py b/benchmarks/pandas/bench_combine_first_dataframe.py
@@ -0,0 +1,36 @@
+"""Benchmark: DataFrame.combine_first — fill NaN values from another DataFrame (union of indexes).
+Mirrors tsb bench_combine_first_dataframe.ts.
+"""
+import json, time
+import numpy as np
+import pandas as pd
+
+SIZE = 5_000
+WARMUP = 5
+ITERATIONS = 30
+
+rows1 = list(range(SIZE))
+data1a = [None if i % 3 == 0 else i * 1.5 for i in range(SIZE)]
+data1b = [None if i % 5 == 0 else i * 0.5 for i in range(SIZE)]
+df1 = pd.DataFrame({"a": data1a, "b": data1b}, index=rows1)
+
+rows2 = list(range(SIZE + 500))
+data2a = [i * 2.0 for i in range(SIZE + 500)]
+data2b = [i * 1.0 for i in range(SIZE + 500)]
+data2c = [i * 0.1 for i in range(SIZE + 500)]
+df2 = pd.DataFrame({"a": data2a, "b": data2b, "c": data2c}, index=rows2)
+
+for _ in range(WARMUP):
+    df1.combine_first(df2)
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    df1.combine_first(df2)
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "combine_first_dataframe",
+    "mean_ms": round(total / ITERATIONS, 3),
+    "iterations": ITERATIONS,
+    "total_ms": round(total, 3),
+}))
diff --git a/benchmarks/pandas/bench_concat_options.py b/benchmarks/pandas/bench_concat_options.py
@@ -0,0 +1,37 @@
+"""
+Benchmark: pandas concat with join="inner" and ignore_index=True options.
+Outputs JSON: {"function": "concat_options", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+import json
+import time
+import pandas as pd
+
+ROWS = 50_000
+WARMUP = 5
+ITERATIONS = 20
+
+df1 = pd.DataFrame({
+    "a": [i * 1.0 for i in range(ROWS)],
+    "b": [i * 2.0 for i in range(ROWS)],
+    "c": [i * 3.0 for i in range(ROWS)],
+})
+df2 = pd.DataFrame({
+    "a": [i * 1.5 for i in range(ROWS)],
+    "b": [i * 2.5 for i in range(ROWS)],
+    "d": [i * 4.0 for i in range(ROWS)],
+})
+
+for _ in range(WARMUP):
+    pd.concat([df1, df2], join="inner", ignore_index=True)
+    pd.concat([df1, df2], join="outer", ignore_index=True)
+
+times = []
+for _ in range(ITERATIONS):
+    t0 = time.perf_counter()
+    pd.concat([df1, df2], join="inner", ignore_index=True)
+    pd.concat([df1, df2], join="outer", ignore_index=True)
+    times.append((time.perf_counter() - t0) * 1000)
+
+total_ms = sum(times)
+mean_ms = total_ms / ITERATIONS
+print(json.dumps({"function": "concat_options", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)}))
diff --git a/benchmarks/pandas/bench_cut_interval_index.py b/benchmarks/pandas/bench_cut_interval_index.py
@@ -0,0 +1,24 @@
+"""Benchmark: cutIntervalIndex / qcutIntervalIndex — pd.cut/qcut returning IntervalIndex on 100k-element Series."""
+import json
+import time
+import numpy as np
+import pandas as pd
+
+SIZE = 100_000
+WARMUP = 5
+ITERATIONS = 30
+
+data = (np.arange(SIZE) % 1000) * 0.1
+s = pd.Series(data)
+
+for _ in range(WARMUP):
+    pd.cut(s, 20, retbins=False)
+    pd.qcut(s, 10, duplicates="drop")
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    pd.cut(s, 20, retbins=False)
+    pd.qcut(s, 10, duplicates="drop")
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({"function": "cut_interval_index", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total}))