Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions benchmarks/pandas/bench_applySeries_fn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""
Benchmark: pandas Series.apply() with (value) lambda β€” 100k-element Series.
Mirrors tsb's applySeries (stats/apply.ts) behavior.
Outputs JSON: {"function": "applySeries_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...}
"""
import json
import time
import pandas as pd

SIZE = 100_000
WARMUP = 5
ITERATIONS = 30

s = pd.Series([i * 0.5 for i in range(SIZE)])

fn = lambda v: v * 2 + 1 # noqa: E731

for _ in range(WARMUP):
s.apply(fn)

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
s.apply(fn)
times.append((time.perf_counter() - t0) * 1000)

total_ms = sum(times)
mean_ms = total_ms / ITERATIONS
print(json.dumps({
"function": "applySeries_fn",
"mean_ms": mean_ms,
"iterations": ITERATIONS,
"total_ms": total_ms,
}))
46 changes: 46 additions & 0 deletions benchmarks/pandas/bench_categorical_index_modify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""
Benchmark: pandas CategoricalIndex modification β€” rename_categories, reorder_categories,
remove_categories, set_categories, remove_unused_categories on a 10k-element index.
Outputs JSON: {"function": "categorical_index_modify", "mean_ms": ..., "iterations": ..., "total_ms": ...}
"""
import json
import time
import pandas as pd

SIZE = 10_000
WARMUP = 5
ITERATIONS = 50

CATS = ["alpha", "beta", "gamma", "delta", "epsilon"]
labels = [CATS[i % len(CATS)] for i in range(SIZE)]
ci = pd.CategoricalIndex(labels)

for _ in range(WARMUP):
ci.rename_categories(["A", "B", "C", "D", "E"])
ci.reorder_categories(["epsilon", "delta", "gamma", "beta", "alpha"])
ci.remove_categories(["epsilon"])
ci.set_categories(["alpha", "beta", "gamma"])
ci.remove_unused_categories()
ci.as_ordered()
ci.as_unordered()

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
ci.rename_categories(["A", "B", "C", "D", "E"])
ci.reorder_categories(["epsilon", "delta", "gamma", "beta", "alpha"])
ci.remove_categories(["epsilon"])
ci.set_categories(["alpha", "beta", "gamma"])
ci.remove_unused_categories()
ci.as_ordered()
ci.as_unordered()
times.append((time.perf_counter() - t0) * 1000)

total_ms = sum(times)
mean_ms = total_ms / ITERATIONS
print(json.dumps({
"function": "categorical_index_modify",
"mean_ms": mean_ms,
"iterations": ITERATIONS,
"total_ms": total_ms,
}))
39 changes: 39 additions & 0 deletions benchmarks/pandas/bench_clip_dataframe_with_bounds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""
Benchmark: pandas DataFrame.clip with Series bounds (axis=0) on 100k-row DataFrame.
Outputs JSON: {"function": "clip_dataframe_with_bounds", "mean_ms": ..., "iterations": ..., "total_ms": ...}
"""
import json
import time
import numpy as np
import pandas as pd

SIZE = 100_000
WARMUP = 5
ITERATIONS = 30

df = pd.DataFrame({
"a": [(i % 200) - 100 for i in range(SIZE)],
"b": [(i % 150) - 75 for i in range(SIZE)],
"c": [(i % 100) - 50 for i in range(SIZE)],
})

lower_bounds = pd.Series([(i % 40) - 20 for i in range(SIZE)])
upper_bounds = pd.Series([(i % 40) + 20 for i in range(SIZE)])

for _ in range(WARMUP):
df.clip(lower=lower_bounds, upper=upper_bounds, axis=0)

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
df.clip(lower=lower_bounds, upper=upper_bounds, axis=0)
times.append((time.perf_counter() - t0) * 1000)

total_ms = sum(times)
mean_ms = total_ms / ITERATIONS
print(json.dumps({
"function": "clip_dataframe_with_bounds",
"mean_ms": mean_ms,
"iterations": ITERATIONS,
"total_ms": total_ms,
}))
35 changes: 35 additions & 0 deletions benchmarks/pandas/bench_clip_series_with_bounds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""
Benchmark: pandas Series.clip with per-element Series bounds on 100k values.
Outputs JSON: {"function": "clip_series_with_bounds", "mean_ms": ..., "iterations": ..., "total_ms": ...}
"""
import json
import time
import numpy as np
import pandas as pd

SIZE = 100_000
WARMUP = 5
ITERATIONS = 30

data = [(i % 200) - 100 for i in range(SIZE)]
lower = pd.Series([(i % 50) - 30 for i in range(SIZE)])
upper = pd.Series([(i % 50) + 20 for i in range(SIZE)])
series = pd.Series(data)

for _ in range(WARMUP):
series.clip(lower=lower, upper=upper)

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
series.clip(lower=lower, upper=upper)
times.append((time.perf_counter() - t0) * 1000)

total_ms = sum(times)
mean_ms = total_ms / ITERATIONS
print(json.dumps({
"function": "clip_series_with_bounds",
"mean_ms": mean_ms,
"iterations": ITERATIONS,
"total_ms": total_ms,
}))
30 changes: 30 additions & 0 deletions benchmarks/pandas/bench_concat_series_axis0.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Benchmark: pd.concat of multiple Series along axis=0 β€” vertical stacking
of 5 Series of 20k elements each."""
import json, time
import numpy as np
import pandas as pd

CHUNK = 20_000
WARMUP = 5
ITERATIONS = 30

s1 = pd.Series(np.arange(CHUNK, dtype=float) * 1.0)
s2 = pd.Series(np.arange(CHUNK, dtype=float) * 2.0)
s3 = pd.Series(np.arange(CHUNK, dtype=float) * 3.0)
s4 = pd.Series(np.arange(CHUNK, dtype=float) * 4.0)
s5 = pd.Series(np.arange(CHUNK, dtype=float) * 5.0)

for _ in range(WARMUP):
pd.concat([s1, s2, s3, s4, s5])

start = time.perf_counter()
for _ in range(ITERATIONS):
pd.concat([s1, s2, s3, s4, s5])
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "concat_series_axis0",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
41 changes: 41 additions & 0 deletions benchmarks/pandas/bench_dataframe_apply_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""
Benchmark: pandas DataFrame.apply() β€” apply fn to each column (axis=0) and row (axis=1).
Mirrors tsb's dataFrameApply (stats/apply.ts) behavior.
Outputs JSON: {"function": "dataframe_apply_stats", "mean_ms": ..., "iterations": ..., "total_ms": ...}
"""
import json
import time
import pandas as pd
import numpy as np

SIZE = 10_000
WARMUP = 3
ITERATIONS = 20

df = pd.DataFrame({
"a": (np.arange(SIZE) * 1.0),
"b": (np.arange(SIZE) * 2.0),
"c": (np.arange(SIZE) * 3.0),
})

sum_fn = lambda col: col.mean() # noqa: E731

for _ in range(WARMUP):
df.apply(sum_fn, axis=0)
df.apply(sum_fn, axis=1)

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
df.apply(sum_fn, axis=0)
df.apply(sum_fn, axis=1)
times.append((time.perf_counter() - t0) * 1000)

total_ms = sum(times)
mean_ms = total_ms / ITERATIONS
print(json.dumps({
"function": "dataframe_apply_stats",
"mean_ms": mean_ms,
"iterations": ITERATIONS,
"total_ms": total_ms,
}))
36 changes: 36 additions & 0 deletions benchmarks/pandas/bench_dataframe_from_columns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
Benchmark: pandas DataFrame() construction β€” create 100k-row DataFrame from column arrays.
Mirrors tsb's DataFrame.fromColumns() behavior.
Outputs JSON: {"function": "dataframe_from_columns", "mean_ms": ..., "iterations": ..., "total_ms": ...}
"""
import json
import time
import pandas as pd
import numpy as np

SIZE = 100_000
WARMUP = 5
ITERATIONS = 30

col_a = np.arange(SIZE, dtype=float)
col_b = np.arange(SIZE, dtype=float) * 2.5
col_c = np.arange(SIZE) % 1000
col_d = np.sin(np.arange(SIZE) * 0.001)

for _ in range(WARMUP):
pd.DataFrame({"a": col_a, "b": col_b, "c": col_c, "d": col_d})

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
pd.DataFrame({"a": col_a, "b": col_b, "c": col_c, "d": col_d})
times.append((time.perf_counter() - t0) * 1000)

total_ms = sum(times)
mean_ms = total_ms / ITERATIONS
print(json.dumps({
"function": "dataframe_from_columns",
"mean_ms": mean_ms,
"iterations": ITERATIONS,
"total_ms": total_ms,
}))
25 changes: 25 additions & 0 deletions benchmarks/pandas/bench_dataframe_has_col_get.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""Benchmark: DataFrame column presence and access (.keys(), [], __getitem__) on 100k-row DataFrame."""
import json, time
import pandas as pd

SIZE = 100_000
WARMUP = 10
ITERATIONS = 100

df = pd.DataFrame({"a": list(range(SIZE)), "b": [i * 2.0 for i in range(SIZE)], "c": [str(i) for i in range(SIZE)]})

for _ in range(WARMUP):
"a" in df.columns
df["b"]
df.get("c")

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
"a" in df.columns
df["b"]
df.get("c")
times.append((time.perf_counter() - t0) * 1000)

total = sum(times)
print(json.dumps({"function": "dataframe_has_col_get", "mean_ms": round(total / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total, 3)}))
20 changes: 20 additions & 0 deletions benchmarks/pandas/bench_dataframe_median_method.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""Benchmark: DataFrame.median() β€” column-wise median on 100k-row DataFrame."""
import json, time
import pandas as pd

SIZE = 100_000
WARMUP = 5
ITERATIONS = 50

df = pd.DataFrame({"a": [i * 1.1 for i in range(SIZE)], "b": [i * 2.2 for i in range(SIZE)], "c": [i * 3.3 for i in range(SIZE)]})

for _ in range(WARMUP): df.median()

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
df.median()
times.append((time.perf_counter() - t0) * 1000)

total = sum(times)
print(json.dumps({"function": "dataframe_median_method", "mean_ms": round(total / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total, 3)}))
42 changes: 42 additions & 0 deletions benchmarks/pandas/bench_dataframe_pipe_to.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""
Benchmark: pandas DataFrame.pipe with positional target argument on 100k-row DataFrame.
Mirrors tsb's dataFramePipeTo β€” inserting the DataFrame at a specific arg position.
Outputs JSON: {"function": "dataframe_pipe_to", "mean_ms": ..., "iterations": ..., "total_ms": ...}
"""
import json
import time
import numpy as np
import pandas as pd

SIZE = 100_000
WARMUP = 5
ITERATIONS = 50


def filter_above(threshold: float, df: pd.DataFrame) -> pd.DataFrame:
return df[df["val"] > threshold]


left = pd.DataFrame({
"key": [i % 1000 for i in range(SIZE)],
"val": [i * 1.5 for i in range(SIZE)],
})

for _ in range(WARMUP):
# pandas pipe with tuple form: (fn, 'positional_kwarg') β€” use pipe with lambda here
left.pipe(lambda df: filter_above(50_000, df))

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
left.pipe(lambda df: filter_above(50_000, df))
times.append((time.perf_counter() - t0) * 1000)

total_ms = sum(times)
mean_ms = total_ms / ITERATIONS
print(json.dumps({
"function": "dataframe_pipe_to",
"mean_ms": mean_ms,
"iterations": ITERATIONS,
"total_ms": total_ms,
}))
20 changes: 20 additions & 0 deletions benchmarks/pandas/bench_dataframe_var_method.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""Benchmark: DataFrame.var() β€” column-wise variance on 100k-row DataFrame."""
import json, time
import pandas as pd

SIZE = 100_000
WARMUP = 10
ITERATIONS = 100

df = pd.DataFrame({"a": [i * 1.1 for i in range(SIZE)], "b": [i * 2.2 for i in range(SIZE)], "c": [i * 3.3 for i in range(SIZE)]})

for _ in range(WARMUP): df.var()

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
df.var()
times.append((time.perf_counter() - t0) * 1000)

total = sum(times)
print(json.dumps({"function": "dataframe_var_method", "mean_ms": round(total / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total, 3)}))
Loading
Loading