Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions benchmarks/pandas/bench_any_all.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Benchmark: any_all — Series.any / all and DataFrame.any / all on 100k rows."""
import json, time
import numpy as np
import pandas as pd

SIZE = 100_000
WARMUP = 5
ITERATIONS = 50

s = pd.Series(np.arange(SIZE) % 2 == 0)
df = pd.DataFrame({
"a": np.arange(SIZE) % 3 != 0,
"b": np.arange(SIZE) > 0,
"c": np.ones(SIZE, dtype=bool),
})

for _ in range(WARMUP):
s.any()
s.all()
df.any()
df.all()

start = time.perf_counter()
for _ in range(ITERATIONS):
s.any()
s.all()
df.any()
df.all()
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "any_all",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
31 changes: 31 additions & 0 deletions benchmarks/pandas/bench_astype_df_fn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""Benchmark: astype standalone — DataFrame.astype with per-column and uniform dtype on 100k-row DataFrame."""
import json, time
import pandas as pd
import numpy as np

SIZE = 100_000
WARMUP = 5
ITERATIONS = 50

df = pd.DataFrame({
"a": np.arange(SIZE, dtype=np.float64),
"b": np.arange(SIZE, dtype=np.int64),
"c": np.where(np.arange(SIZE) % 2 == 0, 1, 0).astype(np.int64),
})

for _ in range(WARMUP):
df.astype({"a": "float32", "b": "int32"})
df.astype("float64")

start = time.perf_counter()
for _ in range(ITERATIONS):
df.astype({"a": "float32", "b": "int32"})
df.astype("float64")
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "astype_df_fn",
"mean_ms": round(total / ITERATIONS, 3),
"iterations": ITERATIONS,
"total_ms": round(total, 3),
}))
39 changes: 39 additions & 0 deletions benchmarks/pandas/bench_cat_freq_crosstab.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""
Benchmark: pd.Series.value_counts (freq table) and pd.crosstab for categorical data on 100k elements.
Outputs JSON: {"function": "cat_freq_crosstab", "mean_ms": ..., "iterations": ..., "total_ms": ...}
"""
import json
import time
import pandas as pd

SIZE = 100_000
WARMUP = 5
ITERATIONS = 20

cats_a = ["alpha", "beta", "gamma", "delta", "epsilon"]
cats_b = ["north", "south", "east", "west"]
data_a = pd.Categorical([cats_a[i % len(cats_a)] for i in range(SIZE)], categories=cats_a)
data_b = pd.Categorical([cats_b[i % len(cats_b)] for i in range(SIZE)], categories=cats_b)
s_a = pd.Series(data_a)
s_b = pd.Series(data_b)

for _ in range(WARMUP):
s_a.value_counts(sort=False)
pd.crosstab(s_a, s_b)
pd.crosstab(s_a, s_b, normalize=True)

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
s_a.value_counts(sort=False)
pd.crosstab(s_a, s_b)
pd.crosstab(s_a, s_b, normalize=True)
times.append((time.perf_counter() - t0) * 1000)

total_ms = sum(times)
print(json.dumps({
"function": "cat_freq_crosstab",
"mean_ms": total_ms / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total_ms,
}))
53 changes: 53 additions & 0 deletions benchmarks/pandas/bench_cat_intersect_diff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""
Benchmark: pandas category set operations — intersection and difference of
categorical Series categories (100k-element, 20 categories each).
Mirrors tsb's catIntersectCategories / catDiffCategories.
Outputs JSON: {"function": "cat_intersect_diff", "mean_ms": ..., "iterations": ..., "total_ms": ...}
"""
import json
import time
import pandas as pd

SIZE = 100_000
WARMUP = 5
ITERATIONS = 30

cats_a = [f"cat_a_{i}" for i in range(20)]
cats_b = [f"cat_{'a' if i < 10 else 'b'}_{i}" for i in range(20)]

data_a = [cats_a[i % len(cats_a)] for i in range(SIZE)]
data_b = [cats_b[i % len(cats_b)] for i in range(SIZE)]

s_a = pd.Categorical(data_a, categories=cats_a)
s_b = pd.Categorical(data_b, categories=cats_b)

def cat_intersect(a, b):
"""Return new Categorical with categories = intersection of a.categories and b.categories."""
b_set = set(b.categories)
intersected = [c for c in a.categories if c in b_set]
return pd.Categorical(a, categories=intersected)

def cat_diff(a, b):
"""Return new Categorical with categories = a.categories - b.categories."""
b_set = set(b.categories)
remaining = [c for c in a.categories if c not in b_set]
return pd.Categorical(a, categories=remaining)

for _ in range(WARMUP):
cat_intersect(s_a, s_b)
cat_diff(s_a, s_b)

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
cat_intersect(s_a, s_b)
cat_diff(s_a, s_b)
times.append((time.perf_counter() - t0) * 1000)

total_ms = sum(times)
print(json.dumps({
"function": "cat_intersect_diff",
"mean_ms": total_ms / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total_ms,
}))
49 changes: 49 additions & 0 deletions benchmarks/pandas/bench_cat_ops_from_codes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""
Benchmark: pd.Categorical.from_codes, reorder_categories by freq, ordered categorical on 100k elements.
Outputs JSON: {"function": "cat_ops_from_codes", "mean_ms": ..., "iterations": ..., "total_ms": ...}
"""
import json
import time
import pandas as pd
import numpy as np

SIZE = 100_000
WARMUP = 5
ITERATIONS = 20

categories = ["alpha", "beta", "gamma", "delta", "epsilon"]
codes = [i % len(categories) for i in range(SIZE)]
order = ["epsilon", "delta", "gamma", "beta", "alpha"]

def cat_from_codes():
return pd.Categorical.from_codes(codes, categories=categories)

def cat_sort_by_freq(c):
s = pd.Series(c)
freq_order = s.value_counts().index.tolist()
return s.astype(pd.CategoricalDtype(categories=freq_order, ordered=False))

def cat_to_ordinal(c):
s = pd.Series(c)
return s.astype(pd.CategoricalDtype(categories=order, ordered=True))

for _ in range(WARMUP):
c = cat_from_codes()
cat_sort_by_freq(c)
cat_to_ordinal(c)

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
c = cat_from_codes()
cat_sort_by_freq(c)
cat_to_ordinal(c)
times.append((time.perf_counter() - t0) * 1000)

total_ms = sum(times)
print(json.dumps({
"function": "cat_ops_from_codes",
"mean_ms": total_ms / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total_ms,
}))
51 changes: 51 additions & 0 deletions benchmarks/pandas/bench_cat_ops_setops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""
Benchmark: categorical union/intersect/diff categories on 100k element Series.
Outputs JSON: {"function": "cat_ops_setops", "mean_ms": ..., "iterations": ..., "total_ms": ...}
"""
import json
import time
import pandas as pd

SIZE = 100_000
WARMUP = 5
ITERATIONS = 20

cats_a = ["alpha", "beta", "gamma", "delta"]
cats_b = ["gamma", "delta", "epsilon", "zeta"]
data_a = [cats_a[i % len(cats_a)] for i in range(SIZE)]
data_b = [cats_b[i % len(cats_b)] for i in range(SIZE)]
s_a = pd.Series(data_a, dtype="category")
s_b = pd.Series(data_b, dtype="category")

def cat_union(a, b):
cats = list(dict.fromkeys(list(a.cat.categories) + [c for c in b.cat.categories if c not in a.cat.categories]))
return a.astype(pd.CategoricalDtype(categories=cats))

def cat_intersect(a, b):
cats = [c for c in a.cat.categories if c in set(b.cat.categories)]
return a.astype(pd.CategoricalDtype(categories=cats))

def cat_diff(a, b):
cats = [c for c in a.cat.categories if c not in set(b.cat.categories)]
return a.astype(pd.CategoricalDtype(categories=cats))

for _ in range(WARMUP):
cat_union(s_a, s_b)
cat_intersect(s_a, s_b)
cat_diff(s_a, s_b)

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
cat_union(s_a, s_b)
cat_intersect(s_a, s_b)
cat_diff(s_a, s_b)
times.append((time.perf_counter() - t0) * 1000)

total_ms = sum(times)
print(json.dumps({
"function": "cat_ops_setops",
"mean_ms": total_ms / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total_ms,
}))
29 changes: 29 additions & 0 deletions benchmarks/pandas/bench_combine_first_fn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Benchmark: combineFirstSeries standalone — pd.Series.combine_first() on 50k-element Series with 30% NaN."""
import json, time
import pandas as pd
import numpy as np

SIZE = 50_000
WARMUP = 5
ITERATIONS = 30

rng = np.random.default_rng(42)
data1 = rng.standard_normal(SIZE)
data1[::3] = float("nan") # ~30% nulls
s1 = pd.Series(data1)
s2 = pd.Series(np.arange(SIZE, dtype=np.float64) * 2.0)

for _ in range(WARMUP):
s1.combine_first(s2)

start = time.perf_counter()
for _ in range(ITERATIONS):
s1.combine_first(s2)
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "combine_first_fn",
"mean_ms": round(total / ITERATIONS, 3),
"iterations": ITERATIONS,
"total_ms": round(total, 3),
}))
33 changes: 33 additions & 0 deletions benchmarks/pandas/bench_combine_first_series.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""Benchmark: Series.combine_first (standalone equivalent) — fill missing values from another Series.
Mirrors tsb bench_combine_first_series.ts for pandas.
"""
import json, time
import pandas as pd
import numpy as np

SIZE = 10_000
WARMUP = 5
ITERATIONS = 50

data1 = [None if i % 3 == 0 else i * 0.5 for i in range(SIZE)]
data2 = [i * 0.1 for i in range(SIZE)]
s1 = pd.Series(data1)
s2 = pd.Series(data2)

for _ in range(WARMUP):
s1.combine_first(s2)

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
s1.combine_first(s2)
times.append((time.perf_counter() - t0) * 1000)

total = sum(times)
mean = total / ITERATIONS
print(json.dumps({
"function": "combine_first_series",
"mean_ms": round(mean, 3),
"iterations": ITERATIONS,
"total_ms": round(total, 3),
}))
41 changes: 41 additions & 0 deletions benchmarks/pandas/bench_combine_first_series_fn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""
Benchmark: Series.combine_first() — fill NaN values from another Series (union of indexes).
Mirrors tsb bench_combine_first_series_fn.ts (standalone combineFirstSeries fn).
Outputs JSON: {"function": "combine_first_series_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...}
"""
import json
import time
import numpy as np
import pandas as pd

SIZE = 100_000
WARMUP = 5
ITERATIONS = 30

rng = np.random.default_rng(42)
raw = rng.uniform(0, 10, SIZE)
mask = rng.integers(0, 4, SIZE) == 0 # ~25% nulls
d1 = pd.array(raw, dtype="Float64")
for idx in range(SIZE):
if mask[idx]:
d1[idx] = pd.NA

s1 = pd.Series(d1, dtype="Float64")
s2 = pd.Series(rng.uniform(0, 10, SIZE))

for _ in range(WARMUP):
s1.combine_first(s2)

times = []
for _ in range(ITERATIONS):
t0 = time.perf_counter()
s1.combine_first(s2)
times.append((time.perf_counter() - t0) * 1000)

total_ms = sum(times)
print(json.dumps({
"function": "combine_first_series_fn",
"mean_ms": round(total_ms / ITERATIONS, 3),
"iterations": ITERATIONS,
"total_ms": round(total_ms, 3),
}))
Loading
Loading