diff --git a/benchmarks/pandas/bench_any_all.py b/benchmarks/pandas/bench_any_all.py new file mode 100644 index 00000000..f9d22406 --- /dev/null +++ b/benchmarks/pandas/bench_any_all.py @@ -0,0 +1,36 @@ +"""Benchmark: any_all — Series.any / all and DataFrame.any / all on 100k rows.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series(np.arange(SIZE) % 2 == 0) +df = pd.DataFrame({ + "a": np.arange(SIZE) % 3 != 0, + "b": np.arange(SIZE) > 0, + "c": np.ones(SIZE, dtype=bool), +}) + +for _ in range(WARMUP): + s.any() + s.all() + df.any() + df.all() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.any() + s.all() + df.any() + df.all() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "any_all", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_astype_df_fn.py b/benchmarks/pandas/bench_astype_df_fn.py new file mode 100644 index 00000000..629eb5c7 --- /dev/null +++ b/benchmarks/pandas/bench_astype_df_fn.py @@ -0,0 +1,31 @@ +"""Benchmark: astype standalone — DataFrame.astype with per-column and uniform dtype on 100k-row DataFrame.""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({ + "a": np.arange(SIZE, dtype=np.float64), + "b": np.arange(SIZE, dtype=np.int64), + "c": np.where(np.arange(SIZE) % 2 == 0, 1, 0).astype(np.int64), +}) + +for _ in range(WARMUP): + df.astype({"a": "float32", "b": "int32"}) + df.astype("float64") + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.astype({"a": "float32", "b": "int32"}) + df.astype("float64") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "astype_df_fn", + "mean_ms": round(total / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total, 3), +})) diff --git a/benchmarks/pandas/bench_cat_freq_crosstab.py b/benchmarks/pandas/bench_cat_freq_crosstab.py new file mode 100644 index 00000000..54ab101e --- /dev/null +++ b/benchmarks/pandas/bench_cat_freq_crosstab.py @@ -0,0 +1,39 @@ +""" +Benchmark: pd.Series.value_counts (freq table) and pd.crosstab for categorical data on 100k elements. +Outputs JSON: {"function": "cat_freq_crosstab", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +cats_a = ["alpha", "beta", "gamma", "delta", "epsilon"] +cats_b = ["north", "south", "east", "west"] +data_a = pd.Categorical([cats_a[i % len(cats_a)] for i in range(SIZE)], categories=cats_a) +data_b = pd.Categorical([cats_b[i % len(cats_b)] for i in range(SIZE)], categories=cats_b) +s_a = pd.Series(data_a) +s_b = pd.Series(data_b) + +for _ in range(WARMUP): + s_a.value_counts(sort=False) + pd.crosstab(s_a, s_b) + pd.crosstab(s_a, s_b, normalize=True) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s_a.value_counts(sort=False) + pd.crosstab(s_a, s_b) + pd.crosstab(s_a, s_b, normalize=True) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "cat_freq_crosstab", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_cat_intersect_diff.py b/benchmarks/pandas/bench_cat_intersect_diff.py new file mode 100644 index 00000000..33f96ee4 --- /dev/null +++ b/benchmarks/pandas/bench_cat_intersect_diff.py @@ -0,0 +1,53 @@ +""" +Benchmark: pandas category set operations — intersection and difference of +categorical Series categories (100k-element, 20 categories each). +Mirrors tsb's catIntersectCategories / catDiffCategories. +Outputs JSON: {"function": "cat_intersect_diff", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +cats_a = [f"cat_a_{i}" for i in range(20)] +cats_b = [f"cat_{'a' if i < 10 else 'b'}_{i}" for i in range(20)] + +data_a = [cats_a[i % len(cats_a)] for i in range(SIZE)] +data_b = [cats_b[i % len(cats_b)] for i in range(SIZE)] + +s_a = pd.Categorical(data_a, categories=cats_a) +s_b = pd.Categorical(data_b, categories=cats_b) + +def cat_intersect(a, b): + """Return new Categorical with categories = intersection of a.categories and b.categories.""" + b_set = set(b.categories) + intersected = [c for c in a.categories if c in b_set] + return pd.Categorical(a, categories=intersected) + +def cat_diff(a, b): + """Return new Categorical with categories = a.categories - b.categories.""" + b_set = set(b.categories) + remaining = [c for c in a.categories if c not in b_set] + return pd.Categorical(a, categories=remaining) + +for _ in range(WARMUP): + cat_intersect(s_a, s_b) + cat_diff(s_a, s_b) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + cat_intersect(s_a, s_b) + cat_diff(s_a, s_b) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "cat_intersect_diff", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_cat_ops_from_codes.py b/benchmarks/pandas/bench_cat_ops_from_codes.py new file mode 100644 index 00000000..34762982 --- /dev/null +++ b/benchmarks/pandas/bench_cat_ops_from_codes.py @@ -0,0 +1,49 @@ +""" +Benchmark: pd.Categorical.from_codes, reorder_categories by freq, ordered categorical on 100k elements. +Outputs JSON: {"function": "cat_ops_from_codes", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +categories = ["alpha", "beta", "gamma", "delta", "epsilon"] +codes = [i % len(categories) for i in range(SIZE)] +order = ["epsilon", "delta", "gamma", "beta", "alpha"] + +def cat_from_codes(): + return pd.Categorical.from_codes(codes, categories=categories) + +def cat_sort_by_freq(c): + s = pd.Series(c) + freq_order = s.value_counts().index.tolist() + return s.astype(pd.CategoricalDtype(categories=freq_order, ordered=False)) + +def cat_to_ordinal(c): + s = pd.Series(c) + return s.astype(pd.CategoricalDtype(categories=order, ordered=True)) + +for _ in range(WARMUP): + c = cat_from_codes() + cat_sort_by_freq(c) + cat_to_ordinal(c) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + c = cat_from_codes() + cat_sort_by_freq(c) + cat_to_ordinal(c) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "cat_ops_from_codes", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_cat_ops_setops.py b/benchmarks/pandas/bench_cat_ops_setops.py new file mode 100644 index 00000000..eaff873f --- /dev/null +++ b/benchmarks/pandas/bench_cat_ops_setops.py @@ -0,0 +1,51 @@ +""" +Benchmark: categorical union/intersect/diff categories on 100k element Series. +Outputs JSON: {"function": "cat_ops_setops", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +cats_a = ["alpha", "beta", "gamma", "delta"] +cats_b = ["gamma", "delta", "epsilon", "zeta"] +data_a = [cats_a[i % len(cats_a)] for i in range(SIZE)] +data_b = [cats_b[i % len(cats_b)] for i in range(SIZE)] +s_a = pd.Series(data_a, dtype="category") +s_b = pd.Series(data_b, dtype="category") + +def cat_union(a, b): + cats = list(dict.fromkeys(list(a.cat.categories) + [c for c in b.cat.categories if c not in a.cat.categories])) + return a.astype(pd.CategoricalDtype(categories=cats)) + +def cat_intersect(a, b): + cats = [c for c in a.cat.categories if c in set(b.cat.categories)] + return a.astype(pd.CategoricalDtype(categories=cats)) + +def cat_diff(a, b): + cats = [c for c in a.cat.categories if c not in set(b.cat.categories)] + return a.astype(pd.CategoricalDtype(categories=cats)) + +for _ in range(WARMUP): + cat_union(s_a, s_b) + cat_intersect(s_a, s_b) + cat_diff(s_a, s_b) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + cat_union(s_a, s_b) + cat_intersect(s_a, s_b) + cat_diff(s_a, s_b) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "cat_ops_setops", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_combine_first_fn.py b/benchmarks/pandas/bench_combine_first_fn.py new file mode 100644 index 00000000..763949a3 --- /dev/null +++ b/benchmarks/pandas/bench_combine_first_fn.py @@ -0,0 +1,29 @@ +"""Benchmark: combineFirstSeries standalone — pd.Series.combine_first() on 50k-element Series with 30% NaN.""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 50_000 +WARMUP = 5 +ITERATIONS = 30 + +rng = np.random.default_rng(42) +data1 = rng.standard_normal(SIZE) +data1[::3] = float("nan") # ~30% nulls +s1 = pd.Series(data1) +s2 = pd.Series(np.arange(SIZE, dtype=np.float64) * 2.0) + +for _ in range(WARMUP): + s1.combine_first(s2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s1.combine_first(s2) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "combine_first_fn", + "mean_ms": round(total / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total, 3), +})) diff --git a/benchmarks/pandas/bench_combine_first_series.py b/benchmarks/pandas/bench_combine_first_series.py new file mode 100644 index 00000000..848bdf1f --- /dev/null +++ b/benchmarks/pandas/bench_combine_first_series.py @@ -0,0 +1,33 @@ +"""Benchmark: Series.combine_first (standalone equivalent) — fill missing values from another Series. +Mirrors tsb bench_combine_first_series.ts for pandas. +""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +data1 = [None if i % 3 == 0 else i * 0.5 for i in range(SIZE)] +data2 = [i * 0.1 for i in range(SIZE)] +s1 = pd.Series(data1) +s2 = pd.Series(data2) + +for _ in range(WARMUP): + s1.combine_first(s2) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s1.combine_first(s2) + times.append((time.perf_counter() - t0) * 1000) + +total = sum(times) +mean = total / ITERATIONS +print(json.dumps({ + "function": "combine_first_series", + "mean_ms": round(mean, 3), + "iterations": ITERATIONS, + "total_ms": round(total, 3), +})) diff --git a/benchmarks/pandas/bench_combine_first_series_fn.py b/benchmarks/pandas/bench_combine_first_series_fn.py new file mode 100644 index 00000000..6d4559e2 --- /dev/null +++ b/benchmarks/pandas/bench_combine_first_series_fn.py @@ -0,0 +1,41 @@ +""" +Benchmark: Series.combine_first() — fill NaN values from another Series (union of indexes). +Mirrors tsb bench_combine_first_series_fn.ts (standalone combineFirstSeries fn). +Outputs JSON: {"function": "combine_first_series_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +rng = np.random.default_rng(42) +raw = rng.uniform(0, 10, SIZE) +mask = rng.integers(0, 4, SIZE) == 0 # ~25% nulls +d1 = pd.array(raw, dtype="Float64") +for idx in range(SIZE): + if mask[idx]: + d1[idx] = pd.NA + +s1 = pd.Series(d1, dtype="Float64") +s2 = pd.Series(rng.uniform(0, 10, SIZE)) + +for _ in range(WARMUP): + s1.combine_first(s2) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s1.combine_first(s2) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "combine_first_series_fn", + "mean_ms": round(total_ms / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_crosstab_normalize.py b/benchmarks/pandas/bench_crosstab_normalize.py new file mode 100644 index 00000000..be29b814 --- /dev/null +++ b/benchmarks/pandas/bench_crosstab_normalize.py @@ -0,0 +1,36 @@ +"""Benchmark: pd.crosstab() with normalize options — proportions by row/col/all.""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 50_000 +WARMUP = 5 +ITERATIONS = 30 + +rng = np.random.default_rng(99) +choices_a = ["north", "south", "east", "west"] +choices_b = ["red", "green", "blue"] + +a = pd.Series(np.array(choices_a)[rng.integers(0, 4, SIZE)]) +b = pd.Series(np.array(choices_b)[rng.integers(0, 3, SIZE)]) + +for _ in range(WARMUP): + pd.crosstab(a, b, normalize=True) + pd.crosstab(a, b, normalize="index") + pd.crosstab(a, b, normalize="columns") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.crosstab(a, b, normalize=True) + pd.crosstab(a, b, normalize="index") + pd.crosstab(a, b, normalize="columns") + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "crosstab_normalize", + "mean_ms": round(total_ms / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_cummax_cummin_str.py b/benchmarks/pandas/bench_cummax_cummin_str.py new file mode 100644 index 00000000..165a21b4 --- /dev/null +++ b/benchmarks/pandas/bench_cummax_cummin_str.py @@ -0,0 +1,27 @@ +""" +Benchmark: Series.cummax() / cummin() on string Series of 10k elements. +Outputs JSON: {"function": "cummax_cummin_str", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +words = ["apple", "banana", "cherry", "date", "elderberry", "fig", "grape", "honeydew"] +data = [words[i % len(words)] for i in range(SIZE)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.cummax() + s.cummin() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.cummax() + s.cummin() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "cummax_cummin_str", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_cumops_skipna.py b/benchmarks/pandas/bench_cumops_skipna.py new file mode 100644 index 00000000..6e14f0b6 --- /dev/null +++ b/benchmarks/pandas/bench_cumops_skipna.py @@ -0,0 +1,27 @@ +""" +Benchmark: cumsum / cumprod with skipna=False on 100k-element Series. +Outputs JSON: {"function": "cumops_skipna", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +data = [(i % 100) * 0.001 + 1 if i % 20 != 0 else float("nan") for i in range(SIZE)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.cumsum(skipna=False) + s.cumprod(skipna=False) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.cumsum(skipna=False) + s.cumprod(skipna=False) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "cumops_skipna", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_abs_fn.py b/benchmarks/pandas/bench_dataframe_abs_fn.py new file mode 100644 index 00000000..3b6550eb --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_abs_fn.py @@ -0,0 +1,41 @@ +""" +Benchmark: dataFrameAbs standalone — absolute value on a 100k-row × 4-column DataFrame. +Mirrors bench_dataframe_abs_fn.ts (uses df.abs() which is the pandas equivalent). +Outputs JSON: {"function": "dataframe_abs_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +df = pd.DataFrame( + { + "a": [(i % 200) - 100 for i in range(SIZE)], + "b": [np.sin(i * 0.01) * 100 for i in range(SIZE)], + "c": [-i * 0.5 for i in range(SIZE)], + "d": [(i % 50) - 25 for i in range(SIZE)], + } +) + +for _ in range(WARMUP): + df.abs() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.abs() +total = (time.perf_counter() - start) * 1000 + +print( + json.dumps( + { + "function": "dataframe_abs_fn", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, + } + ) +) diff --git a/benchmarks/pandas/bench_dataframe_cov_options.py b/benchmarks/pandas/bench_dataframe_cov_options.py new file mode 100644 index 00000000..ec1392c0 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_cov_options.py @@ -0,0 +1,33 @@ +"""Benchmark: DataFrame.cov / DataFrame.corr with options (ddof, min_periods).""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 20_000 +WARMUP = 3 +ITERATIONS = 20 + +rng = np.random.default_rng(42) +a = np.arange(SIZE) * 0.5 + np.sin(np.arange(SIZE) * 0.01) +b = np.arange(SIZE) * 0.3 - np.cos(np.arange(SIZE) * 0.02) +c = (np.arange(SIZE) % 100) * 1.5 +df = pd.DataFrame({"a": a, "b": b, "c": c}) + +for _ in range(WARMUP): + df.cov(ddof=0) + df.cov(ddof=1, min_periods=100) + df.corr(min_periods=50) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.cov(ddof=0) + df.cov(ddof=1, min_periods=100) + df.corr(min_periods=50) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_cov_options", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_cumops_axis1.py b/benchmarks/pandas/bench_dataframe_cumops_axis1.py new file mode 100644 index 00000000..265d3f9e --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_cumops_axis1.py @@ -0,0 +1,28 @@ +""" +Benchmark: DataFrame.cumsum(axis=1) / cumprod(axis=1) (row-wise) on 10k x 8 DataFrame. +Outputs JSON: {"function": "dataframe_cumops_axis1", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +ROWS = 10_000 +COLS = 8 +WARMUP = 3 +ITERATIONS = 20 + +data = {f"col{c}": ((np.arange(ROWS) + c) % 10) * 0.1 + 1 for c in range(COLS)} +df = pd.DataFrame(data) + +for _ in range(WARMUP): + df.cumsum(axis=1) + df.cumprod(axis=1) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.cumsum(axis=1) + df.cumprod(axis=1) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "dataframe_cumops_axis1", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_numeric_pipeline.py b/benchmarks/pandas/bench_dataframe_numeric_pipeline.py new file mode 100644 index 00000000..f6194913 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_numeric_pipeline.py @@ -0,0 +1,45 @@ +""" +Benchmark: DataFrame numeric pipeline — chain abs → round → sign on a 100k-row × 3-column DataFrame. +Mirrors bench_dataframe_numeric_pipeline.ts. +Outputs JSON: {"function": "dataframe_numeric_pipeline", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import math +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +df = pd.DataFrame( + { + "a": [math.sin(i * 0.01) * 150 - 20 for i in range(SIZE)], + "b": [math.cos(i * 0.02) * 80 for i in range(SIZE)], + "c": [(i % 1000) * 0.123 - 50 for i in range(SIZE)], + } +) + +for _ in range(WARMUP): + a = df.abs() + b = a.round(1) + np.sign(b) + +start = time.perf_counter() +for _ in range(ITERATIONS): + a = df.abs() + b = a.round(1) + np.sign(b) +total = (time.perf_counter() - start) * 1000 + +print( + json.dumps( + { + "function": "dataframe_numeric_pipeline", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, + } + ) +) diff --git a/benchmarks/pandas/bench_dataframe_reflected_arith.py b/benchmarks/pandas/bench_dataframe_reflected_arith.py new file mode 100644 index 00000000..e8272546 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_reflected_arith.py @@ -0,0 +1,35 @@ +"""Benchmark: dataframe_reflected_arith — DataFrame.radd / rsub / rmul / rdiv.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 10_000 +WARMUP = 5 +ITERATIONS = 30 + +df = pd.DataFrame({ + "a": np.arange(ROWS) * 1.5, + "b": (np.arange(ROWS) % 100) + 1.0, + "c": np.arange(ROWS) * 0.25, +}) + +for _ in range(WARMUP): + df.radd(10) + df.rsub(1000) + df.rmul(3) + df.rdiv(100) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.radd(10) + df.rsub(1000) + df.rmul(3) + df.rdiv(100) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_reflected_arith", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_rolling_apply_fn.py b/benchmarks/pandas/bench_dataframe_rolling_apply_fn.py new file mode 100644 index 00000000..4e096628 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_rolling_apply_fn.py @@ -0,0 +1,31 @@ +"""Benchmark: DataFrame rolling apply with a custom range function per column.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 5_000 +WINDOW = 10 +WARMUP = 3 +ITERATIONS = 10 + +a = np.sin(np.arange(ROWS) * 0.01) +b = np.cos(np.arange(ROWS) * 0.02) +c = (np.arange(ROWS) % 100) * 0.5 +df = pd.DataFrame({"a": a, "b": b, "c": c}) + +range_fn = lambda w: np.max(w) - np.min(w) + +for _ in range(WARMUP): + df.rolling(WINDOW).apply(range_fn, raw=True) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.rolling(WINDOW).apply(range_fn, raw=True) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_rolling_apply_fn", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_round_fn.py b/benchmarks/pandas/bench_dataframe_round_fn.py new file mode 100644 index 00000000..77611594 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_round_fn.py @@ -0,0 +1,41 @@ +""" +Benchmark: dataFrameRound standalone — round a 100k-row × 4-column DataFrame to 2 decimals. +Mirrors bench_dataframe_round_fn.ts (uses df.round(2) which is the pandas equivalent). +Outputs JSON: {"function": "dataframe_round_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import math +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +df = pd.DataFrame( + { + "a": [i * 0.123456 for i in range(SIZE)], + "b": [math.sin(i * 0.01) * 99.9 for i in range(SIZE)], + "c": [-i * 0.987654 for i in range(SIZE)], + "d": [(i % 1000) * 3.14159 for i in range(SIZE)], + } +) + +for _ in range(WARMUP): + df.round(2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.round(2) +total = (time.perf_counter() - start) * 1000 + +print( + json.dumps( + { + "function": "dataframe_round_fn", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, + } + ) +) diff --git a/benchmarks/pandas/bench_date_offset_hour_second.py b/benchmarks/pandas/bench_date_offset_hour_second.py new file mode 100644 index 00000000..1b075b6e --- /dev/null +++ b/benchmarks/pandas/bench_date_offset_hour_second.py @@ -0,0 +1,38 @@ +"""Benchmark: DateOffset Hour and Second — apply operations on 5k dates. +Mirrors tsb bench_date_offset_hour_second.ts for pandas. +""" +import json, time +from datetime import timedelta +import pandas as pd +from pandas.tseries.offsets import Hour, Second + +SIZE = 5_000 +WARMUP = 5 +ITERATIONS = 50 + +hour = Hour(3) +second = Second(90) +base = pd.Timestamp("2020-01-15 10:00:00", tz="UTC") +dates = [base + timedelta(minutes=i) for i in range(SIZE)] + +for _ in range(WARMUP): + for d in dates[:100]: + d + hour + d + second + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + for d in dates: + d + hour + d + second + times.append((time.perf_counter() - t0) * 1000) + +total = sum(times) +mean = total / ITERATIONS +print(json.dumps({ + "function": "date_offset_hour_second", + "mean_ms": round(mean, 3), + "iterations": ITERATIONS, + "total_ms": round(total, 3), +})) diff --git a/benchmarks/pandas/bench_digitize_fn.py b/benchmarks/pandas/bench_digitize_fn.py new file mode 100644 index 00000000..7dedaa46 --- /dev/null +++ b/benchmarks/pandas/bench_digitize_fn.py @@ -0,0 +1,35 @@ +"""Benchmark: numpy.digitize (standalone) — bin 50k values into 10 bins. +Mirrors tsb bench_digitize_fn.ts for numpy/pandas. +""" +import json, time +import numpy as np + +SIZE = 50_000 +WARMUP = 5 +ITERATIONS = 50 + +rng = np.random.default_rng(42) +values = np.where( + np.arange(SIZE) % 20 == 0, + np.nan, + (np.arange(SIZE) % 100) * 0.1, +).tolist() +bins = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + +for _ in range(WARMUP): + np.digitize(values, bins) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + np.digitize(values, bins) + times.append((time.perf_counter() - t0) * 1000) + +total = sum(times) +mean = total / ITERATIONS +print(json.dumps({ + "function": "digitize_fn", + "mean_ms": round(mean, 3), + "iterations": ITERATIONS, + "total_ms": round(total, 3), +})) diff --git a/benchmarks/pandas/bench_dropna_thresh_subset.py b/benchmarks/pandas/bench_dropna_thresh_subset.py new file mode 100644 index 00000000..4a250cd5 --- /dev/null +++ b/benchmarks/pandas/bench_dropna_thresh_subset.py @@ -0,0 +1,35 @@ +"""Benchmark: DataFrame.dropna with thresh and subset options.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +a = [None if i % 5 == 0 else float(i) for i in range(SIZE)] +b = [None if i % 7 == 0 else float(i * 2) for i in range(SIZE)] +c = [None if i % 11 == 0 else float(i * 3) for i in range(SIZE)] +d = [None if i % 3 == 0 else f"label_{i % 20}" for i in range(SIZE)] +df = pd.DataFrame({"a": a, "b": b, "c": c, "d": d}) + +for _ in range(WARMUP): + df.dropna(how="any") + df.dropna(how="all") + df.dropna(thresh=3) + df.dropna(subset=["a", "b"]) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.dropna(how="any") + df.dropna(how="all") + df.dropna(thresh=3) + df.dropna(subset=["a", "b"]) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dropna_thresh_subset", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dt_strftime.py b/benchmarks/pandas/bench_dt_strftime.py new file mode 100644 index 00000000..03c3d997 --- /dev/null +++ b/benchmarks/pandas/bench_dt_strftime.py @@ -0,0 +1,27 @@ +"""Benchmark: dt_strftime — dt.strftime formatting on 100k datetime values.""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +dates = pd.date_range("2024-01-01", periods=ROWS, freq="1min") +s = pd.Series(dates) + +for _ in range(WARMUP): + s.dt.strftime("%Y-%m-%d") + s.dt.strftime("%H:%M:%S") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.dt.strftime("%Y-%m-%d") + s.dt.strftime("%H:%M:%S") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dt_strftime", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_explode_dataframe.py b/benchmarks/pandas/bench_explode_dataframe.py new file mode 100644 index 00000000..4ce23ffd --- /dev/null +++ b/benchmarks/pandas/bench_explode_dataframe.py @@ -0,0 +1,23 @@ +"""Benchmark: DataFrame.explode() — explode list-column into rows.""" +import json, time +import pandas as pd + +ROWS = 10_000 +WARMUP = 5 +ITERATIONS = 30 + +vals = [[i, i + 1, i + 2] for i in range(ROWS)] +labels = [f"cat_{i % 100}" for i in range(ROWS)] +df = pd.DataFrame({"vals": vals, "labels": labels}) + +for _ in range(WARMUP): + df.explode("vals") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.explode("vals") + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "explode_dataframe", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_fillna_col_map.py b/benchmarks/pandas/bench_fillna_col_map.py new file mode 100644 index 00000000..5ad9a015 --- /dev/null +++ b/benchmarks/pandas/bench_fillna_col_map.py @@ -0,0 +1,35 @@ +"""Benchmark: DataFrame.fillna() with per-column fill dict.""" +import json, time, random +import pandas as pd +import numpy as np + +ROWS = 50_000 +WARMUP = 5 +ITERATIONS = 30 + +random.seed(42) +rng = np.random.default_rng(42) + +col_a = np.where(rng.random(ROWS) < 0.2, np.nan, rng.random(ROWS) * 100) +col_b = np.where(rng.random(ROWS) < 0.2, np.nan, rng.random(ROWS) * 50) +col_c = np.where(rng.random(ROWS) < 0.2, np.nan, rng.random(ROWS) * 200) + +df = pd.DataFrame({"a": col_a, "b": col_b, "c": col_c}) +fill_map = {"a": 0, "b": -1, "c": 99} + +for _ in range(WARMUP): + df.fillna(fill_map) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.fillna(fill_map) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "fillna_col_map", + "mean_ms": round(total_ms / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_formatter_factories.py b/benchmarks/pandas/bench_formatter_factories.py new file mode 100644 index 00000000..8a631226 --- /dev/null +++ b/benchmarks/pandas/bench_formatter_factories.py @@ -0,0 +1,50 @@ +""" +Benchmark: pandas formatter factories (lambda closures applied with Series.map) +Mirrors tsb's makeFloatFormatter / makePercentFormatter / makeCurrencyFormatter +applied via applySeriesFormatter on a 100k-element Series. +Outputs JSON: {"function": "formatter_factories", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +data = [i * 0.0001234 for i in range(SIZE)] +s = pd.Series(data) + +def make_float_formatter(decimals=3): + return lambda v: f"{v:.{decimals}f}" if isinstance(v, (int, float)) else str(v) + +def make_percent_formatter(decimals=1): + return lambda v: f"{v * 100:.{decimals}f}%" if isinstance(v, (int, float)) else str(v) + +def make_currency_formatter(symbol="€", decimals=2): + return lambda v: f"{symbol}{v:,.{decimals}f}" if isinstance(v, (int, float)) else str(v) + +float_fmt = make_float_formatter(3) +pct_fmt = make_percent_formatter(1) +curr_fmt = make_currency_formatter("€", 2) + +for _ in range(WARMUP): + s.map(float_fmt) + s.map(pct_fmt) + s.map(curr_fmt) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.map(float_fmt) + s.map(pct_fmt) + s.map(curr_fmt) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "formatter_factories", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_get_dummies_drop_first.py b/benchmarks/pandas/bench_get_dummies_drop_first.py new file mode 100644 index 00000000..7562dcc9 --- /dev/null +++ b/benchmarks/pandas/bench_get_dummies_drop_first.py @@ -0,0 +1,31 @@ +"""Benchmark: pd.get_dummies with drop_first and prefix options.""" +import json, time +import pandas as pd +import numpy as np + +ROWS = 50_000 +WARMUP = 5 +ITERATIONS = 30 + +cat_data = [f"cat_{i % 10}" for i in range(ROWS)] +s = pd.Categorical(cat_data) +df = pd.DataFrame({ + "category": cat_data, + "value": np.arange(ROWS, dtype=np.float64) * 0.1, +}) + +for _ in range(WARMUP): + pd.get_dummies(s, drop_first=True) + pd.get_dummies(s, prefix="grp", prefix_sep="_") + pd.get_dummies(df, columns=["category"], drop_first=True) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.get_dummies(s, drop_first=True) + pd.get_dummies(s, prefix="grp", prefix_sep="_") + pd.get_dummies(df, columns=["category"], drop_first=True) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "get_dummies_drop_first", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_groupby_agg_no_index.py b/benchmarks/pandas/bench_groupby_agg_no_index.py new file mode 100644 index 00000000..28da89f0 --- /dev/null +++ b/benchmarks/pandas/bench_groupby_agg_no_index.py @@ -0,0 +1,33 @@ +"""Benchmark: DataFrameGroupBy.agg() with as_index=False — group key as column.""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +rng = np.random.default_rng(42) +groups = np.array(["alpha", "beta", "gamma", "delta", "epsilon"]) +df = pd.DataFrame({ + "group": groups[rng.integers(0, 5, SIZE)], + "x": rng.random(SIZE) * 100, + "y": rng.random(SIZE) * 50, +}) + +for _ in range(WARMUP): + df.groupby("group", as_index=False).agg({"x": "mean", "y": "sum"}) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.groupby("group", as_index=False).agg({"x": "mean", "y": "sum"}) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "groupby_agg_no_index", + "mean_ms": round(total_ms / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_histogram_bin_edges.py b/benchmarks/pandas/bench_histogram_bin_edges.py new file mode 100644 index 00000000..4bd560a1 --- /dev/null +++ b/benchmarks/pandas/bench_histogram_bin_edges.py @@ -0,0 +1,26 @@ +""" +Benchmark: np.histogram with custom bin edges on 100k-element array. +Outputs JSON: {"function": "histogram_bin_edges", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +data = np.array([(i % 1000) * 0.1 for i in range(SIZE)]) +bin_edges = np.array([i * 5.0 for i in range(21)]) # 20 bins covering [0, 100) + +for _ in range(WARMUP): + np.histogram(data, bins=bin_edges) + np.histogram(data, bins=20) + +start = time.perf_counter() +for _ in range(ITERATIONS): + np.histogram(data, bins=bin_edges) + np.histogram(data, bins=20) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "histogram_bin_edges", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_insert_pop.py b/benchmarks/pandas/bench_insert_pop.py new file mode 100644 index 00000000..a00e05b5 --- /dev/null +++ b/benchmarks/pandas/bench_insert_pop.py @@ -0,0 +1,37 @@ +"""Benchmark: DataFrame.insert() and DataFrame.pop() on a 10k-row DataFrame. + +Mirrors tsb's insertColumn, popColumn, reorderColumns, moveColumn benchmarks. +""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.arange(ROWS) +df = pd.DataFrame({"a": data, "b": data, "c": data, "d": data}) +extra_col = data * 2 + +def run(): + df2 = df.copy() + df2.insert(2, "x", extra_col) + df2.pop("x") + df[["d", "c", "b", "a"]] # reorderColumns + df[["c", "a", "b", "d"]] # moveColumn equivalent + +for _ in range(WARMUP): + run() + +start = time.perf_counter() +for _ in range(ITERATIONS): + run() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "insert_pop", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_interpolate_methods.py b/benchmarks/pandas/bench_interpolate_methods.py new file mode 100644 index 00000000..e5db873c --- /dev/null +++ b/benchmarks/pandas/bench_interpolate_methods.py @@ -0,0 +1,31 @@ +"""Benchmark: interpolateSeries with linear, ffill, bfill, nearest, zero methods.""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 50_000 +WARMUP = 5 +ITERATIONS = 30 + +data = [float(i) * 0.1 if i % 5 != 0 else None for i in range(SIZE)] +s = pd.Series(data, dtype=float) + +for _ in range(WARMUP): + s.interpolate(method="linear") + s.ffill() + s.bfill() + s.interpolate(method="nearest") + s.interpolate(method="zero") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.interpolate(method="linear") + s.ffill() + s.bfill() + s.interpolate(method="nearest") + s.interpolate(method="zero") + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "interpolate_methods", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_interpolate_zero_nearest.py b/benchmarks/pandas/bench_interpolate_zero_nearest.py new file mode 100644 index 00000000..bc8798b4 --- /dev/null +++ b/benchmarks/pandas/bench_interpolate_zero_nearest.py @@ -0,0 +1,32 @@ +"""Benchmark: Series.interpolate with zero and nearest methods.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 50_000 +WARMUP = 5 +ITERATIONS = 30 + +data = [None if i % 7 in (0, 1, 2) else np.sin(i * 0.01) * 100 for i in range(SIZE)] +s = pd.Series(data, dtype="float64") + +for _ in range(WARMUP): + s.interpolate(method="zero") + s.interpolate(method="nearest") + s.interpolate(method="linear", limit=2) + s.interpolate(method="pad", limit=5) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.interpolate(method="zero") + s.interpolate(method="nearest") + s.interpolate(method="linear", limit=2) + s.interpolate(method="pad", limit=5) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "interpolate_zero_nearest", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_is_named_agg_spec.py b/benchmarks/pandas/bench_is_named_agg_spec.py new file mode 100644 index 00000000..348c7e03 --- /dev/null +++ b/benchmarks/pandas/bench_is_named_agg_spec.py @@ -0,0 +1,46 @@ +""" +Benchmark: is_named_agg_spec equivalent — check whether all values in a dict +are of a given type (mirrors tsb's isNamedAggSpec guard). +In pandas the equivalent is isinstance-checking NamedAgg namedtuples. +Outputs JSON: {"function": "is_named_agg_spec", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +from collections import namedtuple + +WARMUP = 5 +ITERATIONS = 100 + +NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) + + +def is_named_agg_spec(spec: dict) -> bool: + """Return True if every value is a NamedAgg instance.""" + return all(isinstance(v, NamedAgg) for v in spec.values()) + + +# A valid spec — all NamedAgg instances (200 entries). +valid_spec = {f"col_{i}": NamedAgg(f"src_{i % 10}", "sum") for i in range(200)} + +# An invalid spec — plain string values. +invalid_spec = {f"col_{i}": "sum" for i in range(200)} + +for _ in range(WARMUP): + is_named_agg_spec(valid_spec) + is_named_agg_spec(invalid_spec) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + for _ in range(500): + is_named_agg_spec(valid_spec) + is_named_agg_spec(invalid_spec) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "is_named_agg_spec", + "mean_ms": round(total_ms / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_isin_series_fn.py b/benchmarks/pandas/bench_isin_series_fn.py new file mode 100644 index 00000000..24e299f4 --- /dev/null +++ b/benchmarks/pandas/bench_isin_series_fn.py @@ -0,0 +1,28 @@ +"""Benchmark: isin standalone — pd.Series.isin with large and small value sets on 100k-element Series.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series([i % 5000 for i in range(SIZE)]) +test_set = list(range(2500)) +test_set2 = [100, 200, 300, 400, 500] + +for _ in range(WARMUP): + s.isin(test_set) + s.isin(test_set2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.isin(test_set) + s.isin(test_set2) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "isin_series_fn", + "mean_ms": round(total / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total, 3), +})) diff --git a/benchmarks/pandas/bench_json_normalize_meta.py b/benchmarks/pandas/bench_json_normalize_meta.py new file mode 100644 index 00000000..383c44514 --- /dev/null +++ b/benchmarks/pandas/bench_json_normalize_meta.py @@ -0,0 +1,45 @@ +"""Benchmark: pd.json_normalize with record_path, meta fields, and nested data.""" +import json, time +import pandas as pd + +SIZE = 2_000 +WARMUP = 3 +ITERATIONS = 20 + +records = [ + { + "id": i, + "dept": f"dept_{i % 10}", + "location": {"city": f"city_{i % 20}", "country": "US"}, + "employees": [ + {"name": f"emp_{i}_{j}", "salary": (i * 3 + j) * 1000, "active": j % 2 == 0} + for j in range(3) + ], + } + for i in range(SIZE) +] + +for _ in range(WARMUP): + pd.json_normalize( + records, + record_path="employees", + meta=["id", "dept"], + meta_prefix="company_", + ) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.json_normalize( + records, + record_path="employees", + meta=["id", "dept"], + meta_prefix="company_", + ) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "json_normalize_meta", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_named_agg_class.py b/benchmarks/pandas/bench_named_agg_class.py new file mode 100644 index 00000000..92c17770 --- /dev/null +++ b/benchmarks/pandas/bench_named_agg_class.py @@ -0,0 +1,45 @@ +"""Benchmark: pd.NamedAgg class construction and isinstance validation — 100 specs × 1000 iters. +Mirrors tsb bench_named_agg_class.ts for pandas. +""" +import json, time +import pandas as pd + +WARMUP = 5 +ITERATIONS = 1_000 +N = 100 + +sample_spec = { + "total": pd.NamedAgg(column="salary", aggfunc="sum"), + "avg": pd.NamedAgg(column="salary", aggfunc="mean"), + "max": pd.NamedAgg(column="salary", aggfunc="max"), + "cnt": pd.NamedAgg(column="headcount", aggfunc="count"), +} + +def is_named_agg_spec(spec): + return isinstance(spec, dict) and all(isinstance(v, pd.NamedAgg) for v in spec.values()) + +for _ in range(WARMUP): + for _ in range(N): + pd.NamedAgg(column="salary", aggfunc="sum") + pd.NamedAgg(column="score", aggfunc="mean") + is_named_agg_spec(sample_spec) + is_named_agg_spec({"x": "not-namedagg"}) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + for _ in range(N): + pd.NamedAgg(column="salary", aggfunc="sum") + pd.NamedAgg(column="score", aggfunc="mean") + is_named_agg_spec(sample_spec) + is_named_agg_spec({"x": "not-namedagg"}) + times.append((time.perf_counter() - t0) * 1000) + +total = sum(times) +mean = total / ITERATIONS +print(json.dumps({ + "function": "named_agg_class", + "mean_ms": round(mean, 3), + "iterations": ITERATIONS, + "total_ms": round(total, 3), +})) diff --git a/benchmarks/pandas/bench_nan_sum_mean_std.py b/benchmarks/pandas/bench_nan_sum_mean_std.py new file mode 100644 index 00000000..d1cb5f0d --- /dev/null +++ b/benchmarks/pandas/bench_nan_sum_mean_std.py @@ -0,0 +1,29 @@ +""" +Benchmark: np.nansum / np.nanmean / np.nanstd — nan-ignoring aggregates on 100k-element arrays. +Outputs JSON: {"function": "nan_sum_mean_std", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import math +import time +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +# Array with ~10% NaN values +data = np.array([float("nan") if i % 10 == 0 else math.sin(i * 0.01) * 100 + 50 for i in range(SIZE)]) + +for _ in range(WARMUP): + np.nansum(data) + np.nanmean(data) + np.nanstd(data) + +start = time.perf_counter() +for _ in range(ITERATIONS): + np.nansum(data) + np.nanmean(data) + np.nanstd(data) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "nan_sum_mean_std", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_nan_var_min_max.py b/benchmarks/pandas/bench_nan_var_min_max.py new file mode 100644 index 00000000..1834e1e1 --- /dev/null +++ b/benchmarks/pandas/bench_nan_var_min_max.py @@ -0,0 +1,28 @@ +""" +Benchmark: np.nanvar / np.nanmin / np.nanmax — nan-ignoring aggregates on 100k-element arrays. +Outputs JSON: {"function": "nan_var_min_max", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +# Array with ~10% NaN values +data = np.array([float("nan") if i % 10 == 0 else (i % 1000) * 0.1 - 50 for i in range(SIZE)]) + +for _ in range(WARMUP): + np.nanvar(data) + np.nanmin(data) + np.nanmax(data) + +start = time.perf_counter() +for _ in range(ITERATIONS): + np.nanvar(data) + np.nanmin(data) + np.nanmax(data) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "nan_var_min_max", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_nancumops_extra.py b/benchmarks/pandas/bench_nancumops_extra.py new file mode 100644 index 00000000..4ba72073 --- /dev/null +++ b/benchmarks/pandas/bench_nancumops_extra.py @@ -0,0 +1,29 @@ +""" +Benchmark: np.nanmedian / nancount / np.nanprod — additional nan-ignoring aggregates on 100k array. +Outputs JSON: {"function": "nancumops_extra", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import math +import time +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +# Array with ~10% NaN values +data = np.array([float("nan") if i % 10 == 0 else math.sin(i * 0.01) * 100 + 50 for i in range(SIZE)]) + +for _ in range(WARMUP): + np.nanmedian(data) + np.count_nonzero(~np.isnan(data)) + np.nanprod(data[:100]) # limit to 100 to avoid overflow + +start = time.perf_counter() +for _ in range(ITERATIONS): + np.nanmedian(data) + np.count_nonzero(~np.isnan(data)) + np.nanprod(data[:100]) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "nancumops_extra", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_natsort.py b/benchmarks/pandas/bench_natsort.py new file mode 100644 index 00000000..052f1fca --- /dev/null +++ b/benchmarks/pandas/bench_natsort.py @@ -0,0 +1,43 @@ +"""Benchmark: natsort — natural-order sorting of 10k strings with numeric suffixes. + +Mirrors tsb's natSorted / natCompare / natSortKey / natArgSort using the +Python `natsort` package (falls back to a manual key if natsort not installed). +""" +import json, time + +N = 10_000 +WARMUP = 3 +ITERATIONS = 10 + +# Build the same dataset as the TS benchmark +items = [f"item{N - i}" for i in range(N)] + +try: + from natsort import natsorted, natsort_keygen + nat_key = natsort_keygen() + def run(): + natsorted(items) + nat_key("file42") +except ImportError: + # Fallback: manual digit-aware key (equivalent logic) + import re + def _nat_key(s): + return [int(t) if t.isdigit() else t for t in re.split(r"(\d+)", s)] + def run(): + sorted(items, key=_nat_key) + _nat_key("file42") + +for _ in range(WARMUP): + run() + +start = time.perf_counter() +for _ in range(ITERATIONS): + run() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "natsort", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_natsort_ops.py b/benchmarks/pandas/bench_natsort_ops.py new file mode 100644 index 00000000..03c8f1de --- /dev/null +++ b/benchmarks/pandas/bench_natsort_ops.py @@ -0,0 +1,56 @@ +""" +Benchmark: natsort.natsorted and natsort.index_natsorted on filename-like strings. +Outputs JSON: {"function": "natsort_ops", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 20 + +filenames = [f"file{i % 100}_chunk{i // 100}.txt" for i in range(SIZE)] + +def nat_compare(a, b): + """Natural comparison: return -1/0/1 by tokenizing digit runs.""" + import re + def tokenize(s): + parts = re.split(r'(\d+)', s) + return [int(p) if p.isdigit() else p for p in parts] + ta, tb = tokenize(a), tokenize(b) + return (ta > tb) - (ta < tb) + +def nat_sorted(arr): + import re + def key(s): + parts = re.split(r'(\d+)', s) + return [int(p) if p.isdigit() else p for p in parts] + return sorted(arr, key=key) + +def nat_argsort(arr): + import re + def key(s): + parts = re.split(r'(\d+)', s) + return [int(p) if p.isdigit() else p for p in parts] + return [i for i, _ in sorted(enumerate(arr), key=lambda x: key(x[1]))] + +for _ in range(WARMUP): + nat_compare("file10.txt", "file9.txt") + nat_sorted(filenames) + nat_argsort(filenames) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + nat_compare("file10.txt", "file9.txt") + nat_sorted(filenames) + nat_argsort(filenames) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "natsort_ops", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_nlargest_dataframe.py b/benchmarks/pandas/bench_nlargest_dataframe.py new file mode 100644 index 00000000..c0430dd4 --- /dev/null +++ b/benchmarks/pandas/bench_nlargest_dataframe.py @@ -0,0 +1,30 @@ +"""Benchmark: DataFrame.nlargest / nsmallest — top-N rows by column.""" +import json, time +import pandas as pd +import numpy as np + +ROWS = 100_000 +N = 100 +WARMUP = 5 +ITERATIONS = 30 + +rng = np.random.default_rng(42) +df = pd.DataFrame({ + "a": rng.random(ROWS) * 1000, + "b": rng.random(ROWS) * 500, + "c": rng.random(ROWS) * 100, +}) + +for _ in range(WARMUP): + df.nlargest(N, "a") + df.nsmallest(N, "b") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.nlargest(N, "a") + df.nsmallest(N, "b") + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "nlargest_dataframe", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_numeric_stats_ext.py b/benchmarks/pandas/bench_numeric_stats_ext.py new file mode 100644 index 00000000..49f65dfd --- /dev/null +++ b/benchmarks/pandas/bench_numeric_stats_ext.py @@ -0,0 +1,51 @@ +""" +Benchmark: scipy percentileofscore, min-max normalization, coefficient of variation on 100k elements. +Outputs JSON: {"function": "numeric_stats_ext", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import math +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +data = [math.sin(i * 0.001) * 100 + 50 for i in range(SIZE)] +s = pd.Series(data) + +def percentile_of_score(arr, score): + """Compute percentile rank of score (rank method).""" + n = len(arr) + below = sum(1 for v in arr if v < score) + equal = sum(1 for v in arr if v == score) + return (below + 0.5 * equal) / n * 100 + +def min_max_normalize(series): + mn, mx = series.min(), series.max() + return (series - mn) / (mx - mn) + +def coeff_of_variation(series): + return series.std(ddof=1) / series.mean() + +for _ in range(WARMUP): + percentile_of_score(data, 50) + min_max_normalize(s) + coeff_of_variation(s) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + percentile_of_score(data, 50) + min_max_normalize(s) + coeff_of_variation(s) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "numeric_stats_ext", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_pct_change_fill_method.py b/benchmarks/pandas/bench_pct_change_fill_method.py new file mode 100644 index 00000000..d048edb7 --- /dev/null +++ b/benchmarks/pandas/bench_pct_change_fill_method.py @@ -0,0 +1,37 @@ +"""Benchmark: Series.pct_change / DataFrame.pct_change with fill_method options.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 50_000 +WARMUP = 5 +ITERATIONS = 30 + +data = [None if i % 20 == 0 else np.sin(i * 0.01) * 100 + 100 for i in range(SIZE)] +s = pd.Series(data, dtype="float64") + +df = pd.DataFrame({ + "a": data, + "b": [None if i % 15 == 0 else np.cos(i * 0.02) * 50 + 50 for i in range(SIZE)], +}) + +for _ in range(WARMUP): + s.pct_change(fill_method="pad") + s.pct_change(fill_method="bfill") + s.pct_change(fill_method=None) + df.pct_change(fill_method="pad", periods=2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.pct_change(fill_method="pad") + s.pct_change(fill_method="bfill") + s.pct_change(fill_method=None) + df.pct_change(fill_method="pad", periods=2) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "pct_change_fill_method", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_pct_change_periods.py b/benchmarks/pandas/bench_pct_change_periods.py new file mode 100644 index 00000000..ec09db18 --- /dev/null +++ b/benchmarks/pandas/bench_pct_change_periods.py @@ -0,0 +1,39 @@ +"""Benchmark: Series.pct_change() / DataFrame.pct_change() with various periods.""" +import json, time +import pandas as pd +import numpy as np + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +rng = np.random.default_rng(7) +data = rng.random(ROWS) * 100 + 10 + +series = pd.Series(data) +df = pd.DataFrame({ + "a": data, + "b": data * 1.5, + "c": data * 0.8, +}) + +for _ in range(WARMUP): + series.pct_change(periods=1) + series.pct_change(periods=7) + df.pct_change(periods=5) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + series.pct_change(periods=1) + series.pct_change(periods=7) + df.pct_change(periods=5) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "pct_change_periods", + "mean_ms": round(total_ms / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_pivot_fn.py b/benchmarks/pandas/bench_pivot_fn.py new file mode 100644 index 00000000..deb0437c --- /dev/null +++ b/benchmarks/pandas/bench_pivot_fn.py @@ -0,0 +1,35 @@ +"""Benchmark: pivot standalone — pd.pivot() standalone function on a 100×20 grid DataFrame.""" +import json, time +import pandas as pd +import numpy as np + +ROWS = 100 +COLS = 20 +WARMUP = 5 +ITERATIONS = 50 + +row_arr = [] +col_arr = [] +val_arr = [] +for r in range(ROWS): + for c in range(COLS): + row_arr.append(r) + col_arr.append(c) + val_arr.append(r * COLS + c + 0.5) + +df = pd.DataFrame({"row": row_arr, "col": col_arr, "val": val_arr}) + +for _ in range(WARMUP): + pd.pivot(df, index="row", columns="col", values="val") + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.pivot(df, index="row", columns="col", values="val") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "pivot_fn", + "mean_ms": round(total / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total, 3), +})) diff --git a/benchmarks/pandas/bench_pivot_table_aggfunc_variants.py b/benchmarks/pandas/bench_pivot_table_aggfunc_variants.py new file mode 100644 index 00000000..e7d7ebfb --- /dev/null +++ b/benchmarks/pandas/bench_pivot_table_aggfunc_variants.py @@ -0,0 +1,36 @@ +""" +Benchmark: pd.pivot_table with multiple aggfuncs (sum, count, min, max) on 50k-row DataFrame. +Outputs JSON: {"function": "pivot_table_aggfunc_variants", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +ROWS = 50_000 +WARMUP = 3 +ITERATIONS = 20 + +regions = ["North", "South", "East", "West"] +categories = ["A", "B", "C", "D", "E"] + +df = pd.DataFrame({ + "region": [regions[i % len(regions)] for i in range(ROWS)], + "category": [categories[i % len(categories)] for i in range(ROWS)], + "sales": [(i % 1000) * 1.5 + 10 for i in range(ROWS)], +}) + +for _ in range(WARMUP): + pd.pivot_table(df, values="sales", index="region", columns="category", aggfunc="sum") + pd.pivot_table(df, values="sales", index="region", columns="category", aggfunc="count") + pd.pivot_table(df, values="sales", index="region", columns="category", aggfunc="min") + pd.pivot_table(df, values="sales", index="region", columns="category", aggfunc="max") + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.pivot_table(df, values="sales", index="region", columns="category", aggfunc="sum") + pd.pivot_table(df, values="sales", index="region", columns="category", aggfunc="count") + pd.pivot_table(df, values="sales", index="region", columns="category", aggfunc="min") + pd.pivot_table(df, values="sales", index="region", columns="category", aggfunc="max") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "pivot_table_aggfunc_variants", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_pivot_table_fill_value.py b/benchmarks/pandas/bench_pivot_table_fill_value.py new file mode 100644 index 00000000..6aed06de --- /dev/null +++ b/benchmarks/pandas/bench_pivot_table_fill_value.py @@ -0,0 +1,28 @@ +"""Benchmark: pivot_table with fill_value=0 — fills missing cells with 0.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 50_000 +WARMUP = 3 +ITERATIONS = 10 + +rows = [f"row_{i % 50}" for i in range(ROWS)] +cols = [f"col_{i % 30}" for i in range(ROWS)] +vals = np.arange(ROWS, dtype=np.float64) * 0.1 +df = pd.DataFrame({"row": rows, "col": cols, "value": vals}) + +for _ in range(WARMUP): + df.pivot_table(values="value", index="row", columns="col", aggfunc="sum", fill_value=0) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.pivot_table(values="value", index="row", columns="col", aggfunc="sum", fill_value=0) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "pivot_table_fill_value", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_read_json_all_orients.py b/benchmarks/pandas/bench_read_json_all_orients.py new file mode 100644 index 00000000..e5ac9567 --- /dev/null +++ b/benchmarks/pandas/bench_read_json_all_orients.py @@ -0,0 +1,42 @@ +"""Benchmark: pd.read_json with all orient options (records, split, columns, index, values).""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 5_000 +WARMUP = 3 +ITERATIONS = 20 + +ids = list(range(SIZE)) +values = [i * 1.1 for i in range(SIZE)] +labels = [f"cat_{i % 10}" for i in range(SIZE)] +df = pd.DataFrame({"id": ids, "value": values, "label": labels}) + +records_json = df.to_json(orient="records") +split_json = df.to_json(orient="split") +columns_json = df.to_json(orient="columns") +values_json = df.to_json(orient="values") +index_json = df.to_json(orient="index") + +for _ in range(WARMUP): + pd.read_json(records_json, orient="records") + pd.read_json(split_json, orient="split") + pd.read_json(columns_json, orient="columns") + pd.read_json(values_json, orient="values") + pd.read_json(index_json, orient="index") + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.read_json(records_json, orient="records") + pd.read_json(split_json, orient="split") + pd.read_json(columns_json, orient="columns") + pd.read_json(values_json, orient="values") + pd.read_json(index_json, orient="index") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "read_json_all_orients", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_reindex_fill.py b/benchmarks/pandas/bench_reindex_fill.py new file mode 100644 index 00000000..98fa5653 --- /dev/null +++ b/benchmarks/pandas/bench_reindex_fill.py @@ -0,0 +1,40 @@ +""" +Benchmark: pandas Series.reindex() with ffill / bfill fill methods. +Mirrors tsb's reindexSeries with method="ffill"/"bfill". +Outputs JSON: {"function": "reindex_fill", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +SIZE = 50_000 +WARMUP = 5 +ITERATIONS = 30 + +# Sparse original index: every other position +orig_index = [i * 2 for i in range(SIZE)] +data = [np.sin(i * 0.01) for i in range(SIZE)] +s = pd.Series(data, index=orig_index) + +# Dense new index: fills in the gaps +new_index = list(range(SIZE * 2)) + +for _ in range(WARMUP): + s.reindex(new_index, method="ffill") + s.reindex(new_index, method="bfill") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.reindex(new_index, method="ffill") + s.reindex(new_index, method="bfill") + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "reindex_fill", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_reindex_fill_methods.py b/benchmarks/pandas/bench_reindex_fill_methods.py new file mode 100644 index 00000000..c5381d7a --- /dev/null +++ b/benchmarks/pandas/bench_reindex_fill_methods.py @@ -0,0 +1,37 @@ +"""Benchmark: Series.reindex / DataFrame.reindex with fill methods (ffill, bfill, nearest).""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 20_000 +WARMUP = 3 +ITERATIONS = 20 + +orig_labels = list(range(0, SIZE * 2, 2)) # even numbers +data = [i * 1.5 for i in range(SIZE)] +s = pd.Series(data, index=orig_labels) + +new_index = list(range(SIZE * 2)) # all numbers 0..SIZE*2-1 + +df = pd.DataFrame({"a": data, "b": [v * 2 for v in data]}, index=orig_labels) + +for _ in range(WARMUP): + s.reindex(new_index, method="ffill") + s.reindex(new_index, method="bfill") + s.reindex(new_index, method="nearest") + df.reindex(new_index, method="ffill") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.reindex(new_index, method="ffill") + s.reindex(new_index, method="bfill") + s.reindex(new_index, method="nearest") + df.reindex(new_index, method="ffill") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "reindex_fill_methods", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_sample_weighted.py b/benchmarks/pandas/bench_sample_weighted.py new file mode 100644 index 00000000..2420747f --- /dev/null +++ b/benchmarks/pandas/bench_sample_weighted.py @@ -0,0 +1,36 @@ +""" +Benchmark: pandas Series.sample() with weights — weighted random sampling. +Mirrors tsb's sampleSeries with weights option. +Outputs JSON: {"function": "sample_weighted", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 100_000 +N_SAMPLE = 1_000 +WARMUP = 5 +ITERATIONS = 30 + +data = [i * 0.5 for i in range(SIZE)] +# Weights: higher values get more weight (triangular distribution) +weights = [(i + 1) / SIZE for i in range(SIZE)] + +s = pd.Series(data) + +for _ in range(WARMUP): + s.sample(n=N_SAMPLE, weights=weights, replace=False) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.sample(n=N_SAMPLE, weights=weights, replace=False) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "sample_weighted", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_sample_weights.py b/benchmarks/pandas/bench_sample_weights.py new file mode 100644 index 00000000..fe15b449 --- /dev/null +++ b/benchmarks/pandas/bench_sample_weights.py @@ -0,0 +1,32 @@ +""" +Benchmark: DataFrame.sample / Series.sample with weights option on 100k rows. +Outputs JSON: {"function": "sample_weights", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import math +import time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +data = list(range(SIZE)) +weights = np.array([math.exp((i / SIZE) * 3) for i in range(SIZE)]) +weights_normalized = weights / weights.sum() + +s = pd.Series(data) +df = pd.DataFrame({"a": data, "b": [i * 2.0 for i in range(SIZE)], "c": [i * 3.0 for i in range(SIZE)]}) + +for _ in range(WARMUP): + s.sample(n=1000, weights=weights_normalized) + df.sample(n=1000, weights=weights_normalized) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.sample(n=1000, weights=weights_normalized) + df.sample(n=1000, weights=weights_normalized) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "sample_weights", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_select_dtypes_options.py b/benchmarks/pandas/bench_select_dtypes_options.py new file mode 100644 index 00000000..acd2010a --- /dev/null +++ b/benchmarks/pandas/bench_select_dtypes_options.py @@ -0,0 +1,32 @@ +"""Benchmark: DataFrame.select_dtypes() — filter columns by dtype (include/exclude).""" +import json, time +import pandas as pd +import numpy as np + +ROWS = 50_000 +WARMUP = 5 +ITERATIONS = 30 + +rng = np.random.default_rng(42) +df = pd.DataFrame({ + "intCol": np.arange(ROWS, dtype=np.int32), + "floatCol": np.arange(ROWS, dtype=np.float64) * 1.5, + "boolCol": np.arange(ROWS) % 2 == 0, + "strCol": [f"s_{i % 100}" for i in range(ROWS)], +}) + +for _ in range(WARMUP): + df.select_dtypes(include="number") + df.select_dtypes(exclude="number") + df.select_dtypes(include=["int", "float"]) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.select_dtypes(include="number") + df.select_dtypes(exclude="number") + df.select_dtypes(include=["int", "float"]) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "select_dtypes_options", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_series_cumops_nan.py b/benchmarks/pandas/bench_series_cumops_nan.py new file mode 100644 index 00000000..f7b50f20 --- /dev/null +++ b/benchmarks/pandas/bench_series_cumops_nan.py @@ -0,0 +1,32 @@ +""" +Benchmark: cumsum / cumprod / cummax / cummin on 100k Series with NaN values. +Outputs JSON: {"function": "series_cumops_nan", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import math +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +raw = [float("nan") if i % 10 == 0 else math.sin(i * 0.01) * 50 + 100 for i in range(SIZE)] +s = pd.Series(raw) + +for _ in range(WARMUP): + s.cumsum() + s.cumprod() + s.cummax() + s.cummin() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.cumsum() + s.cumprod() + s.cummax() + s.cummin() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "series_cumops_nan", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_series_numeric_pipeline.py b/benchmarks/pandas/bench_series_numeric_pipeline.py new file mode 100644 index 00000000..098598f4 --- /dev/null +++ b/benchmarks/pandas/bench_series_numeric_pipeline.py @@ -0,0 +1,38 @@ +""" +Benchmark: Series numeric pipeline — chain abs → round → clip on a 100k-element Series. +Mirrors bench_series_numeric_pipeline.ts. +Outputs JSON: {"function": "series_numeric_pipeline", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import math +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +s = pd.Series([math.sin(i * 0.01) * 150 - 20 for i in range(SIZE)]) + +for _ in range(WARMUP): + a = s.abs() + b = a.round(2) + b.clip(lower=0, upper=100) + +start = time.perf_counter() +for _ in range(ITERATIONS): + a = s.abs() + b = a.round(2) + b.clip(lower=0, upper=100) +total = (time.perf_counter() - start) * 1000 + +print( + json.dumps( + { + "function": "series_numeric_pipeline", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, + } + ) +) diff --git a/benchmarks/pandas/bench_series_reflected_arith.py b/benchmarks/pandas/bench_series_reflected_arith.py new file mode 100644 index 00000000..12a80e22 --- /dev/null +++ b/benchmarks/pandas/bench_series_reflected_arith.py @@ -0,0 +1,32 @@ +"""Benchmark: series_reflected_arith — Series.radd / rsub / rmul / rdiv.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +a = pd.Series(np.arange(SIZE) * 1.5) +b = pd.Series((np.arange(SIZE) % 1000) + 1.0) + +for _ in range(WARMUP): + a.radd(10) + a.rsub(1000) + a.rmul(3) + b.rdiv(100) + +start = time.perf_counter() +for _ in range(ITERATIONS): + a.radd(10) + a.rsub(1000) + a.rmul(3) + b.rdiv(100) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_reflected_arith", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_shift_fn.py b/benchmarks/pandas/bench_series_shift_fn.py new file mode 100644 index 00000000..a2d1ed77 --- /dev/null +++ b/benchmarks/pandas/bench_series_shift_fn.py @@ -0,0 +1,35 @@ +""" +Benchmark: pandas Series.shift() — shift a 100k-element Series by 1, 3, +and -2 periods. Mirrors tsb's shiftSeries standalone function. +Outputs JSON: {"function": "series_shift_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +s = pd.Series([i * 0.5 for i in range(SIZE)]) + +for _ in range(WARMUP): + s.shift(1) + s.shift(3) + s.shift(-2) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.shift(1) + s.shift(3) + s.shift(-2) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "series_shift_fn", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_shift_series_fn.py b/benchmarks/pandas/bench_shift_series_fn.py new file mode 100644 index 00000000..45a1506a --- /dev/null +++ b/benchmarks/pandas/bench_shift_series_fn.py @@ -0,0 +1,29 @@ +"""Benchmark: shiftSeries (standalone) — shift values by 1/−2/5 positions in a 100k-element Series.""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series(np.arange(SIZE, dtype=np.float64)) + +for _ in range(WARMUP): + s.shift(1) + s.shift(-2) + s.shift(5) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.shift(1) + s.shift(-2) + s.shift(5) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "shift_series_fn", + "mean_ms": round(total / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total, 3), +})) diff --git a/benchmarks/pandas/bench_str_contains.py b/benchmarks/pandas/bench_str_contains.py new file mode 100644 index 00000000..1378a539 --- /dev/null +++ b/benchmarks/pandas/bench_str_contains.py @@ -0,0 +1,29 @@ +"""Benchmark: pd.Series.str.contains() — regex and literal substring matching on 100k strings.""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +data = [f"item_{i % 500}_value_{i % 7}_end" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.contains("value", regex=False) + s.str.contains(r"_[0-9]+_", regex=True) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.str.contains("value", regex=False) + s.str.contains(r"_[0-9]+_", regex=True) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "str_contains", + "mean_ms": round(total_ms / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_str_swapcase_capitalize.py b/benchmarks/pandas/bench_str_swapcase_capitalize.py new file mode 100644 index 00000000..482fd93b --- /dev/null +++ b/benchmarks/pandas/bench_str_swapcase_capitalize.py @@ -0,0 +1,27 @@ +"""Benchmark: str_swapcase_capitalize — str.swapcase and str.capitalize on 100k strings.""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"Hello World {i % 500} EXAMPLE" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.swapcase() + s.str.capitalize() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.swapcase() + s.str.capitalize() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_swapcase_capitalize", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_to_numeric_dispatch.py b/benchmarks/pandas/bench_to_numeric_dispatch.py new file mode 100644 index 00000000..cff45bea --- /dev/null +++ b/benchmarks/pandas/bench_to_numeric_dispatch.py @@ -0,0 +1,29 @@ +"""Benchmark: toNumeric generic — pd.to_numeric() with array, Series, and scalar inputs.""" +import json, time +import pandas as pd + +SIZE = 50_000 +WARMUP = 5 +ITERATIONS = 30 + +str_nums = [str(i * 1.5) for i in range(SIZE)] +s = pd.Series(str_nums) + +for _ in range(WARMUP): + pd.to_numeric(str_nums, errors="coerce") + pd.to_numeric(s, errors="coerce") + pd.to_numeric("42.7") + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.to_numeric(str_nums, errors="coerce") + pd.to_numeric(s, errors="coerce") + pd.to_numeric("42.7") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "to_numeric_dispatch", + "mean_ms": round(total / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total, 3), +})) diff --git a/benchmarks/pandas/bench_to_numeric_generic.py b/benchmarks/pandas/bench_to_numeric_generic.py new file mode 100644 index 00000000..5e0c8330 --- /dev/null +++ b/benchmarks/pandas/bench_to_numeric_generic.py @@ -0,0 +1,34 @@ +"""Benchmark: pd.to_numeric generic dispatcher — coerce scalars, lists, and Series. +Mirrors tsb bench_to_numeric_generic.ts for pandas. +""" +import json, time +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +str_nums = [str(i * 0.1) for i in range(SIZE)] +series = pd.Series(str_nums) + +for _ in range(WARMUP): + pd.to_numeric("3.14") + pd.to_numeric(str_nums[:100], errors="coerce") + pd.to_numeric(series, errors="coerce") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.to_numeric("3.14") + pd.to_numeric(str_nums, errors="coerce") + pd.to_numeric(series, errors="coerce") + times.append((time.perf_counter() - t0) * 1000) + +total = sum(times) +mean = total / ITERATIONS +print(json.dumps({ + "function": "to_numeric_generic", + "mean_ms": round(mean, 3), + "iterations": ITERATIONS, + "total_ms": round(total, 3), +})) diff --git a/benchmarks/pandas/bench_unstack_fn.py b/benchmarks/pandas/bench_unstack_fn.py new file mode 100644 index 00000000..fa49c040 --- /dev/null +++ b/benchmarks/pandas/bench_unstack_fn.py @@ -0,0 +1,42 @@ +""" +Benchmark: unstack standalone — pivot innermost MultiIndex level to columns using s.unstack(). +Mirrors bench_unstack_fn.ts. +Outputs JSON: {"function": "unstack_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +ROWS = 500 +COLS = 10 +WARMUP = 5 +ITERATIONS = 50 + +data = [i * 1.0 for i in range(ROWS * COLS)] +index = pd.MultiIndex.from_tuples( + [(i // COLS, i % COLS) for i in range(ROWS * COLS)] +) +s = pd.Series(data, index=index) + +for _ in range(WARMUP): + s.unstack() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.unstack() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / len(times) + +print( + json.dumps( + { + "function": "unstack_fn", + "mean_ms": mean_ms, + "iterations": ITERATIONS, + "total_ms": total_ms, + } + ) +) diff --git a/benchmarks/pandas/bench_wide_to_long_sep_suffix.py b/benchmarks/pandas/bench_wide_to_long_sep_suffix.py new file mode 100644 index 00000000..ec0e1323 --- /dev/null +++ b/benchmarks/pandas/bench_wide_to_long_sep_suffix.py @@ -0,0 +1,43 @@ +"""Benchmark: pd.wide_to_long with sep and suffix options.""" +import json, time +import pandas as pd + +ROWS = 5_000 +WARMUP = 3 +ITERATIONS = 20 + +ids = list(range(ROWS)) +df1 = pd.DataFrame({ + "id": ids, + "A_1": [i * 1.0 for i in ids], + "A_2": [i * 1.1 for i in ids], + "A_3": [i * 1.2 for i in ids], + "B_1": [i * 2.0 for i in ids], + "B_2": [i * 2.1 for i in ids], + "B_3": [i * 2.2 for i in ids], +}) + +students = [f"s{i}" for i in ids] +df2 = pd.DataFrame({ + "student": students, + "score_Q1": [i + 10 for i in ids], + "score_Q2": [i + 20 for i in ids], + "score_Q3": [i + 30 for i in ids], +}) + +for _ in range(WARMUP): + pd.wide_to_long(df1, stubnames=["A", "B"], i="id", j="period", sep="_") + pd.wide_to_long(df2, stubnames="score", i="student", j="quarter", sep="_", suffix=r"Q\d+") + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.wide_to_long(df1, stubnames=["A", "B"], i="id", j="period", sep="_") + pd.wide_to_long(df2, stubnames="score", i="student", j="quarter", sep="_", suffix=r"Q\d+") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "wide_to_long_sep_suffix", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/tsb/bench_any_all.ts b/benchmarks/tsb/bench_any_all.ts new file mode 100644 index 00000000..1e262925 --- /dev/null +++ b/benchmarks/tsb/bench_any_all.ts @@ -0,0 +1,41 @@ +/** + * Benchmark: any_all — anySeries / allSeries / anyDataFrame / allDataFrame on 100k rows. + * Outputs JSON: {"function": "any_all", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, anySeries, allSeries, anyDataFrame, allDataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 2 === 0) }); +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => i % 3 !== 0), + b: Array.from({ length: SIZE }, (_, i) => i > 0), + c: Array.from({ length: SIZE }, () => true), +}); + +for (let i = 0; i < WARMUP; i++) { + anySeries(s); + allSeries(s); + anyDataFrame(df); + allDataFrame(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + anySeries(s); + allSeries(s); + anyDataFrame(df); + allDataFrame(df); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "any_all", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_astype_df_fn.ts b/benchmarks/tsb/bench_astype_df_fn.ts new file mode 100644 index 00000000..fa9d73af --- /dev/null +++ b/benchmarks/tsb/bench_astype_df_fn.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: astype (standalone DataFrame) — exported astype(df, dtype) function on 100k-row DataFrame. + * Mirrors pandas DataFrame.astype() called via standalone function. + * Outputs JSON: {"function": "astype_df_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, astype } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i * 1.0), + b: Array.from({ length: SIZE }, (_, i) => i), + c: Array.from({ length: SIZE }, (_, i) => (i % 2 === 0 ? 1 : 0)), +}); + +for (let i = 0; i < WARMUP; i++) { + astype(df, { a: "float32", b: "int32" }); + astype(df, "float64"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + astype(df, { a: "float32", b: "int32" }); + astype(df, "float64"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "astype_df_fn", + mean_ms: Math.round((total / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_cat_freq_crosstab.ts b/benchmarks/tsb/bench_cat_freq_crosstab.ts new file mode 100644 index 00000000..572766d1 --- /dev/null +++ b/benchmarks/tsb/bench_cat_freq_crosstab.ts @@ -0,0 +1,41 @@ +/** + * Benchmark: catFreqTable and catCrossTab on 100k elements. + * Outputs JSON: {"function": "cat_freq_crosstab", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { catFromCodes, catFreqTable, catCrossTab } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const catsA = ["alpha", "beta", "gamma", "delta", "epsilon"]; +const catsB = ["north", "south", "east", "west"]; +const codesA = Array.from({ length: SIZE }, (_, i) => i % catsA.length); +const codesB = Array.from({ length: SIZE }, (_, i) => i % catsB.length); +const csA = catFromCodes(codesA, catsA); +const csB = catFromCodes(codesB, catsB); + +for (let i = 0; i < WARMUP; i++) { + catFreqTable(csA); + catCrossTab(csA, csB); + catCrossTab(csA, csB, { normalize: true }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + catFreqTable(csA); + catCrossTab(csA, csB); + catCrossTab(csA, csB, { normalize: true }); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "cat_freq_crosstab", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_cat_intersect_diff.ts b/benchmarks/tsb/bench_cat_intersect_diff.ts new file mode 100644 index 00000000..b2b10024 --- /dev/null +++ b/benchmarks/tsb/bench_cat_intersect_diff.ts @@ -0,0 +1,43 @@ +/** + * Benchmark: catIntersectCategories / catDiffCategories — set operations on + * categorical Series categories (100k-element Series with 20 categories each). + * Outputs JSON: {"function": "cat_intersect_diff", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, catIntersectCategories, catDiffCategories } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// Build two categorical Series with overlapping but not identical category sets +const catsA = Array.from({ length: 20 }, (_, i) => `cat_a_${i}`); +const catsB = Array.from({ length: 20 }, (_, i) => `cat_${i < 10 ? "a" : "b"}_${i}`); + +const dataA = Array.from({ length: SIZE }, (_, i) => catsA[i % catsA.length]); +const dataB = Array.from({ length: SIZE }, (_, i) => catsB[i % catsB.length]); + +const sA = new Series({ data: dataA }).cat.setCategories(catsA); +const sB = new Series({ data: dataB }).cat.setCategories(catsB); + +for (let i = 0; i < WARMUP; i++) { + catIntersectCategories(sA, sB); + catDiffCategories(sA, sB); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + catIntersectCategories(sA, sB); + catDiffCategories(sA, sB); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "cat_intersect_diff", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_cat_ops_from_codes.ts b/benchmarks/tsb/bench_cat_ops_from_codes.ts new file mode 100644 index 00000000..ceffe6ea --- /dev/null +++ b/benchmarks/tsb/bench_cat_ops_from_codes.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: catFromCodes, catSortByFreq, catToOrdinal on 100k elements. + * Outputs JSON: {"function": "cat_ops_from_codes", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { catFromCodes, catSortByFreq, catToOrdinal } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const categories = ["alpha", "beta", "gamma", "delta", "epsilon"]; +const codes = Array.from({ length: SIZE }, (_, i) => i % categories.length); +const order = ["epsilon", "delta", "gamma", "beta", "alpha"]; + +for (let i = 0; i < WARMUP; i++) { + const cs = catFromCodes(codes, categories); + catSortByFreq(cs); + catToOrdinal(cs, order); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + const cs = catFromCodes(codes, categories); + catSortByFreq(cs); + catToOrdinal(cs, order); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "cat_ops_from_codes", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_cat_ops_setops.ts b/benchmarks/tsb/bench_cat_ops_setops.ts new file mode 100644 index 00000000..b97be665 --- /dev/null +++ b/benchmarks/tsb/bench_cat_ops_setops.ts @@ -0,0 +1,41 @@ +/** + * Benchmark: catUnionCategories, catIntersectCategories, catDiffCategories on 100k elements. + * Outputs JSON: {"function": "cat_ops_setops", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { catFromCodes, catUnionCategories, catIntersectCategories, catDiffCategories } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const catsA = ["alpha", "beta", "gamma", "delta"]; +const catsB = ["gamma", "delta", "epsilon", "zeta"]; +const codesA = Array.from({ length: SIZE }, (_, i) => i % catsA.length); +const codesB = Array.from({ length: SIZE }, (_, i) => i % catsB.length); +const csA = catFromCodes(codesA, catsA); +const csB = catFromCodes(codesB, catsB); + +for (let i = 0; i < WARMUP; i++) { + catUnionCategories(csA, csB); + catIntersectCategories(csA, csB); + catDiffCategories(csA, csB); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + catUnionCategories(csA, csB); + catIntersectCategories(csA, csB); + catDiffCategories(csA, csB); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "cat_ops_setops", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_combine_first_fn.ts b/benchmarks/tsb/bench_combine_first_fn.ts new file mode 100644 index 00000000..6e3c1c93 --- /dev/null +++ b/benchmarks/tsb/bench_combine_first_fn.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: combineFirstSeries standalone — exported combineFirstSeries(s1, s2) function. + * Mirrors pandas Series.combine_first(). + * Outputs JSON: {"function": "combine_first_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, combineFirstSeries } from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// s1: 50k elements with ~30% nulls +const data1 = Array.from({ length: SIZE }, (_, i) => (i % 3 === 0 ? null : i * 1.5)); +// s2: 50k elements, fills in the nulls +const data2 = Array.from({ length: SIZE }, (_, i) => i * 2.0); + +const s1 = new Series({ data: data1 }); +const s2 = new Series({ data: data2 }); + +for (let i = 0; i < WARMUP; i++) { + combineFirstSeries(s1, s2); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + combineFirstSeries(s1, s2); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "combine_first_fn", + mean_ms: Math.round((total / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_combine_first_series.ts b/benchmarks/tsb/bench_combine_first_series.ts new file mode 100644 index 00000000..d2a95b7f --- /dev/null +++ b/benchmarks/tsb/bench_combine_first_series.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: combineFirstSeries (standalone) — fill missing values from another Series. + * Outputs JSON: {"function": "combine_first_series", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { combineFirstSeries, Series } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const data1: (number | null)[] = Array.from({ length: SIZE }, (_, i) => + i % 3 === 0 ? null : i * 0.5, +); +const data2 = Array.from({ length: SIZE }, (_, i) => i * 0.1); +const s1 = new Series({ data: data1 }); +const s2 = new Series({ data: data2 }); + +for (let i = 0; i < WARMUP; i++) { + combineFirstSeries(s1, s2); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + combineFirstSeries(s1, s2); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "combine_first_series", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_combine_first_series_fn.ts b/benchmarks/tsb/bench_combine_first_series_fn.ts new file mode 100644 index 00000000..068451e4 --- /dev/null +++ b/benchmarks/tsb/bench_combine_first_series_fn.ts @@ -0,0 +1,46 @@ +/** + * Benchmark: combineFirstSeries (standalone fn) — fill NaN values from another Series (union of indexes). + * Uses the exported `combineFirstSeries` function rather than the `Series.combineFirst()` method. + * Mirrors bench_combine_first.ts but exercises the standalone export. + * Outputs JSON: {"function": "combine_first_series_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, combineFirstSeries } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const rng = (seed: number) => { + let s = seed; + return () => { + s = (s * 1664525 + 1013904223) & 0xffffffff; + return ((s >>> 0) / 0xffffffff) * 10; + }; +}; +const rand = rng(42); + +const d1: (number | null)[] = Array.from({ length: SIZE }, (_, i) => (i % 4 === 0 ? null : rand())); +const d2 = Array.from({ length: SIZE }, () => rand()); +const s1 = new Series(d1); +const s2 = new Series(d2); + +for (let i = 0; i < WARMUP; i++) { + combineFirstSeries(s1, s2); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + combineFirstSeries(s1, s2); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "combine_first_series_fn", + mean_ms: Math.round((total / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_crosstab_normalize.ts b/benchmarks/tsb/bench_crosstab_normalize.ts new file mode 100644 index 00000000..023b7af0 --- /dev/null +++ b/benchmarks/tsb/bench_crosstab_normalize.ts @@ -0,0 +1,46 @@ +/** + * Benchmark: crosstab() with normalize options — proportions by row/col/all. + * Outputs JSON: {"function": "crosstab_normalize", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, crosstab } from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 30; + +let seed = 99; +const rand = () => { + seed = (seed * 1664525 + 1013904223) & 0x7fffffff; + return seed; +}; + +const choices_a = ["north", "south", "east", "west"]; +const choices_b = ["red", "green", "blue"]; + +const a = new Series({ data: Array.from({ length: SIZE }, () => choices_a[rand() % 4]) }); +const b = new Series({ data: Array.from({ length: SIZE }, () => choices_b[rand() % 3]) }); + +for (let i = 0; i < WARMUP; i++) { + crosstab(a, b, { normalize: true }); + crosstab(a, b, { normalize: "index" }); + crosstab(a, b, { normalize: "columns" }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + crosstab(a, b, { normalize: true }); + crosstab(a, b, { normalize: "index" }); + crosstab(a, b, { normalize: "columns" }); + times.push(performance.now() - t0); +} + +const total_ms = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "crosstab_normalize", + mean_ms: Math.round((total_ms / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total_ms * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_cummax_cummin_str.ts b/benchmarks/tsb/bench_cummax_cummin_str.ts new file mode 100644 index 00000000..084ed153 --- /dev/null +++ b/benchmarks/tsb/bench_cummax_cummin_str.ts @@ -0,0 +1,27 @@ +/** + * Benchmark: cummax / cummin on string Series of 10k elements. + * Outputs JSON: {"function": "cummax_cummin_str", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, cummax, cummin } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const words = ["apple", "banana", "cherry", "date", "elderberry", "fig", "grape", "honeydew"]; +const data: string[] = Array.from({ length: SIZE }, (_, i) => words[i % words.length]); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + cummax(s); + cummin(s); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + cummax(s); + cummin(s); +} +const total = performance.now() - start; + +console.log(JSON.stringify({ function: "cummax_cummin_str", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_cumops_skipna.ts b/benchmarks/tsb/bench_cumops_skipna.ts new file mode 100644 index 00000000..066fee31 --- /dev/null +++ b/benchmarks/tsb/bench_cumops_skipna.ts @@ -0,0 +1,29 @@ +/** + * Benchmark: cumsum / cumprod with skipna=false on 100k-element Series. + * Outputs JSON: {"function": "cumops_skipna", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, cumsum, cumprod } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +// Series with ~5% NaN values +const data: (number | null)[] = Array.from({ length: SIZE }, (_, i) => + i % 20 === 0 ? null : (i % 100) * 0.001 + 1, +); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + cumsum(s, { skipna: false }); + cumprod(s, { skipna: false }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + cumsum(s, { skipna: false }); + cumprod(s, { skipna: false }); +} +const total = performance.now() - start; + +console.log(JSON.stringify({ function: "cumops_skipna", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_abs_fn.ts b/benchmarks/tsb/bench_dataframe_abs_fn.ts new file mode 100644 index 00000000..920c86d1 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_abs_fn.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: dataFrameAbs standalone — absolute value on a 100k-row × 4-column DataFrame. + * Uses the exported dataFrameAbs function (not the .abs() method). + * Outputs JSON: {"function": "dataframe_abs_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dataFrameAbs } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => (i % 200) - 100), + b: Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.01) * 100), + c: Array.from({ length: SIZE }, (_, i) => -i * 0.5), + d: Array.from({ length: SIZE }, (_, i) => (i % 50) - 25), +}); + +for (let i = 0; i < WARMUP; i++) { + dataFrameAbs(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameAbs(df); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_abs_fn", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_cov_options.ts b/benchmarks/tsb/bench_dataframe_cov_options.ts new file mode 100644 index 00000000..1d6e5340 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_cov_options.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: dataFrameCov / dataFrameCorr with options (ddof, minPeriods). + * Outputs JSON: {"function": "dataframe_cov_options", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dataFrameCov, dataFrameCorr } from "../../src/index.ts"; + +const SIZE = 20_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => i * 0.5 + Math.sin(i * 0.01)), + b: Array.from({ length: SIZE }, (_, i) => i * 0.3 - Math.cos(i * 0.02)), + c: Array.from({ length: SIZE }, (_, i) => (i % 100) * 1.5), +}); + +for (let i = 0; i < WARMUP; i++) { + dataFrameCov(df, { ddof: 0 }); + dataFrameCov(df, { ddof: 1, minPeriods: 100 }); + dataFrameCorr(df, { minPeriods: 50 }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameCov(df, { ddof: 0 }); + dataFrameCov(df, { ddof: 1, minPeriods: 100 }); + dataFrameCorr(df, { minPeriods: 50 }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_cov_options", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_cumops_axis1.ts b/benchmarks/tsb/bench_dataframe_cumops_axis1.ts new file mode 100644 index 00000000..10b6418b --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_cumops_axis1.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: dataFrameCumsum / dataFrameCumprod with axis=1 (row-wise) on 10k x 8 DataFrame. + * Outputs JSON: {"function": "dataframe_cumops_axis1", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dataFrameCumsum, dataFrameCumprod } from "../../src/index.ts"; + +const ROWS = 10_000; +const COLS = 8; +const WARMUP = 3; +const ITERATIONS = 20; + +const data: Record = {}; +for (let c = 0; c < COLS; c++) { + data[`col${c}`] = Array.from({ length: ROWS }, (_, i) => ((i + c) % 10) * 0.1 + 1); +} +const df = new DataFrame(data); + +for (let i = 0; i < WARMUP; i++) { + dataFrameCumsum(df, { axis: 1 }); + dataFrameCumprod(df, { axis: 1 }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameCumsum(df, { axis: 1 }); + dataFrameCumprod(df, { axis: 1 }); +} +const total = performance.now() - start; + +console.log(JSON.stringify({ function: "dataframe_cumops_axis1", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_numeric_pipeline.ts b/benchmarks/tsb/bench_dataframe_numeric_pipeline.ts new file mode 100644 index 00000000..6c76834e --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_numeric_pipeline.ts @@ -0,0 +1,39 @@ +/** + * Benchmark: DataFrame numeric pipeline — chain abs → round → sign on a 100k-row × 3-column DataFrame. + * Tests a realistic sequence of standalone DataFrame numeric operations. + * Outputs JSON: {"function": "dataframe_numeric_pipeline", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dataFrameAbs, dataFrameRound, dataFrameSign } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.01) * 150 - 20), + b: Array.from({ length: SIZE }, (_, i) => Math.cos(i * 0.02) * 80), + c: Array.from({ length: SIZE }, (_, i) => (i % 1000) * 0.123 - 50), +}); + +for (let i = 0; i < WARMUP; i++) { + const a = dataFrameAbs(df); + const b = dataFrameRound(a, { decimals: 1 }); + dataFrameSign(b); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + const a = dataFrameAbs(df); + const b = dataFrameRound(a, { decimals: 1 }); + dataFrameSign(b); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_numeric_pipeline", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_reflected_arith.ts b/benchmarks/tsb/bench_dataframe_reflected_arith.ts new file mode 100644 index 00000000..64931a36 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_reflected_arith.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: dataframe_reflected_arith — dataFrameRadd / dataFrameRsub / dataFrameRmul / dataFrameRdiv. + * Outputs JSON: {"function": "dataframe_reflected_arith", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dataFrameRadd, dataFrameRsub, dataFrameRmul, dataFrameRdiv } from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => i * 1.5), + b: Array.from({ length: SIZE }, (_, i) => (i % 100) + 1), + c: Array.from({ length: SIZE }, (_, i) => i * 0.25), +}); + +for (let i = 0; i < WARMUP; i++) { + dataFrameRadd(df, 10); + dataFrameRsub(df, 1000); + dataFrameRmul(df, 3); + dataFrameRdiv(df, 100); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameRadd(df, 10); + dataFrameRsub(df, 1000); + dataFrameRmul(df, 3); + dataFrameRdiv(df, 100); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_reflected_arith", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_rolling_apply_fn.ts b/benchmarks/tsb/bench_dataframe_rolling_apply_fn.ts new file mode 100644 index 00000000..ecfec96a --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_rolling_apply_fn.ts @@ -0,0 +1,45 @@ +/** + * Benchmark: standalone dataFrameRollingApply — apply a custom function over each column's rolling window. + * Outputs JSON: {"function": "dataframe_rolling_apply_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dataFrameRollingApply } from "../../src/index.ts"; + +const ROWS = 5_000; +const WINDOW = 10; +const WARMUP = 3; +const ITERATIONS = 10; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01)), + b: Array.from({ length: ROWS }, (_, i) => Math.cos(i * 0.02)), + c: Array.from({ length: ROWS }, (_, i) => (i % 100) * 0.5), +}); + +const rangeFn = (vals: readonly number[]) => { + let mn = vals[0] ?? 0; + let mx = vals[0] ?? 0; + for (const v of vals) { + if (v < mn) mn = v; + if (v > mx) mx = v; + } + return mx - mn; +}; + +for (let i = 0; i < WARMUP; i++) { + dataFrameRollingApply(df, WINDOW, rangeFn); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameRollingApply(df, WINDOW, rangeFn); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_rolling_apply_fn", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_round_fn.ts b/benchmarks/tsb/bench_dataframe_round_fn.ts new file mode 100644 index 00000000..8e25625c --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_round_fn.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: dataFrameRound standalone — round a 100k-row × 4-column DataFrame to 2 decimals. + * Uses the exported dataFrameRound function (not the .round() method). + * Outputs JSON: {"function": "dataframe_round_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dataFrameRound } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => i * 0.123456), + b: Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.01) * 99.9), + c: Array.from({ length: SIZE }, (_, i) => -i * 0.987654), + d: Array.from({ length: SIZE }, (_, i) => (i % 1000) * 3.14159), +}); + +for (let i = 0; i < WARMUP; i++) { + dataFrameRound(df, { decimals: 2 }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameRound(df, { decimals: 2 }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_round_fn", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_date_offset_hour_second.ts b/benchmarks/tsb/bench_date_offset_hour_second.ts new file mode 100644 index 00000000..17c7e09c --- /dev/null +++ b/benchmarks/tsb/bench_date_offset_hour_second.ts @@ -0,0 +1,42 @@ +/** + * Benchmark: DateOffset Hour and Second — apply operations on 5k dates. + * Outputs JSON: {"function": "date_offset_hour_second", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Hour, Second } from "../../src/index.ts"; + +const SIZE = 5_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const hour = new Hour(3); +const second = new Second(90); +const base = new Date(Date.UTC(2020, 0, 15, 10, 0, 0)); +const dates = Array.from({ length: SIZE }, (_, i) => new Date(base.getTime() + i * 60_000)); + +for (let i = 0; i < WARMUP; i++) { + for (const d of dates.slice(0, 100)) { + hour.apply(d); + second.apply(d); + } +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + for (const d of dates) { + hour.apply(d); + second.apply(d); + } + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "date_offset_hour_second", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_digitize_fn.ts b/benchmarks/tsb/bench_digitize_fn.ts new file mode 100644 index 00000000..07a2bffe --- /dev/null +++ b/benchmarks/tsb/bench_digitize_fn.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: digitize (standalone) — bin 50k values into 10 bins. + * Outputs JSON: {"function": "digitize_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { digitize } from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const values: (number | null)[] = Array.from({ length: SIZE }, (_, i) => + i % 20 === 0 ? null : (i % 100) * 0.1, +); +const bins = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + +for (let i = 0; i < WARMUP; i++) { + digitize(values, bins); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + digitize(values, bins); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "digitize_fn", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_dropna_thresh_subset.ts b/benchmarks/tsb/bench_dropna_thresh_subset.ts new file mode 100644 index 00000000..696c8aa1 --- /dev/null +++ b/benchmarks/tsb/bench_dropna_thresh_subset.ts @@ -0,0 +1,41 @@ +/** + * Benchmark: dropna with thresh and subset options on a DataFrame. + * Outputs JSON: {"function": "dropna_thresh_subset", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dropnaDataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => (i % 5 === 0 ? null : i * 1.0)), + b: Array.from({ length: SIZE }, (_, i) => (i % 7 === 0 ? null : i * 2.0)), + c: Array.from({ length: SIZE }, (_, i) => (i % 11 === 0 ? null : i * 3.0)), + d: Array.from({ length: SIZE }, (_, i) => (i % 3 === 0 ? null : `label_${i % 20}`)), +}); + +for (let i = 0; i < WARMUP; i++) { + dropnaDataFrame(df, { how: "any" }); + dropnaDataFrame(df, { how: "all" }); + dropnaDataFrame(df, { thresh: 3 }); + dropnaDataFrame(df, { subset: ["a", "b"] }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dropnaDataFrame(df, { how: "any" }); + dropnaDataFrame(df, { how: "all" }); + dropnaDataFrame(df, { thresh: 3 }); + dropnaDataFrame(df, { subset: ["a", "b"] }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dropna_thresh_subset", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dt_strftime.ts b/benchmarks/tsb/bench_dt_strftime.ts new file mode 100644 index 00000000..44230e45 --- /dev/null +++ b/benchmarks/tsb/bench_dt_strftime.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: dt_strftime — dt.strftime formatting on 100k datetime values. + * Outputs JSON: {"function": "dt_strftime", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const now = Date.now(); +const data = Array.from({ length: ROWS }, (_, i) => new Date(now + i * 60_000)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.dt.strftime("%Y-%m-%d"); + s.dt.strftime("%H:%M:%S"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.dt.strftime("%Y-%m-%d"); + s.dt.strftime("%H:%M:%S"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dt_strftime", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_explode_dataframe.ts b/benchmarks/tsb/bench_explode_dataframe.ts new file mode 100644 index 00000000..5084ffeb --- /dev/null +++ b/benchmarks/tsb/bench_explode_dataframe.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: explodeDataFrame — explode list-column into rows. + * Outputs JSON: {"function": "explode_dataframe", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, Series, explodeDataFrame } from "../../src/index.ts"; + +const ROWS = 10_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// Each row has a list of 3-5 elements in column "vals" +const vals = Array.from({ length: ROWS }, (_, i) => [i, i + 1, i + 2]); +const labels = Array.from({ length: ROWS }, (_, i) => `cat_${i % 100}`); +const df = DataFrame.fromColumns({ vals, labels }); + +for (let i = 0; i < WARMUP; i++) { + explodeDataFrame(df, "vals"); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + explodeDataFrame(df, "vals"); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "explode_dataframe", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_fillna_col_map.ts b/benchmarks/tsb/bench_fillna_col_map.ts new file mode 100644 index 00000000..003e0a36 --- /dev/null +++ b/benchmarks/tsb/bench_fillna_col_map.ts @@ -0,0 +1,50 @@ +/** + * Benchmark: fillnaDataFrame with ColumnFillMap — per-column fill values. + * Outputs JSON: {"function": "fillna_col_map", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, Series, fillnaDataFrame } from "../../src/index.ts"; + +const ROWS = 50_000; +const WARMUP = 5; +const ITERATIONS = 30; + +function seededRand(seed: number) { + let s = seed; + return () => { + s = (s * 1664525 + 1013904223) & 0x7fffffff; + return s / 0x7fffffff; + }; +} + +const rand = seededRand(42); + +// Build a DataFrame with ~20% NaN in each column +const colA = Array.from({ length: ROWS }, () => (rand() < 0.2 ? null : rand() * 100)); +const colB = Array.from({ length: ROWS }, () => (rand() < 0.2 ? null : rand() * 50)); +const colC = Array.from({ length: ROWS }, () => (rand() < 0.2 ? null : rand() * 200)); + +const df = new DataFrame({ a: colA, b: colB, c: colC }); + +// Per-column fill values +const fillMap: Record = { a: 0, b: -1, c: 99 }; + +for (let i = 0; i < WARMUP; i++) { + fillnaDataFrame(df, { value: fillMap }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + fillnaDataFrame(df, { value: fillMap }); + times.push(performance.now() - t0); +} + +const total_ms = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "fillna_col_map", + mean_ms: Math.round((total_ms / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total_ms * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_formatter_factories.ts b/benchmarks/tsb/bench_formatter_factories.ts new file mode 100644 index 00000000..7421bc31 --- /dev/null +++ b/benchmarks/tsb/bench_formatter_factories.ts @@ -0,0 +1,48 @@ +/** + * Benchmark: makeFloatFormatter / makePercentFormatter / makeCurrencyFormatter + * — create formatter functions and apply each to a 100k-element Series. + * Outputs JSON: {"function": "formatter_factories", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { + Series, + makeFloatFormatter, + makePercentFormatter, + makeCurrencyFormatter, + applySeriesFormatter, +} from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const data = Array.from({ length: SIZE }, (_, i) => i * 0.0001234); +const s = new Series({ data }); + +const floatFmt = makeFloatFormatter(3); +const pctFmt = makePercentFormatter(1); +const currFmt = makeCurrencyFormatter("€", 2); + +for (let i = 0; i < WARMUP; i++) { + applySeriesFormatter(s, floatFmt); + applySeriesFormatter(s, pctFmt); + applySeriesFormatter(s, currFmt); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + applySeriesFormatter(s, floatFmt); + applySeriesFormatter(s, pctFmt); + applySeriesFormatter(s, currFmt); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "formatter_factories", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_get_dummies_drop_first.ts b/benchmarks/tsb/bench_get_dummies_drop_first.ts new file mode 100644 index 00000000..ef1f4c9d --- /dev/null +++ b/benchmarks/tsb/bench_get_dummies_drop_first.ts @@ -0,0 +1,43 @@ +/** + * Benchmark: getDummies / dataFrameGetDummies with drop_first and prefix options. + * Outputs JSON: {"function": "get_dummies_drop_first", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, getDummies, dataFrameGetDummies } from "../../src/index.ts"; + +const ROWS = 50_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// Categorical series with 10 distinct values +const catData = Array.from({ length: ROWS }, (_, i) => `cat_${i % 10}`); +const s = new Series({ data: catData }); +const df = DataFrame.fromColumns({ + category: catData, + value: Float64Array.from({ length: ROWS }, (_, i) => i * 0.1), +}); + +for (let i = 0; i < WARMUP; i++) { + getDummies(s, { dropFirst: true }); + getDummies(s, { prefix: "grp", prefixSep: "_" }); + dataFrameGetDummies(df, { columns: ["category"], dropFirst: true }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + getDummies(s, { dropFirst: true }); + getDummies(s, { prefix: "grp", prefixSep: "_" }); + dataFrameGetDummies(df, { columns: ["category"], dropFirst: true }); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "get_dummies_drop_first", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_groupby_agg_no_index.ts b/benchmarks/tsb/bench_groupby_agg_no_index.ts new file mode 100644 index 00000000..f287ec16 --- /dev/null +++ b/benchmarks/tsb/bench_groupby_agg_no_index.ts @@ -0,0 +1,43 @@ +/** + * Benchmark: DataFrameGroupBy.agg() with asIndex=false — group key as column. + * Outputs JSON: {"function": "groupby_agg_no_index", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +let s = 42; +const rand = () => { + s = (s * 1664525 + 1013904223) & 0x7fffffff; + return s / 0x7fffffff; +}; + +const groups = ["alpha", "beta", "gamma", "delta", "epsilon"]; +const df = new DataFrame({ + group: Array.from({ length: SIZE }, () => groups[Math.floor(rand() * 5)]), + x: Array.from({ length: SIZE }, () => rand() * 100), + y: Array.from({ length: SIZE }, () => rand() * 50), +}); + +for (let i = 0; i < WARMUP; i++) { + df.groupby("group").agg({ x: "mean", y: "sum" }, false); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.groupby("group").agg({ x: "mean", y: "sum" }, false); + times.push(performance.now() - t0); +} + +const total_ms = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "groupby_agg_no_index", + mean_ms: Math.round((total_ms / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total_ms * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_histogram_bin_edges.ts b/benchmarks/tsb/bench_histogram_bin_edges.ts new file mode 100644 index 00000000..cfd2d4d8 --- /dev/null +++ b/benchmarks/tsb/bench_histogram_bin_edges.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: histogram with custom binEdges option on 100k-element array. + * Outputs JSON: {"function": "histogram_bin_edges", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { histogram } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const data = Array.from({ length: SIZE }, (_, i) => (i % 1000) * 0.1); + +// Custom bin edges: 20 edges covering [0, 100) in equal-width steps of 5 +const binEdges: number[] = Array.from({ length: 21 }, (_, i) => i * 5); + +for (let i = 0; i < WARMUP; i++) { + histogram(data, { binEdges }); + histogram(data, { bins: 20 }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + histogram(data, { binEdges }); + histogram(data, { bins: 20 }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "histogram_bin_edges", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_insert_pop.ts b/benchmarks/tsb/bench_insert_pop.ts new file mode 100644 index 00000000..663753a8 --- /dev/null +++ b/benchmarks/tsb/bench_insert_pop.ts @@ -0,0 +1,39 @@ +/** + * Benchmark: insertColumn, popColumn, reorderColumns, moveColumn on a 10k-row DataFrame + * + * Mirrors pandas DataFrame.insert() and DataFrame.pop() operations. + */ +import { DataFrame, insertColumn, popColumn, reorderColumns, moveColumn } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => i); +const df = DataFrame.fromColumns({ a: data, b: data, c: data, d: data }); +const extraCol = Array.from({ length: ROWS }, (_, i) => i * 2); + +for (let i = 0; i < WARMUP; i++) { + const df2 = insertColumn(df, 2, "x", extraCol); + popColumn(df2, "x"); + reorderColumns(df, ["d", "c", "b", "a"]); + moveColumn(df, "c", 0); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + const df2 = insertColumn(df, 2, "x", extraCol); + popColumn(df2, "x"); + reorderColumns(df, ["d", "c", "b", "a"]); + moveColumn(df, "c", 0); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "insert_pop", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_interpolate_methods.ts b/benchmarks/tsb/bench_interpolate_methods.ts new file mode 100644 index 00000000..18ce7691 --- /dev/null +++ b/benchmarks/tsb/bench_interpolate_methods.ts @@ -0,0 +1,45 @@ +/** + * Benchmark: interpolateSeries with linear, ffill, bfill, nearest, zero methods. + * Outputs JSON: {"function": "interpolate_methods", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, interpolateSeries } from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// Build a series with ~20% NaN scattered +const data: (number | null)[] = Array.from({ length: SIZE }, (_, i) => + i % 5 === 0 ? null : i * 0.1, +); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + interpolateSeries(s, { method: "linear" }); + interpolateSeries(s, { method: "ffill" }); + interpolateSeries(s, { method: "bfill" }); + interpolateSeries(s, { method: "nearest" }); + interpolateSeries(s, { method: "zero" }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + interpolateSeries(s, { method: "linear" }); + interpolateSeries(s, { method: "ffill" }); + interpolateSeries(s, { method: "bfill" }); + interpolateSeries(s, { method: "nearest" }); + interpolateSeries(s, { method: "zero" }); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "interpolate_methods", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_interpolate_zero_nearest.ts b/benchmarks/tsb/bench_interpolate_zero_nearest.ts new file mode 100644 index 00000000..7de85df1 --- /dev/null +++ b/benchmarks/tsb/bench_interpolate_zero_nearest.ts @@ -0,0 +1,42 @@ +/** + * Benchmark: interpolateSeries with zero and nearest methods. + * Outputs JSON: {"function": "interpolate_zero_nearest", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, interpolateSeries } from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// ~15% null values with consecutive gaps +const data: (number | null)[] = Array.from({ length: SIZE }, (_, i) => { + const mod = i % 7; + if (mod === 0 || mod === 1 || mod === 2) return null; + return Math.sin(i * 0.01) * 100; +}); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + interpolateSeries(s, { method: "zero" }); + interpolateSeries(s, { method: "nearest" }); + interpolateSeries(s, { method: "linear", limit: 2 }); + interpolateSeries(s, { method: "ffill", limit: 5 }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + interpolateSeries(s, { method: "zero" }); + interpolateSeries(s, { method: "nearest" }); + interpolateSeries(s, { method: "linear", limit: 2 }); + interpolateSeries(s, { method: "ffill", limit: 5 }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "interpolate_zero_nearest", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_is_named_agg_spec.ts b/benchmarks/tsb/bench_is_named_agg_spec.ts new file mode 100644 index 00000000..5a3cf3fc --- /dev/null +++ b/benchmarks/tsb/bench_is_named_agg_spec.ts @@ -0,0 +1,48 @@ +/** + * Benchmark: isNamedAggSpec — type-guard that checks whether a spec object + * consists entirely of NamedAgg instances. Used by DataFrameGroupBy.agg() + * to distinguish NamedAggSpec from plain AggSpec dicts. + * Outputs JSON: {"function": "is_named_agg_spec", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { isNamedAggSpec, namedAgg } from "../../src/index.ts"; + +const WARMUP = 5; +const ITERATIONS = 100; + +// A large spec dict that IS a NamedAggSpec — all values are NamedAgg instances. +const validSpec = Object.fromEntries( + Array.from({ length: 200 }, (_, i) => [ + `col_${i}`, + namedAgg(`src_${i % 10}`, "sum"), + ]), +); + +// A dict that is NOT a NamedAggSpec — plain string values. +const invalidSpec: Record = Object.fromEntries( + Array.from({ length: 200 }, (_, i) => [`col_${i}`, "sum"]), +); + +for (let i = 0; i < WARMUP; i++) { + isNamedAggSpec(validSpec); + isNamedAggSpec(invalidSpec); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + for (let j = 0; j < 500; j++) { + isNamedAggSpec(validSpec); + isNamedAggSpec(invalidSpec); + } + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "is_named_agg_spec", + mean_ms: Math.round((total / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_isin_series_fn.ts b/benchmarks/tsb/bench_isin_series_fn.ts new file mode 100644 index 00000000..813599df --- /dev/null +++ b/benchmarks/tsb/bench_isin_series_fn.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: isin standalone — exported isin(series, values) function on 100k-element Series. + * Mirrors pandas Series.isin() called as a standalone function. + * Outputs JSON: {"function": "isin_series_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, isin } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 5000) }); +const testSet = Array.from({ length: 2500 }, (_, i) => i); +const testSet2 = [100, 200, 300, 400, 500]; + +for (let i = 0; i < WARMUP; i++) { + isin(s, testSet); + isin(s, testSet2); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + isin(s, testSet); + isin(s, testSet2); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "isin_series_fn", + mean_ms: Math.round((total / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_json_normalize_meta.ts b/benchmarks/tsb/bench_json_normalize_meta.ts new file mode 100644 index 00000000..33cf97b4 --- /dev/null +++ b/benchmarks/tsb/bench_json_normalize_meta.ts @@ -0,0 +1,49 @@ +/** + * Benchmark: jsonNormalize with recordPath, meta fields, and nested data. + * Outputs JSON: {"function": "json_normalize_meta", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { jsonNormalize } from "../../src/index.ts"; + +const SIZE = 2_000; +const WARMUP = 3; +const ITERATIONS = 20; + +// Nested records with meta fields +const records = Array.from({ length: SIZE }, (_, i) => ({ + id: i, + dept: `dept_${i % 10}`, + location: { city: `city_${i % 20}`, country: "US" }, + employees: Array.from({ length: 3 }, (_, j) => ({ + name: `emp_${i}_${j}`, + salary: (i * 3 + j) * 1000, + active: j % 2 === 0, + })), +})); + +for (let i = 0; i < WARMUP; i++) { + // Normalize with recordPath into employees array, keeping dept and location as meta + jsonNormalize(records, { + recordPath: "employees", + meta: ["id", "dept"], + metaPrefix: "company_", + }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + jsonNormalize(records, { + recordPath: "employees", + meta: ["id", "dept"], + metaPrefix: "company_", + }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "json_normalize_meta", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_named_agg_class.ts b/benchmarks/tsb/bench_named_agg_class.ts new file mode 100644 index 00000000..9f2542d9 --- /dev/null +++ b/benchmarks/tsb/bench_named_agg_class.ts @@ -0,0 +1,48 @@ +/** + * Benchmark: NamedAgg class, namedAgg factory, isNamedAggSpec — construct and validate 10k specs. + * Outputs JSON: {"function": "named_agg_class", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { NamedAgg, namedAgg, isNamedAggSpec } from "../../src/index.ts"; + +const WARMUP = 5; +const ITERATIONS = 1_000; +const N = 100; + +const sampleSpec = { + total: namedAgg("salary", "sum"), + avg: namedAgg("salary", "mean"), + max: namedAgg("salary", "max"), + cnt: namedAgg("headcount", "count"), +}; + +for (let i = 0; i < WARMUP; i++) { + for (let j = 0; j < N; j++) { + new NamedAgg("salary", "sum"); + namedAgg("score", "mean"); + isNamedAggSpec(sampleSpec); + isNamedAggSpec({ x: "not-namedagg" }); + } +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + for (let j = 0; j < N; j++) { + new NamedAgg("salary", "sum"); + namedAgg("score", "mean"); + isNamedAggSpec(sampleSpec); + isNamedAggSpec({ x: "not-namedagg" }); + } + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "named_agg_class", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_nan_sum_mean_std.ts b/benchmarks/tsb/bench_nan_sum_mean_std.ts new file mode 100644 index 00000000..18d2d445 --- /dev/null +++ b/benchmarks/tsb/bench_nan_sum_mean_std.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: nansum / nanmean / nanstd — nan-ignoring aggregates on 100k-element arrays. + * Outputs JSON: {"function": "nan_sum_mean_std", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { nansum, nanmean, nanstd } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +// Array with ~10% null values +const data: (number | null)[] = Array.from({ length: SIZE }, (_, i) => + i % 10 === 0 ? null : Math.sin(i * 0.01) * 100 + 50, +); + +for (let i = 0; i < WARMUP; i++) { + nansum(data); + nanmean(data); + nanstd(data); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + nansum(data); + nanmean(data); + nanstd(data); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "nan_sum_mean_std", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_nan_var_min_max.ts b/benchmarks/tsb/bench_nan_var_min_max.ts new file mode 100644 index 00000000..00c15f22 --- /dev/null +++ b/benchmarks/tsb/bench_nan_var_min_max.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: nanvar / nanmin / nanmax — nan-ignoring aggregates on 100k-element arrays. + * Outputs JSON: {"function": "nan_var_min_max", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { nanvar, nanmin, nanmax } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +// Array with ~10% null values +const data: (number | null)[] = Array.from({ length: SIZE }, (_, i) => + i % 10 === 0 ? null : (i % 1000) * 0.1 - 50, +); + +for (let i = 0; i < WARMUP; i++) { + nanvar(data); + nanmin(data); + nanmax(data); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + nanvar(data); + nanmin(data); + nanmax(data); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "nan_var_min_max", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_nancumops_extra.ts b/benchmarks/tsb/bench_nancumops_extra.ts new file mode 100644 index 00000000..36442c2f --- /dev/null +++ b/benchmarks/tsb/bench_nancumops_extra.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: nanmedian / nancount / nanprod — additional nan-ignoring aggregates on 100k array. + * Outputs JSON: {"function": "nancumops_extra", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { nanmedian, nancount, nanprod } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +// Array with ~10% NaN values +const data: (number | null)[] = Array.from({ length: SIZE }, (_, i) => + i % 10 === 0 ? null : Math.sin(i * 0.01) * 100 + 50, +); + +for (let i = 0; i < WARMUP; i++) { + nanmedian(data); + nancount(data); + nanprod(data); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + nanmedian(data); + nancount(data); + nanprod(data); +} +const total = performance.now() - start; + +console.log(JSON.stringify({ function: "nancumops_extra", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_natsort.ts b/benchmarks/tsb/bench_natsort.ts new file mode 100644 index 00000000..0880d04a --- /dev/null +++ b/benchmarks/tsb/bench_natsort.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: natSorted, natCompare, natSortKey, natArgSort on 10k strings + * + * Mirrors Python `natsort` package usage: natural-order sorting of strings + * with embedded numeric tokens (e.g. "file10" sorts after "file9"). + */ +import { natSorted, natCompare, natSortKey, natArgSort } from "../../src/index.js"; + +const N = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; + +// Build an array of strings with numeric suffixes (out of natural order) +const items = Array.from({ length: N }, (_, i) => `item${N - i}`); + +// Warm-up +for (let i = 0; i < WARMUP; i++) { + natSorted(items); + natCompare("file10", "file9"); + natSortKey("file42"); + natArgSort(items); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + natSorted(items); + natCompare("file10", "file9"); + natSortKey("file42"); + natArgSort(items); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "natsort", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_natsort_ops.ts b/benchmarks/tsb/bench_natsort_ops.ts new file mode 100644 index 00000000..642dbf55 --- /dev/null +++ b/benchmarks/tsb/bench_natsort_ops.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: natCompare, natSorted, natArgSort on arrays of filename-like strings. + * Outputs JSON: {"function": "natsort_ops", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { natCompare, natSorted, natArgSort } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const filenames = Array.from({ length: SIZE }, (_, i) => `file${i % 100}_chunk${Math.floor(i / 100)}.txt`); + +for (let i = 0; i < WARMUP; i++) { + natCompare("file10.txt", "file9.txt"); + natSorted(filenames); + natArgSort(filenames); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + natCompare("file10.txt", "file9.txt"); + natSorted(filenames); + natArgSort(filenames); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "natsort_ops", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_nlargest_dataframe.ts b/benchmarks/tsb/bench_nlargest_dataframe.ts new file mode 100644 index 00000000..ba3802b0 --- /dev/null +++ b/benchmarks/tsb/bench_nlargest_dataframe.ts @@ -0,0 +1,39 @@ +/** + * Benchmark: nlargestDataFrame / nsmallestDataFrame — top-N rows by multiple columns. + * Outputs JSON: {"function": "nlargest_dataframe", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, Series, nlargestDataFrame, nsmallestDataFrame } from "../../src/index.ts"; + +const ROWS = 100_000; +const N = 100; +const WARMUP = 5; +const ITERATIONS = 30; + +const a = new Series({ data: Float64Array.from({ length: ROWS }, () => Math.random() * 1000) }); +const b = new Series({ data: Float64Array.from({ length: ROWS }, () => Math.random() * 500) }); +const c = new Series({ data: Float64Array.from({ length: ROWS }, () => Math.random() * 100) }); +const df = DataFrame.fromColumns({ a, b, c }); + +for (let i = 0; i < WARMUP; i++) { + nlargestDataFrame(df, N, { columns: ["a"] }); + nsmallestDataFrame(df, N, { columns: ["b"] }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + nlargestDataFrame(df, N, { columns: ["a"] }); + nsmallestDataFrame(df, N, { columns: ["b"] }); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "nlargest_dataframe", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_numeric_stats_ext.ts b/benchmarks/tsb/bench_numeric_stats_ext.ts new file mode 100644 index 00000000..4beb34d0 --- /dev/null +++ b/benchmarks/tsb/bench_numeric_stats_ext.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: percentileOfScore, minMaxNormalize, coefficientOfVariation on 100k elements. + * Outputs JSON: {"function": "numeric_stats_ext", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, percentileOfScore, minMaxNormalize, coefficientOfVariation } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const data = Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.001) * 100 + 50); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + percentileOfScore(data, 50, "rank"); + minMaxNormalize(s); + coefficientOfVariation(s); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + percentileOfScore(data, 50, "rank"); + minMaxNormalize(s); + coefficientOfVariation(s); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "numeric_stats_ext", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_pct_change_fill_method.ts b/benchmarks/tsb/bench_pct_change_fill_method.ts new file mode 100644 index 00000000..8e495ebb --- /dev/null +++ b/benchmarks/tsb/bench_pct_change_fill_method.ts @@ -0,0 +1,45 @@ +/** + * Benchmark: pctChangeSeries / pctChangeDataFrame with fillMethod options. + * Outputs JSON: {"function": "pct_change_fill_method", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, pctChangeSeries, pctChangeDataFrame } from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// Series with some nulls so fillMethod has effect +const data: (number | null)[] = Array.from({ length: SIZE }, (_, i) => + i % 20 === 0 ? null : Math.sin(i * 0.01) * 100 + 100, +); +const s = new Series({ data }); + +const df = DataFrame.fromColumns({ + a: data, + b: Array.from({ length: SIZE }, (_, i) => (i % 15 === 0 ? null : Math.cos(i * 0.02) * 50 + 50)), +}); + +for (let i = 0; i < WARMUP; i++) { + pctChangeSeries(s, { fillMethod: "pad" }); + pctChangeSeries(s, { fillMethod: "bfill" }); + pctChangeSeries(s, { fillMethod: null }); + pctChangeDataFrame(df, { fillMethod: "pad", periods: 2 }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + pctChangeSeries(s, { fillMethod: "pad" }); + pctChangeSeries(s, { fillMethod: "bfill" }); + pctChangeSeries(s, { fillMethod: null }); + pctChangeDataFrame(df, { fillMethod: "pad", periods: 2 }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "pct_change_fill_method", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_pct_change_periods.ts b/benchmarks/tsb/bench_pct_change_periods.ts new file mode 100644 index 00000000..897db47e --- /dev/null +++ b/benchmarks/tsb/bench_pct_change_periods.ts @@ -0,0 +1,49 @@ +/** + * Benchmark: pctChangeSeries / pctChangeDataFrame with various period values. + * Outputs JSON: {"function": "pct_change_periods", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, pctChangeSeries, pctChangeDataFrame } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +let s = 7; +const rand = () => { + s = (s * 1664525 + 1013904223) & 0x7fffffff; + return s / 0x7fffffff; +}; + +const data = Array.from({ length: ROWS }, () => rand() * 100 + 10); +const series = new Series({ data }); + +const df = new DataFrame({ + a: data, + b: data.map((v) => v * 1.5), + c: data.map((v) => v * 0.8), +}); + +for (let i = 0; i < WARMUP; i++) { + pctChangeSeries(series, { periods: 1 }); + pctChangeSeries(series, { periods: 7 }); + pctChangeDataFrame(df, { periods: 5 }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + pctChangeSeries(series, { periods: 1 }); + pctChangeSeries(series, { periods: 7 }); + pctChangeDataFrame(df, { periods: 5 }); + times.push(performance.now() - t0); +} + +const total_ms = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "pct_change_periods", + mean_ms: Math.round((total_ms / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total_ms * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_pivot_fn.ts b/benchmarks/tsb/bench_pivot_fn.ts new file mode 100644 index 00000000..4214855d --- /dev/null +++ b/benchmarks/tsb/bench_pivot_fn.ts @@ -0,0 +1,43 @@ +/** + * Benchmark: pivot standalone — exported pivot(df, options) function on a DataFrame. + * Mirrors pandas pd.pivot() standalone function. + * Outputs JSON: {"function": "pivot_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, pivot } from "../../src/index.ts"; + +const ROWS = 100; +const COLS = 20; +const WARMUP = 5; +const ITERATIONS = 50; + +// Build a ROWS×COLS grid of (row, col, val) triples +const rowArr: number[] = []; +const colArr: number[] = []; +const valArr: number[] = []; +for (let r = 0; r < ROWS; r++) { + for (let c = 0; c < COLS; c++) { + rowArr.push(r); + colArr.push(c); + valArr.push(r * COLS + c + 0.5); + } +} +const df = new DataFrame({ row: rowArr, col: colArr, val: valArr }); + +for (let i = 0; i < WARMUP; i++) { + pivot(df, { index: "row", columns: "col", values: "val" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + pivot(df, { index: "row", columns: "col", values: "val" }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "pivot_fn", + mean_ms: Math.round((total / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_pivot_table_aggfunc_variants.ts b/benchmarks/tsb/bench_pivot_table_aggfunc_variants.ts new file mode 100644 index 00000000..5977e72c --- /dev/null +++ b/benchmarks/tsb/bench_pivot_table_aggfunc_variants.ts @@ -0,0 +1,43 @@ +/** + * Benchmark: pivotTable with multiple aggfuncs (sum, count, min, max) on 50k-row DataFrame. + * Outputs JSON: {"function": "pivot_table_aggfunc_variants", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, pivotTable } from "../../src/index.ts"; + +const ROWS = 50_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const regions = ["North", "South", "East", "West"]; +const categories = ["A", "B", "C", "D", "E"]; + +const region = Array.from({ length: ROWS }, (_, i) => regions[i % regions.length] as string); +const category = Array.from({ length: ROWS }, (_, i) => categories[i % categories.length] as string); +const sales = Array.from({ length: ROWS }, (_, i) => (i % 1000) * 1.5 + 10); + +const df = DataFrame.fromColumns({ region, category, sales }); + +for (let i = 0; i < WARMUP; i++) { + pivotTable(df, { values: "sales", index: "region", columns: "category", aggfunc: "sum" }); + pivotTable(df, { values: "sales", index: "region", columns: "category", aggfunc: "count" }); + pivotTable(df, { values: "sales", index: "region", columns: "category", aggfunc: "min" }); + pivotTable(df, { values: "sales", index: "region", columns: "category", aggfunc: "max" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + pivotTable(df, { values: "sales", index: "region", columns: "category", aggfunc: "sum" }); + pivotTable(df, { values: "sales", index: "region", columns: "category", aggfunc: "count" }); + pivotTable(df, { values: "sales", index: "region", columns: "category", aggfunc: "min" }); + pivotTable(df, { values: "sales", index: "region", columns: "category", aggfunc: "max" }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "pivot_table_aggfunc_variants", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_pivot_table_fill_value.ts b/benchmarks/tsb/bench_pivot_table_fill_value.ts new file mode 100644 index 00000000..63a3d945 --- /dev/null +++ b/benchmarks/tsb/bench_pivot_table_fill_value.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: pivotTable with fill_value — fills missing cells with 0 instead of null. + * Outputs JSON: {"function": "pivot_table_fill_value", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, pivotTable } from "../../src/index.ts"; + +const ROWS = 50_000; +const WARMUP = 3; +const ITERATIONS = 10; + +// Sparse data — not all (row, col) combos exist, so fill_value matters +const rows = Array.from({ length: ROWS }, (_, i) => `row_${i % 50}`); +const cols = Array.from({ length: ROWS }, (_, i) => `col_${i % 30}`); +const vals = Array.from({ length: ROWS }, (_, i) => i * 0.1); +const df = DataFrame.fromColumns({ row: rows, col: cols, value: vals }); + +for (let i = 0; i < WARMUP; i++) { + pivotTable(df, { values: "value", index: "row", columns: "col", aggfunc: "sum", fill_value: 0 }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + pivotTable(df, { values: "value", index: "row", columns: "col", aggfunc: "sum", fill_value: 0 }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "pivot_table_fill_value", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_read_json_all_orients.ts b/benchmarks/tsb/bench_read_json_all_orients.ts new file mode 100644 index 00000000..8011a802 --- /dev/null +++ b/benchmarks/tsb/bench_read_json_all_orients.ts @@ -0,0 +1,48 @@ +/** + * Benchmark: readJson with all orient options (records, split, columns, index, values). + * Outputs JSON: {"function": "read_json_all_orients", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, readJson, toJson } from "../../src/index.ts"; + +const SIZE = 5_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const df = DataFrame.fromColumns({ + id: Array.from({ length: SIZE }, (_, i) => i), + value: Array.from({ length: SIZE }, (_, i) => i * 1.1), + label: Array.from({ length: SIZE }, (_, i) => `cat_${i % 10}`), +}); + +const recordsJson = toJson(df, { orient: "records" }); +const splitJson = toJson(df, { orient: "split" }); +const columnsJson = toJson(df, { orient: "columns" }); +const valuesJson = toJson(df, { orient: "values" }); +const indexJson = toJson(df, { orient: "index" }); + +for (let i = 0; i < WARMUP; i++) { + readJson(recordsJson, { orient: "records" }); + readJson(splitJson, { orient: "split" }); + readJson(columnsJson, { orient: "columns" }); + readJson(valuesJson, { orient: "values" }); + readJson(indexJson, { orient: "index" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + readJson(recordsJson, { orient: "records" }); + readJson(splitJson, { orient: "split" }); + readJson(columnsJson, { orient: "columns" }); + readJson(valuesJson, { orient: "values" }); + readJson(indexJson, { orient: "index" }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "read_json_all_orients", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_reindex_fill.ts b/benchmarks/tsb/bench_reindex_fill.ts new file mode 100644 index 00000000..a60c7647 --- /dev/null +++ b/benchmarks/tsb/bench_reindex_fill.ts @@ -0,0 +1,42 @@ +/** + * Benchmark: reindexSeries with fill methods (ffill / bfill) — realign a + * 100k-element Series to a larger index using forward-fill and backward-fill. + * Extends bench_reindex which only tests the no-fill case. + * Outputs JSON: {"function": "reindex_fill", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, Index, reindexSeries } from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// Sparse original index: every other position +const origLabels = Array.from({ length: SIZE }, (_, i) => i * 2); +const data = Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.01)); +const s = new Series({ data, index: new Index(origLabels) }); + +// Dense new index: fills in the gaps +const newIndex = Array.from({ length: SIZE * 2 }, (_, i) => i); + +for (let i = 0; i < WARMUP; i++) { + reindexSeries(s, newIndex, { method: "ffill" }); + reindexSeries(s, newIndex, { method: "bfill" }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + reindexSeries(s, newIndex, { method: "ffill" }); + reindexSeries(s, newIndex, { method: "bfill" }); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "reindex_fill", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_reindex_fill_methods.ts b/benchmarks/tsb/bench_reindex_fill_methods.ts new file mode 100644 index 00000000..c919750b --- /dev/null +++ b/benchmarks/tsb/bench_reindex_fill_methods.ts @@ -0,0 +1,47 @@ +/** + * Benchmark: reindexSeries / reindexDataFrame with fill methods (ffill, bfill, nearest). + * Outputs JSON: {"function": "reindex_fill_methods", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, Index, reindexSeries, reindexDataFrame } from "../../src/index.ts"; + +const SIZE = 20_000; +const WARMUP = 3; +const ITERATIONS = 20; + +// Original: even indices +const origLabels = Array.from({ length: SIZE }, (_, i) => i * 2); +const data = Array.from({ length: SIZE }, (_, i) => i * 1.5); +const s = new Series({ data, index: new Index(origLabels) }); + +// New index: 0..SIZE*2 (includes odd indices that need filling) +const newIndex = Array.from({ length: SIZE * 2 }, (_, i) => i); + +const df = DataFrame.fromColumns( + { a: data, b: data.map((v) => v * 2) }, + new Index(origLabels), +); + +for (let i = 0; i < WARMUP; i++) { + reindexSeries(s, newIndex, { method: "ffill" }); + reindexSeries(s, newIndex, { method: "bfill" }); + reindexSeries(s, newIndex, { method: "nearest" }); + reindexDataFrame(df, { index: newIndex, method: "ffill" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + reindexSeries(s, newIndex, { method: "ffill" }); + reindexSeries(s, newIndex, { method: "bfill" }); + reindexSeries(s, newIndex, { method: "nearest" }); + reindexDataFrame(df, { index: newIndex, method: "ffill" }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "reindex_fill_methods", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_sample_weighted.ts b/benchmarks/tsb/bench_sample_weighted.ts new file mode 100644 index 00000000..e41c8fbc --- /dev/null +++ b/benchmarks/tsb/bench_sample_weighted.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: sampleSeries with weights — weighted random sampling from a + * 100k-element Series. Extends bench_sample_fn which tests unweighted sampling. + * Outputs JSON: {"function": "sample_weighted", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, sampleSeries } from "../../src/index.ts"; + +const SIZE = 100_000; +const N_SAMPLE = 1_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const data = Array.from({ length: SIZE }, (_, i) => i * 0.5); +// Weights: higher values get more weight (triangular distribution) +const weights = Array.from({ length: SIZE }, (_, i) => (i + 1) / SIZE); + +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + sampleSeries(s, { n: N_SAMPLE, weights }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + sampleSeries(s, { n: N_SAMPLE, weights }); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "sample_weighted", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_sample_weights.ts b/benchmarks/tsb/bench_sample_weights.ts new file mode 100644 index 00000000..cc3d05f5 --- /dev/null +++ b/benchmarks/tsb/bench_sample_weights.ts @@ -0,0 +1,42 @@ +/** + * Benchmark: sampleSeries / sampleDataFrame with weights option on 100k rows. + * Outputs JSON: {"function": "sample_weights", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, sampleSeries, sampleDataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const data = Array.from({ length: SIZE }, (_, i) => i * 1.0); +// Exponentially increasing weights so later rows are more likely to be picked +const weights = Array.from({ length: SIZE }, (_, i) => Math.exp((i / SIZE) * 3)); + +const s = new Series({ data }); + +const df = DataFrame.fromColumns({ + a: data, + b: Array.from({ length: SIZE }, (_, i) => i * 2.0), + c: Array.from({ length: SIZE }, (_, i) => i * 3.0), +}); + +for (let i = 0; i < WARMUP; i++) { + sampleSeries(s, { n: 1000, weights }); + sampleDataFrame(df, { n: 1000, weights }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + sampleSeries(s, { n: 1000, weights }); + sampleDataFrame(df, { n: 1000, weights }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "sample_weights", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_select_dtypes_options.ts b/benchmarks/tsb/bench_select_dtypes_options.ts new file mode 100644 index 00000000..f2c190ec --- /dev/null +++ b/benchmarks/tsb/bench_select_dtypes_options.ts @@ -0,0 +1,42 @@ +/** + * Benchmark: selectDtypes — filter DataFrame columns by dtype (include/exclude). + * Outputs JSON: {"function": "select_dtypes_options", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, Series, selectDtypes } from "../../src/index.ts"; + +const ROWS = 50_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// Build a mixed-dtype DataFrame +const intCol = new Series({ data: Int32Array.from({ length: ROWS }, (_, i) => i) }); +const floatCol = new Series({ data: Float64Array.from({ length: ROWS }, (_, i) => i * 1.5) }); +const boolCol = new Series({ data: Array.from({ length: ROWS }, (_, i) => i % 2 === 0) }); +const strCol = new Series({ data: Array.from({ length: ROWS }, (_, i) => `s_${i % 100}`) }); +const df = DataFrame.fromColumns({ intCol, floatCol, boolCol, strCol }); + +for (let i = 0; i < WARMUP; i++) { + selectDtypes(df, { include: "number" }); + selectDtypes(df, { exclude: "number" }); + selectDtypes(df, { include: ["integer", "float"] }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + selectDtypes(df, { include: "number" }); + selectDtypes(df, { exclude: "number" }); + selectDtypes(df, { include: ["integer", "float"] }); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "select_dtypes_options", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_series_cumops_nan.ts b/benchmarks/tsb/bench_series_cumops_nan.ts new file mode 100644 index 00000000..c6931514 --- /dev/null +++ b/benchmarks/tsb/bench_series_cumops_nan.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: cumsum / cumprod / cummax / cummin on 100k-element Series with NaN values (skipna=true). + * Outputs JSON: {"function": "series_cumops_nan", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, cumsum, cumprod, cummax, cummin } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +// ~10% NaN values +const data: (number | null)[] = Array.from({ length: SIZE }, (_, i) => + i % 10 === 0 ? null : Math.sin(i * 0.01) * 50 + 100, +); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + cumsum(s); + cumprod(s); + cummax(s); + cummin(s); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + cumsum(s); + cumprod(s); + cummax(s); + cummin(s); +} +const total = performance.now() - start; + +console.log(JSON.stringify({ function: "series_cumops_nan", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_numeric_pipeline.ts b/benchmarks/tsb/bench_series_numeric_pipeline.ts new file mode 100644 index 00000000..8b3acdd9 --- /dev/null +++ b/benchmarks/tsb/bench_series_numeric_pipeline.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: Series numeric pipeline — chain abs → round → clip on a 100k-element Series. + * Tests a realistic sequence of standalone numeric operations. + * Outputs JSON: {"function": "series_numeric_pipeline", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, seriesAbs, seriesRound, clipSeriesWithBounds } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const s = new Series({ + data: Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.01) * 150 - 20), +}); + +for (let i = 0; i < WARMUP; i++) { + const a = seriesAbs(s); + const b = seriesRound(a, { decimals: 2 }); + clipSeriesWithBounds(b, { lower: 0, upper: 100 }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + const a = seriesAbs(s); + const b = seriesRound(a, { decimals: 2 }); + clipSeriesWithBounds(b, { lower: 0, upper: 100 }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_numeric_pipeline", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_reflected_arith.ts b/benchmarks/tsb/bench_series_reflected_arith.ts new file mode 100644 index 00000000..a5bab682 --- /dev/null +++ b/benchmarks/tsb/bench_series_reflected_arith.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: series_reflected_arith — seriesRadd / seriesRsub / seriesRmul / seriesRdiv. + * Outputs JSON: {"function": "series_reflected_arith", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, seriesRadd, seriesRsub, seriesRmul, seriesRdiv } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const a = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 1.5) }); +const b = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i % 1000) + 1) }); + +for (let i = 0; i < WARMUP; i++) { + seriesRadd(a, 10); + seriesRsub(a, 1000); + seriesRmul(a, 3); + seriesRdiv(b, 100); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesRadd(a, 10); + seriesRsub(a, 1000); + seriesRmul(a, 3); + seriesRdiv(b, 100); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_reflected_arith", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_shift_fn.ts b/benchmarks/tsb/bench_series_shift_fn.ts new file mode 100644 index 00000000..aec6797c --- /dev/null +++ b/benchmarks/tsb/bench_series_shift_fn.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: shiftSeries (standalone export from stats/shift_diff.ts) — shift + * a 100k-element Series by 1 and 3 periods. Uses the exported shiftSeries + * function (distinct from the earlier manual-impl bench_series_shift). + * Outputs JSON: {"function": "series_shift_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, shiftSeries } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 0.5) }); + +for (let i = 0; i < WARMUP; i++) { + shiftSeries(s, 1); + shiftSeries(s, 3); + shiftSeries(s, -2); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + shiftSeries(s, 1); + shiftSeries(s, 3); + shiftSeries(s, -2); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "series_shift_fn", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_shift_series_fn.ts b/benchmarks/tsb/bench_shift_series_fn.ts new file mode 100644 index 00000000..e1ca368f --- /dev/null +++ b/benchmarks/tsb/bench_shift_series_fn.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: shiftSeries — standalone exported shiftSeries function on 100k-element Series. + * Mirrors pandas Series.shift(). + * Outputs JSON: {"function": "shift_series_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, shiftSeries } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 1.0) }); + +for (let i = 0; i < WARMUP; i++) { + shiftSeries(s, 1); + shiftSeries(s, -2); + shiftSeries(s, 5); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + shiftSeries(s, 1); + shiftSeries(s, -2); + shiftSeries(s, 5); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "shift_series_fn", + mean_ms: Math.round((total / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_str_contains.ts b/benchmarks/tsb/bench_str_contains.ts new file mode 100644 index 00000000..4eb0de8b --- /dev/null +++ b/benchmarks/tsb/bench_str_contains.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: str.contains() — regex and literal substring matching on 100k strings. + * Outputs JSON: {"function": "str_contains", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const data = Array.from({ length: ROWS }, (_, i) => `item_${i % 500}_value_${i % 7}_end`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.contains("value", false); + s.str.contains("_[0-9]+_", true); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.str.contains("value", false); + s.str.contains("_[0-9]+_", true); + times.push(performance.now() - t0); +} + +const total_ms = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "str_contains", + mean_ms: Math.round((total_ms / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total_ms * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_str_swapcase_capitalize.ts b/benchmarks/tsb/bench_str_swapcase_capitalize.ts new file mode 100644 index 00000000..8ffda40f --- /dev/null +++ b/benchmarks/tsb/bench_str_swapcase_capitalize.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: str_swapcase_capitalize — str.swapcase and str.capitalize on 100k strings. + * Outputs JSON: {"function": "str_swapcase_capitalize", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `Hello World ${i % 500} EXAMPLE`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.swapcase(); + s.str.capitalize(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.swapcase(); + s.str.capitalize(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_swapcase_capitalize", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_to_numeric_dispatch.ts b/benchmarks/tsb/bench_to_numeric_dispatch.ts new file mode 100644 index 00000000..cd00c0e1 --- /dev/null +++ b/benchmarks/tsb/bench_to_numeric_dispatch.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: toNumeric generic dispatcher — exported toNumeric(value) dispatches to Series/array/scalar paths. + * Mirrors pandas pd.to_numeric() with multiple input types. + * Outputs JSON: {"function": "to_numeric_dispatch", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, toNumeric } from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const strNums = Array.from({ length: SIZE }, (_, i) => String(i * 1.5)); +const s = new Series({ data: strNums }); + +for (let i = 0; i < WARMUP; i++) { + toNumeric(strNums, { errors: "coerce" }); + toNumeric(s, { errors: "coerce" }); + toNumeric("42.7"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + toNumeric(strNums, { errors: "coerce" }); + toNumeric(s, { errors: "coerce" }); + toNumeric("42.7"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "to_numeric_dispatch", + mean_ms: Math.round((total / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_to_numeric_generic.ts b/benchmarks/tsb/bench_to_numeric_generic.ts new file mode 100644 index 00000000..e95e2954 --- /dev/null +++ b/benchmarks/tsb/bench_to_numeric_generic.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: toNumeric (generic dispatcher) — coerce scalars, arrays, and Series. + * Outputs JSON: {"function": "to_numeric_generic", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { toNumeric, Series } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const strNums = Array.from({ length: SIZE }, (_, i) => String(i * 0.1)); +const series = new Series({ data: strNums }); + +for (let i = 0; i < WARMUP; i++) { + toNumeric("3.14"); + toNumeric(strNums.slice(0, 100), { errors: "coerce" }); + toNumeric(series, { errors: "coerce" }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + toNumeric("3.14"); + toNumeric(strNums, { errors: "coerce" }); + toNumeric(series, { errors: "coerce" }); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "to_numeric_generic", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_unstack_fn.ts b/benchmarks/tsb/bench_unstack_fn.ts new file mode 100644 index 00000000..a7090712 --- /dev/null +++ b/benchmarks/tsb/bench_unstack_fn.ts @@ -0,0 +1,41 @@ +/** + * Benchmark: unstack standalone — pivot innermost MultiIndex level to columns using exported unstack(). + * Uses the standalone unstack(series) function (not the .unstack() method). + * Outputs JSON: {"function": "unstack_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, unstack } from "../../src/index.ts"; + +const ROWS = 500; +const COLS = 10; +const WARMUP = 5; +const ITERATIONS = 50; + +const data = Array.from({ length: ROWS * COLS }, (_, i) => i * 1.0); +const index = Array.from( + { length: ROWS * COLS }, + (_, i) => [Math.floor(i / COLS), i % COLS] as [number, number], +); +const s = new Series({ data, index }); + +for (let i = 0; i < WARMUP; i++) { + unstack(s); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + unstack(s); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / times.length; + +console.log( + JSON.stringify({ + function: "unstack_fn", + mean_ms: meanMs, + iterations: ITERATIONS, + total_ms: totalMs, + }), +); diff --git a/benchmarks/tsb/bench_wide_to_long_sep_suffix.ts b/benchmarks/tsb/bench_wide_to_long_sep_suffix.ts new file mode 100644 index 00000000..7f3d9ee5 --- /dev/null +++ b/benchmarks/tsb/bench_wide_to_long_sep_suffix.ts @@ -0,0 +1,50 @@ +/** + * Benchmark: wideToLong with sep and suffix options — different column naming patterns. + * Outputs JSON: {"function": "wide_to_long_sep_suffix", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, wideToLong } from "../../src/index.ts"; + +const ROWS = 5_000; +const WARMUP = 3; +const ITERATIONS = 20; + +// Dataset 1: underscore-separated columns (A_1, A_2, B_1, B_2) +const ids = Array.from({ length: ROWS }, (_, i) => i); +const df1 = DataFrame.fromColumns({ + id: ids, + A_1: ids.map((i) => i * 1.0), + A_2: ids.map((i) => i * 1.1), + A_3: ids.map((i) => i * 1.2), + B_1: ids.map((i) => i * 2.0), + B_2: ids.map((i) => i * 2.1), + B_3: ids.map((i) => i * 2.2), +}); + +// Dataset 2: string suffix pattern (score_Q1, score_Q2, score_Q3) +const df2 = DataFrame.fromColumns({ + student: ids.map((i) => `s${i}`), + score_Q1: ids.map((i) => i + 10), + score_Q2: ids.map((i) => i + 20), + score_Q3: ids.map((i) => i + 30), +}); + +for (let i = 0; i < WARMUP; i++) { + wideToLong(df1, ["A", "B"], "id", "period", { sep: "_" }); + wideToLong(df2, "score", "student", "quarter", { sep: "_", suffix: /Q\d+/ }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + wideToLong(df1, ["A", "B"], "id", "period", { sep: "_" }); + wideToLong(df2, "score", "student", "quarter", { sep: "_", suffix: /Q\d+/ }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "wide_to_long_sep_suffix", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +);