diff --git a/benchmarks/pandas/bench_apply_dataframe_formatter.py b/benchmarks/pandas/bench_apply_dataframe_formatter.py new file mode 100644 index 00000000..958d0b0f --- /dev/null +++ b/benchmarks/pandas/bench_apply_dataframe_formatter.py @@ -0,0 +1,17 @@ +"""Benchmark: DataFrame.map formatter on 10k-row DataFrame""" +import json, time +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 +df = pd.DataFrame({"a": [i * 1.234 for i in range(ROWS)], "b": [i * 5.678 for i in range(ROWS)]}) + +for _ in range(WARMUP): + df.map(lambda v: f"{v:.2f}") + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.map(lambda v: f"{v:.2f}") +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "apply_dataframe_formatter", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_apply_series_formatter.py b/benchmarks/pandas/bench_apply_series_formatter.py new file mode 100644 index 00000000..ac69f451 --- /dev/null +++ b/benchmarks/pandas/bench_apply_series_formatter.py @@ -0,0 +1,17 @@ +"""Benchmark: apply formatter to 100k-element pandas Series""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +s = pd.Series([i * 1.234 for i in range(ROWS)]) + +for _ in range(WARMUP): + s.map(lambda v: f"{v:.2f}") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.map(lambda v: f"{v:.2f}") +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "apply_series_formatter", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_arange_linspace.py b/benchmarks/pandas/bench_arange_linspace.py new file mode 100644 index 00000000..828a3294 --- /dev/null +++ b/benchmarks/pandas/bench_arange_linspace.py @@ -0,0 +1,18 @@ +"""Benchmark: np.arange and np.linspace generating 100k-element arrays""" +import json, time +import numpy as np + +N = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +for _ in range(WARMUP): + np.arange(0, N, 1) + np.linspace(0, 1, N) + +start = time.perf_counter() +for _ in range(ITERATIONS): + np.arange(0, N, 1) + np.linspace(0, 1, N) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "arange_linspace", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_astype_series.py b/benchmarks/pandas/bench_astype_series.py new file mode 100644 index 00000000..6e00e51d --- /dev/null +++ b/benchmarks/pandas/bench_astype_series.py @@ -0,0 +1,26 @@ +"""Benchmark: Series.astype() — cast Series dtype.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +float_series = pd.Series([i * 1.5 for i in range(SIZE)]) +int_series = pd.Series([i for i in range(SIZE)]) + +for _ in range(WARMUP): + float_series.astype("int32") + int_series.astype("float64") + int_series.astype("str") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + float_series.astype("int32") + int_series.astype("float64") + int_series.astype("str") + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"astype_series","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_attrs_advanced.py b/benchmarks/pandas/bench_attrs_advanced.py new file mode 100644 index 00000000..b9249386 --- /dev/null +++ b/benchmarks/pandas/bench_attrs_advanced.py @@ -0,0 +1,30 @@ +"""Benchmark: pandas Series attrs advanced — individual attr get/set/delete/copy/merge""" +import json, time +import pandas as pd + +WARMUP = 3 +ITERATIONS = 1_000 + +s = pd.Series(range(1_000)) +s2 = pd.Series(range(1_000)) + +for _ in range(WARMUP): + s.attrs["unit"] = "meters" + _ = s.attrs.get("unit") + _ = bool(s.attrs) + s2.attrs.update(dict(s.attrs)) + s.attrs.update({"version": 1}) + s.attrs.pop("unit", None) + s.attrs.clear() + +start = time.perf_counter() +for i in range(ITERATIONS): + s.attrs["unit"] = "meters" + _ = s.attrs.get("unit") + _ = bool(s.attrs) + s2.attrs.update(dict(s.attrs)) + s.attrs.update({"version": i}) + s.attrs.pop("unit", None) + s.attrs.clear() +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "attrs_advanced", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_attrs_count_keys.py b/benchmarks/pandas/bench_attrs_count_keys.py new file mode 100644 index 00000000..1546c8a4 --- /dev/null +++ b/benchmarks/pandas/bench_attrs_count_keys.py @@ -0,0 +1,15 @@ +import pandas as pd, time, json +N = 100_000 +s = pd.Series(range(N)) +s.attrs = {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, "g": 7, "h": 8} +WARMUP = 3 +ITERS = 10_000 +for _ in range(WARMUP): + _ = len(s.attrs) + _ = list(s.attrs.keys()) +t0 = time.perf_counter() +for _ in range(ITERS): + _ = len(s.attrs) + _ = list(s.attrs.keys()) +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "attrs_count_keys", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_attrs_ops.py b/benchmarks/pandas/bench_attrs_ops.py new file mode 100644 index 00000000..db5eb946 --- /dev/null +++ b/benchmarks/pandas/bench_attrs_ops.py @@ -0,0 +1,21 @@ +import pandas as pd, time, json +N = 10_000 +s = pd.Series(range(N)) +attrs_data = {"unit": "meters", "created": "2024-01-01", "source": "sensor-1", "version": 2} +WARMUP = 3 +ITERS = 100 +for _ in range(WARMUP): + s.attrs.update(attrs_data) + _ = dict(s.attrs) + s.attrs["version"] = 99 + s2 = s.copy() + s2.attrs.update({"extra": "x"}) +t0 = time.perf_counter() +for i in range(ITERS): + s.attrs.update(attrs_data) + _ = dict(s.attrs) + s.attrs["version"] = i + s2 = s.copy() + s2.attrs.update({"extra": "x"}) +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "attrs_ops", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_between.py b/benchmarks/pandas/bench_between.py new file mode 100644 index 00000000..7ddfd202 --- /dev/null +++ b/benchmarks/pandas/bench_between.py @@ -0,0 +1,21 @@ +"""Benchmark: Series.between() — element-wise range check.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series([float(i) for i in range(SIZE)]) + +for _ in range(WARMUP): + s.between(25000.0, 75000.0) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.between(25000.0, 75000.0) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"between","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_cat_add_remove_categories.py b/benchmarks/pandas/bench_cat_add_remove_categories.py new file mode 100644 index 00000000..e45bc727 --- /dev/null +++ b/benchmarks/pandas/bench_cat_add_remove_categories.py @@ -0,0 +1,23 @@ +"""Benchmark: cat_add_remove_categories — pandas CategoricalIndex add_categories/remove_categories on 100k-element Series""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +cats = ["a", "b", "c", "d"] +s = pd.Categorical([cats[i % len(cats)] for i in range(ROWS)], categories=cats) + +for _ in range(WARMUP): + _ = s.add_categories(["e", "f"]) + _ = s.remove_categories(["d"]) + +start = time.perf_counter() +for _ in range(ITERATIONS): + _ = s.add_categories(["e", "f"]) + _ = s.remove_categories(["d"]) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "cat_add_remove_categories", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_cat_cross_tab.py b/benchmarks/pandas/bench_cat_cross_tab.py new file mode 100644 index 00000000..7bdaff2c --- /dev/null +++ b/benchmarks/pandas/bench_cat_cross_tab.py @@ -0,0 +1,20 @@ +"""Benchmark: pd.crosstab on two 100k-element categorical Series""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +cats1 = ["a", "b", "c", "d"] +cats2 = ["x", "y", "z"] +s1 = pd.Series([cats1[i % 4] for i in range(ROWS)]) +s2 = pd.Series([cats2[i % 3] for i in range(ROWS)]) + +for _ in range(WARMUP): + pd.crosstab(s1, s2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.crosstab(s1, s2) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "cat_cross_tab", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_cat_equal_categories.py b/benchmarks/pandas/bench_cat_equal_categories.py new file mode 100644 index 00000000..2ac527da --- /dev/null +++ b/benchmarks/pandas/bench_cat_equal_categories.py @@ -0,0 +1,22 @@ +"""Benchmark: compare categorical categories equality (10k iterations)""" +import json, time +import pandas as pd + +WARMUP = 3 +ITERATIONS = 10 +cats1 = ["cat_0", "cat_1", "cat_2"] +cats2 = ["cat_0", "cat_1", "cat_2"] +c1 = pd.CategoricalDtype(categories=cats1) +c2 = pd.CategoricalDtype(categories=cats2) +REPS = 10_000 + +for _ in range(WARMUP): + for _ in range(REPS): + set(c1.categories) == set(c2.categories) + +start = time.perf_counter() +for _ in range(ITERATIONS): + for _ in range(REPS): + set(c1.categories) == set(c2.categories) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "cat_equal_categories", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_cat_freq_table.py b/benchmarks/pandas/bench_cat_freq_table.py new file mode 100644 index 00000000..9d79a6f2 --- /dev/null +++ b/benchmarks/pandas/bench_cat_freq_table.py @@ -0,0 +1,18 @@ +"""Benchmark: value_counts on 100k-element categorical Series""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +cats = ["low", "med", "high", "ultra"] +s = pd.Series([cats[i % 4] for i in range(ROWS)]) + +for _ in range(WARMUP): + s.value_counts() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.value_counts() +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "cat_freq_table", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_cat_recode.py b/benchmarks/pandas/bench_cat_recode.py new file mode 100644 index 00000000..b7df1d1a --- /dev/null +++ b/benchmarks/pandas/bench_cat_recode.py @@ -0,0 +1,20 @@ +"""Benchmark: catRecode on 100k-element categorical Series""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +cats = ["a", "b", "c"] +data = [cats[i % 3] for i in range(ROWS)] +s = pd.Series(pd.Categorical(data)) +rmap = {"a": "x", "b": "y", "c": "z"} + +for _ in range(WARMUP): + s.cat.rename_categories(rmap) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.cat.rename_categories(rmap) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "cat_recode", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_cat_remove_unused.py b/benchmarks/pandas/bench_cat_remove_unused.py new file mode 100644 index 00000000..3e739887 --- /dev/null +++ b/benchmarks/pandas/bench_cat_remove_unused.py @@ -0,0 +1,29 @@ +"""Benchmark: cat_remove_unused — pd.Categorical.remove_unused_categories() on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +cats = ["a", "b", "c"] +data = [cats[i % len(cats)] for i in range(ROWS)] +# Add unused categories +cat_type = pd.CategoricalDtype(categories=["a", "b", "c", "x", "y", "z"]) +s = pd.Series(data, dtype=cat_type) + +for _ in range(WARMUP): + s.cat.remove_unused_categories() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.cat.remove_unused_categories() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "cat_remove_unused", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_cat_rename_set_categories.py b/benchmarks/pandas/bench_cat_rename_set_categories.py new file mode 100644 index 00000000..962978a0 --- /dev/null +++ b/benchmarks/pandas/bench_cat_rename_set_categories.py @@ -0,0 +1,23 @@ +"""Benchmark: cat_rename_set_categories — pandas Categorical rename_categories/set_categories on 100k-element Series""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +cats = ["a", "b", "c", "d"] +s = pd.Categorical([cats[i % len(cats)] for i in range(ROWS)], categories=cats) + +for _ in range(WARMUP): + _ = s.rename_categories({"a": "alpha", "b": "beta"}) + _ = s.set_categories(["a", "b", "c", "d", "e"]) + +start = time.perf_counter() +for _ in range(ITERATIONS): + _ = s.rename_categories({"a": "alpha", "b": "beta"}) + _ = s.set_categories(["a", "b", "c", "d", "e"]) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "cat_rename_set_categories", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_cat_reorder_as_ordered.py b/benchmarks/pandas/bench_cat_reorder_as_ordered.py new file mode 100644 index 00000000..dbf45791 --- /dev/null +++ b/benchmarks/pandas/bench_cat_reorder_as_ordered.py @@ -0,0 +1,25 @@ +"""Benchmark: cat_reorder_as_ordered — pandas Categorical reorder_categories/as_ordered/as_unordered on 100k-element Series""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +cats = ["a", "b", "c", "d"] +s = pd.Categorical([cats[i % len(cats)] for i in range(ROWS)], categories=cats) + +for _ in range(WARMUP): + _ = s.reorder_categories(["d", "c", "b", "a"]) + _ = s.as_ordered() + _ = s.as_unordered() + +start = time.perf_counter() +for _ in range(ITERATIONS): + _ = s.reorder_categories(["d", "c", "b", "a"]) + _ = s.as_ordered() + _ = s.as_unordered() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "cat_reorder_as_ordered", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_cat_set_ops.py b/benchmarks/pandas/bench_cat_set_ops.py new file mode 100644 index 00000000..29e04cc0 --- /dev/null +++ b/benchmarks/pandas/bench_cat_set_ops.py @@ -0,0 +1,24 @@ +"""Benchmark: categorical set operations (union, intersect, diff)""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +cats1 = [f"cat_{i}" for i in range(500)] +cats2 = [f"cat_{i+250}" for i in range(500)] +c1 = pd.CategoricalDtype(categories=cats1) +c2 = pd.CategoricalDtype(categories=cats2) + +for _ in range(WARMUP): + set(c1.categories) | set(c2.categories) + set(c1.categories) & set(c2.categories) + set(c1.categories) - set(c2.categories) + +start = time.perf_counter() +for _ in range(ITERATIONS): + set(c1.categories) | set(c2.categories) + set(c1.categories) & set(c2.categories) + set(c1.categories) - set(c2.categories) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "cat_set_ops", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_cat_sort_by_freq.py b/benchmarks/pandas/bench_cat_sort_by_freq.py new file mode 100644 index 00000000..41f65f0f --- /dev/null +++ b/benchmarks/pandas/bench_cat_sort_by_freq.py @@ -0,0 +1,24 @@ +"""Benchmark: sort categories by frequency on 100k-element categorical Series""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +cats = ["rare", "common", "very_common", "ultra_common"] +data = [] +for i in range(ROWS): + r = i % 51 + data.append(cats[0] if r < 1 else cats[1] if r < 6 else cats[2] if r < 21 else cats[3]) +s = pd.Series(data) + +for _ in range(WARMUP): + order = s.value_counts().index.tolist() + s.astype(pd.CategoricalDtype(categories=order, ordered=True)) + +start = time.perf_counter() +for _ in range(ITERATIONS): + order = s.value_counts().index.tolist() + s.astype(pd.CategoricalDtype(categories=order, ordered=True)) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "cat_sort_by_freq", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_cat_to_ordinal.py b/benchmarks/pandas/bench_cat_to_ordinal.py new file mode 100644 index 00000000..497230b6 --- /dev/null +++ b/benchmarks/pandas/bench_cat_to_ordinal.py @@ -0,0 +1,19 @@ +"""Benchmark: catToOrdinal on 100k-element categorical Series""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +cats = ["low", "med", "high"] +data = [cats[i % 3] for i in range(ROWS)] +s = pd.Series(pd.Categorical(data)) + +for _ in range(WARMUP): + s.astype(pd.CategoricalDtype(categories=cats, ordered=True)) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.astype(pd.CategoricalDtype(categories=cats, ordered=True)) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "cat_to_ordinal", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_cat_union_intersect_diff.py b/benchmarks/pandas/bench_cat_union_intersect_diff.py new file mode 100644 index 00000000..bef28368 --- /dev/null +++ b/benchmarks/pandas/bench_cat_union_intersect_diff.py @@ -0,0 +1,19 @@ +import pandas as pd, time, json +N = 50_000 +cats1 = ["A", "B", "C", "D"] +cats2 = ["C", "D", "E", "F"] +s1 = pd.Categorical([cats1[i % len(cats1)] for i in range(N)], categories=cats1) +s2 = pd.Categorical([cats2[i % len(cats2)] for i in range(N)], categories=cats2) +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + _ = s1.set_categories(s1.categories.union(s2.categories)) + _ = s1.set_categories(s1.categories.intersection(s2.categories)) + _ = s1.set_categories(s1.categories.difference(s2.categories)) +t0 = time.perf_counter() +for _ in range(ITERS): + _ = s1.set_categories(s1.categories.union(s2.categories)) + _ = s1.set_categories(s1.categories.intersection(s2.categories)) + _ = s1.set_categories(s1.categories.difference(s2.categories)) +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "cat_union_intersect_diff", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_cat_value_counts.py b/benchmarks/pandas/bench_cat_value_counts.py new file mode 100644 index 00000000..7181c182 --- /dev/null +++ b/benchmarks/pandas/bench_cat_value_counts.py @@ -0,0 +1,21 @@ +"""Benchmark: cat_value_counts — pandas Categorical value_counts on 100k-element Series""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +cats = ["a", "b", "c", "d", "e"] +s = pd.Categorical([cats[i % len(cats)] for i in range(ROWS)], categories=cats) + +for _ in range(WARMUP): + _ = pd.Series(s).value_counts() + +start = time.perf_counter() +for _ in range(ITERATIONS): + _ = pd.Series(s).value_counts() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "cat_value_counts", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_clip.py b/benchmarks/pandas/bench_clip.py new file mode 100644 index 00000000..30be9d0b --- /dev/null +++ b/benchmarks/pandas/bench_clip.py @@ -0,0 +1,21 @@ +"""Benchmark: Series.clip() — clip values to a range.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series([float(i) for i in range(SIZE)]) + +for _ in range(WARMUP): + s.clip(lower=10000.0, upper=90000.0) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.clip(lower=10000.0, upper=90000.0) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"clip","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_clip_advanced.py b/benchmarks/pandas/bench_clip_advanced.py new file mode 100644 index 00000000..32de1cd0 --- /dev/null +++ b/benchmarks/pandas/bench_clip_advanced.py @@ -0,0 +1,33 @@ +""" +Benchmark: Series.clip(lower_arr, upper_arr) / DataFrame.clip() — per-element clipping with array bounds. +Outputs JSON: {"function": "clip_advanced", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import math +import time +import pandas as pd +import numpy as np + +ROWS = 50_000 +WARMUP = 5 +ITERATIONS = 30 + +data = np.array([math.sin(i * 0.01) * 200 for i in range(ROWS)]) +lower = np.full(ROWS, -50.0) +upper = np.full(ROWS, 50.0) +s = pd.Series(data) + +df_data = {f"col{c}": np.array([math.sin((i + c) * 0.01) * 200 for i in range(ROWS)]) for c in range(5)} +df = pd.DataFrame(df_data) + +for _ in range(WARMUP): + s.clip(lower=lower, upper=upper) + df.clip(lower=-50, upper=50) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.clip(lower=lower, upper=upper) + df.clip(lower=-50, upper=50) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "clip_advanced", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_coefficient_of_variation.py b/benchmarks/pandas/bench_coefficient_of_variation.py new file mode 100644 index 00000000..e1bcae94 --- /dev/null +++ b/benchmarks/pandas/bench_coefficient_of_variation.py @@ -0,0 +1,20 @@ +"""Benchmark: coefficient of variation on 100k-element Series""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +s = pd.Series([i * 0.1 + 1 for i in range(ROWS)]) + +def cv(x): + return x.std() / x.mean() + +for _ in range(WARMUP): + cv(s) + +start = time.perf_counter() +for _ in range(ITERATIONS): + cv(s) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "coefficient_of_variation", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_combine_first.py b/benchmarks/pandas/bench_combine_first.py new file mode 100644 index 00000000..763b6a15 --- /dev/null +++ b/benchmarks/pandas/bench_combine_first.py @@ -0,0 +1,12 @@ +import pandas as pd, json, time, numpy as np +rng = np.random.default_rng(42) +s1 = pd.Series(rng.standard_normal(100_000)) +s2 = pd.Series(rng.standard_normal(100_000)) +# Put NaN in s1 +s1[::3] = float("nan") +for _ in range(3): s1.combine_first(s2) +N = 50 +t0 = time.perf_counter() +for _ in range(N): s1.combine_first(s2) +elapsed = time.perf_counter() - t0 +print(json.dumps({"function": "combine_first", "mean_ms": elapsed/N*1000, "iterations": N, "total_ms": elapsed*1000})) diff --git a/benchmarks/pandas/bench_concat_axis1.py b/benchmarks/pandas/bench_concat_axis1.py new file mode 100644 index 00000000..6257eb3b --- /dev/null +++ b/benchmarks/pandas/bench_concat_axis1.py @@ -0,0 +1,21 @@ +"""Benchmark: pd.concat([df1, df2], axis=1) — column-wise concat on 100k-row DataFrames.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +df1 = pd.DataFrame({"a": np.arange(ROWS) * 1.0, "b": np.arange(ROWS) * 2.0}) +df2 = pd.DataFrame({"c": np.arange(ROWS) * 3.0, "d": np.arange(ROWS) * 4.0}) + +for _ in range(WARMUP): pd.concat([df1, df2], axis=1) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.concat([df1, df2], axis=1) + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "concat_axis1", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_corr.py b/benchmarks/pandas/bench_corr.py new file mode 100644 index 00000000..fde4e7c3 --- /dev/null +++ b/benchmarks/pandas/bench_corr.py @@ -0,0 +1,21 @@ +"""Benchmark: DataFrame.corr — pairwise correlation of numeric columns.""" +import json, time +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({"a":[float(i*1.1) for i in range(SIZE)],"b":[float(i*0.7+0.3) for i in range(SIZE)],"c":[float(i*-0.5+100) for i in range(SIZE)]}) + +for _ in range(WARMUP): + df.corr() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.corr() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"corr","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_count_valid.py b/benchmarks/pandas/bench_count_valid.py new file mode 100644 index 00000000..36b819e7 --- /dev/null +++ b/benchmarks/pandas/bench_count_valid.py @@ -0,0 +1,19 @@ +"""Benchmark: Series.count on 100k-element pandas Series with NaN""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +data = [np.nan if i % 7 == 0 else i * 0.1 for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.count() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.count() +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "count_valid", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_countna.py b/benchmarks/pandas/bench_countna.py new file mode 100644 index 00000000..52ca3eed --- /dev/null +++ b/benchmarks/pandas/bench_countna.py @@ -0,0 +1,26 @@ +"""Benchmark: countna — count NaN/null values in a Series with 10% nulls""" +import json, time +import pandas as pd +import numpy as np + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +data = [None if i % 10 == 0 else float(i) for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.isna().sum() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.isna().sum() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "countna", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_cov.py b/benchmarks/pandas/bench_cov.py new file mode 100644 index 00000000..95e9c5c3 --- /dev/null +++ b/benchmarks/pandas/bench_cov.py @@ -0,0 +1,21 @@ +"""Benchmark: DataFrame.cov — pairwise covariance of numeric columns.""" +import json, time +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({"a":[float(i*1.1) for i in range(SIZE)],"b":[float(i*0.7+0.3) for i in range(SIZE)],"c":[float(i*-0.5+100) for i in range(SIZE)]}) + +for _ in range(WARMUP): + df.cov() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.cov() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"cov","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_crosstab.py b/benchmarks/pandas/bench_crosstab.py new file mode 100644 index 00000000..10237533 --- /dev/null +++ b/benchmarks/pandas/bench_crosstab.py @@ -0,0 +1,24 @@ +"""Benchmark: pd.crosstab() — compute a cross-tabulation.""" +import json, time +import pandas as pd + +SIZE = 50_000 +WARMUP = 5 +ITERATIONS = 50 + +import random +random.seed(42) +a = pd.Series([random.choice(["x","y","z"]) for _ in range(SIZE)]) +b = pd.Series([random.choice(["p","q","r","s"]) for _ in range(SIZE)]) + +for _ in range(WARMUP): + pd.crosstab(a, b) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.crosstab(a, b) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"crosstab","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_cummax.py b/benchmarks/pandas/bench_cummax.py new file mode 100644 index 00000000..63c57326 --- /dev/null +++ b/benchmarks/pandas/bench_cummax.py @@ -0,0 +1,9 @@ +import pandas as pd, json, time, numpy as np +rng = np.random.default_rng(42) +s = pd.Series(rng.standard_normal(100_000)) +for _ in range(3): s.cummax() +N = 100 +t0 = time.perf_counter() +for _ in range(N): s.cummax() +elapsed = time.perf_counter() - t0 +print(json.dumps({"function": "cummax", "mean_ms": elapsed/N*1000, "iterations": N, "total_ms": elapsed*1000})) diff --git a/benchmarks/pandas/bench_cummin.py b/benchmarks/pandas/bench_cummin.py new file mode 100644 index 00000000..114e5d07 --- /dev/null +++ b/benchmarks/pandas/bench_cummin.py @@ -0,0 +1,9 @@ +import pandas as pd, json, time, numpy as np +rng = np.random.default_rng(42) +s = pd.Series(rng.standard_normal(100_000)) +for _ in range(3): s.cummin() +N = 100 +t0 = time.perf_counter() +for _ in range(N): s.cummin() +elapsed = time.perf_counter() - t0 +print(json.dumps({"function": "cummin", "mean_ms": elapsed/N*1000, "iterations": N, "total_ms": elapsed*1000})) diff --git a/benchmarks/pandas/bench_dataframe_abs.py b/benchmarks/pandas/bench_dataframe_abs.py new file mode 100644 index 00000000..38dd6518 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_abs.py @@ -0,0 +1,14 @@ +import pandas as pd, time, json +N = 100_000 +cols = 5 +data = {f"col{c}": [(i % 200) - 100 for i in range(N)] for c in range(cols)} +df = pd.DataFrame(data) +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + df.abs() +t0 = time.perf_counter() +for _ in range(ITERS): + df.abs() +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "dataframe_abs", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_apply_axis1.py b/benchmarks/pandas/bench_dataframe_apply_axis1.py new file mode 100644 index 00000000..26885a4e --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_apply_axis1.py @@ -0,0 +1,27 @@ +"""Benchmark: DataFrame.apply with axis=1 (row-wise) on 10k-row DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 10_000 +WARMUP = 2 +ITERATIONS = 10 + +a = np.arange(ROWS) * 0.1 +b = np.arange(ROWS) * 0.2 +df = pd.DataFrame({"a": a, "b": b}) + +for _ in range(WARMUP): + df.apply(lambda row: row.sum(), axis=1) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.apply(lambda row: row.sum(), axis=1) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_apply_axis1", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_apply_col.py b/benchmarks/pandas/bench_dataframe_apply_col.py new file mode 100644 index 00000000..e8bdabc9 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_apply_col.py @@ -0,0 +1,9 @@ +import pandas as pd, json, time, numpy as np +rng = np.random.default_rng(42) +df = pd.DataFrame(rng.standard_normal((10_000, 5)), columns=list("ABCDE")) +for _ in range(3): df.apply(lambda col: col.mean(), axis=0) +N = 100 +t0 = time.perf_counter() +for _ in range(N): df.apply(lambda col: col.mean(), axis=0) +elapsed = time.perf_counter() - t0 +print(json.dumps({"function": "dataframe_apply_col", "mean_ms": elapsed/N*1000, "iterations": N, "total_ms": elapsed*1000})) diff --git a/benchmarks/pandas/bench_dataframe_apply_map.py b/benchmarks/pandas/bench_dataframe_apply_map.py new file mode 100644 index 00000000..e084e62b --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_apply_map.py @@ -0,0 +1,17 @@ +"""Benchmark: DataFrame.map element-wise on 10k-row DataFrame""" +import json, time +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 +df = pd.DataFrame({"a": [i * 0.1 for i in range(ROWS)], "b": [i * 0.2 for i in range(ROWS)]}) + +for _ in range(WARMUP): + df.map(lambda v: v + 1) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.map(lambda v: v + 1) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "dataframe_apply_map", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_assign.py b/benchmarks/pandas/bench_dataframe_assign.py new file mode 100644 index 00000000..2e699c1f --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_assign.py @@ -0,0 +1,20 @@ +"""Benchmark: DataFrame.assign(c=series) on 100k-row DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +df = pd.DataFrame({"a": np.arange(ROWS) * 1.0, "b": np.arange(ROWS) * 2.0}) +new_col = pd.Series(np.arange(ROWS) * 3.0) +for _ in range(WARMUP): df.assign(c=new_col) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.assign(c=new_col) + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "dataframe_assign", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_dataframe_astype.py b/benchmarks/pandas/bench_dataframe_astype.py new file mode 100644 index 00000000..f2f685f0 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_astype.py @@ -0,0 +1,21 @@ +"""Benchmark: DataFrame.astype() — cast column dtypes.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({"a":[float(i) for i in range(SIZE)],"b":[i for i in range(SIZE)]}) + +for _ in range(WARMUP): + df.astype({"a": "float32", "b": "int32"}) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.astype({"a": "float32", "b": "int32"}) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"dataframe_astype","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_dataframe_clip.py b/benchmarks/pandas/bench_dataframe_clip.py new file mode 100644 index 00000000..73abc09a --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_clip.py @@ -0,0 +1,14 @@ +import pandas as pd, time, json +N = 100_000 +cols = 5 +data = {f"col{c}": [(i % 200) - 100 for i in range(N)] for c in range(cols)} +df = pd.DataFrame(data) +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + df.clip(lower=-50, upper=50) +t0 = time.perf_counter() +for _ in range(ITERS): + df.clip(lower=-50, upper=50) +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "dataframe_clip", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_col_has.py b/benchmarks/pandas/bench_dataframe_col_has.py new file mode 100644 index 00000000..b5e412b7 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_col_has.py @@ -0,0 +1,21 @@ +"""Benchmark: DataFrame column access via [] and 'in' on a 100k-row DataFrame""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +df = pd.DataFrame({"a": range(ROWS), "b": [i * 2.0 for i in range(ROWS)]}) + +for _ in range(WARMUP): + df["a"] + "b" in df.columns + df.get("c") + +start = time.perf_counter() +for _ in range(ITERATIONS): + df["a"] + "b" in df.columns + df.get("c") +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "dataframe_col_has", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_corr.py b/benchmarks/pandas/bench_dataframe_corr.py new file mode 100644 index 00000000..f724a4b2 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_corr.py @@ -0,0 +1,31 @@ +"""Benchmark: DataFrame correlation matrix on 10k-row x 5-column DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 + +df = pd.DataFrame({ + "A": np.sin(np.arange(ROWS) * 0.01), + "B": np.cos(np.arange(ROWS) * 0.01), + "C": np.sin(np.arange(ROWS) * 0.02), + "D": np.cos(np.arange(ROWS) * 0.02), + "E": np.sin(np.arange(ROWS) * 0.03), +}) + +for _ in range(WARMUP): + df.corr() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.corr() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_corr", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_count.py b/benchmarks/pandas/bench_dataframe_count.py new file mode 100644 index 00000000..4f5e4b4e --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_count.py @@ -0,0 +1,22 @@ +"""Benchmark: DataFrame.count() on 100k-row DataFrame with some NAs.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +a = np.where(np.arange(ROWS) % 3 == 0, np.nan, np.arange(ROWS, dtype=float)) +b = np.where(np.arange(ROWS) % 5 == 0, np.nan, np.arange(ROWS, dtype=float) * 2) +c = np.arange(ROWS, dtype=float) * 3 +df = pd.DataFrame({"a": a, "b": b, "c": c}) +for _ in range(WARMUP): df.count() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.count() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "dataframe_count", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_dataframe_cummax.py b/benchmarks/pandas/bench_dataframe_cummax.py new file mode 100644 index 00000000..f0662644 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_cummax.py @@ -0,0 +1,14 @@ +import pandas as pd, time, json +N = 100_000 +cols = 4 +data = {f"col{c}": [(i % 100) * 1.0 for i in range(N)] for c in range(cols)} +df = pd.DataFrame(data) +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + df.cummax() +t0 = time.perf_counter() +for _ in range(ITERS): + df.cummax() +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "dataframe_cummax", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_cummin.py b/benchmarks/pandas/bench_dataframe_cummin.py new file mode 100644 index 00000000..4cbd1c87 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_cummin.py @@ -0,0 +1,14 @@ +import pandas as pd, time, json +N = 100_000 +cols = 4 +data = {f"col{c}": [(i % 100) * 1.0 for i in range(N)] for c in range(cols)} +df = pd.DataFrame(data) +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + df.cummin() +t0 = time.perf_counter() +for _ in range(ITERS): + df.cummin() +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "dataframe_cummin", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_cumprod.py b/benchmarks/pandas/bench_dataframe_cumprod.py new file mode 100644 index 00000000..e117b503 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_cumprod.py @@ -0,0 +1,14 @@ +import pandas as pd, time, json +N = 10_000 +cols = 4 +data = {f"col{c}": [(i % 5) + 1 for i in range(N)] for c in range(cols)} +df = pd.DataFrame(data) +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + df.cumprod() +t0 = time.perf_counter() +for _ in range(ITERS): + df.cumprod() +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "dataframe_cumprod", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_cumsum.py b/benchmarks/pandas/bench_dataframe_cumsum.py new file mode 100644 index 00000000..147df106 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_cumsum.py @@ -0,0 +1,14 @@ +import pandas as pd, time, json +N = 100_000 +cols = 4 +data = {f"col{c}": [(i % 10) + 1 for i in range(N)] for c in range(cols)} +df = pd.DataFrame(data) +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + df.cumsum() +t0 = time.perf_counter() +for _ in range(ITERS): + df.cumsum() +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "dataframe_cumsum", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_describe.py b/benchmarks/pandas/bench_dataframe_describe.py new file mode 100644 index 00000000..e8d17fdc --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_describe.py @@ -0,0 +1,23 @@ +"""Benchmark: DataFrame.describe() on 100k-row DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +df = pd.DataFrame({ + "a": (np.arange(ROWS) * 1.23) % 9000, + "b": (np.arange(ROWS) * 4.56) % 7000, + "c": np.arange(ROWS) * 0.5, +}) +for _ in range(WARMUP): df.describe() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.describe() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "dataframe_describe", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_dataframe_drop.py b/benchmarks/pandas/bench_dataframe_drop.py new file mode 100644 index 00000000..06ffe9d2 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_drop.py @@ -0,0 +1,24 @@ +"""Benchmark: DataFrame.drop(columns) on 100k-row DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({ + "a": np.arange(ROWS) * 1.0, + "b": np.arange(ROWS) * 2.0, + "c": np.arange(ROWS) * 3.0, + "d": np.arange(ROWS) * 4.0, +}) +for _ in range(WARMUP): df.drop(columns=["b", "d"]) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.drop(columns=["b", "d"]) + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "dataframe_drop", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_dataframe_ewm.py b/benchmarks/pandas/bench_dataframe_ewm.py new file mode 100644 index 00000000..192f7e03 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_ewm.py @@ -0,0 +1,17 @@ +"""Benchmark: DataFrame ewm mean on 10k-row DataFrame""" +import json, time +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 +df = pd.DataFrame({"a": [i * 0.1 for i in range(ROWS)], "b": [i * 0.2 for i in range(ROWS)]}) + +for _ in range(WARMUP): + df.ewm(alpha=0.3).mean() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.ewm(alpha=0.3).mean() +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "dataframe_ewm", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_ewm_std_var.py b/benchmarks/pandas/bench_dataframe_ewm_std_var.py new file mode 100644 index 00000000..21a1bc7c --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_ewm_std_var.py @@ -0,0 +1,29 @@ +"""Benchmark: DataFrame EWM std and var on 100k-row DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +a = np.sin(np.arange(ROWS) * 0.05) +b = np.cos(np.arange(ROWS) * 0.05) +df = pd.DataFrame({"a": a, "b": b}) + +for _ in range(WARMUP): + df.ewm(span=20).std() + df.ewm(span=20).var() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.ewm(span=20).std() + df.ewm(span=20).var() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_ewm_std_var", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_expanding.py b/benchmarks/pandas/bench_dataframe_expanding.py new file mode 100644 index 00000000..484b84fb --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_expanding.py @@ -0,0 +1,17 @@ +"""Benchmark: DataFrame expanding mean on 10k-row DataFrame""" +import json, time +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 +df = pd.DataFrame({"a": [i * 0.1 for i in range(ROWS)], "b": [i * 0.2 for i in range(ROWS)]}) + +for _ in range(WARMUP): + df.expanding().mean() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.expanding().mean() +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "dataframe_expanding", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_expanding_min_max.py b/benchmarks/pandas/bench_dataframe_expanding_min_max.py new file mode 100644 index 00000000..ff468557 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_expanding_min_max.py @@ -0,0 +1,29 @@ +"""Benchmark: DataFrame expanding min and max on 100k-row DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +a = np.sin(np.arange(ROWS) * 0.01) +b = np.cos(np.arange(ROWS) * 0.01) +df = pd.DataFrame({"a": a, "b": b}) + +for _ in range(WARMUP): + df.expanding().min() + df.expanding().max() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.expanding().min() + df.expanding().max() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_expanding_min_max", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_fillna.py b/benchmarks/pandas/bench_dataframe_fillna.py new file mode 100644 index 00000000..9ea28f3a --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_fillna.py @@ -0,0 +1,21 @@ +"""Benchmark: DataFrame.fillna(value) on 100k-row DataFrame with NAs.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +a = np.where(np.arange(ROWS) % 4 == 0, np.nan, np.arange(ROWS, dtype=float)) +b = np.where(np.arange(ROWS) % 6 == 0, np.nan, np.arange(ROWS, dtype=float) * 2) +df = pd.DataFrame({"a": a, "b": b}) +for _ in range(WARMUP): df.fillna(0) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.fillna(0) + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "dataframe_fillna", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_dataframe_from2d_select.py b/benchmarks/pandas/bench_dataframe_from2d_select.py new file mode 100644 index 00000000..671aa5b4 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_from2d_select.py @@ -0,0 +1,33 @@ +"""Benchmark: DataFrame from 2D array and column selection""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data2d = np.column_stack([ + np.arange(ROWS, dtype=float), + np.arange(ROWS, dtype=float) * 2, + np.arange(ROWS, dtype=float) * 3, +]) +cols = ["a", "b", "c"] +df = pd.DataFrame(data2d, columns=cols) + +for _ in range(WARMUP): + pd.DataFrame(data2d, columns=cols) + df[["a", "c"]] + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.DataFrame(data2d, columns=cols) + df[["a", "c"]] +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_from2d_select", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_from_pairs.py b/benchmarks/pandas/bench_dataframe_from_pairs.py new file mode 100644 index 00000000..5b0d1520 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_from_pairs.py @@ -0,0 +1,19 @@ +"""Benchmark: pd.DataFrame construction from dict of arrays (100k rows)""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +a = list(range(ROWS)) +b = [i * 2.5 for i in range(ROWS)] +c = [f"str_{i % 1000}" for i in range(ROWS)] + +for _ in range(WARMUP): + pd.DataFrame({"a": a, "b": b, "c": c}) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.DataFrame({"a": a, "b": b, "c": c}) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "dataframe_from_pairs", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_fromrecords.py b/benchmarks/pandas/bench_dataframe_fromrecords.py new file mode 100644 index 00000000..4e496b31 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_fromrecords.py @@ -0,0 +1,25 @@ +"""Benchmark: dataframe_fromrecords — pd.DataFrame(records) on 10k records with 5 columns""" +import json, time + +ROWS = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +records = [{"a": i, "b": i * 2.0, "c": i % 100, "d": i * 0.5, "e": i % 10} for i in range(ROWS)] + +import pandas as pd + +for _ in range(WARMUP): + pd.DataFrame(records) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.DataFrame(records) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_fromrecords", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_head_tail.py b/benchmarks/pandas/bench_dataframe_head_tail.py new file mode 100644 index 00000000..7f7891f6 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_head_tail.py @@ -0,0 +1,23 @@ +"""Benchmark: DataFrame.head() and .tail() — slice first/last N rows.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({"a":[float(i) for i in range(SIZE)],"b":[i*2 for i in range(SIZE)],"c":[str(i) for i in range(SIZE)]}) + +for _ in range(WARMUP): + df.head(100) + df.tail(100) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.head(100) + df.tail(100) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"dataframe_head_tail","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_dataframe_iloc.py b/benchmarks/pandas/bench_dataframe_iloc.py new file mode 100644 index 00000000..de9f3c5f --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_iloc.py @@ -0,0 +1,20 @@ +"""Benchmark: DataFrame.iloc[] on 100k-row DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +df = pd.DataFrame({"a": np.arange(ROWS) * 1.0, "b": np.arange(ROWS) * 2.0, "c": np.arange(ROWS) * 3.0}) +positions = list(range(0, ROWS, 100)) +for _ in range(WARMUP): df.iloc[positions] + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.iloc[positions] + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "dataframe_iloc", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_dataframe_isna.py b/benchmarks/pandas/bench_dataframe_isna.py new file mode 100644 index 00000000..9601c2ec --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_isna.py @@ -0,0 +1,21 @@ +"""Benchmark: DataFrame.isna() on 100k-row DataFrame with some NAs.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +a = np.where(np.arange(ROWS) % 5 == 0, np.nan, np.arange(ROWS, dtype=float)) +b = np.where(np.arange(ROWS) % 7 == 0, np.nan, np.arange(ROWS, dtype=float) * 2) +df = pd.DataFrame({"a": a, "b": b}) +for _ in range(WARMUP): df.isna() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.isna() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "dataframe_isna", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_dataframe_loc.py b/benchmarks/pandas/bench_dataframe_loc.py new file mode 100644 index 00000000..f8f683b4 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_loc.py @@ -0,0 +1,21 @@ +"""Benchmark: DataFrame.loc[] on 100k-row DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +idx = np.arange(ROWS) +df = pd.DataFrame({"a": np.arange(ROWS) * 1.0, "b": np.arange(ROWS) * 2.0}, index=idx) +select_labels = np.arange(0, ROWS, 100) +for _ in range(WARMUP): df.loc[select_labels] + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.loc[select_labels] + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "dataframe_loc", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_dataframe_mask.py b/benchmarks/pandas/bench_dataframe_mask.py new file mode 100644 index 00000000..f4eeb7c6 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_mask.py @@ -0,0 +1,15 @@ +import pandas as pd, time, json +N = 100_000 +cols = 4 +data = {f"col{c}": [(i % 200) - 100 for i in range(N)] for c in range(cols)} +df = pd.DataFrame(data) +mask = pd.DataFrame({f"col{c}": [i % 3 == 0 for i in range(N)] for c in range(cols)}) +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + df.mask(mask, other=0) +t0 = time.perf_counter() +for _ in range(ITERS): + df.mask(mask, other=0) +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "dataframe_mask", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_min_max.py b/benchmarks/pandas/bench_dataframe_min_max.py new file mode 100644 index 00000000..9f5cf6ce --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_min_max.py @@ -0,0 +1,24 @@ +"""Benchmark: DataFrame.min() and DataFrame.max() on 100k-row DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +df = pd.DataFrame({ + "a": (np.arange(ROWS) * 3.14) % 5000, + "b": (np.arange(ROWS) * 2.71) % 8000, + "c": np.arange(ROWS, dtype=float), +}) +for _ in range(WARMUP): df.min(); df.max() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.min() + df.max() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "dataframe_min_max", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_dataframe_nlargest_nsmallest.py b/benchmarks/pandas/bench_dataframe_nlargest_nsmallest.py new file mode 100644 index 00000000..8259c34c --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_nlargest_nsmallest.py @@ -0,0 +1,18 @@ +import pandas as pd, time, json +N = 100_000 +df = pd.DataFrame({ + "a": [(i * 1337) % 100_007 for i in range(N)], + "b": [(i * 7919) % 100_003 for i in range(N)], + "c": [(i * 3571) % 99_991 for i in range(N)], +}) +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + df.nlargest(100, "a") + df.nsmallest(100, "a") +t0 = time.perf_counter() +for _ in range(ITERS): + df.nlargest(100, "a") + df.nsmallest(100, "a") +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "dataframe_nlargest_nsmallest", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_notna.py b/benchmarks/pandas/bench_dataframe_notna.py new file mode 100644 index 00000000..406d16ff --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_notna.py @@ -0,0 +1,21 @@ +"""Benchmark: DataFrame.notna() on 100k-row DataFrame with some NAs.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +a = np.where(np.arange(ROWS) % 5 == 0, np.nan, np.arange(ROWS, dtype=float)) +b = np.arange(ROWS, dtype=float) * 2 +df = pd.DataFrame({"a": a, "b": b}) +for _ in range(WARMUP): df.notna() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.notna() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "dataframe_notna", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_dataframe_rank.py b/benchmarks/pandas/bench_dataframe_rank.py new file mode 100644 index 00000000..b82832e2 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_rank.py @@ -0,0 +1,27 @@ +"""Benchmark: DataFrame.rank on a 10k-row DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 20 + +a = np.sin(np.arange(ROWS) * 0.1) +b = np.cos(np.arange(ROWS) * 0.1) +df = pd.DataFrame({"a": a, "b": b}) + +for _ in range(WARMUP): + df.rank() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.rank() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_rank", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_resetindex.py b/benchmarks/pandas/bench_dataframe_resetindex.py new file mode 100644 index 00000000..9d1f1cd6 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_resetindex.py @@ -0,0 +1,20 @@ +"""Benchmark: DataFrame.reset_index() on 100k-row DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +idx = np.arange(ROWS - 1, -1, -1) +df = pd.DataFrame({"a": np.arange(ROWS) * 1.0, "b": np.arange(ROWS) * 2.0}, index=idx) +for _ in range(WARMUP): df.reset_index(drop=True) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.reset_index(drop=True) + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "dataframe_resetindex", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_dataframe_rolling.py b/benchmarks/pandas/bench_dataframe_rolling.py new file mode 100644 index 00000000..d8cd4e3f --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_rolling.py @@ -0,0 +1,17 @@ +"""Benchmark: DataFrame rolling mean on 100k-row DataFrame""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +df = pd.DataFrame({"a": [i * 0.1 for i in range(ROWS)], "b": [i * 0.2 for i in range(ROWS)]}) + +for _ in range(WARMUP): + df.rolling(10).mean() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.rolling(10).mean() +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "dataframe_rolling", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_rolling_agg.py b/benchmarks/pandas/bench_dataframe_rolling_agg.py new file mode 100644 index 00000000..fa53580b --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_rolling_agg.py @@ -0,0 +1,28 @@ +"""Benchmark: DataFrame rolling multi-aggregation on 100k rows""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +df = pd.DataFrame({ + "a": np.sin(np.arange(ROWS) * 0.01), + "b": np.cos(np.arange(ROWS) * 0.01), +}) + +for _ in range(WARMUP): + df.rolling(10).agg(["mean", "sum"]) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.rolling(10).agg(["mean", "sum"]) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_rolling_agg", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_rolling_apply.py b/benchmarks/pandas/bench_dataframe_rolling_apply.py new file mode 100644 index 00000000..a46f3170 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_rolling_apply.py @@ -0,0 +1,27 @@ +"""Benchmark: DataFrame rolling apply with custom function on 10k-row DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 10_000 +WARMUP = 2 +ITERATIONS = 5 + +a = np.sin(np.arange(ROWS) * 0.01) +b = np.cos(np.arange(ROWS) * 0.01) +df = pd.DataFrame({"a": a, "b": b}) + +for _ in range(WARMUP): + df.rolling(10).apply(np.sum, raw=True) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.rolling(10).apply(np.sum, raw=True) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_rolling_apply", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_round.py b/benchmarks/pandas/bench_dataframe_round.py new file mode 100644 index 00000000..0b3d4c6b --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_round.py @@ -0,0 +1,14 @@ +import pandas as pd, time, json +N = 100_000 +cols = 5 +data = {f"col{c}": [(i % 100) * 1.5 for i in range(N)] for c in range(cols)} +df = pd.DataFrame(data) +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + df.round(2) +t0 = time.perf_counter() +for _ in range(ITERS): + df.round(2) +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "dataframe_round", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_select.py b/benchmarks/pandas/bench_dataframe_select.py new file mode 100644 index 00000000..7148942c --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_select.py @@ -0,0 +1,22 @@ +"""Benchmark: DataFrame[[cols]] column selection on 100k-row DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({ + "a": np.arange(ROWS) * 1.0, "b": np.arange(ROWS) * 2.0, + "c": np.arange(ROWS) * 3.0, "d": np.arange(ROWS) * 4.0, +}) +for _ in range(WARMUP): df[["a", "c"]] + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df[["a", "c"]] + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "dataframe_select", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_dataframe_sem_var.py b/benchmarks/pandas/bench_dataframe_sem_var.py new file mode 100644 index 00000000..7af54eca --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_sem_var.py @@ -0,0 +1,29 @@ +""" +Benchmark: DataFrame.var() / DataFrame.sem() — variance and SEM on a 10k×10 DataFrame. +Outputs JSON: {"function": "dataframe_sem_var", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import math +import time +import pandas as pd +import numpy as np + +ROWS = 10_000 +COLS = 10 +WARMUP = 5 +ITERATIONS = 20 + +data = {f"col{c}": np.array([math.sin((i + c) * 0.01) * 100 for i in range(ROWS)]) for c in range(COLS)} +df = pd.DataFrame(data) + +for _ in range(WARMUP): + df.var() + df.sem() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.var() + df.sem() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "dataframe_sem_var", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_set_index.py b/benchmarks/pandas/bench_dataframe_set_index.py new file mode 100644 index 00000000..f6d446c9 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_set_index.py @@ -0,0 +1,23 @@ +"""Benchmark: DataFrame.set_index(col) on 100k-row DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +df = pd.DataFrame({ + "id": np.arange(ROWS), + "a": np.arange(ROWS) * 1.5, + "b": np.arange(ROWS) * 2.5, +}) +for _ in range(WARMUP): df.set_index("id") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.set_index("id") + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "dataframe_set_index", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_dataframe_setindex.py b/benchmarks/pandas/bench_dataframe_setindex.py new file mode 100644 index 00000000..0f3dd944 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_setindex.py @@ -0,0 +1,29 @@ +"""Benchmark: dataframe_setindex — df.set_index(col) on a 10k-row DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({ + "id": np.arange(ROWS), + "a": np.arange(ROWS, dtype=float) * 2.0, + "b": np.arange(ROWS) % 100, +}) + +for _ in range(WARMUP): + df.set_index("id") + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.set_index("id") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_setindex", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_skew_kurt.py b/benchmarks/pandas/bench_dataframe_skew_kurt.py new file mode 100644 index 00000000..f7b6c943 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_skew_kurt.py @@ -0,0 +1,29 @@ +""" +Benchmark: DataFrame.skew() / DataFrame.kurt() — skewness and kurtosis on a 10k×10 DataFrame. +Outputs JSON: {"function": "dataframe_skew_kurt", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import math +import time +import pandas as pd +import numpy as np + +ROWS = 10_000 +COLS = 10 +WARMUP = 5 +ITERATIONS = 20 + +data = {f"col{c}": np.array([math.sin((i + c) * 0.01) * 100 for i in range(ROWS)]) for c in range(COLS)} +df = pd.DataFrame(data) + +for _ in range(WARMUP): + df.skew() + df.kurt() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.skew() + df.kurt() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "dataframe_skew_kurt", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_sort_index.py b/benchmarks/pandas/bench_dataframe_sort_index.py new file mode 100644 index 00000000..4de05f5b --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_sort_index.py @@ -0,0 +1,20 @@ +"""Benchmark: DataFrame.sort_index() on 100k-row DataFrame with shuffled index.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +idx = np.arange(ROWS - 1, -1, -1) +df = pd.DataFrame({"a": np.arange(ROWS) * 1.1, "b": np.arange(ROWS) * 2.2}, index=idx) +for _ in range(WARMUP): df.sort_index() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.sort_index() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "dataframe_sort_index", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_dataframe_std_var.py b/benchmarks/pandas/bench_dataframe_std_var.py new file mode 100644 index 00000000..de1ef841 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_std_var.py @@ -0,0 +1,23 @@ +"""Benchmark: DataFrame.std() and DataFrame.var() on 100k-row DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +df = pd.DataFrame({ + "a": (np.arange(ROWS) * 1.23) % 9000, + "b": (np.arange(ROWS) * 4.56) % 7000, +}) +for _ in range(WARMUP): df.std(); df.var() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.std() + df.var() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "dataframe_std_var", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_dataframe_sum_mean.py b/benchmarks/pandas/bench_dataframe_sum_mean.py new file mode 100644 index 00000000..b6700570 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_sum_mean.py @@ -0,0 +1,24 @@ +"""Benchmark: DataFrame.sum() and DataFrame.mean() on 100k-row DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +df = pd.DataFrame({ + "a": np.arange(ROWS) * 1.0, + "b": np.arange(ROWS) * 2.0, + "c": np.arange(ROWS) * 3.0, +}) +for _ in range(WARMUP): df.sum(); df.mean() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.sum() + df.mean() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "dataframe_sum_mean", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_dataframe_to_array.py b/benchmarks/pandas/bench_dataframe_to_array.py new file mode 100644 index 00000000..1f9cd145 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_to_array.py @@ -0,0 +1,23 @@ +"""Benchmark: DataFrame.to_numpy() on 100k-row DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +df = pd.DataFrame({ + "a": np.arange(ROWS) * 1.0, + "b": np.arange(ROWS) * 2.0, + "c": np.arange(ROWS) * 3.0, +}) +for _ in range(WARMUP): df.to_numpy() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.to_numpy() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "dataframe_to_array", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_dataframe_to_dict.py b/benchmarks/pandas/bench_dataframe_to_dict.py new file mode 100644 index 00000000..75703800 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_to_dict.py @@ -0,0 +1,19 @@ +"""Benchmark: DataFrame.to_dict() (column-oriented) on 100k-row DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +df = pd.DataFrame({"a": np.arange(ROWS) * 1.0, "b": np.arange(ROWS) * 2.0}) +for _ in range(WARMUP): df.to_dict() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.to_dict() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "dataframe_to_dict", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_dataframe_to_records.py b/benchmarks/pandas/bench_dataframe_to_records.py new file mode 100644 index 00000000..128a15c7 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_to_records.py @@ -0,0 +1,19 @@ +"""Benchmark: DataFrame.to_dict(orient='records') on 100k-row DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +df = pd.DataFrame({"a": np.arange(ROWS) * 1.0, "b": np.arange(ROWS) * 2.0}) +for _ in range(WARMUP): df.to_dict(orient="records") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.to_dict(orient="records") + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "dataframe_to_records", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_dataframe_to_string.py b/benchmarks/pandas/bench_dataframe_to_string.py new file mode 100644 index 00000000..0621ce56 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_to_string.py @@ -0,0 +1,17 @@ +"""Benchmark: DataFrame.to_string on 1k-row pandas DataFrame""" +import json, time +import pandas as pd + +ROWS = 1_000 +WARMUP = 3 +ITERATIONS = 10 +df = pd.DataFrame({"a": range(ROWS), "b": [i * 1.5 for i in range(ROWS)]}) + +for _ in range(WARMUP): + df.to_string() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.to_string() +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "dataframe_to_string", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_torecords.py b/benchmarks/pandas/bench_dataframe_torecords.py new file mode 100644 index 00000000..e6592100 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_torecords.py @@ -0,0 +1,31 @@ +"""Benchmark: dataframe_torecords — df.to_dict(orient='records') on a 10k-row DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({ + "a": np.arange(ROWS), + "b": np.arange(ROWS, dtype=float) * 2.0, + "c": np.arange(ROWS) % 100, + "d": np.arange(ROWS, dtype=float) * 0.5, + "e": np.arange(ROWS) % 10, +}) + +for _ in range(WARMUP): + df.to_dict(orient="records") + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.to_dict(orient="records") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_torecords", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_transform.py b/benchmarks/pandas/bench_dataframe_transform.py new file mode 100644 index 00000000..80da19fe --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_transform.py @@ -0,0 +1,17 @@ +"""Benchmark: DataFrame.transform element-wise on 100k-row DataFrame""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +df = pd.DataFrame({"a": [i * 0.1 for i in range(ROWS)], "b": [i * 0.2 for i in range(ROWS)]}) + +for _ in range(WARMUP): + df.transform(lambda x: x * 2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.transform(lambda x: x * 2) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "dataframe_transform", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_transform_rows.py b/benchmarks/pandas/bench_dataframe_transform_rows.py new file mode 100644 index 00000000..304b390f --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_transform_rows.py @@ -0,0 +1,17 @@ +"""Benchmark: DataFrame row-wise transform on 10k-row DataFrame""" +import json, time +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 +df = pd.DataFrame({"a": [i * 1.0 for i in range(ROWS)], "b": [i * 2.0 for i in range(ROWS)]}) + +for _ in range(WARMUP): + df.apply(lambda row: pd.Series({"a": row["a"] * 2, "b": row["b"] + 1}), axis=1) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.apply(lambda row: pd.Series({"a": row["a"] * 2, "b": row["b"] + 1}), axis=1) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "dataframe_transform_rows", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_value_counts.py b/benchmarks/pandas/bench_dataframe_value_counts.py new file mode 100644 index 00000000..21616a6c --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_value_counts.py @@ -0,0 +1,16 @@ +import pandas as pd, time, json +N = 100_000 +cats = ["apple", "banana", "cherry", "date", "elderberry"] +df = pd.DataFrame({ + "fruit": [cats[i % len(cats)] for i in range(N)], + "color": ["red" if i % 3 == 0 else "yellow" if i % 3 == 1 else "purple" for i in range(N)], +}) +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + df.value_counts(subset=["fruit", "color"]) +t0 = time.perf_counter() +for _ in range(ITERS): + df.value_counts(subset=["fruit", "color"]) +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "dataframe_value_counts", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_where.py b/benchmarks/pandas/bench_dataframe_where.py new file mode 100644 index 00000000..7c2b3d8d --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_where.py @@ -0,0 +1,15 @@ +import pandas as pd, time, json +N = 100_000 +cols = 4 +data = {f"col{c}": [(i % 200) - 100 for i in range(N)] for c in range(cols)} +df = pd.DataFrame(data) +mask = pd.DataFrame({f"col{c}": [i % 2 == 0 for i in range(N)] for c in range(cols)}) +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + df.where(mask, other=0) +t0 = time.perf_counter() +for _ in range(ITERS): + df.where(mask, other=0) +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "dataframe_where", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_date_offset.py b/benchmarks/pandas/bench_date_offset.py new file mode 100644 index 00000000..3f48db46 --- /dev/null +++ b/benchmarks/pandas/bench_date_offset.py @@ -0,0 +1,36 @@ +"""Benchmark: DateOffset — MonthEnd, BusinessDay, YearBegin apply.""" +import json, time +import pandas as pd +from pandas.tseries.offsets import MonthEnd, BusinessDay, YearBegin, Day +from datetime import datetime, timezone, timedelta + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +month_end = MonthEnd(1) +biz_day = BusinessDay(5) +year_begin = YearBegin(1) +day_off = Day(30) +base = pd.Timestamp("2020-01-15", tz="UTC") +dates = [base + timedelta(days=i) for i in range(SIZE)] + +for _ in range(WARMUP): + for d in dates: + d + month_end + d + biz_day + d + year_begin + d + day_off + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + for d in dates: + d + month_end + d + biz_day + d + year_begin + d + day_off + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"date_offset","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_datetime_accessor.py b/benchmarks/pandas/bench_datetime_accessor.py new file mode 100644 index 00000000..0ab3a0f9 --- /dev/null +++ b/benchmarks/pandas/bench_datetime_accessor.py @@ -0,0 +1,17 @@ +import pandas as pd, time, json +N = 100_000 +dates = pd.date_range("2020-01-01", periods=N, freq="D") +s = pd.Series(dates) +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + s.dt.year + s.dt.month + s.dt.dayofweek +t0 = time.perf_counter() +for _ in range(ITERS): + s.dt.year + s.dt.month + s.dt.dayofweek +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "datetime_accessor", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_df_from_pairs.py b/benchmarks/pandas/bench_df_from_pairs.py new file mode 100644 index 00000000..d199bc73 --- /dev/null +++ b/benchmarks/pandas/bench_df_from_pairs.py @@ -0,0 +1,22 @@ +"""Benchmark: pandas DataFrame from dict of Series (equivalent to dataFrameFromPairs)""" +import json, time +import pandas as pd + +N = 10_000 +pairs = { + "a": pd.Series(range(N)), + "b": pd.Series(range(0, N * 2, 2)), + "c": pd.Series(range(0, N * 3, 3)), +} + +WARMUP = 3 +ITERATIONS = 100 + +for _ in range(WARMUP): + pd.DataFrame(pairs) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.DataFrame(pairs) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "df_from_pairs", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_diff.py b/benchmarks/pandas/bench_diff.py new file mode 100644 index 00000000..72ff53a5 --- /dev/null +++ b/benchmarks/pandas/bench_diff.py @@ -0,0 +1,21 @@ +"""Benchmark: Series.diff() — first discrete difference.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series([float(i*1.1+0.5) for i in range(SIZE)]) + +for _ in range(WARMUP): + s.diff() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.diff() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"diff","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_drop_duplicates.py b/benchmarks/pandas/bench_drop_duplicates.py new file mode 100644 index 00000000..eafc3158 --- /dev/null +++ b/benchmarks/pandas/bench_drop_duplicates.py @@ -0,0 +1,21 @@ +"""Benchmark: DataFrame.drop_duplicates() — remove duplicate rows.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({"a":[i % 1000 for i in range(SIZE)],"b":[i % 500 for i in range(SIZE)]}) + +for _ in range(WARMUP): + df.drop_duplicates() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.drop_duplicates() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"drop_duplicates","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_dt_date.py b/benchmarks/pandas/bench_dt_date.py new file mode 100644 index 00000000..0fc250b3 --- /dev/null +++ b/benchmarks/pandas/bench_dt_date.py @@ -0,0 +1,21 @@ +"""Benchmark: dt_date — pandas dt.date on 100k datetime values""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = pd.date_range("2020-01-01", periods=ROWS, freq="D") +s = pd.Series(data) + +for _ in range(WARMUP): + _ = s.dt.date + +start = time.perf_counter() +for _ in range(ITERATIONS): + _ = s.dt.date +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "dt_date", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dt_dayofyear_weekday.py b/benchmarks/pandas/bench_dt_dayofyear_weekday.py new file mode 100644 index 00000000..a2d34327 --- /dev/null +++ b/benchmarks/pandas/bench_dt_dayofyear_weekday.py @@ -0,0 +1,23 @@ +"""Benchmark: dt_dayofyear_weekday — pandas dt.dayofyear, dt.weekday on 100k values""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = pd.date_range("2020-01-01", periods=ROWS, freq="D") +s = pd.Series(data) + +for _ in range(WARMUP): + _ = s.dt.dayofyear + _ = s.dt.weekday + +start = time.perf_counter() +for _ in range(ITERATIONS): + _ = s.dt.dayofyear + _ = s.dt.weekday +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "dt_dayofyear_weekday", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dt_days_in_month.py b/benchmarks/pandas/bench_dt_days_in_month.py new file mode 100644 index 00000000..1984cecf --- /dev/null +++ b/benchmarks/pandas/bench_dt_days_in_month.py @@ -0,0 +1,25 @@ +"""Benchmark: dt_days_in_month — dt.days_in_month on 100k datetime values""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +dates = pd.date_range("2020-01-01", periods=ROWS, freq="D") +s = pd.Series(dates) + +for _ in range(WARMUP): + s.dt.days_in_month + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.dt.days_in_month +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dt_days_in_month", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dt_floor_ceil.py b/benchmarks/pandas/bench_dt_floor_ceil.py new file mode 100644 index 00000000..04089602 --- /dev/null +++ b/benchmarks/pandas/bench_dt_floor_ceil.py @@ -0,0 +1,28 @@ +"""Benchmark: dt_floor_ceil — dt.floor and dt.ceil on 100k datetime values""" +import json, time +import pandas as pd +import numpy as np + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +dates = pd.date_range("2024-01-01", periods=ROWS, freq="1min") +s = pd.Series(dates) + +for _ in range(WARMUP): + s.dt.floor("H") + s.dt.ceil("H") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.dt.floor("H") + s.dt.ceil("H") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dt_floor_ceil", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dt_hour_minute_second.py b/benchmarks/pandas/bench_dt_hour_minute_second.py new file mode 100644 index 00000000..c3503396 --- /dev/null +++ b/benchmarks/pandas/bench_dt_hour_minute_second.py @@ -0,0 +1,29 @@ +"""Benchmark: dt_hour_minute_second — dt.hour, dt.minute, dt.second on 100k datetime values""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +dates = pd.date_range("2024-01-01", periods=ROWS, freq="1min") +s = pd.Series(dates) + +for _ in range(WARMUP): + s.dt.hour + s.dt.minute + s.dt.second + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.dt.hour + s.dt.minute + s.dt.second +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dt_hour_minute_second", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dt_is_leap_year.py b/benchmarks/pandas/bench_dt_is_leap_year.py new file mode 100644 index 00000000..61e7e7f6 --- /dev/null +++ b/benchmarks/pandas/bench_dt_is_leap_year.py @@ -0,0 +1,25 @@ +"""Benchmark: dt_is_leap_year — dt.is_leap_year on 100k datetime values""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +dates = pd.date_range("2020-01-01", periods=ROWS, freq="D") +s = pd.Series(dates) + +for _ in range(WARMUP): + s.dt.is_leap_year + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.dt.is_leap_year +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dt_is_leap_year", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dt_is_month_start_end.py b/benchmarks/pandas/bench_dt_is_month_start_end.py new file mode 100644 index 00000000..2f8532d4 --- /dev/null +++ b/benchmarks/pandas/bench_dt_is_month_start_end.py @@ -0,0 +1,27 @@ +"""Benchmark: dt_is_month_start_end — dt.is_month_start and dt.is_month_end on 100k datetime values""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +dates = pd.date_range("2020-01-01", periods=ROWS, freq="D") +s = pd.Series(dates) + +for _ in range(WARMUP): + s.dt.is_month_start + s.dt.is_month_end + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.dt.is_month_start + s.dt.is_month_end +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dt_is_month_start_end", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dt_is_quarter_start_end.py b/benchmarks/pandas/bench_dt_is_quarter_start_end.py new file mode 100644 index 00000000..7b9bf0fd --- /dev/null +++ b/benchmarks/pandas/bench_dt_is_quarter_start_end.py @@ -0,0 +1,27 @@ +"""Benchmark: dt_is_quarter_start_end — is_quarter_start, is_quarter_end on 100k datetime values""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +dates = pd.date_range("2024-01-01", periods=ROWS, freq="1D") +s = pd.Series(dates) + +for _ in range(WARMUP): + s.dt.is_quarter_start + s.dt.is_quarter_end + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.dt.is_quarter_start + s.dt.is_quarter_end +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dt_is_quarter_start_end", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dt_is_year_start_end.py b/benchmarks/pandas/bench_dt_is_year_start_end.py new file mode 100644 index 00000000..957d0d0d --- /dev/null +++ b/benchmarks/pandas/bench_dt_is_year_start_end.py @@ -0,0 +1,27 @@ +"""Benchmark: dt_is_year_start_end — dt.is_year_start and dt.is_year_end on 100k datetime values""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +dates = pd.date_range("2020-01-01", periods=ROWS, freq="D") +s = pd.Series(dates) + +for _ in range(WARMUP): + s.dt.is_year_start + s.dt.is_year_end + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.dt.is_year_start + s.dt.is_year_end +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dt_is_year_start_end", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dt_millisecond_microsecond_nanosecond.py b/benchmarks/pandas/bench_dt_millisecond_microsecond_nanosecond.py new file mode 100644 index 00000000..5ae7e45c --- /dev/null +++ b/benchmarks/pandas/bench_dt_millisecond_microsecond_nanosecond.py @@ -0,0 +1,24 @@ +"""Benchmark: dt_millisecond_microsecond_nanosecond — pandas dt.microsecond, dt.nanosecond on 100k values""" +import json +import time +import pandas as pd +import numpy as np + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = pd.date_range("2020-01-01", periods=ROWS, freq="s") +s = pd.Series(data) + +for _ in range(WARMUP): + _ = s.dt.microsecond + _ = s.dt.nanosecond + +start = time.perf_counter() +for _ in range(ITERATIONS): + _ = s.dt.microsecond + _ = s.dt.nanosecond +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "dt_millisecond_microsecond_nanosecond", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dt_normalize.py b/benchmarks/pandas/bench_dt_normalize.py new file mode 100644 index 00000000..7ee5d29b --- /dev/null +++ b/benchmarks/pandas/bench_dt_normalize.py @@ -0,0 +1,25 @@ +"""Benchmark: dt_normalize — dt.normalize (truncate to midnight) on 100k datetime values""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +dates = pd.date_range("2024-01-01", periods=ROWS, freq="1min") +s = pd.Series(dates) + +for _ in range(WARMUP): + s.dt.normalize() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.dt.normalize() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dt_normalize", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dt_quarter_month.py b/benchmarks/pandas/bench_dt_quarter_month.py new file mode 100644 index 00000000..5b858b24 --- /dev/null +++ b/benchmarks/pandas/bench_dt_quarter_month.py @@ -0,0 +1,29 @@ +"""Benchmark: dt_quarter_month — dt.quarter, dt.is_month_start, dt.is_month_end on 100k datetime values""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +dates = pd.date_range("2024-01-01", periods=ROWS, freq="1D") +s = pd.Series(dates) + +for _ in range(WARMUP): + s.dt.quarter + s.dt.is_month_start + s.dt.is_month_end + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.dt.quarter + s.dt.is_month_start + s.dt.is_month_end +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dt_quarter_month", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dt_round.py b/benchmarks/pandas/bench_dt_round.py new file mode 100644 index 00000000..7ca8dd7b --- /dev/null +++ b/benchmarks/pandas/bench_dt_round.py @@ -0,0 +1,21 @@ +"""Benchmark: dt_round — pandas dt.round() to hour on 100k values""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = pd.date_range("2020-01-01", periods=ROWS, freq="min") +s = pd.Series(data) + +for _ in range(WARMUP): + _ = s.dt.round("h") + +start = time.perf_counter() +for _ in range(ITERATIONS): + _ = s.dt.round("h") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "dt_round", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dt_year_month_day.py b/benchmarks/pandas/bench_dt_year_month_day.py new file mode 100644 index 00000000..e0fcfc63 --- /dev/null +++ b/benchmarks/pandas/bench_dt_year_month_day.py @@ -0,0 +1,29 @@ +"""Benchmark: dt_year_month_day — dt.year, dt.month, dt.day on 100k datetime values""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +dates = pd.date_range("2024-01-01", periods=ROWS, freq="1D") +s = pd.Series(dates) + +for _ in range(WARMUP): + s.dt.year + s.dt.month + s.dt.day + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.dt.year + s.dt.month + s.dt.day +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dt_year_month_day", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dtype.py b/benchmarks/pandas/bench_dtype.py new file mode 100644 index 00000000..7f10345e --- /dev/null +++ b/benchmarks/pandas/bench_dtype.py @@ -0,0 +1,33 @@ +"""Benchmark: pandas dtype access — dtype property, kind, itemsize, numeric checks""" +import json, time +import pandas as pd +import numpy as np + +WARMUP = 3 +ITERATIONS = 10_000 + +values = list(range(100)) +arr = np.array(values, dtype=np.float64) + +for _ in range(WARMUP): + dt = arr.dtype + _ = dt.kind + _ = dt.itemsize + _ = np.dtype("float64") + _ = np.result_type(np.dtype("float32"), np.dtype("float64")) + _ = pd.api.types.is_numeric_dtype(dt) + _ = pd.api.types.is_float_dtype(dt) + _ = pd.api.types.is_integer_dtype(dt) + +start = time.perf_counter() +for _ in range(ITERATIONS): + dt = arr.dtype + _ = dt.kind + _ = dt.itemsize + _ = np.dtype("float64") + _ = np.result_type(np.dtype("float32"), np.dtype("float64")) + _ = pd.api.types.is_numeric_dtype(dt) + _ = pd.api.types.is_float_dtype(dt) + _ = pd.api.types.is_integer_dtype(dt) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "dtype", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dtype_predicates.py b/benchmarks/pandas/bench_dtype_predicates.py new file mode 100644 index 00000000..7eb9826b --- /dev/null +++ b/benchmarks/pandas/bench_dtype_predicates.py @@ -0,0 +1,50 @@ +"""Benchmark: dtype predicate functions using pandas.api.types""" +import json +import time +import pandas as pd +import numpy as np + +WARMUP = 3 +ITERATIONS = 10_000 + +dtypes = [ + np.dtype("float64"), + np.dtype("int32"), + np.dtype("uint8"), + np.dtype("bool"), + pd.StringDtype(), + np.dtype("datetime64[ns]"), + pd.CategoricalDtype(), + np.dtype("O"), + np.dtype("timedelta64[ns]"), +] + + +def run_checks(): + for d in dtypes: + pd.api.types.is_numeric_dtype(d) + pd.api.types.is_integer_dtype(d) + pd.api.types.is_float_dtype(d) + pd.api.types.is_bool_dtype(d) + pd.api.types.is_string_dtype(d) + pd.api.types.is_datetime64_any_dtype(d) + pd.api.types.is_categorical_dtype(d) + pd.api.types.is_signed_integer_dtype(d) + pd.api.types.is_unsigned_integer_dtype(d) + pd.api.types.is_timedelta64_dtype(d) + pd.api.types.is_object_dtype(d) + pd.api.types.is_complex_dtype(d) + pd.api.types.is_extension_array_dtype(d) + pd.api.types.is_period_dtype(d) + pd.api.types.is_interval_dtype(d) + + +for _ in range(WARMUP): + run_checks() + +start = time.perf_counter() +for _ in range(ITERATIONS): + run_checks() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "dtype_predicates", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_duplicated.py b/benchmarks/pandas/bench_duplicated.py new file mode 100644 index 00000000..e5eb52d3 --- /dev/null +++ b/benchmarks/pandas/bench_duplicated.py @@ -0,0 +1,21 @@ +"""Benchmark: DataFrame.duplicated() — detect duplicate rows.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({"a":[i % 1000 for i in range(SIZE)],"b":[i % 500 for i in range(SIZE)]}) + +for _ in range(WARMUP): + df.duplicated() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.duplicated() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"duplicated","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_ewm_apply.py b/benchmarks/pandas/bench_ewm_apply.py new file mode 100644 index 00000000..3567da86 --- /dev/null +++ b/benchmarks/pandas/bench_ewm_apply.py @@ -0,0 +1,29 @@ +"""Benchmark: EWM.apply with custom function on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.sin(np.arange(ROWS) * 0.05) +s = pd.Series(data) + +def weighted_mean(x): + return x.mean() + +for _ in range(WARMUP): + s.ewm(span=20).apply(weighted_mean, raw=True) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.ewm(span=20).apply(weighted_mean, raw=True) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "ewm_apply", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_ewm_corr.py b/benchmarks/pandas/bench_ewm_corr.py new file mode 100644 index 00000000..9ac12550 --- /dev/null +++ b/benchmarks/pandas/bench_ewm_corr.py @@ -0,0 +1,21 @@ +"""Benchmark: Series.ewm(span=10).corr(other) on 100k-element Series.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +idx = np.arange(SIZE) +a = pd.Series(np.sin(idx * 0.01)) +b = pd.Series(np.cos(idx * 0.01)) +for _ in range(WARMUP): a.ewm(span=10).corr(b) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + a.ewm(span=10).corr(b) + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "ewm_corr", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_ewm_cov.py b/benchmarks/pandas/bench_ewm_cov.py new file mode 100644 index 00000000..cdddb474 --- /dev/null +++ b/benchmarks/pandas/bench_ewm_cov.py @@ -0,0 +1,28 @@ +"""Benchmark: EWM.cov between two 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data1 = np.sin(np.arange(ROWS) * 0.05) +data2 = np.cos(np.arange(ROWS) * 0.05) +s1 = pd.Series(data1) +s2 = pd.Series(data2) + +for _ in range(WARMUP): + s1.ewm(span=20).cov(s2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s1.ewm(span=20).cov(s2) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "ewm_cov", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_expanding_apply.py b/benchmarks/pandas/bench_expanding_apply.py new file mode 100644 index 00000000..64bc9ed8 --- /dev/null +++ b/benchmarks/pandas/bench_expanding_apply.py @@ -0,0 +1,29 @@ +"""Benchmark: expanding apply with custom function on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 2 +ITERATIONS = 5 + +data = np.sin(np.arange(ROWS) * 0.01) +s = pd.Series(data) + +def fn(values): + return values.mean() + +for _ in range(WARMUP): + s.expanding().apply(fn, raw=True) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.expanding().apply(fn, raw=True) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "expanding_apply", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_expanding_count.py b/benchmarks/pandas/bench_expanding_count.py new file mode 100644 index 00000000..bb445c00 --- /dev/null +++ b/benchmarks/pandas/bench_expanding_count.py @@ -0,0 +1,26 @@ +"""Benchmark: Expanding.count on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.where(np.arange(ROWS) % 10 == 0, np.nan, np.sin(np.arange(ROWS) * 0.01)) +s = pd.Series(data) + +for _ in range(WARMUP): + s.expanding().count() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.expanding().count() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "expanding_count", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_expanding_max.py b/benchmarks/pandas/bench_expanding_max.py new file mode 100644 index 00000000..a6586c4e --- /dev/null +++ b/benchmarks/pandas/bench_expanding_max.py @@ -0,0 +1,26 @@ +"""Benchmark: Expanding.max on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.sin(np.arange(ROWS) * 0.01) +s = pd.Series(data) + +for _ in range(WARMUP): + s.expanding().max() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.expanding().max() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "expanding_max", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_expanding_mean.py b/benchmarks/pandas/bench_expanding_mean.py new file mode 100644 index 00000000..536fd8b7 --- /dev/null +++ b/benchmarks/pandas/bench_expanding_mean.py @@ -0,0 +1,26 @@ +"""Benchmark: expanding mean on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.sin(np.arange(ROWS) * 0.01) +s = pd.Series(data) + +for _ in range(WARMUP): + s.expanding().mean() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.expanding().mean() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "expanding_mean", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_expanding_median.py b/benchmarks/pandas/bench_expanding_median.py new file mode 100644 index 00000000..8bce05a9 --- /dev/null +++ b/benchmarks/pandas/bench_expanding_median.py @@ -0,0 +1,26 @@ +"""Benchmark: Expanding.median on 10k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 10_000 +WARMUP = 2 +ITERATIONS = 5 + +data = np.sin(np.arange(ROWS) * 0.01) +s = pd.Series(data) + +for _ in range(WARMUP): + s.expanding().median() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.expanding().median() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "expanding_median", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_expanding_min.py b/benchmarks/pandas/bench_expanding_min.py new file mode 100644 index 00000000..4f29d95a --- /dev/null +++ b/benchmarks/pandas/bench_expanding_min.py @@ -0,0 +1,26 @@ +"""Benchmark: Expanding.min on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.sin(np.arange(ROWS) * 0.01) +s = pd.Series(data) + +for _ in range(WARMUP): + s.expanding().min() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.expanding().min() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "expanding_min", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_explode.py b/benchmarks/pandas/bench_explode.py new file mode 100644 index 00000000..f473ad62 --- /dev/null +++ b/benchmarks/pandas/bench_explode.py @@ -0,0 +1,11 @@ +import pandas as pd, json, time, numpy as np +rng = np.random.default_rng(42) +# Each row has a list of 1-5 items +data = [[int(x) for x in rng.integers(0, 100, size=rng.integers(1, 6))] for _ in range(10_000)] +s = pd.Series(data) +for _ in range(3): s.explode() +N = 50 +t0 = time.perf_counter() +for _ in range(N): s.explode() +elapsed = time.perf_counter() - t0 +print(json.dumps({"function": "explode", "mean_ms": elapsed/N*1000, "iterations": N, "total_ms": elapsed*1000})) diff --git a/benchmarks/pandas/bench_factorize.py b/benchmarks/pandas/bench_factorize.py new file mode 100644 index 00000000..80bc8888 --- /dev/null +++ b/benchmarks/pandas/bench_factorize.py @@ -0,0 +1,26 @@ +"""Benchmark: factorize / pd.Categorical.from_codes — encode values as integer codes.""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +categories = ["cat", "dog", "bird", "fish", "hamster"] +data = [categories[i % len(categories)] for i in range(SIZE)] +s = pd.Series(data) + +for _ in range(WARMUP): + pd.factorize(data) + s.factorize() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.factorize(data) + s.factorize() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "factorize", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_fillna_dropna.py b/benchmarks/pandas/bench_fillna_dropna.py new file mode 100644 index 00000000..1c9c0adc --- /dev/null +++ b/benchmarks/pandas/bench_fillna_dropna.py @@ -0,0 +1,15 @@ +import pandas as pd, time, json +N = 100_000 +data = [None if i % 7 == 0 else i * 1.5 for i in range(N)] +s = pd.Series(data) +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + s.fillna(0) + s.dropna() +t0 = time.perf_counter() +for _ in range(ITERS): + s.fillna(0) + s.dropna() +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "fillna_dropna", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_format_compact.py b/benchmarks/pandas/bench_format_compact.py new file mode 100644 index 00000000..827ab88d --- /dev/null +++ b/benchmarks/pandas/bench_format_compact.py @@ -0,0 +1,22 @@ +"""Benchmark: format compact (K/M/B) on 100k numbers""" +import json, time + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +data = [i * 1234 for i in range(ROWS)] + +def fmt_compact(v): + if abs(v) >= 1e9: return f"{v/1e9:.1f}B" + if abs(v) >= 1e6: return f"{v/1e6:.1f}M" + if abs(v) >= 1e3: return f"{v/1e3:.1f}K" + return str(v) + +for _ in range(WARMUP): + [fmt_compact(v) for v in data] + +start = time.perf_counter() +for _ in range(ITERATIONS): + [fmt_compact(v) for v in data] +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "format_compact", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_format_currency.py b/benchmarks/pandas/bench_format_currency.py new file mode 100644 index 00000000..12068e18 --- /dev/null +++ b/benchmarks/pandas/bench_format_currency.py @@ -0,0 +1,16 @@ +"""Benchmark: format currency on 100k numbers""" +import json, time + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +data = [i * 9.99 for i in range(ROWS)] + +for _ in range(WARMUP): + [f"${v:,.2f}" for v in data] + +start = time.perf_counter() +for _ in range(ITERATIONS): + [f"${v:,.2f}" for v in data] +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "format_currency", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_format_engineering.py b/benchmarks/pandas/bench_format_engineering.py new file mode 100644 index 00000000..beded433 --- /dev/null +++ b/benchmarks/pandas/bench_format_engineering.py @@ -0,0 +1,16 @@ +"""Benchmark: format engineering notation on 100k numbers""" +import json, time + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +data = [i * 1.5e3 for i in range(ROWS)] + +for _ in range(WARMUP): + [f"{v:.3g}" for v in data] + +start = time.perf_counter() +for _ in range(ITERATIONS): + [f"{v:.3g}" for v in data] +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "format_engineering", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_format_float.py b/benchmarks/pandas/bench_format_float.py new file mode 100644 index 00000000..acd1970c --- /dev/null +++ b/benchmarks/pandas/bench_format_float.py @@ -0,0 +1,16 @@ +"""Benchmark: format float on 100k numbers""" +import json, time + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +data = [i * 3.14159 for i in range(ROWS)] + +for _ in range(WARMUP): + [f"{v:.3f}" for v in data] + +start = time.perf_counter() +for _ in range(ITERATIONS): + [f"{v:.3f}" for v in data] +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "format_float", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_format_percent.py b/benchmarks/pandas/bench_format_percent.py new file mode 100644 index 00000000..44af0dee --- /dev/null +++ b/benchmarks/pandas/bench_format_percent.py @@ -0,0 +1,16 @@ +"""Benchmark: format percent on 100k numbers""" +import json, time + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +data = [i / ROWS for i in range(ROWS)] + +for _ in range(WARMUP): + [f"{v:.2%}" for v in data] + +start = time.perf_counter() +for _ in range(ITERATIONS): + [f"{v:.2%}" for v in data] +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "format_percent", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_format_scientific.py b/benchmarks/pandas/bench_format_scientific.py new file mode 100644 index 00000000..5b4224c7 --- /dev/null +++ b/benchmarks/pandas/bench_format_scientific.py @@ -0,0 +1,16 @@ +"""Benchmark: format scientific notation on 100k numbers""" +import json, time + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +data = [i * 1.23456e-5 for i in range(ROWS)] + +for _ in range(WARMUP): + [f"{v:.2e}" for v in data] + +start = time.perf_counter() +for _ in range(ITERATIONS): + [f"{v:.2e}" for v in data] +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "format_scientific", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_format_thousands.py b/benchmarks/pandas/bench_format_thousands.py new file mode 100644 index 00000000..cbb2b5eb --- /dev/null +++ b/benchmarks/pandas/bench_format_thousands.py @@ -0,0 +1,16 @@ +"""Benchmark: format thousands separator on 100k numbers""" +import json, time + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +data = [i * 1234.56 for i in range(ROWS)] + +for _ in range(WARMUP): + [f"{v:,.2f}" for v in data] + +start = time.perf_counter() +for _ in range(ITERATIONS): + [f"{v:,.2f}" for v in data] +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "format_thousands", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_from_dict_oriented.py b/benchmarks/pandas/bench_from_dict_oriented.py new file mode 100644 index 00000000..5b8c8688 --- /dev/null +++ b/benchmarks/pandas/bench_from_dict_oriented.py @@ -0,0 +1,17 @@ +"""Benchmark: pd.DataFrame.from_records on 10k records""" +import json, time +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 +records = [{"id": i, "val": i * 1.5, "name": f"item_{i}"} for i in range(ROWS)] + +for _ in range(WARMUP): + pd.DataFrame.from_records(records) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.DataFrame.from_records(records) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "from_dict_oriented", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_get_dummies.py b/benchmarks/pandas/bench_get_dummies.py new file mode 100644 index 00000000..440445f7 --- /dev/null +++ b/benchmarks/pandas/bench_get_dummies.py @@ -0,0 +1,28 @@ +"""Benchmark: pd.get_dummies — one-hot encoding of categorical data.""" +import json, time +import pandas as pd + +SIZE = 10_000 +WARMUP = 3 +ITERATIONS = 30 + +categories = ["A", "B", "C", "D", "E"] +s = pd.Series([categories[i % len(categories)] for i in range(SIZE)]) +df = pd.DataFrame({ + "cat1": [categories[i % len(categories)] for i in range(SIZE)], + "cat2": [["x", "y", "z"][i % 3] for i in range(SIZE)], +}) + +for _ in range(WARMUP): + pd.get_dummies(s) + pd.get_dummies(df, columns=["cat1", "cat2"]) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.get_dummies(s) + pd.get_dummies(df, columns=["cat1", "cat2"]) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "get_dummies", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_groupby_agg.py b/benchmarks/pandas/bench_groupby_agg.py new file mode 100644 index 00000000..7b72ae90 --- /dev/null +++ b/benchmarks/pandas/bench_groupby_agg.py @@ -0,0 +1,13 @@ +import pandas as pd, json, time, numpy as np +rng = np.random.default_rng(42) +df = pd.DataFrame({ + "group": rng.choice(["A","B","C","D","E"], size=100_000), + "val1": rng.standard_normal(100_000), + "val2": rng.standard_normal(100_000), +}) +for _ in range(3): df.groupby("group").agg({"val1": ["mean","std","min","max"], "val2": ["sum","count"]}) +N = 30 +t0 = time.perf_counter() +for _ in range(N): df.groupby("group").agg({"val1": ["mean","std","min","max"], "val2": ["sum","count"]}) +elapsed = time.perf_counter() - t0 +print(json.dumps({"function": "groupby_agg", "mean_ms": elapsed/N*1000, "iterations": N, "total_ms": elapsed*1000})) diff --git a/benchmarks/pandas/bench_groupby_apply.py b/benchmarks/pandas/bench_groupby_apply.py new file mode 100644 index 00000000..49b84bf0 --- /dev/null +++ b/benchmarks/pandas/bench_groupby_apply.py @@ -0,0 +1,19 @@ +"""Benchmark: GroupBy apply (identity) on 100k-row DataFrame""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 2 +ITERATIONS = 5 +keys = [f"g{i % 50}" for i in range(ROWS)] +vals = [i * 0.1 for i in range(ROWS)] +df = pd.DataFrame({"key": keys, "value": vals}) + +for _ in range(WARMUP): + df.groupby("key").apply(lambda x: x) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.groupby("key").apply(lambda x: x) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "groupby_apply", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_groupby_count.py b/benchmarks/pandas/bench_groupby_count.py new file mode 100644 index 00000000..57ee3a30 --- /dev/null +++ b/benchmarks/pandas/bench_groupby_count.py @@ -0,0 +1,17 @@ +import pandas as pd, time, json +N = 100_000 +keys = ["A", "B", "C", "D", "E"] +df = pd.DataFrame({ + "key": [keys[i % len(keys)] for i in range(N)], + "val": [i * 1.0 for i in range(N)], +}) +gb = df.groupby("key") +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + gb.count() +t0 = time.perf_counter() +for _ in range(ITERS): + gb.count() +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "groupby_count", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_groupby_custom_agg.py b/benchmarks/pandas/bench_groupby_custom_agg.py new file mode 100644 index 00000000..110a62b5 --- /dev/null +++ b/benchmarks/pandas/bench_groupby_custom_agg.py @@ -0,0 +1,19 @@ +"""Benchmark: GroupBy custom agg on 100k-row DataFrame""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +keys = [f"g{i % 100}" for i in range(ROWS)] +vals = [i * 0.1 for i in range(ROWS)] +df = pd.DataFrame({"key": keys, "value": vals}) + +for _ in range(WARMUP): + df.groupby("key")["value"].agg(lambda x: x.max() - x.min()) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.groupby("key")["value"].agg(lambda x: x.max() - x.min()) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "groupby_custom_agg", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_groupby_filter.py b/benchmarks/pandas/bench_groupby_filter.py new file mode 100644 index 00000000..445d3e39 --- /dev/null +++ b/benchmarks/pandas/bench_groupby_filter.py @@ -0,0 +1,19 @@ +"""Benchmark: GroupBy filter on 100k-row DataFrame""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +keys = [f"g{i % 200}" for i in range(ROWS)] +vals = [i * 0.1 for i in range(ROWS)] +df = pd.DataFrame({"key": keys, "value": vals}) + +for _ in range(WARMUP): + df.groupby("key").filter(lambda x: len(x) > 400) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.groupby("key").filter(lambda x: len(x) > 400) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "groupby_filter", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_groupby_first.py b/benchmarks/pandas/bench_groupby_first.py new file mode 100644 index 00000000..e449c58a --- /dev/null +++ b/benchmarks/pandas/bench_groupby_first.py @@ -0,0 +1,18 @@ +import pandas as pd, time, json +N = 100_000 +keys = ["A", "B", "C", "D", "E"] +df = pd.DataFrame({ + "key": [keys[i % len(keys)] for i in range(N)], + "val": [i * 0.5 for i in range(N)], + "val2": [i % 100 for i in range(N)], +}) +gb = df.groupby("key") +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + gb.first() +t0 = time.perf_counter() +for _ in range(ITERS): + gb.first() +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "groupby_first", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_groupby_get_group.py b/benchmarks/pandas/bench_groupby_get_group.py new file mode 100644 index 00000000..58b67dca --- /dev/null +++ b/benchmarks/pandas/bench_groupby_get_group.py @@ -0,0 +1,30 @@ +"""Benchmark: groupby_get_group — DataFrameGroupBy.get_group on 100k rows""" +import json, time +import pandas as pd +import numpy as np + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +group_keys = [f"group_{i % 5}" for i in range(ROWS)] +values = list(range(ROWS)) +df = pd.DataFrame({"group": group_keys, "value": values}) +grouped = df.groupby("group") + +for _ in range(WARMUP): + grouped.get_group("group_0") + grouped.get_group("group_1") + +start = time.perf_counter() +for _ in range(ITERATIONS): + grouped.get_group("group_0") + grouped.get_group("group_1") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "groupby_get_group", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_groupby_last.py b/benchmarks/pandas/bench_groupby_last.py new file mode 100644 index 00000000..39d6c576 --- /dev/null +++ b/benchmarks/pandas/bench_groupby_last.py @@ -0,0 +1,18 @@ +import pandas as pd, time, json +N = 100_000 +keys = ["A", "B", "C", "D", "E"] +df = pd.DataFrame({ + "key": [keys[i % len(keys)] for i in range(N)], + "val": [i * 0.5 for i in range(N)], + "val2": [i % 100 for i in range(N)], +}) +gb = df.groupby("key") +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + gb.last() +t0 = time.perf_counter() +for _ in range(ITERS): + gb.last() +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "groupby_last", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_groupby_max.py b/benchmarks/pandas/bench_groupby_max.py new file mode 100644 index 00000000..8c28787b --- /dev/null +++ b/benchmarks/pandas/bench_groupby_max.py @@ -0,0 +1,17 @@ +import pandas as pd, time, json +N = 100_000 +keys = ["A", "B", "C", "D", "E"] +df = pd.DataFrame({ + "key": [keys[i % len(keys)] for i in range(N)], + "val": [i * 1.0 for i in range(N)], +}) +gb = df.groupby("key") +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + gb.max() +t0 = time.perf_counter() +for _ in range(ITERS): + gb.max() +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "groupby_max", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_groupby_median.py b/benchmarks/pandas/bench_groupby_median.py new file mode 100644 index 00000000..19fe85f5 --- /dev/null +++ b/benchmarks/pandas/bench_groupby_median.py @@ -0,0 +1,22 @@ +"""Benchmark: DataFrame.groupby().median() on 100k-row DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +df = pd.DataFrame({ + "group": np.arange(ROWS) % 100, + "value": (np.arange(ROWS) * 1.414) % 9999, +}) +for _ in range(WARMUP): df.groupby("group")["value"].median() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.groupby("group")["value"].median() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "groupby_median", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_groupby_min.py b/benchmarks/pandas/bench_groupby_min.py new file mode 100644 index 00000000..58f71a8f --- /dev/null +++ b/benchmarks/pandas/bench_groupby_min.py @@ -0,0 +1,17 @@ +import pandas as pd, time, json +N = 100_000 +keys = ["A", "B", "C", "D", "E"] +df = pd.DataFrame({ + "key": [keys[i % len(keys)] for i in range(N)], + "val": [i * 1.0 for i in range(N)], +}) +gb = df.groupby("key") +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + gb.min() +t0 = time.perf_counter() +for _ in range(ITERS): + gb.min() +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "groupby_min", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_groupby_multi_agg.py b/benchmarks/pandas/bench_groupby_multi_agg.py new file mode 100644 index 00000000..4db764c7 --- /dev/null +++ b/benchmarks/pandas/bench_groupby_multi_agg.py @@ -0,0 +1,19 @@ +"""Benchmark: GroupBy multiple aggregations on 100k-row DataFrame""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +keys = [f"g{i % 100}" for i in range(ROWS)] +vals = [i * 0.1 for i in range(ROWS)] +df = pd.DataFrame({"key": keys, "value": vals}) + +for _ in range(WARMUP): + df.groupby("key")["value"].agg(["mean", "std", "min", "max"]) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.groupby("key")["value"].agg(["mean", "std", "min", "max"]) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "groupby_multi_agg", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_groupby_ngroups.py b/benchmarks/pandas/bench_groupby_ngroups.py new file mode 100644 index 00000000..acd7fee6 --- /dev/null +++ b/benchmarks/pandas/bench_groupby_ngroups.py @@ -0,0 +1,26 @@ +"""Benchmark: DataFrameGroupBy.ngroups and .groups property access.""" +import json +import time +import pandas as pd + +ROWS = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({ + "key": [f"g{i % 100}" for i in range(ROWS)], + "val": [i * 1.5 for i in range(ROWS)], +}) +gbk = df.groupby("key") + +for _ in range(WARMUP): + gbk.ngroups + list(gbk.groups.keys()) + +t0 = time.perf_counter() +for _ in range(ITERATIONS): + gbk.ngroups + list(gbk.groups.keys()) +total = (time.perf_counter() - t0) * 1000 + +print(json.dumps({"function": "groupby_ngroups", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_groupby_size.py b/benchmarks/pandas/bench_groupby_size.py new file mode 100644 index 00000000..be0e1255 --- /dev/null +++ b/benchmarks/pandas/bench_groupby_size.py @@ -0,0 +1,17 @@ +import pandas as pd, time, json +N = 100_000 +keys = ["A", "B", "C", "D", "E"] +df = pd.DataFrame({ + "key": [keys[i % len(keys)] for i in range(N)], + "val": [i * 1.0 for i in range(N)], +}) +gb = df.groupby("key") +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + gb.size() +t0 = time.perf_counter() +for _ in range(ITERS): + gb.size() +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "groupby_size", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_groupby_std.py b/benchmarks/pandas/bench_groupby_std.py new file mode 100644 index 00000000..aea87a31 --- /dev/null +++ b/benchmarks/pandas/bench_groupby_std.py @@ -0,0 +1,19 @@ +"""Benchmark: GroupBy std on 100k-row DataFrame""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +keys = [f"g{i % 100}" for i in range(ROWS)] +vals = [i * 0.1 for i in range(ROWS)] +df = pd.DataFrame({"key": keys, "value": vals}) + +for _ in range(WARMUP): + df.groupby("key")["value"].std() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.groupby("key")["value"].std() +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "groupby_std", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_groupby_std_df.py b/benchmarks/pandas/bench_groupby_std_df.py new file mode 100644 index 00000000..337977a4 --- /dev/null +++ b/benchmarks/pandas/bench_groupby_std_df.py @@ -0,0 +1,23 @@ +"""Benchmark: DataFrame.groupby(by).std() on 100k-row DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +df = pd.DataFrame({ + "group": np.arange(ROWS) % 50, + "a": (np.arange(ROWS) * 1.23) % 9999, + "b": (np.arange(ROWS) * 4.56) % 9999, +}) +for _ in range(WARMUP): df.groupby("group").std() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.groupby("group").std() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "groupby_std_df", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_groupby_sum.py b/benchmarks/pandas/bench_groupby_sum.py new file mode 100644 index 00000000..76c89cf8 --- /dev/null +++ b/benchmarks/pandas/bench_groupby_sum.py @@ -0,0 +1,18 @@ +import pandas as pd, time, json +N = 100_000 +keys = ["A", "B", "C", "D", "E"] +df = pd.DataFrame({ + "key": [keys[i % len(keys)] for i in range(N)], + "val": [i * 1.0 for i in range(N)], + "val2": [i % 200 for i in range(N)], +}) +gb = df.groupby("key") +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + gb.sum() +t0 = time.perf_counter() +for _ in range(ITERS): + gb.sum() +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "groupby_sum", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_groupby_transform.py b/benchmarks/pandas/bench_groupby_transform.py new file mode 100644 index 00000000..aa263534 --- /dev/null +++ b/benchmarks/pandas/bench_groupby_transform.py @@ -0,0 +1,19 @@ +"""Benchmark: GroupBy transform on 100k-row DataFrame""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +keys = [f"g{i % 100}" for i in range(ROWS)] +vals = [i * 0.1 for i in range(ROWS)] +df = pd.DataFrame({"key": keys, "value": vals}) + +for _ in range(WARMUP): + df.groupby("key")["value"].transform(lambda x: x) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.groupby("key")["value"].transform(lambda x: x) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "groupby_transform", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_groupby_var.py b/benchmarks/pandas/bench_groupby_var.py new file mode 100644 index 00000000..cd5eba58 --- /dev/null +++ b/benchmarks/pandas/bench_groupby_var.py @@ -0,0 +1,19 @@ +"""Benchmark: GroupBy var on 100k-row DataFrame""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +keys = [f"g{i % 100}" for i in range(ROWS)] +vals = [i * 0.1 for i in range(ROWS)] +df = pd.DataFrame({"key": keys, "value": vals}) + +for _ in range(WARMUP): + df.groupby("key")["value"].var() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.groupby("key")["value"].var() +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "groupby_var", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_histogram.py b/benchmarks/pandas/bench_histogram.py new file mode 100644 index 00000000..ec4551f6 --- /dev/null +++ b/benchmarks/pandas/bench_histogram.py @@ -0,0 +1,17 @@ +"""Benchmark: np.histogram on 100k-element array""" +import json, time +import numpy as np + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +data = np.array([(i % 1000) * 0.1 for i in range(ROWS)]) + +for _ in range(WARMUP): + np.histogram(data, bins=50) + +start = time.perf_counter() +for _ in range(ITERATIONS): + np.histogram(data, bins=50) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "histogram", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_idxmin_idxmax.py b/benchmarks/pandas/bench_idxmin_idxmax.py new file mode 100644 index 00000000..60a1fe98 --- /dev/null +++ b/benchmarks/pandas/bench_idxmin_idxmax.py @@ -0,0 +1,28 @@ +""" +Benchmark: Series.idxmin() / Series.idxmax() — index of min/max on a 100k-element Series. +Outputs JSON: {"function": "idxmin_idxmax", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import math +import time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +data = np.array([math.sin(i * 0.01) * 1000 for i in range(SIZE)]) +s = pd.Series(data) + +for _ in range(WARMUP): + s.idxmin() + s.idxmax() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.idxmin() + s.idxmax() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "idxmin_idxmax", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_index_append.py b/benchmarks/pandas/bench_index_append.py new file mode 100644 index 00000000..d80264f2 --- /dev/null +++ b/benchmarks/pandas/bench_index_append.py @@ -0,0 +1,28 @@ +"""Benchmark: index_append — Index.append concatenating two indices""" +import json +import time +import pandas as pd + +ROWS = 50_000 +WARMUP = 3 +ITERATIONS = 10 + +data1 = [f"key_{i}" for i in range(ROWS)] +data2 = [f"key_{ROWS + i}" for i in range(ROWS)] +idx1 = pd.Index(data1) +idx2 = pd.Index(data2) + +for _ in range(WARMUP): + idx1.append(idx2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + idx1.append(idx2) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "index_append", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_index_arg_sort.py b/benchmarks/pandas/bench_index_arg_sort.py new file mode 100644 index 00000000..075f1b5d --- /dev/null +++ b/benchmarks/pandas/bench_index_arg_sort.py @@ -0,0 +1,26 @@ +"""Benchmark: index_arg_sort — Index.argsort on 100k-element Index""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +labels = np.arange(SIZE, 0, -1) +idx = pd.Index(labels) + +for _ in range(WARMUP): + idx.argsort() + +start = time.perf_counter() +for _ in range(ITERATIONS): + idx.argsort() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "index_arg_sort", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_index_argmin_argmax.py b/benchmarks/pandas/bench_index_argmin_argmax.py new file mode 100644 index 00000000..f06ed3f1 --- /dev/null +++ b/benchmarks/pandas/bench_index_argmin_argmax.py @@ -0,0 +1,28 @@ +"""Benchmark: index_argmin_argmax — Index.argmin and Index.argmax on 100k-element Index""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +labels = np.arange(SIZE) +idx = pd.Index(labels) + +for _ in range(WARMUP): + idx.argmin() + idx.argmax() + +start = time.perf_counter() +for _ in range(ITERATIONS): + idx.argmin() + idx.argmax() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "index_argmin_argmax", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_index_contains.py b/benchmarks/pandas/bench_index_contains.py new file mode 100644 index 00000000..187012b9 --- /dev/null +++ b/benchmarks/pandas/bench_index_contains.py @@ -0,0 +1,31 @@ +"""Benchmark: Index.isin on 100k-element Index""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +labels = np.arange(SIZE) +idx = pd.Index(labels) +lookups = np.arange(0, 1000) * 100 + +for _ in range(WARMUP): + for lbl in lookups[:10]: + lbl in idx + idx.isin(lookups) + +start = time.perf_counter() +for _ in range(ITERATIONS): + for lbl in lookups[:10]: + lbl in idx + idx.isin(lookups) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "index_contains", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_index_copy_toarray.py b/benchmarks/pandas/bench_index_copy_toarray.py new file mode 100644 index 00000000..f1fa9573 --- /dev/null +++ b/benchmarks/pandas/bench_index_copy_toarray.py @@ -0,0 +1,26 @@ +"""Benchmark: Index copy and tolist on 100k-element Index""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +idx = pd.Index(range(ROWS)) + +for _ in range(WARMUP): + idx.copy() + idx.tolist() + +start = time.perf_counter() +for _ in range(ITERATIONS): + idx.copy() + idx.tolist() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "index_copy_toarray", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_index_delete_drop.py b/benchmarks/pandas/bench_index_delete_drop.py new file mode 100644 index 00000000..76bebcef --- /dev/null +++ b/benchmarks/pandas/bench_index_delete_drop.py @@ -0,0 +1,28 @@ +"""Benchmark: index_delete_drop — Index.delete and Index.drop on 100k-element Index""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +labels = np.arange(SIZE) +idx = pd.Index(labels) + +for _ in range(WARMUP): + idx.delete(500) + idx.drop([100, 200, 300, 400, 500]) + +start = time.perf_counter() +for _ in range(ITERATIONS): + idx.delete(500) + idx.drop([100, 200, 300, 400, 500]) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "index_delete_drop", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_index_drop_duplicates.py b/benchmarks/pandas/bench_index_drop_duplicates.py new file mode 100644 index 00000000..6c68c0d9 --- /dev/null +++ b/benchmarks/pandas/bench_index_drop_duplicates.py @@ -0,0 +1,26 @@ +"""Benchmark: index_drop_duplicates — Index.drop_duplicates on 100k Index with 50% dupes""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +labels = np.arange(SIZE) % (SIZE // 2) +idx = pd.Index(labels) + +for _ in range(WARMUP): + idx.drop_duplicates() + +start = time.perf_counter() +for _ in range(ITERATIONS): + idx.drop_duplicates() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "index_drop_duplicates", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_index_duplicated.py b/benchmarks/pandas/bench_index_duplicated.py new file mode 100644 index 00000000..a816c2ef --- /dev/null +++ b/benchmarks/pandas/bench_index_duplicated.py @@ -0,0 +1,20 @@ +"""Benchmark: index_duplicated — pandas Index.duplicated() on 100k-element Index with duplicates""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +idx = pd.Index([i % 90_000 for i in range(ROWS)]) + +for _ in range(WARMUP): + _ = idx.duplicated(keep="first") + +start = time.perf_counter() +for _ in range(ITERATIONS): + _ = idx.duplicated(keep="first") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "index_duplicated", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_index_equals_identical.py b/benchmarks/pandas/bench_index_equals_identical.py new file mode 100644 index 00000000..5d153547 --- /dev/null +++ b/benchmarks/pandas/bench_index_equals_identical.py @@ -0,0 +1,29 @@ +"""Benchmark: index_equals_identical — Index.equals and Index.identical on 100k-element Index""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +labels = np.arange(SIZE) +idx = pd.Index(labels) +idx2 = pd.Index(labels.copy()) + +for _ in range(WARMUP): + idx.equals(idx2) + idx.identical(idx2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + idx.equals(idx2) + idx.identical(idx2) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "index_equals_identical", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_index_fillna.py b/benchmarks/pandas/bench_index_fillna.py new file mode 100644 index 00000000..31fc8efc --- /dev/null +++ b/benchmarks/pandas/bench_index_fillna.py @@ -0,0 +1,27 @@ +"""Benchmark: index_fillna — Index.fillna replacing null values on 100k-element index""" +import json +import time +import pandas as pd +import numpy as np + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [None if i % 10 == 0 else f"key_{i}" for i in range(ROWS)] +idx = pd.Index(data) + +for _ in range(WARMUP): + idx.fillna("missing") + +start = time.perf_counter() +for _ in range(ITERATIONS): + idx.fillna("missing") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "index_fillna", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_index_getindexer.py b/benchmarks/pandas/bench_index_getindexer.py new file mode 100644 index 00000000..2b5e0bab --- /dev/null +++ b/benchmarks/pandas/bench_index_getindexer.py @@ -0,0 +1,26 @@ +"""Benchmark: index_getindexer — pd.Index.get_indexer(target) on 10k-element Index""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 + +base_idx = pd.Index(np.arange(ROWS, dtype=float)) +target_idx = pd.Index(np.arange(1000, dtype=float) * 10) + +for _ in range(WARMUP): + base_idx.get_indexer(target_idx) + +start = time.perf_counter() +for _ in range(ITERATIONS): + base_idx.get_indexer(target_idx) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "index_getindexer", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_index_getloc.py b/benchmarks/pandas/bench_index_getloc.py new file mode 100644 index 00000000..7d0c22de --- /dev/null +++ b/benchmarks/pandas/bench_index_getloc.py @@ -0,0 +1,20 @@ +"""Benchmark: Index.get_loc (pandas equivalent).""" +import json +import time +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 30 + +idx = pd.Index(range(SIZE)) + +for _ in range(WARMUP): + idx.get_loc(5000) + +t0 = time.perf_counter() +for i in range(ITERATIONS): + idx.get_loc(i % SIZE) +total = (time.perf_counter() - t0) * 1000 + +print(json.dumps({"function": "index_getloc", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_index_insert.py b/benchmarks/pandas/bench_index_insert.py new file mode 100644 index 00000000..16b84163 --- /dev/null +++ b/benchmarks/pandas/bench_index_insert.py @@ -0,0 +1,28 @@ +"""Benchmark: index_insert — Index.insert on 100k-element Index""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +labels = np.arange(SIZE) +idx = pd.Index(labels) + +for _ in range(WARMUP): + idx.insert(500, 999_999) + idx.insert(0, -1) + +start = time.perf_counter() +for _ in range(ITERATIONS): + idx.insert(500, 999_999) + idx.insert(0, -1) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "index_insert", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_index_isin.py b/benchmarks/pandas/bench_index_isin.py new file mode 100644 index 00000000..1196bd10 --- /dev/null +++ b/benchmarks/pandas/bench_index_isin.py @@ -0,0 +1,21 @@ +"""Benchmark: index_isin — pandas Index.isin() membership check on 100k-element Index""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +idx = pd.Index(range(ROWS)) +lookup = list(range(0, ROWS, 100)) + +for _ in range(WARMUP): + _ = idx.isin(lookup) + +start = time.perf_counter() +for _ in range(ITERATIONS): + _ = idx.isin(lookup) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "index_isin", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_index_isna_dropna.py b/benchmarks/pandas/bench_index_isna_dropna.py new file mode 100644 index 00000000..f1336e93 --- /dev/null +++ b/benchmarks/pandas/bench_index_isna_dropna.py @@ -0,0 +1,28 @@ +"""Benchmark: index_isna_dropna — Index.isna and Index.dropna on 100k-element Index with nulls""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +labels = [None if i % 5 == 0 else i for i in range(SIZE)] +idx = pd.Index(labels) + +for _ in range(WARMUP): + idx.isna() + idx.dropna() + +start = time.perf_counter() +for _ in range(ITERATIONS): + idx.isna() + idx.dropna() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "index_isna_dropna", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_index_min_max.py b/benchmarks/pandas/bench_index_min_max.py new file mode 100644 index 00000000..4d743854 --- /dev/null +++ b/benchmarks/pandas/bench_index_min_max.py @@ -0,0 +1,28 @@ +"""Benchmark: index_min_max — Index.min and Index.max on 100k-element Index""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +labels = np.arange(SIZE) +idx = pd.Index(labels) + +for _ in range(WARMUP): + idx.min() + idx.max() + +start = time.perf_counter() +for _ in range(ITERATIONS): + idx.min() + idx.max() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "index_min_max", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_index_monotonic.py b/benchmarks/pandas/bench_index_monotonic.py new file mode 100644 index 00000000..5776e22e --- /dev/null +++ b/benchmarks/pandas/bench_index_monotonic.py @@ -0,0 +1,22 @@ +"""Benchmark: Index.is_monotonic_increasing, is_monotonic_decreasing, is_unique on 100k Index""" +import json, time +import pandas as pd + +N = 100_000 +WARMUP = 3 +ITERATIONS = 10 +idx_inc = pd.Index(range(N)) +idx_dec = pd.Index(range(N, 0, -1)) + +for _ in range(WARMUP): + idx_inc.is_monotonic_increasing + idx_dec.is_monotonic_decreasing + idx_inc.is_unique + +start = time.perf_counter() +for _ in range(ITERATIONS): + idx_inc.is_monotonic_increasing + idx_dec.is_monotonic_decreasing + idx_inc.is_unique +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "index_monotonic", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_index_nunique.py b/benchmarks/pandas/bench_index_nunique.py new file mode 100644 index 00000000..398ee59b --- /dev/null +++ b/benchmarks/pandas/bench_index_nunique.py @@ -0,0 +1,26 @@ +"""Benchmark: index_nunique — Index.nunique on 100k-element Index with 50% unique values""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +labels = np.arange(SIZE) % (SIZE // 2) +idx = pd.Index(labels) + +for _ in range(WARMUP): + idx.nunique() + +start = time.perf_counter() +for _ in range(ITERATIONS): + idx.nunique() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "index_nunique", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_index_ops.py b/benchmarks/pandas/bench_index_ops.py new file mode 100644 index 00000000..80194073 --- /dev/null +++ b/benchmarks/pandas/bench_index_ops.py @@ -0,0 +1,32 @@ +"""Benchmark: Index set operations (union, intersection, difference) on 50k-element Index""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 50_000 +WARMUP = 3 +ITERATIONS = 20 + +labelsA = np.arange(SIZE) +labelsB = np.arange(SIZE // 2, SIZE + SIZE // 2) +idxA = pd.Index(labelsA) +idxB = pd.Index(labelsB) + +for _ in range(WARMUP): + idxA.union(idxB) + idxA.intersection(idxB) + idxA.difference(idxB) + +start = time.perf_counter() +for _ in range(ITERATIONS): + idxA.union(idxB) + idxA.intersection(idxB) + idxA.difference(idxB) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "index_ops", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_index_rename.py b/benchmarks/pandas/bench_index_rename.py new file mode 100644 index 00000000..e76e1565 --- /dev/null +++ b/benchmarks/pandas/bench_index_rename.py @@ -0,0 +1,26 @@ +"""Benchmark: index_rename — Index.rename changing the index name""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"key_{i}" for i in range(ROWS)] +idx = pd.Index(data, name="original_name") + +for _ in range(WARMUP): + idx.rename("new_name") + +start = time.perf_counter() +for _ in range(ITERATIONS): + idx.rename("new_name") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "index_rename", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_index_slice_take.py b/benchmarks/pandas/bench_index_slice_take.py new file mode 100644 index 00000000..70804ef6 --- /dev/null +++ b/benchmarks/pandas/bench_index_slice_take.py @@ -0,0 +1,29 @@ +"""Benchmark: index_slice_take — Index slice and take on 100k-element Index""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +labels = np.arange(SIZE) +idx = pd.Index(labels) +positions = np.arange(0, SIZE, 100) + +for _ in range(WARMUP): + idx[0:50_000] + idx.take(positions) + +start = time.perf_counter() +for _ in range(ITERATIONS): + idx[0:50_000] + idx.take(positions) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "index_slice_take", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_index_sort.py b/benchmarks/pandas/bench_index_sort.py new file mode 100644 index 00000000..f998e45f --- /dev/null +++ b/benchmarks/pandas/bench_index_sort.py @@ -0,0 +1,26 @@ +"""Benchmark: Index.sort_values on 100k-element Index""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +labels = np.arange(SIZE)[::-1] +idx = pd.Index(labels) + +for _ in range(WARMUP): + idx.sort_values() + +start = time.perf_counter() +for _ in range(ITERATIONS): + idx.sort_values() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "index_sort", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_index_symmetric_diff.py b/benchmarks/pandas/bench_index_symmetric_diff.py new file mode 100644 index 00000000..6e216e5f --- /dev/null +++ b/benchmarks/pandas/bench_index_symmetric_diff.py @@ -0,0 +1,19 @@ +"""Benchmark: pandas Index.symmetric_difference on 10k-element integer indexes""" +import json, time +import pandas as pd + +N = 10_000 +a = pd.Index(range(N)) +b = pd.Index(range(N // 2, N + N // 2)) + +WARMUP = 3 +ITERATIONS = 50 + +for _ in range(WARMUP): + a.symmetric_difference(b) + +start = time.perf_counter() +for _ in range(ITERATIONS): + a.symmetric_difference(b) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "index_symmetric_diff", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_interpolate.py b/benchmarks/pandas/bench_interpolate.py new file mode 100644 index 00000000..ab3e81d9 --- /dev/null +++ b/benchmarks/pandas/bench_interpolate.py @@ -0,0 +1,23 @@ +"""Benchmark: Series.interpolate() — linear interpolation over NaN values.""" +import json, time +import pandas as pd +import math + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +data = [float(i) if i % 5 != 0 else math.nan for i in range(SIZE)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.interpolate(method="linear") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.interpolate(method="linear") + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"interpolate","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_interval.py b/benchmarks/pandas/bench_interval.py new file mode 100644 index 00000000..fbe1aee0 --- /dev/null +++ b/benchmarks/pandas/bench_interval.py @@ -0,0 +1,30 @@ +"""Benchmark: Interval / IntervalIndex — closed/open intervals.""" +import json, time +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +intervals = [pd.Interval(i, i + 1) for i in range(SIZE)] +breaks = list(range(1_001)) + +for _ in range(WARMUP): + for iv in intervals[:100]: + iv.mid in iv + _ = iv.length + str(iv) + pd.IntervalIndex.from_breaks(breaks) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + for iv in intervals: + iv.mid in iv + _ = iv.length + str(iv) + pd.IntervalIndex.from_breaks(breaks) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"interval","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_isin.py b/benchmarks/pandas/bench_isin.py new file mode 100644 index 00000000..6340ccb8 --- /dev/null +++ b/benchmarks/pandas/bench_isin.py @@ -0,0 +1,22 @@ +"""Benchmark: Series.isin() — membership test.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series([i % 5000 for i in range(SIZE)]) +test_set = list(range(0, 2500)) + +for _ in range(WARMUP): + s.isin(test_set) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.isin(test_set) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"isin","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_json_normalize.py b/benchmarks/pandas/bench_json_normalize.py new file mode 100644 index 00000000..a28193d7 --- /dev/null +++ b/benchmarks/pandas/bench_json_normalize.py @@ -0,0 +1,24 @@ +"""Benchmark: json_normalize — flatten nested JSON to a flat DataFrame.""" +import json, time +import pandas as pd + +SIZE = 1_000 +WARMUP = 5 +ITERATIONS = 50 + +records = [ + {"id": i, "name": f"user_{i}", "address": {"city": f"city_{i % 10}", "zip": str(10000 + i)}, "scores": [i, i+1, i+2]} + for i in range(SIZE) +] + +for _ in range(WARMUP): + pd.json_normalize(records, max_level=2) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.json_normalize(records, max_level=2) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"json_normalize","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_make_formatter.py b/benchmarks/pandas/bench_make_formatter.py new file mode 100644 index 00000000..14d8cf61 --- /dev/null +++ b/benchmarks/pandas/bench_make_formatter.py @@ -0,0 +1,20 @@ +import pandas as pd, time, json +WARMUP = 3 +ITERS = 10_000 +def make_float_fmt(d): + return lambda x: f"{x:.{d}f}" +def make_pct_fmt(d): + return lambda x: f"{x*100:.{d}f}%" +def make_curr_fmt(sym, d): + return lambda x: f"{sym}{x:.{d}f}" +for _ in range(WARMUP): + make_float_fmt(2) + make_pct_fmt(1) + make_curr_fmt("$", 2) +t0 = time.perf_counter() +for _ in range(ITERS): + make_float_fmt(2) + make_pct_fmt(1) + make_curr_fmt("$", 2) +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "make_formatter", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_mask.py b/benchmarks/pandas/bench_mask.py new file mode 100644 index 00000000..c2bff435 --- /dev/null +++ b/benchmarks/pandas/bench_mask.py @@ -0,0 +1,10 @@ +import pandas as pd, json, time, numpy as np +rng = np.random.default_rng(42) +s = pd.Series(rng.standard_normal(100_000)) +cond = s < 0 +for _ in range(3): s.mask(cond, 0.0) +N = 100 +t0 = time.perf_counter() +for _ in range(N): s.mask(cond, 0.0) +elapsed = time.perf_counter() - t0 +print(json.dumps({"function": "mask", "mean_ms": elapsed/N*1000, "iterations": N, "total_ms": elapsed*1000})) diff --git a/benchmarks/pandas/bench_melt.py b/benchmarks/pandas/bench_melt.py new file mode 100644 index 00000000..25284b6f --- /dev/null +++ b/benchmarks/pandas/bench_melt.py @@ -0,0 +1,29 @@ +"""Benchmark: melt (wide to long) on 10k-row DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 + +df = pd.DataFrame({ + "A": np.arange(ROWS) * 0.1, + "B": np.arange(ROWS) * 0.2, + "C": np.arange(ROWS) * 0.3, +}) + +for _ in range(WARMUP): + df.melt(value_vars=["A", "B", "C"]) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.melt(value_vars=["A", "B", "C"]) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "melt", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_merge_inner.py b/benchmarks/pandas/bench_merge_inner.py new file mode 100644 index 00000000..243c0c24 --- /dev/null +++ b/benchmarks/pandas/bench_merge_inner.py @@ -0,0 +1,21 @@ +"""Benchmark: pd.merge(left, right, how='inner') on 50k-row DataFrames.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 50_000 +WARMUP = 3 +ITERATIONS = 10 + +left = pd.DataFrame({"id": np.arange(ROWS), "val": np.arange(ROWS) * 1.5}) +right = pd.DataFrame({"id": np.arange(ROWS) + 10000, "extra": np.arange(ROWS) * 2.0}) + +for _ in range(WARMUP): pd.merge(left, right, on="id", how="inner") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.merge(left, right, on="id", how="inner") + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "merge_inner", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_merge_left.py b/benchmarks/pandas/bench_merge_left.py new file mode 100644 index 00000000..712783e7 --- /dev/null +++ b/benchmarks/pandas/bench_merge_left.py @@ -0,0 +1,21 @@ +"""Benchmark: pd.merge(left, right, how='left') on 50k-row DataFrames.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 50_000 +WARMUP = 3 +ITERATIONS = 10 + +left = pd.DataFrame({"id": np.arange(ROWS), "val": np.arange(ROWS) * 1.5}) +right = pd.DataFrame({"id": np.arange(ROWS) % (ROWS // 2), "extra": np.arange(ROWS) * 2.0}) + +for _ in range(WARMUP): pd.merge(left, right, on="id", how="left") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.merge(left, right, on="id", how="left") + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "merge_left", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_merge_left_on_right_on.py b/benchmarks/pandas/bench_merge_left_on_right_on.py new file mode 100644 index 00000000..4fb14cd7 --- /dev/null +++ b/benchmarks/pandas/bench_merge_left_on_right_on.py @@ -0,0 +1,27 @@ +"""Benchmark: merge with left_on/right_on (pandas equivalent).""" +import json +import time +import pandas as pd + +ROWS = 20_000 +WARMUP = 3 +ITERATIONS = 10 + +left = pd.DataFrame({ + "emp_id": list(range(ROWS)), + "salary": [30000 + i * 10 for i in range(ROWS)], +}) +right = pd.DataFrame({ + "id": list(range(ROWS // 2)), + "dept": [f"dept{i % 10}" for i in range(ROWS // 2)], +}) + +for _ in range(WARMUP): + pd.merge(left, right, left_on="emp_id", right_on="id") + +t0 = time.perf_counter() +for _ in range(ITERATIONS): + pd.merge(left, right, left_on="emp_id", right_on="id") +total = (time.perf_counter() - t0) * 1000 + +print(json.dumps({"function": "merge_left_on_right_on", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_merge_outer.py b/benchmarks/pandas/bench_merge_outer.py new file mode 100644 index 00000000..49d66a77 --- /dev/null +++ b/benchmarks/pandas/bench_merge_outer.py @@ -0,0 +1,21 @@ +"""Benchmark: pd.merge(left, right, how='outer') on 30k-row DataFrames.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 30_000 +WARMUP = 3 +ITERATIONS = 10 + +left = pd.DataFrame({"id": np.arange(ROWS), "val": np.arange(ROWS) * 1.5}) +right = pd.DataFrame({"id": np.arange(ROWS) + ROWS // 2, "extra": np.arange(ROWS) * 2.0}) + +for _ in range(WARMUP): pd.merge(left, right, on="id", how="outer") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.merge(left, right, on="id", how="outer") + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "merge_outer", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_merge_right.py b/benchmarks/pandas/bench_merge_right.py new file mode 100644 index 00000000..66e7105b --- /dev/null +++ b/benchmarks/pandas/bench_merge_right.py @@ -0,0 +1,21 @@ +"""Benchmark: pd.merge(left, right, how='right') on 50k-row DataFrames.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 50_000 +WARMUP = 3 +ITERATIONS = 10 + +left = pd.DataFrame({"id": np.arange(ROWS) % (ROWS // 2), "val": np.arange(ROWS) * 1.5}) +right = pd.DataFrame({"id": np.arange(ROWS), "extra": np.arange(ROWS) * 2.0}) + +for _ in range(WARMUP): pd.merge(left, right, on="id", how="right") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.merge(left, right, on="id", how="right") + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "merge_right", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_min_max_normalize.py b/benchmarks/pandas/bench_min_max_normalize.py new file mode 100644 index 00000000..bb93847c --- /dev/null +++ b/benchmarks/pandas/bench_min_max_normalize.py @@ -0,0 +1,26 @@ +"""Benchmark: min-max normalization on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.sin(np.arange(ROWS) * 0.01) * 100 + 50 +s = pd.Series(data) + +for _ in range(WARMUP): + (s - s.min()) / (s.max() - s.min()) + +start = time.perf_counter() +for _ in range(ITERATIONS): + (s - s.min()) / (s.max() - s.min()) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "min_max_normalize", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_mode_series.py b/benchmarks/pandas/bench_mode_series.py new file mode 100644 index 00000000..d6b890b8 --- /dev/null +++ b/benchmarks/pandas/bench_mode_series.py @@ -0,0 +1,24 @@ +""" +Benchmark: Series.mode() — mode of a 10k-element integer Series. +Outputs JSON: {"function": "mode_series", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +data = [i % 200 for i in range(SIZE)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.mode() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.mode() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "mode_series", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_move_column.py b/benchmarks/pandas/bench_move_column.py new file mode 100644 index 00000000..59eab77b --- /dev/null +++ b/benchmarks/pandas/bench_move_column.py @@ -0,0 +1,19 @@ +"""Benchmark: move column (reindex) on a 100k-row DataFrame""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +df = pd.DataFrame({"a": range(ROWS), "b": [i*2 for i in range(ROWS)], "c": [i*3 for i in range(ROWS)]}) + +for _ in range(WARMUP): + cols = ["c"] + [c for c in df.columns if c != "c"] + df[cols] + +start = time.perf_counter() +for _ in range(ITERATIONS): + cols = ["c"] + [c for c in df.columns if c != "c"] + df[cols] +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "move_column", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_multi_index.py b/benchmarks/pandas/bench_multi_index.py new file mode 100644 index 00000000..72bb08e1 --- /dev/null +++ b/benchmarks/pandas/bench_multi_index.py @@ -0,0 +1,19 @@ +"""Benchmark: MultiIndex construction on 100k pairs""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +a = [f"a{i % 100}" for i in range(ROWS)] +b = [i % 1000 for i in range(ROWS)] +tuples = list(zip(a, b)) + +for _ in range(WARMUP): + pd.MultiIndex.from_tuples(tuples) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.MultiIndex.from_tuples(tuples) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "multi_index", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_multi_index_contains.py b/benchmarks/pandas/bench_multi_index_contains.py new file mode 100644 index 00000000..35dd500e --- /dev/null +++ b/benchmarks/pandas/bench_multi_index_contains.py @@ -0,0 +1,22 @@ +"""Benchmark: MultiIndex.__contains__ (pandas equivalent).""" +import json +import time +import pandas as pd + +SIZE = 5_000 +WARMUP = 5 +ITERATIONS = 50 + +arr1 = [f"a{i % 50}" for i in range(SIZE)] +arr2 = [i % 100 for i in range(SIZE)] +mi = pd.MultiIndex.from_arrays([arr1, arr2]) + +for _ in range(WARMUP): + ("a0", 0) in mi + +t0 = time.perf_counter() +for i in range(ITERATIONS): + (f"a{i % 50}", i % 100) in mi +total = (time.perf_counter() - t0) * 1000 + +print(json.dumps({"function": "multi_index_contains", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_multi_index_droplevel.py b/benchmarks/pandas/bench_multi_index_droplevel.py new file mode 100644 index 00000000..06d0451a --- /dev/null +++ b/benchmarks/pandas/bench_multi_index_droplevel.py @@ -0,0 +1,32 @@ +"""Benchmark: MultiIndex droplevel, reorder_levels, set_names""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +a = [f"a{i % 100}" for i in range(ROWS)] +b = [i % 1000 for i in range(ROWS)] +c = [i % 50 for i in range(ROWS)] +tuples = list(zip(a, b, c)) +mi = pd.MultiIndex.from_tuples(tuples, names=["x", "y", "z"]) + +for _ in range(WARMUP): + mi.droplevel(0) + mi.reorder_levels([2, 1, 0]) + mi.set_names(["a", "b", "c"]) + +start = time.perf_counter() +for _ in range(ITERATIONS): + mi.droplevel(0) + mi.reorder_levels([2, 1, 0]) + mi.set_names(["a", "b", "c"]) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "multi_index_droplevel", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_multi_index_duplicated.py b/benchmarks/pandas/bench_multi_index_duplicated.py new file mode 100644 index 00000000..6af1a3b6 --- /dev/null +++ b/benchmarks/pandas/bench_multi_index_duplicated.py @@ -0,0 +1,31 @@ +"""Benchmark: MultiIndex.duplicated() and drop_duplicates() on 100k-pair MultiIndex""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +# Create a MultiIndex with duplicates (10k unique pairs repeated 10 times) +a = [f"a{i % 100}" for i in range(ROWS)] +b = [i % 1000 for i in range(ROWS)] +tuples = list(zip(a, b)) + +mi = pd.MultiIndex.from_tuples(tuples) + +for _ in range(WARMUP): + mi.duplicated() + mi.drop_duplicates() + +start = time.perf_counter() +for _ in range(ITERATIONS): + mi.duplicated() + mi.drop_duplicates() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "multi_index_duplicated", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_multi_index_fromarrays.py b/benchmarks/pandas/bench_multi_index_fromarrays.py new file mode 100644 index 00000000..16043eec --- /dev/null +++ b/benchmarks/pandas/bench_multi_index_fromarrays.py @@ -0,0 +1,21 @@ +"""Benchmark: MultiIndex.from_arrays (pandas equivalent).""" +import json +import time +import pandas as pd + +SIZE = 5_000 +WARMUP = 3 +ITERATIONS = 20 + +arr1 = [f"a{i % 50}" for i in range(SIZE)] +arr2 = [i % 100 for i in range(SIZE)] + +for _ in range(WARMUP): + pd.MultiIndex.from_arrays([arr1, arr2]) + +t0 = time.perf_counter() +for _ in range(ITERATIONS): + pd.MultiIndex.from_arrays([arr1, arr2]) +total = (time.perf_counter() - t0) * 1000 + +print(json.dumps({"function": "multi_index_fromarrays", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_multi_index_fromproduct.py b/benchmarks/pandas/bench_multi_index_fromproduct.py new file mode 100644 index 00000000..1972f990 --- /dev/null +++ b/benchmarks/pandas/bench_multi_index_fromproduct.py @@ -0,0 +1,20 @@ +"""Benchmark: MultiIndex.from_product (pandas equivalent).""" +import json +import time +import pandas as pd + +WARMUP = 3 +ITERATIONS = 30 + +level1 = [f"a{i}" for i in range(50)] +level2 = list(range(100)) + +for _ in range(WARMUP): + pd.MultiIndex.from_product([level1, level2]) + +t0 = time.perf_counter() +for _ in range(ITERATIONS): + pd.MultiIndex.from_product([level1, level2]) +total = (time.perf_counter() - t0) * 1000 + +print(json.dumps({"function": "multi_index_fromproduct", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_multi_index_getloc.py b/benchmarks/pandas/bench_multi_index_getloc.py new file mode 100644 index 00000000..a9c9733f --- /dev/null +++ b/benchmarks/pandas/bench_multi_index_getloc.py @@ -0,0 +1,28 @@ +"""Benchmark: MultiIndex.get_loc key lookup""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +a = [f"a{i % 100}" for i in range(ROWS)] +b = [i % 1000 for i in range(ROWS)] +tuples = list(zip(a, b)) +mi = pd.MultiIndex.from_tuples(tuples) +key = ("a50", 500) + +for _ in range(WARMUP): + mi.get_loc(key) + +start = time.perf_counter() +for _ in range(ITERATIONS): + mi.get_loc(key) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "multi_index_getloc", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_multi_index_isin.py b/benchmarks/pandas/bench_multi_index_isin.py new file mode 100644 index 00000000..d29d9c00 --- /dev/null +++ b/benchmarks/pandas/bench_multi_index_isin.py @@ -0,0 +1,21 @@ +"""Benchmark: MultiIndex.isin() on 100k-pair MultiIndex""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +a = [f"a{i % 100}" for i in range(ROWS)] +b = [i % 1000 for i in range(ROWS)] +tuples = list(zip(a, b)) +mi = pd.MultiIndex.from_tuples(tuples) +lookup_tuples = [(f"a{i % 100}", i % 1000) for i in range(1000)] + +for _ in range(WARMUP): + mi.isin(lookup_tuples) + +start = time.perf_counter() +for _ in range(ITERATIONS): + mi.isin(lookup_tuples) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "multi_index_isin", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_multi_index_isna_dropna.py b/benchmarks/pandas/bench_multi_index_isna_dropna.py new file mode 100644 index 00000000..98233d29 --- /dev/null +++ b/benchmarks/pandas/bench_multi_index_isna_dropna.py @@ -0,0 +1,34 @@ +"""Benchmark: MultiIndex.isna(), notna(), dropna() on 100k-pair MultiIndex with some nulls""" +import json, time +import pandas as pd +import numpy as np + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +# Create a MultiIndex with some null values +a = [None if i % 10 == 0 else f"a{i % 100}" for i in range(ROWS)] +b = [None if i % 20 == 0 else i % 1000 for i in range(ROWS)] +tuples = list(zip(a, b)) + +mi = pd.MultiIndex.from_tuples(tuples) + +for _ in range(WARMUP): + mi.isna() + mi.notna() + mi.dropna() + +start = time.perf_counter() +for _ in range(ITERATIONS): + mi.isna() + mi.notna() + mi.dropna() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "multi_index_isna_dropna", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_multi_index_reorder_levels.py b/benchmarks/pandas/bench_multi_index_reorder_levels.py new file mode 100644 index 00000000..58d10f56 --- /dev/null +++ b/benchmarks/pandas/bench_multi_index_reorder_levels.py @@ -0,0 +1,21 @@ +"""Benchmark: MultiIndex.reorder_levels() on 100k-pair MultiIndex""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +a = [f"a{i % 100}" for i in range(ROWS)] +b = [i % 1000 for i in range(ROWS)] +c = [i % 50 for i in range(ROWS)] +tuples = list(zip(a, b, c)) +mi = pd.MultiIndex.from_tuples(tuples) + +for _ in range(WARMUP): + mi.reorder_levels([2, 0, 1]) + +start = time.perf_counter() +for _ in range(ITERATIONS): + mi.reorder_levels([2, 0, 1]) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "multi_index_reorder_levels", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_multi_index_set_names.py b/benchmarks/pandas/bench_multi_index_set_names.py new file mode 100644 index 00000000..0f93fd40 --- /dev/null +++ b/benchmarks/pandas/bench_multi_index_set_names.py @@ -0,0 +1,20 @@ +"""Benchmark: MultiIndex.set_names() on 100k-pair MultiIndex""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +a = [f"a{i % 100}" for i in range(ROWS)] +b = [i % 1000 for i in range(ROWS)] +tuples = list(zip(a, b)) +mi = pd.MultiIndex.from_tuples(tuples) + +for _ in range(WARMUP): + mi.set_names(["level0", "level1"]) + +start = time.perf_counter() +for _ in range(ITERATIONS): + mi.set_names(["level0", "level1"]) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "multi_index_set_names", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_multi_index_setops.py b/benchmarks/pandas/bench_multi_index_setops.py new file mode 100644 index 00000000..ae29784c --- /dev/null +++ b/benchmarks/pandas/bench_multi_index_setops.py @@ -0,0 +1,37 @@ +"""Benchmark: MultiIndex set operations (union, intersection, difference)""" +import json, time +import pandas as pd + +ROWS = 50_000 +WARMUP = 3 +ITERATIONS = 10 + +a1 = [f"a{i % 100}" for i in range(ROWS)] +b1 = [i % 1000 for i in range(ROWS)] +tuples1 = list(zip(a1, b1)) + +a2 = [f"a{(i + 50) % 100}" for i in range(ROWS)] +b2 = [(i + 500) % 1000 for i in range(ROWS)] +tuples2 = list(zip(a2, b2)) + +mi1 = pd.MultiIndex.from_tuples(tuples1) +mi2 = pd.MultiIndex.from_tuples(tuples2) + +for _ in range(WARMUP): + mi1.union(mi2) + mi1.intersection(mi2) + mi1.difference(mi2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + mi1.union(mi2) + mi1.intersection(mi2) + mi1.difference(mi2) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "multi_index_setops", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_multi_index_sort_equals.py b/benchmarks/pandas/bench_multi_index_sort_equals.py new file mode 100644 index 00000000..041740ff --- /dev/null +++ b/benchmarks/pandas/bench_multi_index_sort_equals.py @@ -0,0 +1,31 @@ +"""Benchmark: MultiIndex sort_values and equals on 100k-pair MultiIndex""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +a = [f"a{i % 100}" for i in range(ROWS)] +b = [i % 1000 for i in range(ROWS)] +tuples = list(zip(a, b)) + +mi = pd.MultiIndex.from_tuples(tuples) +mi2 = pd.MultiIndex.from_tuples(tuples[:]) + +for _ in range(WARMUP): + mi.sort_values() + mi.equals(mi2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + mi.sort_values() + mi.equals(mi2) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "multi_index_sort_equals", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_multi_index_swaplevel.py b/benchmarks/pandas/bench_multi_index_swaplevel.py new file mode 100644 index 00000000..80876427 --- /dev/null +++ b/benchmarks/pandas/bench_multi_index_swaplevel.py @@ -0,0 +1,20 @@ +"""Benchmark: MultiIndex.swaplevel() on 100k-pair MultiIndex""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +a = [f"a{i % 100}" for i in range(ROWS)] +b = [i % 1000 for i in range(ROWS)] +tuples = list(zip(a, b)) +mi = pd.MultiIndex.from_tuples(tuples) + +for _ in range(WARMUP): + mi.swaplevel() + +start = time.perf_counter() +for _ in range(ITERATIONS): + mi.swaplevel() +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "multi_index_swaplevel", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_multi_index_to_array.py b/benchmarks/pandas/bench_multi_index_to_array.py new file mode 100644 index 00000000..b7ffae01 --- /dev/null +++ b/benchmarks/pandas/bench_multi_index_to_array.py @@ -0,0 +1,20 @@ +"""Benchmark: MultiIndex.to_flat_index() (equivalent of toArray()) on 100k-pair MultiIndex""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +a = [f"a{i % 100}" for i in range(ROWS)] +b = [i % 1000 for i in range(ROWS)] +tuples = list(zip(a, b)) +mi = pd.MultiIndex.from_tuples(tuples) + +for _ in range(WARMUP): + mi.to_flat_index() + +start = time.perf_counter() +for _ in range(ITERATIONS): + mi.to_flat_index() +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "multi_index_to_array", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_nancumops.py b/benchmarks/pandas/bench_nancumops.py new file mode 100644 index 00000000..d0b1ba0b --- /dev/null +++ b/benchmarks/pandas/bench_nancumops.py @@ -0,0 +1,35 @@ +""" +Benchmark: np.nansum / np.nanmean / np.nanvar / np.nanstd — nan-ignoring aggregates on 100k array. +Outputs JSON: {"function": "nancumops", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import math +import time +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +# Array with ~10% NaN values +data = np.array([float("nan") if i % 10 == 0 else math.sin(i * 0.01) * 100 for i in range(SIZE)]) + +for _ in range(WARMUP): + np.nansum(data) + np.nanmean(data) + np.nanvar(data) + np.nanstd(data) + np.nanmin(data) + np.nanmax(data) + +start = time.perf_counter() +for _ in range(ITERATIONS): + np.nansum(data) + np.nanmean(data) + np.nanvar(data) + np.nanstd(data) + np.nanmin(data) + np.nanmax(data) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "nancumops", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_nat_sort.py b/benchmarks/pandas/bench_nat_sort.py new file mode 100644 index 00000000..eb748ad4 --- /dev/null +++ b/benchmarks/pandas/bench_nat_sort.py @@ -0,0 +1,33 @@ +"""Benchmark: natural sort using natsort library (equivalent to natSorted/natArgSort).""" +import json, time + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +data = [f"item{i % 1000}_v{i % 10}" for i in range(SIZE)] + +try: + from natsort import natsorted, index_natsorted + def run(): + natsorted(data) + index_natsorted(data) +except ImportError: + import re + def nat_key(s): + return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', s)] + def run(): + sorted(data, key=nat_key) + sorted(range(len(data)), key=lambda i: nat_key(data[i])) + +for _ in range(WARMUP): + run() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + run() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "nat_sort", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_nlargest.py b/benchmarks/pandas/bench_nlargest.py new file mode 100644 index 00000000..542c5039 --- /dev/null +++ b/benchmarks/pandas/bench_nlargest.py @@ -0,0 +1,39 @@ +""" +Benchmark: Series nlargest + +Returns the N largest values from a large numeric Series. +Outputs JSON: {"function": "nlargest", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" + +import json +import time + +import pandas as pd + +SIZE = 100_000 +N = 100 +WARMUP = 5 +ITERATIONS = 50 + +data = [(i * 7919) % SIZE for i in range(SIZE)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.nlargest(N) + +times: "list[float]" = [] +for _ in range(ITERATIONS): + start = time.perf_counter() + s.nlargest(N) + end = time.perf_counter() + times.append((end - start) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS + +print(json.dumps({ + "function": "nlargest", + "mean_ms": round(mean_ms, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_notna_isna.py b/benchmarks/pandas/bench_notna_isna.py new file mode 100644 index 00000000..b6eb5e92 --- /dev/null +++ b/benchmarks/pandas/bench_notna_isna.py @@ -0,0 +1,21 @@ +"""Benchmark: notna/isna on 100k-element pandas Series with NaN""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +data = [np.nan if i % 5 == 0 else i * 0.1 for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.notna() + s.isna() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.notna() + s.isna() +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "notna_isna", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_nsmallest.py b/benchmarks/pandas/bench_nsmallest.py new file mode 100644 index 00000000..3035cd3e --- /dev/null +++ b/benchmarks/pandas/bench_nsmallest.py @@ -0,0 +1,9 @@ +import pandas as pd, json, time, numpy as np +rng = np.random.default_rng(42) +s = pd.Series(rng.standard_normal(100_000)) +for _ in range(3): s.nsmallest(10) +N = 100 +t0 = time.perf_counter() +for _ in range(N): s.nsmallest(10) +elapsed = time.perf_counter() - t0 +print(json.dumps({"function": "nsmallest", "mean_ms": elapsed/N*1000, "iterations": N, "total_ms": elapsed*1000})) diff --git a/benchmarks/pandas/bench_pct_change.py b/benchmarks/pandas/bench_pct_change.py new file mode 100644 index 00000000..70673422 --- /dev/null +++ b/benchmarks/pandas/bench_pct_change.py @@ -0,0 +1,21 @@ +"""Benchmark: Series.pct_change() — percentage change between elements.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series([float(i*1.1+1.0) for i in range(SIZE)]) + +for _ in range(WARMUP): + s.pct_change() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.pct_change() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"pct_change","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_pctchange_df.py b/benchmarks/pandas/bench_pctchange_df.py new file mode 100644 index 00000000..b81333e6 --- /dev/null +++ b/benchmarks/pandas/bench_pctchange_df.py @@ -0,0 +1,27 @@ +"""Benchmark: DataFrame.pct_change — percentage change across DataFrame columns.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({ + "a": [i * 1.1 + 1 for i in range(SIZE)], + "b": [i * 0.5 + 2 for i in range(SIZE)], + "c": [i * 2.3 + 3 for i in range(SIZE)], +}) + +for _ in range(WARMUP): + df.pct_change() + df.pct_change(periods=3) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.pct_change() + df.pct_change(periods=3) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "pctchange_df", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_pearson_corr.py b/benchmarks/pandas/bench_pearson_corr.py new file mode 100644 index 00000000..454aa7f4 --- /dev/null +++ b/benchmarks/pandas/bench_pearson_corr.py @@ -0,0 +1,28 @@ +"""Benchmark: Pearson correlation between two 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +a = np.sin(np.arange(ROWS) * 0.01) +b = np.cos(np.arange(ROWS) * 0.01) +sa = pd.Series(a) +sb = pd.Series(b) + +for _ in range(WARMUP): + sa.corr(sb) + +start = time.perf_counter() +for _ in range(ITERATIONS): + sa.corr(sb) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "pearson_corr", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_percentile_of_score.py b/benchmarks/pandas/bench_percentile_of_score.py new file mode 100644 index 00000000..fa779ead --- /dev/null +++ b/benchmarks/pandas/bench_percentile_of_score.py @@ -0,0 +1,13 @@ +import pandas as pd, time, json +from scipy import stats as sp_stats +N = 100_000 +data = [(i % 1000) * 0.1 for i in range(N)] +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + sp_stats.percentileofscore(data, 50.0) +t0 = time.perf_counter() +for _ in range(ITERS): + sp_stats.percentileofscore(data, 50.0) +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "percentile_of_score", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_period.py b/benchmarks/pandas/bench_period.py new file mode 100644 index 00000000..ad200d25 --- /dev/null +++ b/benchmarks/pandas/bench_period.py @@ -0,0 +1,31 @@ +"""Benchmark: Period / PeriodIndex — fixed-frequency time spans.""" +import json, time +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +base = pd.Period("2020-01-01", freq="D") +periods = [base + i for i in range(SIZE)] + +start_q = pd.Period("2000Q1", freq="Q") +end_q = pd.Period("2024Q4", freq="Q") + +for _ in range(WARMUP): + for p in periods[:100]: + str(p) + p + 1 + pd.period_range(start=start_q, end=end_q, freq="Q") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + for p in periods: + str(p) + p + 1 + pd.period_range(start=start_q, end=end_q, freq="Q") + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"period","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_pipe_bench.py b/benchmarks/pandas/bench_pipe_bench.py new file mode 100644 index 00000000..0adaeb36 --- /dev/null +++ b/benchmarks/pandas/bench_pipe_bench.py @@ -0,0 +1,21 @@ +"""Benchmark: pipe with 3 transforms on a 100k-element pandas Series""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +s = pd.Series([i * 0.5 for i in range(ROWS)]) + +double = lambda x: x * 2 +add_one = lambda x: x + 1 +absfn = lambda x: x.abs() + +for _ in range(WARMUP): + s.pipe(double).pipe(add_one).pipe(absfn) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.pipe(double).pipe(add_one).pipe(absfn) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "pipe_bench", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_pivot.py b/benchmarks/pandas/bench_pivot.py new file mode 100644 index 00000000..e6b94a3d --- /dev/null +++ b/benchmarks/pandas/bench_pivot.py @@ -0,0 +1,15 @@ +import pandas as pd, json, time, numpy as np +rng = np.random.default_rng(42) +rows = 100 +cols = 20 +df = pd.DataFrame({ + "row": np.repeat(range(rows), cols), + "col": list(range(cols)) * rows, + "val": rng.standard_normal(rows * cols), +}) +for _ in range(3): df.pivot(index="row", columns="col", values="val") +N = 100 +t0 = time.perf_counter() +for _ in range(N): df.pivot(index="row", columns="col", values="val") +elapsed = time.perf_counter() - t0 +print(json.dumps({"function": "pivot", "mean_ms": elapsed/N*1000, "iterations": N, "total_ms": elapsed*1000})) diff --git a/benchmarks/pandas/bench_pop_column.py b/benchmarks/pandas/bench_pop_column.py new file mode 100644 index 00000000..f2f15535 --- /dev/null +++ b/benchmarks/pandas/bench_pop_column.py @@ -0,0 +1,17 @@ +"""Benchmark: DataFrame.drop column on a 100k-row DataFrame""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +df_base = pd.DataFrame({"a": range(ROWS), "b": [i*2 for i in range(ROWS)], "c": [i*3 for i in range(ROWS)]}) + +for _ in range(WARMUP): + df_base.drop(columns=["b"]) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df_base.drop(columns=["b"]) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "pop_column", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_quantile.py b/benchmarks/pandas/bench_quantile.py new file mode 100644 index 00000000..3d75006a --- /dev/null +++ b/benchmarks/pandas/bench_quantile.py @@ -0,0 +1,18 @@ +import pandas as pd, time, json +import numpy as np +N = 100_000 +data = [i * 0.001 for i in range(N)] +s = pd.Series(data) +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + s.quantile(0.25) + s.quantile(0.5) + s.quantile(0.75) +t0 = time.perf_counter() +for _ in range(ITERS): + s.quantile(0.25) + s.quantile(0.5) + s.quantile(0.75) +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "quantile", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_range_index.py b/benchmarks/pandas/bench_range_index.py new file mode 100644 index 00000000..df24f4db --- /dev/null +++ b/benchmarks/pandas/bench_range_index.py @@ -0,0 +1,22 @@ +"""Benchmark: pd.RangeIndex construction, .tolist(), slice, contains on 100k""" +import json, time +import pandas as pd + +N = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +for _ in range(WARMUP): + r = pd.RangeIndex(N) + r.tolist() + r[1000:5000] + 50_000 in r + +start = time.perf_counter() +for _ in range(ITERATIONS): + r = pd.RangeIndex(N) + r.tolist() + r[1000:5000] + 50_000 in r +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "range_index", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_rank.py b/benchmarks/pandas/bench_rank.py new file mode 100644 index 00000000..e945b97b --- /dev/null +++ b/benchmarks/pandas/bench_rank.py @@ -0,0 +1,38 @@ +""" +Benchmark: Series rank + +Ranks a large numeric Series using average tie-breaking. +Outputs JSON: {"function": "rank", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" + +import json +import time + +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +data = [float((i // 3) * 1.5) for i in range(SIZE)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.rank(method="average") + +times: "list[float]" = [] +for _ in range(ITERATIONS): + start = time.perf_counter() + s.rank(method="average") + end = time.perf_counter() + times.append((end - start) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS + +print(json.dumps({ + "function": "rank", + "mean_ms": round(mean_ms, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_read_json.py b/benchmarks/pandas/bench_read_json.py new file mode 100644 index 00000000..f4917cf5 --- /dev/null +++ b/benchmarks/pandas/bench_read_json.py @@ -0,0 +1,42 @@ +""" +Benchmark: DataFrame read_json + +Parses a JSON string into a DataFrame (records orient). +Outputs JSON: {"function": "read_json", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" + +import json +import time +import io + +import pandas as pd + +ROWS = 5_000 +WARMUP = 5 +ITERATIONS = 50 + +records = [ + {"id": i, "x": i * 1.1, "y": i * 2.2, "label": f"item_{i % 100}"} + for i in range(ROWS) +] +json_str = json.dumps(records) + +for _ in range(WARMUP): + pd.read_json(io.StringIO(json_str)) + +times: "list[float]" = [] +for _ in range(ITERATIONS): + start = time.perf_counter() + pd.read_json(io.StringIO(json_str)) + end = time.perf_counter() + times.append((end - start) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS + +print(json.dumps({ + "function": "read_json", + "mean_ms": round(mean_ms, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_reorder_columns.py b/benchmarks/pandas/bench_reorder_columns.py new file mode 100644 index 00000000..fc52f380 --- /dev/null +++ b/benchmarks/pandas/bench_reorder_columns.py @@ -0,0 +1,17 @@ +"""Benchmark: reorder DataFrame columns on a 100k-row DataFrame""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +df = pd.DataFrame({"a": range(ROWS), "b": [i*2 for i in range(ROWS)], "c": [i*3 for i in range(ROWS)]}) + +for _ in range(WARMUP): + df[["c", "a", "b"]] + +start = time.perf_counter() +for _ in range(ITERATIONS): + df[["c", "a", "b"]] +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "reorder_columns", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_replace_dataframe.py b/benchmarks/pandas/bench_replace_dataframe.py new file mode 100644 index 00000000..a9d2f532 --- /dev/null +++ b/benchmarks/pandas/bench_replace_dataframe.py @@ -0,0 +1,26 @@ +"""Benchmark: DataFrame.replace — replace values in a DataFrame.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({ + "a": [i % 10 for i in range(SIZE)], + "b": [i % 5 for i in range(SIZE)], + "c": [["x", "y", "z"][i % 3] for i in range(SIZE)], +}) +mapping = {0: 100, 1: 200, 2: 300} + +for _ in range(WARMUP): + df.replace(mapping) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.replace(mapping) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "replace_dataframe", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_resample.py b/benchmarks/pandas/bench_resample.py new file mode 100644 index 00000000..61e98c8a --- /dev/null +++ b/benchmarks/pandas/bench_resample.py @@ -0,0 +1,10 @@ +import pandas as pd, json, time, numpy as np +rng = np.random.default_rng(42) +idx = pd.date_range("2020-01-01", periods=100_000, freq="1min") +s = pd.Series(rng.standard_normal(100_000), index=idx) +for _ in range(3): s.resample("1h").mean() +N = 50 +t0 = time.perf_counter() +for _ in range(N): s.resample("1h").mean() +elapsed = time.perf_counter() - t0 +print(json.dumps({"function": "resample", "mean_ms": elapsed/N*1000, "iterations": N, "total_ms": elapsed*1000})) diff --git a/benchmarks/pandas/bench_rolling_agg.py b/benchmarks/pandas/bench_rolling_agg.py new file mode 100644 index 00000000..1fc1f933 --- /dev/null +++ b/benchmarks/pandas/bench_rolling_agg.py @@ -0,0 +1,26 @@ +"""Benchmark: rolling multi-aggregation on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.sin(np.arange(ROWS) * 0.01) +s = pd.Series(data) + +for _ in range(WARMUP): + s.rolling(10).agg(["mean", "sum"]) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.rolling(10).agg(["mean", "sum"]) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "rolling_agg", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_rolling_apply.py b/benchmarks/pandas/bench_rolling_apply.py new file mode 100644 index 00000000..cd0bfb3c --- /dev/null +++ b/benchmarks/pandas/bench_rolling_apply.py @@ -0,0 +1,18 @@ +"""Benchmark: rolling.apply on 10k-element pandas Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 +s = pd.Series([i * 0.1 for i in range(ROWS)]) + +for _ in range(WARMUP): + s.rolling(10).apply(np.mean, raw=True) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.rolling(10).apply(np.mean, raw=True) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "rolling_apply", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_rolling_std.py b/benchmarks/pandas/bench_rolling_std.py new file mode 100644 index 00000000..e4dd5099 --- /dev/null +++ b/benchmarks/pandas/bench_rolling_std.py @@ -0,0 +1,26 @@ +"""Benchmark: rolling standard deviation with window=100 on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.sin(np.arange(ROWS) * 0.01) +s = pd.Series(data) + +for _ in range(WARMUP): + s.rolling(100).std() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.rolling(100).std() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "rolling_std", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_rolling_sum.py b/benchmarks/pandas/bench_rolling_sum.py new file mode 100644 index 00000000..1a04a3ec --- /dev/null +++ b/benchmarks/pandas/bench_rolling_sum.py @@ -0,0 +1,26 @@ +"""Benchmark: rolling sum with window=100 on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.sin(np.arange(ROWS) * 0.01) +s = pd.Series(data) + +for _ in range(WARMUP): + s.rolling(100).sum() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.rolling(100).sum() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "rolling_sum", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_sample.py b/benchmarks/pandas/bench_sample.py new file mode 100644 index 00000000..3e338857 --- /dev/null +++ b/benchmarks/pandas/bench_sample.py @@ -0,0 +1,9 @@ +import pandas as pd, json, time, numpy as np +rng = np.random.default_rng(42) +s = pd.Series(rng.standard_normal(100_000)) +for _ in range(3): s.sample(n=1000, random_state=42) +N = 100 +t0 = time.perf_counter() +for _ in range(N): s.sample(n=1000, random_state=42) +elapsed = time.perf_counter() - t0 +print(json.dumps({"function": "sample", "mean_ms": elapsed/N*1000, "iterations": N, "total_ms": elapsed*1000})) diff --git a/benchmarks/pandas/bench_searchsorted.py b/benchmarks/pandas/bench_searchsorted.py new file mode 100644 index 00000000..67fbe821 --- /dev/null +++ b/benchmarks/pandas/bench_searchsorted.py @@ -0,0 +1,24 @@ +"""Benchmark: searchsorted / searchsortedMany — binary search on sorted arrays.""" +import json, time +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +sorted_arr = np.array([i * 2 for i in range(SIZE)]) +needles = np.array([i * 200 for i in range(1_000)]) + +for _ in range(WARMUP): + np.searchsorted(sorted_arr, 50_000) + np.searchsorted(sorted_arr, needles) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + np.searchsorted(sorted_arr, 50_000) + np.searchsorted(sorted_arr, needles) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"searchsorted","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_select_dtypes.py b/benchmarks/pandas/bench_select_dtypes.py new file mode 100644 index 00000000..528ef947 --- /dev/null +++ b/benchmarks/pandas/bench_select_dtypes.py @@ -0,0 +1,32 @@ +"""Benchmark: DataFrame.select_dtypes — filter columns by dtype.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({ + "a": list(range(SIZE)), + "b": [i * 1.5 for i in range(SIZE)], + "c": [f"str{i % 1000}" for i in range(SIZE)], + "d": [i % 2 == 0 for i in range(SIZE)], + "e": list(range(0, SIZE * 2, 2)), + "f": [f"label{i % 100}" for i in range(SIZE)], +}) + +for _ in range(WARMUP): + df.select_dtypes(include=["number"]) + df.select_dtypes(include=["object"]) + df.select_dtypes(exclude=["bool"]) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.select_dtypes(include=["number"]) + df.select_dtypes(include=["object"]) + df.select_dtypes(exclude=["bool"]) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "select_dtypes", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_sem_var.py b/benchmarks/pandas/bench_sem_var.py new file mode 100644 index 00000000..4b2f1b66 --- /dev/null +++ b/benchmarks/pandas/bench_sem_var.py @@ -0,0 +1,28 @@ +""" +Benchmark: Series.var() / Series.sem() — variance and SEM on a 100k-element Series. +Outputs JSON: {"function": "sem_var", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import math +import time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +data = np.array([math.sin(i * 0.01) * 100 for i in range(SIZE)]) +s = pd.Series(data) + +for _ in range(WARMUP): + s.var() + s.sem() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.var() + s.sem() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "sem_var", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_series_abs.py b/benchmarks/pandas/bench_series_abs.py new file mode 100644 index 00000000..9d1163f0 --- /dev/null +++ b/benchmarks/pandas/bench_series_abs.py @@ -0,0 +1,21 @@ +"""Benchmark: Series.abs() — element-wise absolute value.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series([float(i - 50000) for i in range(SIZE)]) + +for _ in range(WARMUP): + s.abs() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.abs() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"series_abs","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_series_apply.py b/benchmarks/pandas/bench_series_apply.py new file mode 100644 index 00000000..2404bbb1 --- /dev/null +++ b/benchmarks/pandas/bench_series_apply.py @@ -0,0 +1,17 @@ +"""Benchmark: Series.apply on 100k-element pandas Series""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +s = pd.Series([i * 0.1 for i in range(ROWS)]) + +for _ in range(WARMUP): + s.apply(lambda v: v * 2 + 1) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.apply(lambda v: v * 2 + 1) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "series_apply", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_series_at_iat.py b/benchmarks/pandas/bench_series_at_iat.py new file mode 100644 index 00000000..1a5d22fa --- /dev/null +++ b/benchmarks/pandas/bench_series_at_iat.py @@ -0,0 +1,28 @@ +"""Benchmark: series_at_iat — pd.Series.at and .iat point access on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.arange(ROWS, dtype=float) * 1.5 +s = pd.Series(data) + +for _ in range(WARMUP): + for j in range(1000): s.iat[j] + for j in range(1000): s.at[j] + +start = time.perf_counter() +for _ in range(ITERATIONS): + for j in range(1000): s.iat[j] + for j in range(1000): s.at[j] +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_at_iat", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_compare.py b/benchmarks/pandas/bench_series_compare.py new file mode 100644 index 00000000..389e53d9 --- /dev/null +++ b/benchmarks/pandas/bench_series_compare.py @@ -0,0 +1,37 @@ +"""Benchmark: Series comparison operators (eq, ne, lt, gt, le, ge) on 100k Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +data = np.arange(ROWS) * 0.1 +s = pd.Series(data) +threshold = ROWS * 0.05 + +for _ in range(WARMUP): + s.eq(threshold) + s.ne(threshold) + s.lt(threshold) + s.gt(threshold) + s.le(threshold) + s.ge(threshold) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.eq(threshold) + s.ne(threshold) + s.lt(threshold) + s.gt(threshold) + s.le(threshold) + s.ge(threshold) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_compare", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_copy.py b/benchmarks/pandas/bench_series_copy.py new file mode 100644 index 00000000..b4f12e3d --- /dev/null +++ b/benchmarks/pandas/bench_series_copy.py @@ -0,0 +1,19 @@ +"""Benchmark: Series.copy() on 100k Series.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series(np.arange(SIZE) * 0.5, name="original") +for _ in range(WARMUP): s.copy() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.copy() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_copy", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_corr.py b/benchmarks/pandas/bench_series_corr.py new file mode 100644 index 00000000..5246281d --- /dev/null +++ b/benchmarks/pandas/bench_series_corr.py @@ -0,0 +1,22 @@ +"""Benchmark: Series.corr(other) Pearson correlation on 100k-element Series.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +rng = np.random.default_rng(42) +a = pd.Series(np.arange(SIZE) * 0.1) +b = pd.Series(np.arange(SIZE) * 0.2 + rng.random(SIZE)) + +for _ in range(WARMUP): a.corr(b) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + a.corr(b) + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_corr", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_count.py b/benchmarks/pandas/bench_series_count.py new file mode 100644 index 00000000..2f949499 --- /dev/null +++ b/benchmarks/pandas/bench_series_count.py @@ -0,0 +1,20 @@ +"""Benchmark: Series.count() — non-NA count on 100k Series with some NAs.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 10 +ITERATIONS = 100 + +data = np.where(np.arange(SIZE) % 5 == 0, np.nan, np.arange(SIZE, dtype=float)) +s = pd.Series(data) +for _ in range(WARMUP): s.count() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.count() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_count", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_describe.py b/benchmarks/pandas/bench_series_describe.py new file mode 100644 index 00000000..e20b5ba2 --- /dev/null +++ b/benchmarks/pandas/bench_series_describe.py @@ -0,0 +1,19 @@ +"""Benchmark: Series.describe() — summary statistics on 100k Series.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +s = pd.Series((np.arange(SIZE) * 1.1) % 9999) +for _ in range(WARMUP): s.describe() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.describe() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_describe", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_digitize.py b/benchmarks/pandas/bench_series_digitize.py new file mode 100644 index 00000000..368ccc20 --- /dev/null +++ b/benchmarks/pandas/bench_series_digitize.py @@ -0,0 +1,18 @@ +"""Benchmark: np.digitize on 100k-element array""" +import json, time +import numpy as np + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +data = np.array([i * 0.001 for i in range(ROWS)]) +bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] + +for _ in range(WARMUP): + np.digitize(data, bins) + +start = time.perf_counter() +for _ in range(ITERATIONS): + np.digitize(data, bins) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "series_digitize", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_series_dropna.py b/benchmarks/pandas/bench_series_dropna.py new file mode 100644 index 00000000..8c214b16 --- /dev/null +++ b/benchmarks/pandas/bench_series_dropna.py @@ -0,0 +1,20 @@ +"""Benchmark: Series.dropna() on 100k Series with ~20% NAs.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +data = np.where(np.arange(SIZE) % 5 == 0, np.nan, np.arange(SIZE, dtype=float)) +s = pd.Series(data) +for _ in range(WARMUP): s.dropna() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.dropna() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_dropna", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_dt_strftime.py b/benchmarks/pandas/bench_series_dt_strftime.py new file mode 100644 index 00000000..f5ad146d --- /dev/null +++ b/benchmarks/pandas/bench_series_dt_strftime.py @@ -0,0 +1,13 @@ +import pandas as pd, time, json +N = 100_000 +dates = pd.date_range("2020-01-01", periods=N, freq="D") +s = pd.Series(dates) +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + s.dt.strftime("%Y-%m-%d") +t0 = time.perf_counter() +for _ in range(ITERS): + s.dt.strftime("%Y-%m-%d") +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "series_dt_strftime", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_series_filter.py b/benchmarks/pandas/bench_series_filter.py new file mode 100644 index 00000000..c872512f --- /dev/null +++ b/benchmarks/pandas/bench_series_filter.py @@ -0,0 +1,20 @@ +"""Benchmark: Series boolean selection on 100k Series.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series(np.arange(SIZE)) +mask = pd.Series(np.arange(SIZE) % 2 == 0) +for _ in range(WARMUP): s[mask] + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s[mask] + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_filter", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_floordiv_mod_pow.py b/benchmarks/pandas/bench_series_floordiv_mod_pow.py new file mode 100644 index 00000000..e85b483e --- /dev/null +++ b/benchmarks/pandas/bench_series_floordiv_mod_pow.py @@ -0,0 +1,30 @@ +"""Benchmark: Series floordiv, mod, and pow operators on 100k Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +data = (np.arange(ROWS) + 1) * 0.5 +s = pd.Series(data) + +for _ in range(WARMUP): + s.floordiv(3) + s.mod(7) + s.pow(2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.floordiv(3) + s.mod(7) + s.pow(2) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_floordiv_mod_pow", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_from_object.py b/benchmarks/pandas/bench_series_from_object.py new file mode 100644 index 00000000..5e4f5d6c --- /dev/null +++ b/benchmarks/pandas/bench_series_from_object.py @@ -0,0 +1,17 @@ +"""Benchmark: pd.Series from dict on 10k-key dict""" +import json, time +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 +obj = {f"key_{i}": i * 1.5 for i in range(ROWS)} + +for _ in range(WARMUP): + pd.Series(obj) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.Series(obj) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "series_from_object", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_series_groupby.py b/benchmarks/pandas/bench_series_groupby.py new file mode 100644 index 00000000..465b77c3 --- /dev/null +++ b/benchmarks/pandas/bench_series_groupby.py @@ -0,0 +1,20 @@ +"""Benchmark: Series.groupby(by).sum() on 100k Series.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +s = pd.Series((np.arange(SIZE) * 1.5) % 9999) +by = pd.Series(np.arange(SIZE) % 100) +for _ in range(WARMUP): s.groupby(by).sum() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.groupby(by).sum() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_groupby", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_groupby_apply.py b/benchmarks/pandas/bench_series_groupby_apply.py new file mode 100644 index 00000000..7c73e3bc --- /dev/null +++ b/benchmarks/pandas/bench_series_groupby_apply.py @@ -0,0 +1,23 @@ +"""Benchmark: SeriesGroupBy.apply (pandas equivalent).""" +import json +import time +import pandas as pd +import numpy as np + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 20 + +data = [i * 0.5 for i in range(ROWS)] +by = [i % 100 for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.groupby(by).apply(lambda g: g) + +t0 = time.perf_counter() +for _ in range(ITERATIONS): + s.groupby(by).apply(lambda g: g - g.mean()) +total = (time.perf_counter() - t0) * 1000 + +print(json.dumps({"function": "series_groupby_apply", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_series_groupby_filter.py b/benchmarks/pandas/bench_series_groupby_filter.py new file mode 100644 index 00000000..59cd5b33 --- /dev/null +++ b/benchmarks/pandas/bench_series_groupby_filter.py @@ -0,0 +1,22 @@ +"""Benchmark: SeriesGroupBy.filter (pandas equivalent).""" +import json +import time +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 20 + +data = [i * 1.0 for i in range(ROWS)] +by = [i % 100 for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.groupby(by).filter(lambda g: g.sum() > 1000) + +t0 = time.perf_counter() +for _ in range(ITERATIONS): + s.groupby(by).filter(lambda g: g.sum() > 1000) +total = (time.perf_counter() - t0) * 1000 + +print(json.dumps({"function": "series_groupby_filter", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_series_groupby_transform.py b/benchmarks/pandas/bench_series_groupby_transform.py new file mode 100644 index 00000000..ca7de2b4 --- /dev/null +++ b/benchmarks/pandas/bench_series_groupby_transform.py @@ -0,0 +1,27 @@ +"""Benchmark: SeriesGroupBy.transform on 100k Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = (np.arange(ROWS) * 1.5) % 9999 +by = np.arange(ROWS) % 50 +s = pd.Series(data) + +for _ in range(WARMUP): + s.groupby(by).transform(lambda x: x - x.mean()) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.groupby(by).transform(lambda x: x - x.mean()) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_groupby_transform", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_iloc.py b/benchmarks/pandas/bench_series_iloc.py new file mode 100644 index 00000000..13f0cb24 --- /dev/null +++ b/benchmarks/pandas/bench_series_iloc.py @@ -0,0 +1,20 @@ +"""Benchmark: Series.iloc[] — integer position selection on 100k Series.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +s = pd.Series(np.arange(SIZE) * 3.0) +positions = list(range(0, SIZE, 100)) +for _ in range(WARMUP): s.iloc[positions] + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.iloc[positions] + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_iloc", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_isin.py b/benchmarks/pandas/bench_series_isin.py new file mode 100644 index 00000000..e1c7991d --- /dev/null +++ b/benchmarks/pandas/bench_series_isin.py @@ -0,0 +1,20 @@ +"""Benchmark: Series.isin(values) on 100k Series with 100-element lookup set.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +s = pd.Series(np.arange(SIZE) % 500) +lookup = list(range(0, 500, 5)) +for _ in range(WARMUP): s.isin(lookup) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.isin(lookup) + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_isin", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_isna_notna.py b/benchmarks/pandas/bench_series_isna_notna.py new file mode 100644 index 00000000..0f7e2dfb --- /dev/null +++ b/benchmarks/pandas/bench_series_isna_notna.py @@ -0,0 +1,21 @@ +"""Benchmark: Series.isna() and Series.notna() on 100k Series with NAs.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +data = np.where(np.arange(SIZE) % 3 == 0, np.nan, np.arange(SIZE, dtype=float)) +s = pd.Series(data) +for _ in range(WARMUP): s.isna(); s.notna() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.isna() + s.notna() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_isna_notna", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_loc.py b/benchmarks/pandas/bench_series_loc.py new file mode 100644 index 00000000..d6ff9fb7 --- /dev/null +++ b/benchmarks/pandas/bench_series_loc.py @@ -0,0 +1,20 @@ +"""Benchmark: Series.loc[] — label-based selection on 100k Series.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +s = pd.Series(np.arange(SIZE) * 2.0, index=np.arange(SIZE)) +select_labels = np.arange(0, SIZE, 100) +for _ in range(WARMUP): s.loc[select_labels] + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.loc[select_labels] + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_loc", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_map.py b/benchmarks/pandas/bench_series_map.py new file mode 100644 index 00000000..c7ffd0ff --- /dev/null +++ b/benchmarks/pandas/bench_series_map.py @@ -0,0 +1,22 @@ +"""Benchmark: Series.map() with a dictionary lookup.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series([i % 1000 for i in range(SIZE)]) +lookup = {i: float(i * 2.5) for i in range(1000)} + +for _ in range(WARMUP): + s.map(lookup) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.map(lookup) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"series_map","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_series_median.py b/benchmarks/pandas/bench_series_median.py new file mode 100644 index 00000000..5156e9e2 --- /dev/null +++ b/benchmarks/pandas/bench_series_median.py @@ -0,0 +1,20 @@ +"""Benchmark: Series.median() on 100k-element numeric Series.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +s = pd.Series((np.arange(SIZE) * 1.7) % 9999) + +for _ in range(WARMUP): s.median() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.median() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_median", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_min_max.py b/benchmarks/pandas/bench_series_min_max.py new file mode 100644 index 00000000..269b6c2e --- /dev/null +++ b/benchmarks/pandas/bench_series_min_max.py @@ -0,0 +1,19 @@ +"""Benchmark: Series.min() and Series.max() on 100k numeric Series.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series((np.arange(SIZE) * 3.14) % 5000) +for _ in range(WARMUP): s.min(); s.max() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.min(); s.max() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_min_max", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_nlargest.py b/benchmarks/pandas/bench_series_nlargest.py new file mode 100644 index 00000000..39d07d73 --- /dev/null +++ b/benchmarks/pandas/bench_series_nlargest.py @@ -0,0 +1,26 @@ +"""Benchmark: nlargest on 100k-element Series (top 1000)""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.sin(np.arange(ROWS) * 0.01) * 1000 +s = pd.Series(data) + +for _ in range(WARMUP): + s.nlargest(1000) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.nlargest(1000) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_nlargest", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_nunique.py b/benchmarks/pandas/bench_series_nunique.py new file mode 100644 index 00000000..db67b43c --- /dev/null +++ b/benchmarks/pandas/bench_series_nunique.py @@ -0,0 +1,21 @@ +"""Benchmark: Series.nunique() — count unique values.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series([i % 1000 for i in range(SIZE)]) + +for _ in range(WARMUP): + s.nunique() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.nunique() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"series_nunique","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_series_properties.py b/benchmarks/pandas/bench_series_properties.py new file mode 100644 index 00000000..6d968a3b --- /dev/null +++ b/benchmarks/pandas/bench_series_properties.py @@ -0,0 +1,18 @@ +"""Benchmark: pandas Series property access — shape, ndim, size, empty, values, dtype, name""" +import json, time +import pandas as pd + +N = 100_000 +s = pd.Series(range(N), name="x", dtype=float) + +WARMUP = 3 +ITERATIONS = 100_000 + +for _ in range(WARMUP): + _ = s.shape; _ = s.ndim; _ = s.size; _ = s.empty; _ = s.values; _ = s.dtype; _ = s.name + +start = time.perf_counter() +for _ in range(ITERATIONS): + _ = s.shape; _ = s.ndim; _ = s.size; _ = s.empty; _ = s.values; _ = s.dtype; _ = s.name +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "series_properties", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_series_quantile.py b/benchmarks/pandas/bench_series_quantile.py new file mode 100644 index 00000000..10d8b7b0 --- /dev/null +++ b/benchmarks/pandas/bench_series_quantile.py @@ -0,0 +1,20 @@ +"""Benchmark: Series.quantile(q) on 100k numeric Series.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +s = pd.Series((np.arange(SIZE) * 1.41) % 10000) +for _ in range(WARMUP): s.quantile(0.25); s.quantile(0.75) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.quantile(0.25) + s.quantile(0.75) + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_quantile", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_rank.py b/benchmarks/pandas/bench_series_rank.py new file mode 100644 index 00000000..378445ac --- /dev/null +++ b/benchmarks/pandas/bench_series_rank.py @@ -0,0 +1,26 @@ +"""Benchmark: Series rank on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.sin(np.arange(ROWS) * 0.01) * 1000 +s = pd.Series(data) + +for _ in range(WARMUP): + s.rank() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.rank() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_rank", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_rename.py b/benchmarks/pandas/bench_series_rename.py new file mode 100644 index 00000000..e7c7f202 --- /dev/null +++ b/benchmarks/pandas/bench_series_rename.py @@ -0,0 +1,19 @@ +"""Benchmark: Series.rename(name) on 100k Series.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 10 +ITERATIONS = 100 + +s = pd.Series(np.arange(SIZE), name="old_name") +for _ in range(WARMUP): s.rename("new_name") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.rename("new_name") + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_rename", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_replace.py b/benchmarks/pandas/bench_series_replace.py new file mode 100644 index 00000000..e7a23698 --- /dev/null +++ b/benchmarks/pandas/bench_series_replace.py @@ -0,0 +1,10 @@ +import pandas as pd, json, time, numpy as np +rng = np.random.default_rng(42) +s = pd.Series(rng.integers(0, 10, size=100_000)) +mapping = {i: i*10 for i in range(10)} +for _ in range(3): s.replace(mapping) +N = 50 +t0 = time.perf_counter() +for _ in range(N): s.replace(mapping) +elapsed = time.perf_counter() - t0 +print(json.dumps({"function": "series_replace", "mean_ms": elapsed/N*1000, "iterations": N, "total_ms": elapsed*1000})) diff --git a/benchmarks/pandas/bench_series_resetindex.py b/benchmarks/pandas/bench_series_resetindex.py new file mode 100644 index 00000000..2b91ebdb --- /dev/null +++ b/benchmarks/pandas/bench_series_resetindex.py @@ -0,0 +1,20 @@ +"""Benchmark: Series.reset_index() on 100k Series.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +labels = [f"key_{i}" for i in range(SIZE)] +s = pd.Series(np.arange(SIZE), index=labels) +for _ in range(WARMUP): s.reset_index(drop=True) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.reset_index(drop=True) + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_resetindex", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_setindex.py b/benchmarks/pandas/bench_series_setindex.py new file mode 100644 index 00000000..1045eedd --- /dev/null +++ b/benchmarks/pandas/bench_series_setindex.py @@ -0,0 +1,27 @@ +"""Benchmark: series_setindex — pd.Series with new index on a 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.arange(ROWS, dtype=float) * 1.5 +s = pd.Series(data) +new_index = pd.Index([f"key{i}" for i in range(ROWS)]) + +for _ in range(WARMUP): + s.set_axis(new_index) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.set_axis(new_index) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_setindex", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_sort_index.py b/benchmarks/pandas/bench_series_sort_index.py new file mode 100644 index 00000000..c458e355 --- /dev/null +++ b/benchmarks/pandas/bench_series_sort_index.py @@ -0,0 +1,20 @@ +"""Benchmark: Series.sort_index() on 100k Series with string labels.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +labels = [f"lbl_{(SIZE - i):06d}" for i in range(SIZE)] +s = pd.Series(np.arange(SIZE), index=labels) +for _ in range(WARMUP): s.sort_index() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.sort_index() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_sort_index", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_std_var.py b/benchmarks/pandas/bench_series_std_var.py new file mode 100644 index 00000000..e0d1fb62 --- /dev/null +++ b/benchmarks/pandas/bench_series_std_var.py @@ -0,0 +1,19 @@ +"""Benchmark: Series.std() and Series.var() on 100k numeric Series.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +s = pd.Series((np.arange(SIZE) * 2.71) % 10000) +for _ in range(WARMUP): s.std(); s.var() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.std(); s.var() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_std_var", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_str_replace.py b/benchmarks/pandas/bench_series_str_replace.py new file mode 100644 index 00000000..c8d4f349 --- /dev/null +++ b/benchmarks/pandas/bench_series_str_replace.py @@ -0,0 +1,25 @@ +"""Benchmark: series_str_replace — str.replace on 100k strings""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"hello_world_{i % 200}" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.replace("world", "there", regex=False) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.replace("world", "there", regex=False) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_str_replace", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_sum_mean.py b/benchmarks/pandas/bench_series_sum_mean.py new file mode 100644 index 00000000..dd86b461 --- /dev/null +++ b/benchmarks/pandas/bench_series_sum_mean.py @@ -0,0 +1,19 @@ +"""Benchmark: Series.sum() and Series.mean() on 100k numeric Series.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series(np.arange(SIZE) * 0.001) +for _ in range(WARMUP): s.sum(); s.mean() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.sum(); s.mean() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_sum_mean", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_to_string.py b/benchmarks/pandas/bench_series_to_string.py new file mode 100644 index 00000000..7f60b824 --- /dev/null +++ b/benchmarks/pandas/bench_series_to_string.py @@ -0,0 +1,17 @@ +"""Benchmark: Series.to_string on 1k-element pandas Series""" +import json, time +import pandas as pd + +N = 1_000 +WARMUP = 3 +ITERATIONS = 10 +s = pd.Series([i * 0.1 for i in range(N)]) + +for _ in range(WARMUP): + s.to_string() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.to_string() +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "series_to_string", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_series_toarray_tolist.py b/benchmarks/pandas/bench_series_toarray_tolist.py new file mode 100644 index 00000000..abaac1b2 --- /dev/null +++ b/benchmarks/pandas/bench_series_toarray_tolist.py @@ -0,0 +1,28 @@ +"""Benchmark: Series tolist and to_numpy on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.arange(ROWS) * 0.5 +s = pd.Series(data) + +for _ in range(WARMUP): + s.tolist() + s.to_numpy() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.tolist() + s.to_numpy() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_toarray_tolist", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_toobject.py b/benchmarks/pandas/bench_series_toobject.py new file mode 100644 index 00000000..4c055ff2 --- /dev/null +++ b/benchmarks/pandas/bench_series_toobject.py @@ -0,0 +1,19 @@ +"""Benchmark: Series.to_dict() — convert to dict on 100k Series.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +s = pd.Series(np.arange(SIZE) * 1.5) +for _ in range(WARMUP): s.to_dict() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.to_dict() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_toobject", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_transform.py b/benchmarks/pandas/bench_series_transform.py new file mode 100644 index 00000000..bac0402e --- /dev/null +++ b/benchmarks/pandas/bench_series_transform.py @@ -0,0 +1,17 @@ +"""Benchmark: Series.transform on 100k-element pandas Series""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +s = pd.Series([i * 0.1 for i in range(ROWS)]) + +for _ in range(WARMUP): + s.transform(lambda v: v ** 2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.transform(lambda v: v ** 2) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "series_transform", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_series_unique.py b/benchmarks/pandas/bench_series_unique.py new file mode 100644 index 00000000..07edf7a5 --- /dev/null +++ b/benchmarks/pandas/bench_series_unique.py @@ -0,0 +1,19 @@ +"""Benchmark: Series.unique() on 100k-element Series with 1000 distinct values.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +s = pd.Series(np.arange(SIZE) % 1000) +for _ in range(WARMUP): s.unique() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.unique() + times.append(time.perf_counter() - t0) +total = sum(times) * 1000 +print(json.dumps({ "function": "series_unique", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total })) diff --git a/benchmarks/pandas/bench_series_with_values.py b/benchmarks/pandas/bench_series_with_values.py new file mode 100644 index 00000000..7b6f6920 --- /dev/null +++ b/benchmarks/pandas/bench_series_with_values.py @@ -0,0 +1,20 @@ +"""Benchmark: Series.copy(data=new_data) on 100k-element Series (equivalent to withValues)""" +import json, time +import pandas as pd +import numpy as np + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +data = list(range(ROWS)) +new_data = [i * 2.0 for i in range(ROWS)] +s = pd.Series(data, name="x") + +for _ in range(WARMUP): + pd.Series(new_data, index=s.index, name=s.name) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.Series(new_data, index=s.index, name=s.name) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "series_with_values", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_skew_kurt.py b/benchmarks/pandas/bench_skew_kurt.py new file mode 100644 index 00000000..34ff812f --- /dev/null +++ b/benchmarks/pandas/bench_skew_kurt.py @@ -0,0 +1,28 @@ +""" +Benchmark: Series.skew() / Series.kurt() — skewness and kurtosis on a 100k-element Series. +Outputs JSON: {"function": "skew_kurt", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import math +import time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +data = np.array([math.sin(i * 0.01) * 100 for i in range(SIZE)]) +s = pd.Series(data) + +for _ in range(WARMUP): + s.skew() + s.kurt() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.skew() + s.kurt() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "skew_kurt", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_str_byte_length.py b/benchmarks/pandas/bench_str_byte_length.py new file mode 100644 index 00000000..39bd5cc0 --- /dev/null +++ b/benchmarks/pandas/bench_str_byte_length.py @@ -0,0 +1,14 @@ +import pandas as pd, time, json +N = 100_000 +words = ["hello", "world", "typescript", "benchmark", "tsb"] +data = [words[i % len(words)] for i in range(N)] +s = pd.Series(data) +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + s.str.encode("utf-8").str.len() +t0 = time.perf_counter() +for _ in range(ITERS): + s.str.encode("utf-8").str.len() +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "str_byte_length", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_str_case.py b/benchmarks/pandas/bench_str_case.py new file mode 100644 index 00000000..2bfa8270 --- /dev/null +++ b/benchmarks/pandas/bench_str_case.py @@ -0,0 +1,30 @@ +"""Benchmark: str_case — str.title, str.capitalize, str.swapcase on 100k strings""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"hello world {i}" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.title() + s.str.capitalize() + s.str.swapcase() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.title() + s.str.capitalize() + s.str.swapcase() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_case", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_str_cat.py b/benchmarks/pandas/bench_str_cat.py new file mode 100644 index 00000000..3007647a --- /dev/null +++ b/benchmarks/pandas/bench_str_cat.py @@ -0,0 +1,27 @@ +"""Benchmark: str_cat — str.cat concatenating a Series with another array on 100k strings""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"hello_{i % 200}" for i in range(ROWS)] +other = [f"_world_{i % 100}" for i in range(ROWS)] +s = pd.Series(data) +t = pd.Series(other) + +for _ in range(WARMUP): + s.str.cat(t, sep="-") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.cat(t, sep="-") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_cat", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_str_char_width.py b/benchmarks/pandas/bench_str_char_width.py new file mode 100644 index 00000000..158027e5 --- /dev/null +++ b/benchmarks/pandas/bench_str_char_width.py @@ -0,0 +1,14 @@ +import pandas as pd, time, json +N = 100_000 +words = ["hello", "world", "café", "résumé", "naïve"] +data = [words[i % len(words)] for i in range(N)] +s = pd.Series(data) +WARMUP = 3 +ITERS = 20 +for _ in range(WARMUP): + s.str.len() +t0 = time.perf_counter() +for _ in range(ITERS): + s.str.len() +total = (time.perf_counter() - t0) * 1000 +print(json.dumps({"function": "str_char_width", "mean_ms": total / ITERS, "iterations": ITERS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_str_count.py b/benchmarks/pandas/bench_str_count.py new file mode 100644 index 00000000..4f3815fb --- /dev/null +++ b/benchmarks/pandas/bench_str_count.py @@ -0,0 +1,26 @@ +"""Benchmark: str_count — str.count occurrences of pattern on 100k strings""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"abc abc abc {'abc' if i % 5 == 0 else 'xyz'}" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.count("abc") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.count("abc") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_count", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_str_dedent.py b/benchmarks/pandas/bench_str_dedent.py new file mode 100644 index 00000000..8927d5bb --- /dev/null +++ b/benchmarks/pandas/bench_str_dedent.py @@ -0,0 +1,17 @@ +"""Benchmark: textwrap.dedent on 50k multi-line strings""" +import json, time +import textwrap + +N = 50_000 +WARMUP = 3 +ITERATIONS = 10 +data = [f" line1 {i}\n line2 {i}\n line3 {i}" for i in range(N)] + +for _ in range(WARMUP): + [textwrap.dedent(s) for s in data] + +start = time.perf_counter() +for _ in range(ITERATIONS): + [textwrap.dedent(s) for s in data] +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "str_dedent", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_str_encode.py b/benchmarks/pandas/bench_str_encode.py new file mode 100644 index 00000000..79a92155 --- /dev/null +++ b/benchmarks/pandas/bench_str_encode.py @@ -0,0 +1,26 @@ +"""Benchmark: str_encode — str.encode byte-length encoding on 100k strings""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"hello world {i}" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.encode("utf-8") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.encode("utf-8") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_encode", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_str_extract_all.py b/benchmarks/pandas/bench_str_extract_all.py new file mode 100644 index 00000000..fd3bfce5 --- /dev/null +++ b/benchmarks/pandas/bench_str_extract_all.py @@ -0,0 +1,18 @@ +\"\"\"Benchmark: str.extractall on 10k-element string Series\"\"\" +import json, time +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 +data = [f"val{i} num{i*2} extra{i}" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.extractall(r"(\d+)") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.extractall(r"(\d+)") +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "str_extract_all", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_str_extract_groups.py b/benchmarks/pandas/bench_str_extract_groups.py new file mode 100644 index 00000000..13f60fdc --- /dev/null +++ b/benchmarks/pandas/bench_str_extract_groups.py @@ -0,0 +1,18 @@ +\"\"\"Benchmark: str.extract on 10k-element string Series\"\"\" +import json, time +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 +data = [f"user_{i}_score_{i % 100}" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.extract(r"user_(\d+)_score_(\d+)") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.extract(r"user_(\d+)_score_(\d+)") +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "str_extract_groups", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_str_find.py b/benchmarks/pandas/bench_str_find.py new file mode 100644 index 00000000..2b29bcc0 --- /dev/null +++ b/benchmarks/pandas/bench_str_find.py @@ -0,0 +1,27 @@ +"""Benchmark: str_find — str.find and str.rfind on 100k strings""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"hello_world_{i % 200}_end" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.find("world") + s.str.rfind("_") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.find("world") + s.str.rfind("_") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_find", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_str_fullmatch.py b/benchmarks/pandas/bench_str_fullmatch.py new file mode 100644 index 00000000..cebc283b --- /dev/null +++ b/benchmarks/pandas/bench_str_fullmatch.py @@ -0,0 +1,25 @@ +"""Benchmark: str_fullmatch — str.fullmatch (regex full match) on 100k strings""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"item_{i % 200}" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.fullmatch(r"item_\d+") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.fullmatch(r"item_\d+") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_fullmatch", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_str_get_dummies.py b/benchmarks/pandas/bench_str_get_dummies.py new file mode 100644 index 00000000..141a5d96 --- /dev/null +++ b/benchmarks/pandas/bench_str_get_dummies.py @@ -0,0 +1,18 @@ +"""Benchmark: str.get_dummies on 10k-element string Series""" +import json, time +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 +data = [f"a|b|{chr(97 + (i % 5))}" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.get_dummies(sep="|") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.get_dummies(sep="|") +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "str_get_dummies", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_str_indent.py b/benchmarks/pandas/bench_str_indent.py new file mode 100644 index 00000000..32a1f39a --- /dev/null +++ b/benchmarks/pandas/bench_str_indent.py @@ -0,0 +1,17 @@ +"""Benchmark: textwrap.indent on 50k multi-line strings""" +import json, time +import textwrap + +N = 50_000 +WARMUP = 3 +ITERATIONS = 10 +data = [f"line1 {i}\nline2 {i}\nline3 {i}" for i in range(N)] + +for _ in range(WARMUP): + [textwrap.indent(s, " ") for s in data] + +start = time.perf_counter() +for _ in range(ITERATIONS): + [textwrap.indent(s, " ") for s in data] +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "str_indent", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_str_is_alpha_digit.py b/benchmarks/pandas/bench_str_is_alpha_digit.py new file mode 100644 index 00000000..7da9fa58 --- /dev/null +++ b/benchmarks/pandas/bench_str_is_alpha_digit.py @@ -0,0 +1,27 @@ +"""Benchmark: str_is_alpha_digit — str.isalpha and str.isdigit on 100k strings""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = ["hello" if i % 2 == 0 else "12345" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.isalpha() + s.str.isdigit() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.isalpha() + s.str.isdigit() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_is_alpha_digit", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_str_isalnum_isnumeric.py b/benchmarks/pandas/bench_str_isalnum_isnumeric.py new file mode 100644 index 00000000..33168e77 --- /dev/null +++ b/benchmarks/pandas/bench_str_isalnum_isnumeric.py @@ -0,0 +1,28 @@ +"""Benchmark: str_isalnum_isnumeric — str.isalnum and str.isnumeric on 100k strings""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = ["abc123" if i % 3 == 0 else ("12345" if i % 3 == 1 else "hello!") for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.isalnum() + s.str.isnumeric() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.isalnum() + s.str.isnumeric() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_isalnum_isnumeric", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_str_islower_isupper.py b/benchmarks/pandas/bench_str_islower_isupper.py new file mode 100644 index 00000000..e9b62eb0 --- /dev/null +++ b/benchmarks/pandas/bench_str_islower_isupper.py @@ -0,0 +1,28 @@ +"""Benchmark: str_islower_isupper — str.islower and str.isupper on 100k strings""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = ["hello" if i % 2 == 0 else "WORLD" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.islower() + s.str.isupper() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.islower() + s.str.isupper() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_islower_isupper", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_str_istitle_isspace.py b/benchmarks/pandas/bench_str_istitle_isspace.py new file mode 100644 index 00000000..5724d028 --- /dev/null +++ b/benchmarks/pandas/bench_str_istitle_isspace.py @@ -0,0 +1,28 @@ +"""Benchmark: str_istitle_isspace — str.istitle and str.isspace on 100k strings""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = ["Hello World" if i % 3 == 0 else (" " if i % 3 == 1 else "hello world") for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.istitle() + s.str.isspace() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.istitle() + s.str.isspace() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_istitle_isspace", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_str_join.py b/benchmarks/pandas/bench_str_join.py new file mode 100644 index 00000000..ad2b4379 --- /dev/null +++ b/benchmarks/pandas/bench_str_join.py @@ -0,0 +1,25 @@ +"""Benchmark: str_join — str.join on 100k list-of-strings Series values""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [[f"a{i % 10}", f"b{i % 5}", f"c{i % 3}"] for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.join("-") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.join("-") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_join", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_str_len.py b/benchmarks/pandas/bench_str_len.py new file mode 100644 index 00000000..4d241baa --- /dev/null +++ b/benchmarks/pandas/bench_str_len.py @@ -0,0 +1,18 @@ +"""Benchmark: Series.str.len() on 100k-element string Series""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +data = [f"item_{i}_value" for i in range(ROWS)] +s = pd.Series(data, name="text") + +for _ in range(WARMUP): + s.str.len() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.len() +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "str_len", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_str_lower_upper.py b/benchmarks/pandas/bench_str_lower_upper.py new file mode 100644 index 00000000..d8c21199 --- /dev/null +++ b/benchmarks/pandas/bench_str_lower_upper.py @@ -0,0 +1,27 @@ +"""Benchmark: str_lower_upper — str.lower and str.upper on 100k strings""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"Hello_World_{i % 200}" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.lower() + s.str.upper() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.lower() + s.str.upper() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_lower_upper", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_str_match.py b/benchmarks/pandas/bench_str_match.py new file mode 100644 index 00000000..d8291f53 --- /dev/null +++ b/benchmarks/pandas/bench_str_match.py @@ -0,0 +1,25 @@ +"""Benchmark: str_match — str.match regex matching on 100k strings""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"item_{i % 500}_abc" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.match(r"^item_\d+") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.match(r"^item_\d+") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_match", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_str_multi_replace.py b/benchmarks/pandas/bench_str_multi_replace.py new file mode 100644 index 00000000..eb1537df --- /dev/null +++ b/benchmarks/pandas/bench_str_multi_replace.py @@ -0,0 +1,23 @@ +"""Benchmark: multiple str.replace on 100k-element string Series""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +data = [f"foo bar baz {i}" for i in range(ROWS)] +s = pd.Series(data) +pairs = [("foo", "alpha"), ("bar", "beta"), ("baz", "gamma")] + +for _ in range(WARMUP): + tmp = s + for old, new in pairs: + tmp = tmp.str.replace(old, new, regex=False) + +start = time.perf_counter() +for _ in range(ITERATIONS): + tmp = s + for old, new in pairs: + tmp = tmp.str.replace(old, new, regex=False) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "str_multi_replace", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_str_normalize.py b/benchmarks/pandas/bench_str_normalize.py new file mode 100644 index 00000000..67e46d93 --- /dev/null +++ b/benchmarks/pandas/bench_str_normalize.py @@ -0,0 +1,18 @@ +"""Benchmark: str normalize on 100k-element string Series""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +data = [f"caf\u00e9 {i}" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.normalize("NFC") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.normalize("NFC") +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "str_normalize", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_str_pad.py b/benchmarks/pandas/bench_str_pad.py new file mode 100644 index 00000000..c54e9f2e --- /dev/null +++ b/benchmarks/pandas/bench_str_pad.py @@ -0,0 +1,29 @@ +"""Benchmark: str_pad — str.pad, str.ljust, str.rjust on 100k strings""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"hello_{i % 200}" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.pad(20) + s.str.ljust(20) + s.str.rjust(20) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.pad(20) + s.str.ljust(20) + s.str.rjust(20) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_pad", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_str_partition.py b/benchmarks/pandas/bench_str_partition.py new file mode 100644 index 00000000..c1ff1531 --- /dev/null +++ b/benchmarks/pandas/bench_str_partition.py @@ -0,0 +1,18 @@ +"""Benchmark: str.partition on 100k-element string Series""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +data = [f"prefix_{i}_suffix" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.partition("_") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.partition("_") +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "str_partition", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_str_remove_prefix.py b/benchmarks/pandas/bench_str_remove_prefix.py new file mode 100644 index 00000000..7b09214b --- /dev/null +++ b/benchmarks/pandas/bench_str_remove_prefix.py @@ -0,0 +1,18 @@ +"""Benchmark: str.removeprefix on 100k-element string Series""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +data = [f"prefix_value_{i}" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.removeprefix("prefix_") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.removeprefix("prefix_") +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "str_remove_prefix", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_str_remove_suffix.py b/benchmarks/pandas/bench_str_remove_suffix.py new file mode 100644 index 00000000..704bccf6 --- /dev/null +++ b/benchmarks/pandas/bench_str_remove_suffix.py @@ -0,0 +1,18 @@ +"""Benchmark: str.removesuffix on 100k-element string Series""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +data = [f"value_{i}_suffix" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.removesuffix("_suffix") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.removesuffix("_suffix") +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "str_remove_suffix", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_str_repeat.py b/benchmarks/pandas/bench_str_repeat.py new file mode 100644 index 00000000..d238725b --- /dev/null +++ b/benchmarks/pandas/bench_str_repeat.py @@ -0,0 +1,25 @@ +"""Benchmark: str_repeat — str.repeat on 100k strings""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"ab_{i % 100}" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.repeat(3) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.repeat(3) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_repeat", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_str_rpartition.py b/benchmarks/pandas/bench_str_rpartition.py new file mode 100644 index 00000000..1033aad9 --- /dev/null +++ b/benchmarks/pandas/bench_str_rpartition.py @@ -0,0 +1,18 @@ +"""Benchmark: str.rpartition on 100k-element string Series""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +data = [f"prefix_{i}_suffix" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.rpartition("_") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.rpartition("_") +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "str_rpartition", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_str_rsplit.py b/benchmarks/pandas/bench_str_rsplit.py new file mode 100644 index 00000000..0f2bdf01 --- /dev/null +++ b/benchmarks/pandas/bench_str_rsplit.py @@ -0,0 +1,21 @@ +"""Benchmark: str_rsplit — pandas str.rsplit() on 100k strings""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"part_{i % 100}_b_c_d" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + _ = s.str.rsplit("_", n=2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + _ = s.str.rsplit("_", n=2) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "str_rsplit", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_str_slice_get.py b/benchmarks/pandas/bench_str_slice_get.py new file mode 100644 index 00000000..cd88b905 --- /dev/null +++ b/benchmarks/pandas/bench_str_slice_get.py @@ -0,0 +1,28 @@ +"""Benchmark: str_slice_get — str.slice and str.get character extraction on 100k strings""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"hello_world_{i}" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str[0:5] + s.str.get(0) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str[0:5] + s.str.get(0) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_slice_get", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_str_slice_replace.py b/benchmarks/pandas/bench_str_slice_replace.py new file mode 100644 index 00000000..7d2be501 --- /dev/null +++ b/benchmarks/pandas/bench_str_slice_replace.py @@ -0,0 +1,21 @@ +"""Benchmark: str_slice_replace — pandas str.slice_replace() on 100k strings""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"hello_world_{i % 1000}" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + _ = s.str.slice_replace(0, 5, "goodbye") + +start = time.perf_counter() +for _ in range(ITERATIONS): + _ = s.str.slice_replace(0, 5, "goodbye") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "str_slice_replace", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_str_split_expand.py b/benchmarks/pandas/bench_str_split_expand.py new file mode 100644 index 00000000..f1720904 --- /dev/null +++ b/benchmarks/pandas/bench_str_split_expand.py @@ -0,0 +1,18 @@ +"""Benchmark: str.split(expand=True) on 10k-element string Series""" +import json, time +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 +data = [f"a_{i}_b_{i*2}_c" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.split("_", expand=True) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.split("_", expand=True) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "str_split_expand", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_str_startswith_endswith.py b/benchmarks/pandas/bench_str_startswith_endswith.py new file mode 100644 index 00000000..5e469c55 --- /dev/null +++ b/benchmarks/pandas/bench_str_startswith_endswith.py @@ -0,0 +1,27 @@ +"""Benchmark: str_startswith_endswith — str.startswith and str.endswith on 100k strings""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"hello_world_{i % 200}_suffix" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.startswith("hello") + s.str.endswith("suffix") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.startswith("hello") + s.str.endswith("suffix") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_startswith_endswith", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_str_strip.py b/benchmarks/pandas/bench_str_strip.py new file mode 100644 index 00000000..1eb327ed --- /dev/null +++ b/benchmarks/pandas/bench_str_strip.py @@ -0,0 +1,29 @@ +"""Benchmark: str_strip — str.strip, str.lstrip, str.rstrip on 100k strings""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f" hello_world_{i % 200} " for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.strip() + s.str.lstrip() + s.str.rstrip() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.strip() + s.str.lstrip() + s.str.rstrip() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_strip", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_str_translate.py b/benchmarks/pandas/bench_str_translate.py new file mode 100644 index 00000000..c4e9e9b4 --- /dev/null +++ b/benchmarks/pandas/bench_str_translate.py @@ -0,0 +1,19 @@ +"""Benchmark: str.translate on 100k-element string Series""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 +data = [f"hello world {i}" for i in range(ROWS)] +s = pd.Series(data) +table = str.maketrans({"h": "H", "w": "W", "o": "0"}) + +for _ in range(WARMUP): + s.str.translate(table) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.translate(table) +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "str_translate", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_str_wrap.py b/benchmarks/pandas/bench_str_wrap.py new file mode 100644 index 00000000..904876d4 --- /dev/null +++ b/benchmarks/pandas/bench_str_wrap.py @@ -0,0 +1,26 @@ +"""Benchmark: str_wrap — str.wrap word wrapping on 100k strings""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = ["the quick brown fox jumps over the lazy dog"] * ROWS +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.wrap(20) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.wrap(20) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_wrap", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_str_zfill_center_ljust_rjust.py b/benchmarks/pandas/bench_str_zfill_center_ljust_rjust.py new file mode 100644 index 00000000..540c7681 --- /dev/null +++ b/benchmarks/pandas/bench_str_zfill_center_ljust_rjust.py @@ -0,0 +1,32 @@ +"""Benchmark: str_zfill_center_ljust_rjust — padding operations on 100k strings""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [str(i) for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.zfill(10) + s.str.center(10) + s.str.ljust(10) + s.str.rjust(10) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.zfill(10) + s.str.center(10) + s.str.ljust(10) + s.str.rjust(10) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "str_zfill_center_ljust_rjust", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_string_contains.py b/benchmarks/pandas/bench_string_contains.py new file mode 100644 index 00000000..364d6965 --- /dev/null +++ b/benchmarks/pandas/bench_string_contains.py @@ -0,0 +1,10 @@ +import pandas as pd, json, time, numpy as np +rng = np.random.default_rng(42) +words = ["apple", "banana", "cherry", "date", "elderberry"] +s = pd.Series(rng.choice(words, size=100_000)) +for _ in range(3): s.str.contains("an", regex=False) +N = 50 +t0 = time.perf_counter() +for _ in range(N): s.str.contains("an", regex=False) +elapsed = time.perf_counter() - t0 +print(json.dumps({"function": "string_contains", "mean_ms": elapsed/N*1000, "iterations": N, "total_ms": elapsed*1000})) diff --git a/benchmarks/pandas/bench_timedelta.py b/benchmarks/pandas/bench_timedelta.py new file mode 100644 index 00000000..7de586b2 --- /dev/null +++ b/benchmarks/pandas/bench_timedelta.py @@ -0,0 +1,31 @@ +"""Benchmark: Timedelta — construction and arithmetic.""" +import json, time +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +td1 = pd.Timedelta(days=1, hours=2, minutes=30) +td2 = pd.Timedelta(hours=3, minutes=45, seconds=10) +deltas = [pd.Timedelta(days=i % 365, hours=i % 24) for i in range(SIZE)] + +for _ in range(WARMUP): + for d in deltas: + d + td1 + d - td2 + _ = d.total_seconds() / 3600 + _ = d.total_seconds() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + for d in deltas: + d + td1 + d - td2 + _ = d.total_seconds() / 3600 + _ = d.total_seconds() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"timedelta","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_timestamp.py b/benchmarks/pandas/bench_timestamp.py new file mode 100644 index 00000000..9263fa53 --- /dev/null +++ b/benchmarks/pandas/bench_timestamp.py @@ -0,0 +1,31 @@ +"""Benchmark: Timestamp — construction and component accessors.""" +import json, time +import pandas as pd +from datetime import datetime, timezone, timedelta + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +base = datetime(2020, 1, 1, tzinfo=timezone.utc) +dates = [base + timedelta(days=i) for i in range(SIZE)] + +for _ in range(WARMUP): + for d in dates: + ts = pd.Timestamp(d) + _ = ts.year + _ = ts.month + _ = ts.dayofweek + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + for d in dates: + ts = pd.Timestamp(d) + _ = ts.year + _ = ts.month + _ = ts.dayofweek + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"timestamp","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_to_csv.py b/benchmarks/pandas/bench_to_csv.py new file mode 100644 index 00000000..c2e0298a --- /dev/null +++ b/benchmarks/pandas/bench_to_csv.py @@ -0,0 +1,30 @@ +"""Benchmark: to_csv — serialize a 10k-row DataFrame to CSV string""" +import json, time +import numpy as np +import pandas as pd +import io + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 + +df = pd.DataFrame({ + "id": np.arange(ROWS, dtype=float), + "value": np.arange(ROWS) * 1.1, + "score": np.sin(np.arange(ROWS) * 0.01), +}) + +for _ in range(WARMUP): + df.to_csv(index=False) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.to_csv(index=False) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "to_csv", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_to_datetime.py b/benchmarks/pandas/bench_to_datetime.py new file mode 100644 index 00000000..a495ccc2 --- /dev/null +++ b/benchmarks/pandas/bench_to_datetime.py @@ -0,0 +1,26 @@ +"""Benchmark: pd.to_datetime — parse string/numeric values to datetime.""" +import json, time +import pandas as pd +from datetime import datetime, timedelta + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +base = datetime(2020, 1, 1) +date_strings = [(base + timedelta(days=i)).strftime("%Y-%m-%d") for i in range(SIZE)] +timestamps = [int((base + timedelta(days=i)).timestamp() * 1000) for i in range(SIZE)] + +for _ in range(WARMUP): + pd.to_datetime(date_strings) + pd.to_datetime(timestamps, unit="ms") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.to_datetime(date_strings) + pd.to_datetime(timestamps, unit="ms") + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "to_datetime", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_to_dict_oriented_all.py b/benchmarks/pandas/bench_to_dict_oriented_all.py new file mode 100644 index 00000000..5ff74673 --- /dev/null +++ b/benchmarks/pandas/bench_to_dict_oriented_all.py @@ -0,0 +1,21 @@ +"""Benchmark: DataFrame.to_dict with records, list, split orientations on 10k-row DataFrame""" +import json, time +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 +df = pd.DataFrame({"a": range(ROWS), "b": [i * 1.5 for i in range(ROWS)], "c": [f"s{i}" for i in range(ROWS)]}) + +for _ in range(WARMUP): + df.to_dict(orient="records") + df.to_dict(orient="list") + df.to_dict(orient="split") + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.to_dict(orient="records") + df.to_dict(orient="list") + df.to_dict(orient="split") +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "to_dict_oriented_all", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_to_json.py b/benchmarks/pandas/bench_to_json.py new file mode 100644 index 00000000..d76578da --- /dev/null +++ b/benchmarks/pandas/bench_to_json.py @@ -0,0 +1,29 @@ +"""Benchmark: to_json — serialize a 10k-row DataFrame to JSON string""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 + +df = pd.DataFrame({ + "id": np.arange(ROWS, dtype=float), + "value": np.arange(ROWS) * 1.1, + "score": np.sin(np.arange(ROWS) * 0.01), +}) + +for _ in range(WARMUP): + df.to_json(orient="records") + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.to_json(orient="records") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "to_json", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_to_numeric.py b/benchmarks/pandas/bench_to_numeric.py new file mode 100644 index 00000000..3b20255d --- /dev/null +++ b/benchmarks/pandas/bench_to_numeric.py @@ -0,0 +1,24 @@ +"""Benchmark: pd.to_numeric — coerce string arrays to numeric.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +str_nums = [str(i * 1.5) for i in range(SIZE)] +s = pd.Series(str_nums) + +for _ in range(WARMUP): + pd.to_numeric(str_nums, errors="coerce") + pd.to_numeric(s, errors="coerce") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.to_numeric(str_nums, errors="coerce") + pd.to_numeric(s, errors="coerce") + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "to_numeric", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_type_checks.py b/benchmarks/pandas/bench_type_checks.py new file mode 100644 index 00000000..098d9d5d --- /dev/null +++ b/benchmarks/pandas/bench_type_checks.py @@ -0,0 +1,28 @@ +"""Benchmark: pandas api.types checks on mixed values""" +import json, time +import pandas as pd +from pandas.api.types import is_scalar, is_list_like, is_dict_like, is_iterator + +ITERATIONS = 100_000 +WARMUP = 3 +MEASURED = 10 + +values = [42, "hello", None, [1, 2, 3], {"a": 1}, {1, 2}, {}.items()] + +def run_checks(): + for v in values: + is_scalar(v) + is_list_like(v) + is_dict_like(v) + is_iterator(v) + +for _ in range(WARMUP): + for _ in range(ITERATIONS): + run_checks() + +start = time.perf_counter() +for _ in range(MEASURED): + for _ in range(ITERATIONS): + run_checks() +total = (time.perf_counter() - start) * 1000 +print(json.dumps({"function": "type_checks", "mean_ms": total / MEASURED, "iterations": MEASURED, "total_ms": total})) diff --git a/benchmarks/pandas/bench_unstack.py b/benchmarks/pandas/bench_unstack.py new file mode 100644 index 00000000..4b2dca2a --- /dev/null +++ b/benchmarks/pandas/bench_unstack.py @@ -0,0 +1,24 @@ +"""Benchmark: DataFrame.unstack() — pivot innermost index level to columns.""" +import json, time +import pandas as pd + +ROWS = 500 +COLS = 10 +WARMUP = 5 +ITERATIONS = 50 + +import numpy as np +idx = pd.MultiIndex.from_product([range(ROWS), range(COLS)], names=["row","col"]) +s = pd.Series([float(i) for i in range(ROWS * COLS)], index=idx) + +for _ in range(WARMUP): + s.unstack() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.unstack() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"unstack","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_value_type_checks.py b/benchmarks/pandas/bench_value_type_checks.py new file mode 100644 index 00000000..e684cfb6 --- /dev/null +++ b/benchmarks/pandas/bench_value_type_checks.py @@ -0,0 +1,91 @@ +"""Benchmark: extended value type predicates in Python (closest equivalents)""" +import json +import time +import math +import re + +WARMUP = 3 +ITERATIONS = 10_000 + +mixed = [42, 3.14, True, "hello", None, float("nan"), re.compile(r"abc")] + + +def is_number(v): + return isinstance(v, (int, float)) and not isinstance(v, bool) + + +def is_bool(v): + return isinstance(v, bool) + + +def is_string_value(v): + return isinstance(v, str) + + +def is_float(v): + return isinstance(v, float) + + +def is_integer(v): + return isinstance(v, int) and not isinstance(v, bool) + + +def is_big_int(v): + return isinstance(v, int) and not isinstance(v, bool) and (v > 2**53 or v < -(2**53)) + + +def is_regexp(v): + return isinstance(v, re.Pattern) + + +def is_re_compilable(v): + if isinstance(v, re.Pattern): + return True + if isinstance(v, str): + try: + re.compile(v) + return True + except re.error: + return False + return False + + +def is_missing(v): + if v is None: + return True + if isinstance(v, float) and math.isnan(v): + return True + return False + + +def is_hashable(v): + try: + hash(v) + return True + except TypeError: + return False + + +def run_checks(): + for v in mixed: + is_number(v) + is_bool(v) + is_string_value(v) + is_float(v) + is_integer(v) + is_big_int(v) + is_regexp(v) + is_re_compilable(v) + is_missing(v) + is_hashable(v) + + +for _ in range(WARMUP): + run_checks() + +start = time.perf_counter() +for _ in range(ITERATIONS): + run_checks() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "value_type_checks", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_where.py b/benchmarks/pandas/bench_where.py new file mode 100644 index 00000000..096f6b48 --- /dev/null +++ b/benchmarks/pandas/bench_where.py @@ -0,0 +1,22 @@ +"""Benchmark: Series.where() — conditional replacement.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series([float(i) for i in range(SIZE)]) +cond = s > 50000.0 + +for _ in range(WARMUP): + s.where(cond, other=0.0) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.where(cond, other=0.0) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function":"where","mean_ms":round(total_ms/ITERATIONS,3),"iterations":ITERATIONS,"total_ms":round(total_ms,3)})) diff --git a/benchmarks/pandas/bench_zscore.py b/benchmarks/pandas/bench_zscore.py new file mode 100644 index 00000000..b6050e5a --- /dev/null +++ b/benchmarks/pandas/bench_zscore.py @@ -0,0 +1,26 @@ +"""Benchmark: zscore normalization on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.sin(np.arange(ROWS) * 0.01) * 100 + 50 +s = pd.Series(data) + +for _ in range(WARMUP): + (s - s.mean()) / s.std() + +start = time.perf_counter() +for _ in range(ITERATIONS): + (s - s.mean()) / s.std() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "zscore", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/tsb/bench_apply_dataframe_formatter.ts b/benchmarks/tsb/bench_apply_dataframe_formatter.ts new file mode 100644 index 00000000..c3744a14 --- /dev/null +++ b/benchmarks/tsb/bench_apply_dataframe_formatter.ts @@ -0,0 +1,25 @@ +/** + * Benchmark: applyDataFrameFormatter on 10k-row DataFrame + */ +import { DataFrame, applyDataFrameFormatter, formatFloat } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; +const a = Array.from({ length: ROWS }, (_, i) => i * 1.234); +const b = Array.from({ length: ROWS }, (_, i) => i * 5.678); +const df = DataFrame.fromColumns({ a, b }); +const fmt = formatFloat(2); + +for (let i = 0; i < WARMUP; i++) applyDataFrameFormatter(df, fmt); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) applyDataFrameFormatter(df, fmt); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "apply_dataframe_formatter", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_apply_series_formatter.ts b/benchmarks/tsb/bench_apply_series_formatter.ts new file mode 100644 index 00000000..00a7f6ae --- /dev/null +++ b/benchmarks/tsb/bench_apply_series_formatter.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: applySeriesFormatter on 100k-element numeric Series + */ +import { Series, applySeriesFormatter, formatFloat } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => i * 1.234); +const s = new Series({ data }); +const fmt = formatFloat(2); + +for (let i = 0; i < WARMUP; i++) applySeriesFormatter(s, fmt); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) applySeriesFormatter(s, fmt); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "apply_series_formatter", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_arange_linspace.ts b/benchmarks/tsb/bench_arange_linspace.ts new file mode 100644 index 00000000..20785e2a --- /dev/null +++ b/benchmarks/tsb/bench_arange_linspace.ts @@ -0,0 +1,27 @@ +/** + * Benchmark: arange and linspace generating 100k-element arrays + */ +import { arange, linspace } from "../../src/index.js"; + +const N = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +for (let i = 0; i < WARMUP; i++) { + arange(0, N, 1); + linspace(0, 1, N); +} +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + arange(0, N, 1); + linspace(0, 1, N); +} +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "arange_linspace", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_astype_series.ts b/benchmarks/tsb/bench_astype_series.ts new file mode 100644 index 00000000..b07923aa --- /dev/null +++ b/benchmarks/tsb/bench_astype_series.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: astypeSeries — cast Series dtype. + * Outputs JSON: {"function": "astype_series", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, astypeSeries } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const floatSeries = new Series(Array.from({ length: SIZE }, (_, i) => i * 1.5)); +const intSeries = new Series(Array.from({ length: SIZE }, (_, i) => i)); + +for (let i = 0; i < WARMUP; i++) { + astypeSeries(floatSeries, "int32"); + astypeSeries(intSeries, "float64"); + astypeSeries(intSeries, "string"); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + astypeSeries(floatSeries, "int32"); + astypeSeries(intSeries, "float64"); + astypeSeries(intSeries, "string"); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "astype_series", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_attrs_advanced.ts b/benchmarks/tsb/bench_attrs_advanced.ts new file mode 100644 index 00000000..1069713a --- /dev/null +++ b/benchmarks/tsb/bench_attrs_advanced.ts @@ -0,0 +1,41 @@ +/** + * Benchmark: advanced attrs helpers — getAttr/setAttr/deleteAttr/clearAttrs/copyAttrs/mergeAttrs/hasAttrs + */ +import { Series, getAttr, setAttr, deleteAttr, clearAttrs, copyAttrs, mergeAttrs, hasAttrs } from "../../src/index.js"; + +const N = 1_000; +const s = new Series({ data: Array.from({ length: N }, (_, i) => i) }); +const s2 = new Series({ data: Array.from({ length: N }, (_, i) => i * 2) }); + +const WARMUP = 3; +const ITERATIONS = 1_000; + +for (let i = 0; i < WARMUP; i++) { + setAttr(s, "unit", "meters"); + getAttr(s, "unit"); + hasAttrs(s); + copyAttrs(s, s2); + mergeAttrs(s, { version: 1 }); + deleteAttr(s, "unit"); + clearAttrs(s); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + setAttr(s, "unit", "meters"); + getAttr(s, "unit"); + hasAttrs(s); + copyAttrs(s, s2); + mergeAttrs(s, { version: i }); + deleteAttr(s, "unit"); + clearAttrs(s); +} +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "attrs_advanced", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_attrs_count_keys.ts b/benchmarks/tsb/bench_attrs_count_keys.ts new file mode 100644 index 00000000..ef9c5cb0 --- /dev/null +++ b/benchmarks/tsb/bench_attrs_count_keys.ts @@ -0,0 +1,20 @@ +import { attrsCount, attrsKeys } from "tsb"; +import { Series } from "tsb"; +const N = 100_000; +const s = new Series(Array.from({ length: N }, (_, i) => i)); +const attrs = { a: 1, b: 2, c: 3, d: 4, e: 5, f: 6, g: 7, h: 8 }; +import { setAttrs } from "tsb"; +setAttrs(s, attrs); +const WARMUP = 3; +const ITERS = 10_000; +for (let i = 0; i < WARMUP; i++) { + attrsCount(s); + attrsKeys(s); +} +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) { + attrsCount(s); + attrsKeys(s); +} +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "attrs_count_keys", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_attrs_ops.ts b/benchmarks/tsb/bench_attrs_ops.ts new file mode 100644 index 00000000..2fae1b01 --- /dev/null +++ b/benchmarks/tsb/bench_attrs_ops.ts @@ -0,0 +1,22 @@ +import { getAttrs, setAttrs, updateAttrs, withAttrs } from "tsb"; +import { Series } from "tsb"; +const N = 10_000; +const s = new Series(Array.from({ length: N }, (_, i) => i)); +const attrs = { unit: "meters", created: "2024-01-01", source: "sensor-1", version: 2 }; +const WARMUP = 3; +const ITERS = 100; +for (let i = 0; i < WARMUP; i++) { + setAttrs(s, attrs); + getAttrs(s); + updateAttrs(s, { version: i }); + withAttrs(s, { extra: "x" }); +} +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) { + setAttrs(s, attrs); + getAttrs(s); + updateAttrs(s, { version: i }); + withAttrs(s, { extra: "x" }); +} +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "attrs_ops", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_between.ts b/benchmarks/tsb/bench_between.ts new file mode 100644 index 00000000..4e06570c --- /dev/null +++ b/benchmarks/tsb/bench_between.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: Series.between() — element-wise range check. + * Outputs JSON: {"function": "between", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 1.0) }); + +for (let i = 0; i < WARMUP; i++) { + s.between(25000.0, 75000.0); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + s.between(25000.0, 75000.0); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "between", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_cat_add_remove_categories.ts b/benchmarks/tsb/bench_cat_add_remove_categories.ts new file mode 100644 index 00000000..d508cb70 --- /dev/null +++ b/benchmarks/tsb/bench_cat_add_remove_categories.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: cat_add_remove_categories — CategoricalAccessor addCategories/removeCategories on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const cats = ["a", "b", "c", "d"]; +const s = new Series({ data: Array.from({ length: ROWS }, (_, i) => cats[i % cats.length]) }); + +for (let i = 0; i < WARMUP; i++) { + s.cat.addCategories(["e", "f"]); + s.cat.removeCategories(["d"]); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.cat.addCategories(["e", "f"]); + s.cat.removeCategories(["d"]); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "cat_add_remove_categories", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_cat_cross_tab.ts b/benchmarks/tsb/bench_cat_cross_tab.ts new file mode 100644 index 00000000..36049471 --- /dev/null +++ b/benchmarks/tsb/bench_cat_cross_tab.ts @@ -0,0 +1,25 @@ +/** + * Benchmark: catCrossTab on two 100k-element categorical Series + */ +import { Series, catCrossTab } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const cats1 = ["a", "b", "c", "d"]; +const cats2 = ["x", "y", "z"]; +const s1 = new Series({ data: Array.from({ length: ROWS }, (_, i) => cats1[i % 4]) }); +const s2 = new Series({ data: Array.from({ length: ROWS }, (_, i) => cats2[i % 3]) }); + +for (let i = 0; i < WARMUP; i++) catCrossTab(s1, s2); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) catCrossTab(s1, s2); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "cat_cross_tab", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_cat_equal_categories.ts b/benchmarks/tsb/bench_cat_equal_categories.ts new file mode 100644 index 00000000..6b76ab4d --- /dev/null +++ b/benchmarks/tsb/bench_cat_equal_categories.ts @@ -0,0 +1,27 @@ +/** + * Benchmark: catEqualCategories on two categorical Series (10k iterations) + */ +import { Series, catEqualCategories } from "../../src/index.js"; + +const WARMUP = 3; +const ITERATIONS = 10; +const s1 = new Series({ data: ["cat_0", "cat_1", "cat_2"] }); +const s2 = new Series({ data: ["cat_0", "cat_1", "cat_2"] }); +const REPS = 10_000; + +for (let i = 0; i < WARMUP; i++) { + for (let j = 0; j < REPS; j++) catEqualCategories(s1, s2); +} +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + for (let j = 0; j < REPS; j++) catEqualCategories(s1, s2); +} +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "cat_equal_categories", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_cat_freq_table.ts b/benchmarks/tsb/bench_cat_freq_table.ts new file mode 100644 index 00000000..05cb9ca1 --- /dev/null +++ b/benchmarks/tsb/bench_cat_freq_table.ts @@ -0,0 +1,23 @@ +/** + * Benchmark: catFreqTable on 100k-element categorical Series + */ +import { Series, catFreqTable } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const cats = ["low", "med", "high", "ultra"]; +const s = new Series({ data: Array.from({ length: ROWS }, (_, i) => cats[i % 4]) }); + +for (let i = 0; i < WARMUP; i++) catFreqTable(s); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) catFreqTable(s); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "cat_freq_table", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_cat_recode.ts b/benchmarks/tsb/bench_cat_recode.ts new file mode 100644 index 00000000..238bf6a3 --- /dev/null +++ b/benchmarks/tsb/bench_cat_recode.ts @@ -0,0 +1,25 @@ +/** + * Benchmark: catRecode on 100k-element categorical Series + */ +import { Series, catRecode } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const cats = ["a", "b", "c"]; +const data = Array.from({ length: ROWS }, (_, i) => cats[i % 3]); +const s = new Series({ data }); +const map: Record = { a: "x", b: "y", c: "z" }; + +for (let i = 0; i < WARMUP; i++) catRecode(s, map); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) catRecode(s, map); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "cat_recode", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_cat_remove_unused.ts b/benchmarks/tsb/bench_cat_remove_unused.ts new file mode 100644 index 00000000..e1b33d2f --- /dev/null +++ b/benchmarks/tsb/bench_cat_remove_unused.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: cat_remove_unused — CategoricalAccessor.removeUnusedCategories() on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const cats = ["a", "b", "c"]; +const base = new Series({ data: Array.from({ length: ROWS }, (_, i) => cats[i % cats.length]) }); +// Add extra categories that are unused so removeUnusedCategories has work to do +const s = base.cat.addCategories(["x", "y", "z"]); + +for (let i = 0; i < WARMUP; i++) { + s.cat.removeUnusedCategories(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.cat.removeUnusedCategories(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "cat_remove_unused", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_cat_rename_set_categories.ts b/benchmarks/tsb/bench_cat_rename_set_categories.ts new file mode 100644 index 00000000..a2837ead --- /dev/null +++ b/benchmarks/tsb/bench_cat_rename_set_categories.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: cat_rename_set_categories — CategoricalAccessor renameCategories/setCategories on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const cats = ["a", "b", "c", "d"]; +const s = new Series({ data: Array.from({ length: ROWS }, (_, i) => cats[i % cats.length]) }); + +for (let i = 0; i < WARMUP; i++) { + s.cat.renameCategories({ a: "alpha", b: "beta" }); + s.cat.setCategories(["a", "b", "c", "d", "e"], false); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.cat.renameCategories({ a: "alpha", b: "beta" }); + s.cat.setCategories(["a", "b", "c", "d", "e"], false); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "cat_rename_set_categories", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_cat_reorder_as_ordered.ts b/benchmarks/tsb/bench_cat_reorder_as_ordered.ts new file mode 100644 index 00000000..1644bbb2 --- /dev/null +++ b/benchmarks/tsb/bench_cat_reorder_as_ordered.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: cat_reorder_as_ordered — CategoricalAccessor reorderCategories/asOrdered/asUnordered on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const cats = ["a", "b", "c", "d"]; +const s = new Series({ data: Array.from({ length: ROWS }, (_, i) => cats[i % cats.length]) }); + +for (let i = 0; i < WARMUP; i++) { + s.cat.reorderCategories(["d", "c", "b", "a"]); + s.cat.asOrdered(); + s.cat.asUnordered(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.cat.reorderCategories(["d", "c", "b", "a"]); + s.cat.asOrdered(); + s.cat.asUnordered(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "cat_reorder_as_ordered", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_cat_set_ops.ts b/benchmarks/tsb/bench_cat_set_ops.ts new file mode 100644 index 00000000..c34f1047 --- /dev/null +++ b/benchmarks/tsb/bench_cat_set_ops.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: catUnionCategories / catIntersectCategories / catDiffCategories + */ +import { + Series, + catUnionCategories, + catIntersectCategories, + catDiffCategories, +} from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const cats1 = Array.from({ length: 500 }, (_, i) => `cat_${i}`); +const cats2 = Array.from({ length: 500 }, (_, i) => `cat_${i + 250}`); +const s1 = new Series({ data: Array.from({ length: ROWS }, (_, i) => cats1[i % cats1.length]) }); +const s2 = new Series({ data: Array.from({ length: ROWS }, (_, i) => cats2[i % cats2.length]) }); + +for (let i = 0; i < WARMUP; i++) { + catUnionCategories(s1, s2); + catIntersectCategories(s1, s2); + catDiffCategories(s1, s2); +} +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + catUnionCategories(s1, s2); + catIntersectCategories(s1, s2); + catDiffCategories(s1, s2); +} +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "cat_set_ops", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_cat_sort_by_freq.ts b/benchmarks/tsb/bench_cat_sort_by_freq.ts new file mode 100644 index 00000000..2ba37f6f --- /dev/null +++ b/benchmarks/tsb/bench_cat_sort_by_freq.ts @@ -0,0 +1,28 @@ +/** + * Benchmark: catSortByFreq on 100k-element categorical Series + */ +import { Series, catSortByFreq } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const cats = ["rare", "common", "very_common", "ultra_common"]; +const data: string[] = []; +for (let i = 0; i < ROWS; i++) { + const r = i % 51; + data.push(r < 1 ? cats[0] : r < 6 ? cats[1] : r < 21 ? cats[2] : cats[3]); +} +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) catSortByFreq(s); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) catSortByFreq(s); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "cat_sort_by_freq", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_cat_to_ordinal.ts b/benchmarks/tsb/bench_cat_to_ordinal.ts new file mode 100644 index 00000000..6f791903 --- /dev/null +++ b/benchmarks/tsb/bench_cat_to_ordinal.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: catToOrdinal on 100k-element categorical Series + */ +import { Series, catToOrdinal } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const cats = ["low", "med", "high"]; +const data = Array.from({ length: ROWS }, (_, i) => cats[i % 3]); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) catToOrdinal(s, cats); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) catToOrdinal(s, cats); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "cat_to_ordinal", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_cat_union_intersect_diff.ts b/benchmarks/tsb/bench_cat_union_intersect_diff.ts new file mode 100644 index 00000000..91a47176 --- /dev/null +++ b/benchmarks/tsb/bench_cat_union_intersect_diff.ts @@ -0,0 +1,23 @@ +import { Series, catUnionCategories, catIntersectCategories, catDiffCategories } from "tsb"; +const N = 50_000; +const cats1 = ["A", "B", "C", "D"]; +const cats2 = ["C", "D", "E", "F"]; +const s1 = new Series(Array.from({ length: N }, (_, i) => cats1[i % cats1.length])); +const s2 = new Series(Array.from({ length: N }, (_, i) => cats2[i % cats2.length])); +const c1 = s1.cat; +const c2 = s2.cat; +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) { + catUnionCategories(c1, c2); + catIntersectCategories(c1, c2); + catDiffCategories(c1, c2); +} +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) { + catUnionCategories(c1, c2); + catIntersectCategories(c1, c2); + catDiffCategories(c1, c2); +} +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "cat_union_intersect_diff", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_cat_value_counts.ts b/benchmarks/tsb/bench_cat_value_counts.ts new file mode 100644 index 00000000..3fb739c1 --- /dev/null +++ b/benchmarks/tsb/bench_cat_value_counts.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: cat_value_counts — CategoricalAccessor.valueCounts() on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const cats = ["a", "b", "c", "d", "e"]; +const s = new Series({ data: Array.from({ length: ROWS }, (_, i) => cats[i % cats.length]) }); + +for (let i = 0; i < WARMUP; i++) { + s.cat.valueCounts(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.cat.valueCounts(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "cat_value_counts", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_clip.ts b/benchmarks/tsb/bench_clip.ts new file mode 100644 index 00000000..77ce3688 --- /dev/null +++ b/benchmarks/tsb/bench_clip.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: Series.clip() — clip values to a range. + * Outputs JSON: {"function": "clip", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 1.0) }); + +for (let i = 0; i < WARMUP; i++) { + s.clip({ lower: 10000.0, upper: 90000.0 }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + s.clip({ lower: 10000.0, upper: 90000.0 }); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "clip", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_clip_advanced.ts b/benchmarks/tsb/bench_clip_advanced.ts new file mode 100644 index 00000000..a6af65ac --- /dev/null +++ b/benchmarks/tsb/bench_clip_advanced.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: clipAdvancedSeries / clipAdvancedDataFrame — per-element clipping with array bounds. + * Outputs JSON: {"function": "clip_advanced", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, clipAdvancedSeries, clipAdvancedDataFrame } from "../../src/index.ts"; + +const ROWS = 50_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const data = Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01) * 200); +const lower = Float64Array.from({ length: ROWS }, () => -50); +const upper = Float64Array.from({ length: ROWS }, () => 50); +const s = new Series(data); +const lowerArr = Array.from(lower); +const upperArr = Array.from(upper); + +const dfCols: Record = {}; +for (let c = 0; c < 5; c++) { + dfCols[`col${c}`] = Array.from({ length: ROWS }, (_, i) => Math.sin((i + c) * 0.01) * 200); +} +const df = new DataFrame(dfCols); + +for (let i = 0; i < WARMUP; i++) { + clipAdvancedSeries(s, { lower: lowerArr, upper: upperArr }); + clipAdvancedDataFrame(df, { lower: -50, upper: 50 }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + clipAdvancedSeries(s, { lower: lowerArr, upper: upperArr }); + clipAdvancedDataFrame(df, { lower: -50, upper: 50 }); +} +const total = performance.now() - start; + +console.log(JSON.stringify({ function: "clip_advanced", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_coefficient_of_variation.ts b/benchmarks/tsb/bench_coefficient_of_variation.ts new file mode 100644 index 00000000..9acff25a --- /dev/null +++ b/benchmarks/tsb/bench_coefficient_of_variation.ts @@ -0,0 +1,23 @@ +/** + * Benchmark: coefficientOfVariation on 100k-element Series + */ +import { Series, coefficientOfVariation } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => i * 0.1 + 1); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) coefficientOfVariation(s); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) coefficientOfVariation(s); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "coefficient_of_variation", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_combine_first.ts b/benchmarks/tsb/bench_combine_first.ts new file mode 100644 index 00000000..83b61b9c --- /dev/null +++ b/benchmarks/tsb/bench_combine_first.ts @@ -0,0 +1,14 @@ +import { Series } from "tsb"; + +const rng = (seed: number) => { let s = seed; return () => { s = (s * 1664525 + 1013904223) & 0xffffffff; return ((s >>> 0) / 0xffffffff) * 2 - 1; }; }; +const rand = rng(42); +const d1: (number | null)[] = Array.from({ length: 100_000 }, (_, i) => i % 3 === 0 ? null : rand() * 3); +const d2 = Array.from({ length: 100_000 }, () => rand() * 3); +const s1 = new Series(d1); +const s2 = new Series(d2); +for (let i = 0; i < 3; i++) s1.combineFirst(s2); +const N = 50; +const t0 = performance.now(); +for (let i = 0; i < N; i++) s1.combineFirst(s2); +const elapsed = performance.now() - t0; +console.log(JSON.stringify({ function: "combine_first", mean_ms: elapsed / N, iterations: N, total_ms: elapsed })); diff --git a/benchmarks/tsb/bench_concat_axis1.ts b/benchmarks/tsb/bench_concat_axis1.ts new file mode 100644 index 00000000..a27ad58e --- /dev/null +++ b/benchmarks/tsb/bench_concat_axis1.ts @@ -0,0 +1,28 @@ +/** + * Benchmark: concat([df1, df2], { axis: 1 }) — column-wise concat on 100k-row DataFrames. + */ +import { DataFrame, concat } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const df1 = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i * 1.0), + b: Array.from({ length: ROWS }, (_, i) => i * 2.0), +}); +const df2 = DataFrame.fromColumns({ + c: Array.from({ length: ROWS }, (_, i) => i * 3.0), + d: Array.from({ length: ROWS }, (_, i) => i * 4.0), +}); + +for (let i = 0; i < WARMUP; i++) concat([df1, df2], { axis: 1 }); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + concat([df1, df2], { axis: 1 }); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "concat_axis1", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_corr.ts b/benchmarks/tsb/bench_corr.ts new file mode 100644 index 00000000..39821d71 --- /dev/null +++ b/benchmarks/tsb/bench_corr.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: DataFrame.corr — pairwise correlation of numeric columns. + * Outputs JSON: {"function": "corr", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i * 1.1), + b: Array.from({ length: SIZE }, (_, i) => i * 0.7 + 0.3), + c: Array.from({ length: SIZE }, (_, i) => i * -0.5 + 100), +}); + +for (let i = 0; i < WARMUP; i++) { + df.corr(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + df.corr(); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "corr", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_count_valid.ts b/benchmarks/tsb/bench_count_valid.ts new file mode 100644 index 00000000..17c912f5 --- /dev/null +++ b/benchmarks/tsb/bench_count_valid.ts @@ -0,0 +1,25 @@ +/** + * Benchmark: countValid on 100k-element Series with NaN + */ +import { Series, countValid } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data: (number | null)[] = Array.from({ length: ROWS }, (_, i) => + i % 7 === 0 ? null : i * 0.1, +); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) countValid(s); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) countValid(s); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "count_valid", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_countna.ts b/benchmarks/tsb/bench_countna.ts new file mode 100644 index 00000000..13961a7a --- /dev/null +++ b/benchmarks/tsb/bench_countna.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: countna — count NaN/null values in a Series with 10% nulls + */ +import { Series } from "../../src/index.js"; +import { countna } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const data = Array.from({ length: ROWS }, (_, i) => (i % 10 === 0 ? null : i)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + countna(s); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + countna(s); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "countna", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_cov.ts b/benchmarks/tsb/bench_cov.ts new file mode 100644 index 00000000..af60be69 --- /dev/null +++ b/benchmarks/tsb/bench_cov.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: DataFrame.cov — pairwise covariance of numeric columns. + * Outputs JSON: {"function": "cov", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i * 1.1), + b: Array.from({ length: SIZE }, (_, i) => i * 0.7 + 0.3), + c: Array.from({ length: SIZE }, (_, i) => i * -0.5 + 100), +}); + +for (let i = 0; i < WARMUP; i++) { + df.cov(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + df.cov(); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "cov", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_crosstab.ts b/benchmarks/tsb/bench_crosstab.ts new file mode 100644 index 00000000..24b2fde7 --- /dev/null +++ b/benchmarks/tsb/bench_crosstab.ts @@ -0,0 +1,42 @@ +/** + * Benchmark: crosstab() — compute a cross-tabulation. + * Outputs JSON: {"function": "crosstab", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, crosstab } from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const choices_a = ["x", "y", "z"]; +const choices_b = ["p", "q", "r", "s"]; +let seed = 42; +function rand(): number { + seed = (seed * 1664525 + 1013904223) & 0x7fffffff; + return seed; +} + +const a = new Series({ data: Array.from({ length: SIZE }, () => choices_a[rand() % 3]) }); +const b = new Series({ data: Array.from({ length: SIZE }, () => choices_b[rand() % 4]) }); + +for (let i = 0; i < WARMUP; i++) { + crosstab(a, b); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + crosstab(a, b); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "crosstab", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_cummax.ts b/benchmarks/tsb/bench_cummax.ts new file mode 100644 index 00000000..a537b210 --- /dev/null +++ b/benchmarks/tsb/bench_cummax.ts @@ -0,0 +1,12 @@ +import { Series } from "tsb"; + +const rng = (seed: number) => { let s = seed; return () => { s = (s * 1664525 + 1013904223) & 0xffffffff; return ((s >>> 0) / 0xffffffff) * 2 - 1; }; }; +const rand = rng(42); +const data = Array.from({ length: 100_000 }, () => rand() * 3); +const s = new Series(data); +for (let i = 0; i < 3; i++) s.cummax(); +const N = 100; +const t0 = performance.now(); +for (let i = 0; i < N; i++) s.cummax(); +const elapsed = performance.now() - t0; +console.log(JSON.stringify({ function: "cummax", mean_ms: elapsed / N, iterations: N, total_ms: elapsed })); diff --git a/benchmarks/tsb/bench_cummin.ts b/benchmarks/tsb/bench_cummin.ts new file mode 100644 index 00000000..1b773565 --- /dev/null +++ b/benchmarks/tsb/bench_cummin.ts @@ -0,0 +1,12 @@ +import { Series } from "tsb"; + +const rng = (seed: number) => { let s = seed; return () => { s = (s * 1664525 + 1013904223) & 0xffffffff; return ((s >>> 0) / 0xffffffff) * 2 - 1; }; }; +const rand = rng(42); +const data = Array.from({ length: 100_000 }, () => rand() * 3); +const s = new Series(data); +for (let i = 0; i < 3; i++) s.cummin(); +const N = 100; +const t0 = performance.now(); +for (let i = 0; i < N; i++) s.cummin(); +const elapsed = performance.now() - t0; +console.log(JSON.stringify({ function: "cummin", mean_ms: elapsed / N, iterations: N, total_ms: elapsed })); diff --git a/benchmarks/tsb/bench_dataframe_abs.ts b/benchmarks/tsb/bench_dataframe_abs.ts new file mode 100644 index 00000000..209d3787 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_abs.ts @@ -0,0 +1,15 @@ +import { DataFrame } from "tsb"; +const N = 100_000; +const cols = 5; +const data: Record = {}; +for (let c = 0; c < cols; c++) { + data[`col${c}`] = Array.from({ length: N }, (_, i) => (i % 200) - 100); +} +const df = new DataFrame(data); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) df.abs(); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) df.abs(); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "dataframe_abs", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_apply_axis1.ts b/benchmarks/tsb/bench_dataframe_apply_axis1.ts new file mode 100644 index 00000000..513a715e --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_apply_axis1.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: DataFrame.apply with axis=1 (row-wise) on 10k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 2; +const ITERATIONS = 10; + +const a = Array.from({ length: ROWS }, (_, i) => i * 0.1); +const b = Array.from({ length: ROWS }, (_, i) => i * 0.2); +const df = DataFrame.fromColumns({ a, b }); + +for (let i = 0; i < WARMUP; i++) { + df.apply((s) => s.sum(), 1); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.apply((s) => s.sum(), 1); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_apply_axis1", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_apply_col.ts b/benchmarks/tsb/bench_dataframe_apply_col.ts new file mode 100644 index 00000000..1bcc7341 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_apply_col.ts @@ -0,0 +1,17 @@ +import { DataFrame } from "tsb"; + +const rng = (seed: number) => { let s = seed; return () => { s = (s * 1664525 + 1013904223) & 0xffffffff; return ((s >>> 0) / 0xffffffff) * 2 - 1; }; }; +const rand = rng(42); +const df = new DataFrame({ + A: Array.from({ length: 10_000 }, () => rand() * 3), + B: Array.from({ length: 10_000 }, () => rand() * 3), + C: Array.from({ length: 10_000 }, () => rand() * 3), + D: Array.from({ length: 10_000 }, () => rand() * 3), + E: Array.from({ length: 10_000 }, () => rand() * 3), +}); +for (let i = 0; i < 3; i++) df.apply((col: unknown) => { const c = col as number[]; return c.reduce((a, b) => a + b, 0) / c.length; }, { axis: 0 }); +const N = 100; +const t0 = performance.now(); +for (let i = 0; i < N; i++) df.apply((col: unknown) => { const c = col as number[]; return c.reduce((a, b) => a + b, 0) / c.length; }, { axis: 0 }); +const elapsed = performance.now() - t0; +console.log(JSON.stringify({ function: "dataframe_apply_col", mean_ms: elapsed / N, iterations: N, total_ms: elapsed })); diff --git a/benchmarks/tsb/bench_dataframe_apply_map.ts b/benchmarks/tsb/bench_dataframe_apply_map.ts new file mode 100644 index 00000000..4c6e5a33 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_apply_map.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: dataFrameApplyMap on 10k-row DataFrame + */ +import { DataFrame, dataFrameApplyMap } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; +const a = Array.from({ length: ROWS }, (_, i) => i * 0.1); +const b = Array.from({ length: ROWS }, (_, i) => i * 0.2); +const df = DataFrame.fromColumns({ a, b }); + +for (let i = 0; i < WARMUP; i++) dataFrameApplyMap(df, (v) => (v as number) + 1); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) dataFrameApplyMap(df, (v) => (v as number) + 1); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "dataframe_apply_map", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_assign.ts b/benchmarks/tsb/bench_dataframe_assign.ts new file mode 100644 index 00000000..b7d57daf --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_assign.ts @@ -0,0 +1,25 @@ +/** + * Benchmark: DataFrame.assign({col: series}) on 100k-row DataFrame. + */ +import { DataFrame, Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i * 1.0), + b: Array.from({ length: ROWS }, (_, i) => i * 2.0), +}); +const newCol = new Series({ data: Array.from({ length: ROWS }, (_, i) => i * 3.0), name: "c" }); + +for (let i = 0; i < WARMUP; i++) df.assign({ c: newCol }); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.assign({ c: newCol }); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "dataframe_assign", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_astype.ts b/benchmarks/tsb/bench_dataframe_astype.ts new file mode 100644 index 00000000..39a34529 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_astype.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: DataFrame.astype() — cast column dtypes. + * Outputs JSON: {"function": "dataframe_astype", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i * 1.0), + b: Array.from({ length: SIZE }, (_, i) => i), +}); + +for (let i = 0; i < WARMUP; i++) { + df.astype({ a: "float32", b: "int32" }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + df.astype({ a: "float32", b: "int32" }); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "dataframe_astype", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_clip.ts b/benchmarks/tsb/bench_dataframe_clip.ts new file mode 100644 index 00000000..3aab06c3 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_clip.ts @@ -0,0 +1,16 @@ +import { dataFrameClip } from "tsb"; +import { DataFrame } from "tsb"; +const N = 100_000; +const cols = 5; +const data: Record = {}; +for (let c = 0; c < cols; c++) { + data[`col${c}`] = Array.from({ length: N }, (_, i) => (i % 200) - 100); +} +const df = new DataFrame(data); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) dataFrameClip(df, { lower: -50, upper: 50 }); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) dataFrameClip(df, { lower: -50, upper: 50 }); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "dataframe_clip", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_col_has.ts b/benchmarks/tsb/bench_dataframe_col_has.ts new file mode 100644 index 00000000..0edb6bc0 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_col_has.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: DataFrame.col(), .has(), .get() on a 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const a = Array.from({ length: ROWS }, (_, i) => i); +const b = Array.from({ length: ROWS }, (_, i) => i * 2.0); +const df = new DataFrame({ columns: { a, b } }); + +for (let i = 0; i < WARMUP; i++) { + df.col("a"); + df.has("b"); + df.get("c"); +} +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.col("a"); + df.has("b"); + df.get("c"); +} +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "dataframe_col_has", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_corr.ts b/benchmarks/tsb/bench_dataframe_corr.ts new file mode 100644 index 00000000..40e9cf4b --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_corr.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: DataFrame correlation matrix on 10k-row x 5-column DataFrame + */ +import { DataFrame, dataFrameCorr } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const df = new DataFrame({ + A: Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01)), + B: Float64Array.from({ length: ROWS }, (_, i) => Math.cos(i * 0.01)), + C: Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.02)), + D: Float64Array.from({ length: ROWS }, (_, i) => Math.cos(i * 0.02)), + E: Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.03)), +}); + +for (let i = 0; i < WARMUP; i++) { + dataFrameCorr(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameCorr(df); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_corr", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_count.ts b/benchmarks/tsb/bench_dataframe_count.ts new file mode 100644 index 00000000..8b0e6f58 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_count.ts @@ -0,0 +1,25 @@ +/** + * Benchmark: DataFrame.count() on 100k-row DataFrame with some NAs. + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i % 3 === 0 ? null : i * 1.0), + b: Array.from({ length: ROWS }, (_, i) => i % 5 === 0 ? null : i * 2.0), + c: Array.from({ length: ROWS }, (_, i) => i * 3.0), +}); + +for (let i = 0; i < WARMUP; i++) df.count(); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.count(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "dataframe_count", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_cummax.ts b/benchmarks/tsb/bench_dataframe_cummax.ts new file mode 100644 index 00000000..955798e8 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_cummax.ts @@ -0,0 +1,16 @@ +import { dataFrameCummax } from "tsb"; +import { DataFrame } from "tsb"; +const N = 100_000; +const cols = 4; +const data: Record = {}; +for (let c = 0; c < cols; c++) { + data[`col${c}`] = Array.from({ length: N }, (_, i) => (i % 100) * 1.0); +} +const df = new DataFrame(data); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) dataFrameCummax(df); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) dataFrameCummax(df); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "dataframe_cummax", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_cummin.ts b/benchmarks/tsb/bench_dataframe_cummin.ts new file mode 100644 index 00000000..0fb82a0a --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_cummin.ts @@ -0,0 +1,16 @@ +import { dataFrameCummin } from "tsb"; +import { DataFrame } from "tsb"; +const N = 100_000; +const cols = 4; +const data: Record = {}; +for (let c = 0; c < cols; c++) { + data[`col${c}`] = Array.from({ length: N }, (_, i) => (i % 100) * 1.0); +} +const df = new DataFrame(data); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) dataFrameCummin(df); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) dataFrameCummin(df); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "dataframe_cummin", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_cumprod.ts b/benchmarks/tsb/bench_dataframe_cumprod.ts new file mode 100644 index 00000000..9880cdcd --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_cumprod.ts @@ -0,0 +1,16 @@ +import { dataFrameCumprod } from "tsb"; +import { DataFrame } from "tsb"; +const N = 10_000; +const cols = 4; +const data: Record = {}; +for (let c = 0; c < cols; c++) { + data[`col${c}`] = Array.from({ length: N }, (_, i) => (i % 5) + 1); +} +const df = new DataFrame(data); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) dataFrameCumprod(df); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) dataFrameCumprod(df); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "dataframe_cumprod", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_cumsum.ts b/benchmarks/tsb/bench_dataframe_cumsum.ts new file mode 100644 index 00000000..17e5393e --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_cumsum.ts @@ -0,0 +1,16 @@ +import { dataFrameCumsum } from "tsb"; +import { DataFrame } from "tsb"; +const N = 100_000; +const cols = 4; +const data: Record = {}; +for (let c = 0; c < cols; c++) { + data[`col${c}`] = Array.from({ length: N }, (_, i) => (i % 10) + 1); +} +const df = new DataFrame(data); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) dataFrameCumsum(df); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) dataFrameCumsum(df); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "dataframe_cumsum", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_describe.ts b/benchmarks/tsb/bench_dataframe_describe.ts new file mode 100644 index 00000000..c6be9a03 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_describe.ts @@ -0,0 +1,26 @@ +/** + * Benchmark: DataFrame.describe() on 100k-row DataFrame (separate from describe.ts function). + * Uses df.describe() method directly. + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => (i * 1.23) % 9000), + b: Array.from({ length: ROWS }, (_, i) => (i * 4.56) % 7000), + c: Array.from({ length: ROWS }, (_, i) => i * 0.5), +}); + +for (let i = 0; i < WARMUP; i++) df.describe(); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.describe(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "dataframe_describe", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_drop.ts b/benchmarks/tsb/bench_dataframe_drop.ts new file mode 100644 index 00000000..e4f5a734 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_drop.ts @@ -0,0 +1,26 @@ +/** + * Benchmark: DataFrame.drop(names[]) on 100k-row DataFrame. + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i * 1.0), + b: Array.from({ length: ROWS }, (_, i) => i * 2.0), + c: Array.from({ length: ROWS }, (_, i) => i * 3.0), + d: Array.from({ length: ROWS }, (_, i) => i * 4.0), +}); + +for (let i = 0; i < WARMUP; i++) df.drop(["b", "d"]); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.drop(["b", "d"]); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "dataframe_drop", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_ewm.ts b/benchmarks/tsb/bench_dataframe_ewm.ts new file mode 100644 index 00000000..d6b872d8 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_ewm.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: DataFrameEwm mean on 10k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; +const a = Array.from({ length: ROWS }, (_, i) => i * 0.1); +const b = Array.from({ length: ROWS }, (_, i) => i * 0.2); +const df = DataFrame.fromColumns({ a, b }); + +for (let i = 0; i < WARMUP; i++) df.ewm({ alpha: 0.3 }).mean(); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) df.ewm({ alpha: 0.3 }).mean(); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "dataframe_ewm", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_ewm_std_var.ts b/benchmarks/tsb/bench_dataframe_ewm_std_var.ts new file mode 100644 index 00000000..1f98d2ae --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_ewm_std_var.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: DataFrameEwm std and var on 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const a = Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.05)); +const b = Array.from({ length: ROWS }, (_, i) => Math.cos(i * 0.05)); +const df = DataFrame.fromColumns({ a, b }); + +for (let i = 0; i < WARMUP; i++) { + df.ewm({ span: 20 }).std(); + df.ewm({ span: 20 }).var(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.ewm({ span: 20 }).std(); + df.ewm({ span: 20 }).var(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_ewm_std_var", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_expanding.ts b/benchmarks/tsb/bench_dataframe_expanding.ts new file mode 100644 index 00000000..33acdb3c --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_expanding.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: DataFrameExpanding mean on 10k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; +const a = Array.from({ length: ROWS }, (_, i) => i * 0.1); +const b = Array.from({ length: ROWS }, (_, i) => i * 0.2); +const df = DataFrame.fromColumns({ a, b }); + +for (let i = 0; i < WARMUP; i++) df.expanding().mean(); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) df.expanding().mean(); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "dataframe_expanding", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_expanding_min_max.ts b/benchmarks/tsb/bench_dataframe_expanding_min_max.ts new file mode 100644 index 00000000..edc2bbb1 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_expanding_min_max.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: DataFrameExpanding min and max on 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const a = Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01)); +const b = Array.from({ length: ROWS }, (_, i) => Math.cos(i * 0.01)); +const df = DataFrame.fromColumns({ a, b }); + +for (let i = 0; i < WARMUP; i++) { + df.expanding().min(); + df.expanding().max(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.expanding().min(); + df.expanding().max(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_expanding_min_max", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_fillna.ts b/benchmarks/tsb/bench_dataframe_fillna.ts new file mode 100644 index 00000000..d470a527 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_fillna.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: DataFrame.fillna(value) on 100k-row DataFrame with NAs. + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i % 4 === 0 ? null : i * 1.0), + b: Array.from({ length: ROWS }, (_, i) => i % 6 === 0 ? null : i * 2.0), +}); + +for (let i = 0; i < WARMUP; i++) df.fillna(0); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.fillna(0); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "dataframe_fillna", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_from2d_select.ts b/benchmarks/tsb/bench_dataframe_from2d_select.ts new file mode 100644 index 00000000..fc65c67d --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_from2d_select.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: DataFrame.from2D and DataFrame.select + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data2D = Array.from({ length: ROWS }, (_, i) => [i * 1.0, i * 2.0, i * 3.0]); +const cols = ["a", "b", "c"]; +let df = DataFrame.from2D(data2D, cols); + +for (let i = 0; i < WARMUP; i++) { + DataFrame.from2D(data2D, cols); + df.select(["a", "c"]); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + DataFrame.from2D(data2D, cols); + df.select(["a", "c"]); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_from2d_select", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_from_pairs.ts b/benchmarks/tsb/bench_dataframe_from_pairs.ts new file mode 100644 index 00000000..544bf2a7 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_from_pairs.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: DataFrame.fromColumns with object of arrays (100k rows) + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const a = Array.from({ length: ROWS }, (_, i) => i); +const b = Array.from({ length: ROWS }, (_, i) => i * 2.5); +const c = Array.from({ length: ROWS }, (_, i) => `str_${i % 1000}`); + +for (let i = 0; i < WARMUP; i++) DataFrame.fromColumns({ a, b, c }); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) DataFrame.fromColumns({ a, b, c }); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "dataframe_from_pairs", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_fromrecords.ts b/benchmarks/tsb/bench_dataframe_fromrecords.ts new file mode 100644 index 00000000..cc78662b --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_fromrecords.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: dataframe_fromrecords — DataFrame.fromRecords(records) on 10k records with 5 columns + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const records = Array.from({ length: ROWS }, (_, i) => ({ + a: i, + b: i * 2.0, + c: i % 100, + d: i * 0.5, + e: i % 10, +})); + +for (let i = 0; i < WARMUP; i++) { + DataFrame.fromRecords(records); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + DataFrame.fromRecords(records); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_fromrecords", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_head_tail.ts b/benchmarks/tsb/bench_dataframe_head_tail.ts new file mode 100644 index 00000000..b903c6ab --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_head_tail.ts @@ -0,0 +1,39 @@ +/** + * Benchmark: DataFrame.head() and .tail() — slice first/last N rows. + * Outputs JSON: {"function": "dataframe_head_tail", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i * 1.0), + b: Array.from({ length: SIZE }, (_, i) => i * 2), + c: Array.from({ length: SIZE }, (_, i) => String(i)), +}); + +for (let i = 0; i < WARMUP; i++) { + df.head(100); + df.tail(100); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + df.head(100); + df.tail(100); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "dataframe_head_tail", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_iloc.ts b/benchmarks/tsb/bench_dataframe_iloc.ts new file mode 100644 index 00000000..3315122c --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_iloc.ts @@ -0,0 +1,26 @@ +/** + * Benchmark: DataFrame.iloc(positions[]) on 100k-row DataFrame. + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i * 1.0), + b: Array.from({ length: ROWS }, (_, i) => i * 2.0), + c: Array.from({ length: ROWS }, (_, i) => i * 3.0), +}); +const positions = Array.from({ length: 1000 }, (_, i) => i * 100); + +for (let i = 0; i < WARMUP; i++) df.iloc(positions); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.iloc(positions); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "dataframe_iloc", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_isna.ts b/benchmarks/tsb/bench_dataframe_isna.ts new file mode 100644 index 00000000..dc0bfa60 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_isna.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: DataFrame.isna() on 100k-row DataFrame with some NAs. + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i % 5 === 0 ? null : i * 1.0), + b: Array.from({ length: ROWS }, (_, i) => i % 7 === 0 ? null : i * 2.0), +}); + +for (let i = 0; i < WARMUP; i++) df.isna(); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.isna(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "dataframe_isna", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_loc.ts b/benchmarks/tsb/bench_dataframe_loc.ts new file mode 100644 index 00000000..e0eddd64 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_loc.ts @@ -0,0 +1,26 @@ +/** + * Benchmark: DataFrame.loc(labels[]) on 100k-row DataFrame. + */ +import { DataFrame, Index } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const rowLabels = Array.from({ length: ROWS }, (_, i) => i); +const df = DataFrame.fromColumns( + { a: Array.from({ length: ROWS }, (_, i) => i * 1.0), b: Array.from({ length: ROWS }, (_, i) => i * 2.0) }, + { index: new Index(rowLabels) }, +); +const selectLabels = Array.from({ length: 1000 }, (_, i) => i * 100); + +for (let i = 0; i < WARMUP; i++) df.loc(selectLabels); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.loc(selectLabels); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "dataframe_loc", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_mask.ts b/benchmarks/tsb/bench_dataframe_mask.ts new file mode 100644 index 00000000..dfad6de9 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_mask.ts @@ -0,0 +1,17 @@ +import { dataFrameMask } from "tsb"; +import { DataFrame } from "tsb"; +const N = 100_000; +const cols = 4; +const data: Record = {}; +for (let c = 0; c < cols; c++) { + data[`col${c}`] = Array.from({ length: N }, (_, i) => (i % 200) - 100); +} +const df = new DataFrame(data); +const mask = Array.from({ length: N }, (_, i) => i % 3 === 0); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) dataFrameMask(df, mask, { other: 0 }); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) dataFrameMask(df, mask, { other: 0 }); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "dataframe_mask", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_min_max.ts b/benchmarks/tsb/bench_dataframe_min_max.ts new file mode 100644 index 00000000..23dba9c4 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_min_max.ts @@ -0,0 +1,26 @@ +/** + * Benchmark: DataFrame.min() and DataFrame.max() on 100k-row DataFrame. + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => (i * 3.14) % 5000), + b: Array.from({ length: ROWS }, (_, i) => (i * 2.71) % 8000), + c: Array.from({ length: ROWS }, (_, i) => i * 1.0), +}); + +for (let i = 0; i < WARMUP; i++) { df.min(); df.max(); } + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.min(); + df.max(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "dataframe_min_max", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_nlargest_nsmallest.ts b/benchmarks/tsb/bench_dataframe_nlargest_nsmallest.ts new file mode 100644 index 00000000..d959fe46 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_nlargest_nsmallest.ts @@ -0,0 +1,21 @@ +import { nlargestDataFrame, nsmallestDataFrame } from "tsb"; +import { DataFrame } from "tsb"; +const N = 100_000; +const df = new DataFrame({ + a: Array.from({ length: N }, (_, i) => (i * 1337) % 100_007), + b: Array.from({ length: N }, (_, i) => (i * 7919) % 100_003), + c: Array.from({ length: N }, (_, i) => (i * 3571) % 99_991), +}); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) { + nlargestDataFrame(df, 100, "a"); + nsmallestDataFrame(df, 100, "a"); +} +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) { + nlargestDataFrame(df, 100, "a"); + nsmallestDataFrame(df, 100, "a"); +} +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "dataframe_nlargest_nsmallest", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_notna.ts b/benchmarks/tsb/bench_dataframe_notna.ts new file mode 100644 index 00000000..59f3d1bb --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_notna.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: DataFrame.notna() on 100k-row DataFrame with some NAs. + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i % 5 === 0 ? null : i * 1.0), + b: Array.from({ length: ROWS }, (_, i) => i * 2.0), +}); + +for (let i = 0; i < WARMUP; i++) df.notna(); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.notna(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "dataframe_notna", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_rank.ts b/benchmarks/tsb/bench_dataframe_rank.ts new file mode 100644 index 00000000..e9596ae3 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_rank.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: rankDataFrame on a 10k-row DataFrame + */ +import { DataFrame, rankDataFrame } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const a = Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.1)); +const b = Array.from({ length: ROWS }, (_, i) => Math.cos(i * 0.1)); +const df = DataFrame.fromColumns({ a, b }); + +for (let i = 0; i < WARMUP; i++) { + rankDataFrame(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + rankDataFrame(df); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_rank", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_resetindex.ts b/benchmarks/tsb/bench_dataframe_resetindex.ts new file mode 100644 index 00000000..c3ab7544 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_resetindex.ts @@ -0,0 +1,25 @@ +/** + * Benchmark: DataFrame.resetIndex() on 100k-row DataFrame. + */ +import { DataFrame, Index } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const rowLabels = Array.from({ length: ROWS }, (_, i) => ROWS - i - 1); +const df = DataFrame.fromColumns( + { a: Array.from({ length: ROWS }, (_, i) => i * 1.0), b: Array.from({ length: ROWS }, (_, i) => i * 2.0) }, + { index: new Index(rowLabels) }, +); + +for (let i = 0; i < WARMUP; i++) df.resetIndex(); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.resetIndex(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "dataframe_resetindex", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_rolling.ts b/benchmarks/tsb/bench_dataframe_rolling.ts new file mode 100644 index 00000000..14576117 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_rolling.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: DataFrameRolling mean on 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const a = Array.from({ length: ROWS }, (_, i) => i * 0.1); +const b = Array.from({ length: ROWS }, (_, i) => i * 0.2); +const df = DataFrame.fromColumns({ a, b }); + +for (let i = 0; i < WARMUP; i++) df.rolling(10).mean(); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) df.rolling(10).mean(); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "dataframe_rolling", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_rolling_agg.ts b/benchmarks/tsb/bench_dataframe_rolling_agg.ts new file mode 100644 index 00000000..a32f5f85 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_rolling_agg.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: dataFrameRollingAgg on a 100k-row DataFrame + */ +import { DataFrame, Series, dataFrameRollingAgg } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const a = new Series(Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01))); +const b = new Series(Float64Array.from({ length: ROWS }, (_, i) => Math.cos(i * 0.01))); +const df = new DataFrame({ a, b }); +const fns = { + mean: (v: readonly number[]) => v.reduce((x, y) => x + y, 0) / v.length, + sum: (v: readonly number[]) => v.reduce((x, y) => x + y, 0), +}; + +for (let i = 0; i < WARMUP; i++) { + dataFrameRollingAgg(df, 10, fns); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameRollingAgg(df, 10, fns); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_rolling_agg", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_rolling_apply.ts b/benchmarks/tsb/bench_dataframe_rolling_apply.ts new file mode 100644 index 00000000..d786d662 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_rolling_apply.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: DataFrameRolling apply with custom function on 10k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 2; +const ITERATIONS = 5; + +const a = Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01)); +const b = Array.from({ length: ROWS }, (_, i) => Math.cos(i * 0.01)); +const df = DataFrame.fromColumns({ a, b }); + +const sumFn = (vals: readonly number[]) => vals.reduce((acc, v) => acc + v, 0); + +for (let i = 0; i < WARMUP; i++) { + df.rolling(10).apply(sumFn); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.rolling(10).apply(sumFn); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_rolling_apply", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_round.ts b/benchmarks/tsb/bench_dataframe_round.ts new file mode 100644 index 00000000..f2d57dc5 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_round.ts @@ -0,0 +1,15 @@ +import { DataFrame } from "tsb"; +const N = 100_000; +const cols = 5; +const data: Record = {}; +for (let c = 0; c < cols; c++) { + data[`col${c}`] = Array.from({ length: N }, (_, i) => (i % 100) * 1.5); +} +const df = new DataFrame(data); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) df.round(2); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) df.round(2); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "dataframe_round", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_select.ts b/benchmarks/tsb/bench_dataframe_select.ts new file mode 100644 index 00000000..800e926e --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_select.ts @@ -0,0 +1,26 @@ +/** + * Benchmark: DataFrame.select(names[]) on 100k-row DataFrame. + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i * 1.0), + b: Array.from({ length: ROWS }, (_, i) => i * 2.0), + c: Array.from({ length: ROWS }, (_, i) => i * 3.0), + d: Array.from({ length: ROWS }, (_, i) => i * 4.0), +}); + +for (let i = 0; i < WARMUP; i++) df.select(["a", "c"]); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.select(["a", "c"]); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "dataframe_select", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_sem_var.ts b/benchmarks/tsb/bench_dataframe_sem_var.ts new file mode 100644 index 00000000..3b92da04 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_sem_var.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: varDataFrame / semDataFrame — variance and SEM on a 10k×10 DataFrame. + * Outputs JSON: {"function": "dataframe_sem_var", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, varDataFrame, semDataFrame } from "../../src/index.ts"; + +const ROWS = 10_000; +const COLS = 10; +const WARMUP = 5; +const ITERATIONS = 20; + +const columns: Record = {}; +for (let c = 0; c < COLS; c++) { + columns[`col${c}`] = Array.from({ length: ROWS }, (_, i) => Math.sin((i + c) * 0.01) * 100); +} +const df = new DataFrame(columns); + +for (let i = 0; i < WARMUP; i++) { + varDataFrame(df); + semDataFrame(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + varDataFrame(df); + semDataFrame(df); +} +const total = performance.now() - start; + +console.log(JSON.stringify({ function: "dataframe_sem_var", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_set_index.ts b/benchmarks/tsb/bench_dataframe_set_index.ts new file mode 100644 index 00000000..76731861 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_set_index.ts @@ -0,0 +1,25 @@ +/** + * Benchmark: DataFrame.setIndex(col) on 100k-row DataFrame. + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const df = DataFrame.fromColumns({ + id: Array.from({ length: ROWS }, (_, i) => i), + a: Array.from({ length: ROWS }, (_, i) => i * 1.5), + b: Array.from({ length: ROWS }, (_, i) => i * 2.5), +}); + +for (let i = 0; i < WARMUP; i++) df.setIndex("id"); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.setIndex("id"); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "dataframe_set_index", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_setindex.ts b/benchmarks/tsb/bench_dataframe_setindex.ts new file mode 100644 index 00000000..55455ec6 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_setindex.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: dataframe_setindex — DataFrame.setIndex(col) on a 10k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = DataFrame.fromColumns({ + id: Array.from({ length: ROWS }, (_, i) => i), + a: Array.from({ length: ROWS }, (_, i) => i * 2.0), + b: Array.from({ length: ROWS }, (_, i) => i % 100), +}); + +for (let i = 0; i < WARMUP; i++) { + df.setIndex("id"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.setIndex("id"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_setindex", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_skew_kurt.ts b/benchmarks/tsb/bench_dataframe_skew_kurt.ts new file mode 100644 index 00000000..00452b7c --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_skew_kurt.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: skewDataFrame / kurtDataFrame — skewness and kurtosis on a 10k×10 DataFrame. + * Outputs JSON: {"function": "dataframe_skew_kurt", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, skewDataFrame, kurtDataFrame } from "../../src/index.ts"; + +const ROWS = 10_000; +const COLS = 10; +const WARMUP = 5; +const ITERATIONS = 20; + +const columns: Record = {}; +for (let c = 0; c < COLS; c++) { + columns[`col${c}`] = Array.from({ length: ROWS }, (_, i) => Math.sin((i + c) * 0.01) * 100); +} +const df = new DataFrame(columns); + +for (let i = 0; i < WARMUP; i++) { + skewDataFrame(df); + kurtDataFrame(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + skewDataFrame(df); + kurtDataFrame(df); +} +const total = performance.now() - start; + +console.log(JSON.stringify({ function: "dataframe_skew_kurt", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_sort_index.ts b/benchmarks/tsb/bench_dataframe_sort_index.ts new file mode 100644 index 00000000..e4deb92b --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_sort_index.ts @@ -0,0 +1,29 @@ +/** + * Benchmark: DataFrame.sortIndex() on 100k-row DataFrame with shuffled index. + */ +import { DataFrame, Index } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const shuffled = Array.from({ length: ROWS }, (_, i) => ROWS - i - 1); +const idx = new Index(shuffled); +const df = DataFrame.fromColumns( + { + a: Array.from({ length: ROWS }, (_, i) => i * 1.1), + b: Array.from({ length: ROWS }, (_, i) => i * 2.2), + }, + { index: idx }, +); + +for (let i = 0; i < WARMUP; i++) df.sortIndex(); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.sortIndex(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "dataframe_sort_index", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_std_var.ts b/benchmarks/tsb/bench_dataframe_std_var.ts new file mode 100644 index 00000000..2f326dc0 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_std_var.ts @@ -0,0 +1,25 @@ +/** + * Benchmark: DataFrame.std() and DataFrame.var() on 100k-row DataFrame. + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => (i * 1.23) % 9000), + b: Array.from({ length: ROWS }, (_, i) => (i * 4.56) % 7000), +}); + +for (let i = 0; i < WARMUP; i++) { df.std(); df.var(); } + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.std(); + df.var(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "dataframe_std_var", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_sum_mean.ts b/benchmarks/tsb/bench_dataframe_sum_mean.ts new file mode 100644 index 00000000..3408d16c --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_sum_mean.ts @@ -0,0 +1,26 @@ +/** + * Benchmark: DataFrame.sum() and DataFrame.mean() on 100k-row DataFrame. + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i * 1.0), + b: Array.from({ length: ROWS }, (_, i) => i * 2.0), + c: Array.from({ length: ROWS }, (_, i) => i * 3.0), +}); + +for (let i = 0; i < WARMUP; i++) { df.sum(); df.mean(); } + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.sum(); + df.mean(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "dataframe_sum_mean", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_to_array.ts b/benchmarks/tsb/bench_dataframe_to_array.ts new file mode 100644 index 00000000..f63b4e88 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_to_array.ts @@ -0,0 +1,25 @@ +/** + * Benchmark: DataFrame.toArray() on 100k-row DataFrame. + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i * 1.0), + b: Array.from({ length: ROWS }, (_, i) => i * 2.0), + c: Array.from({ length: ROWS }, (_, i) => i * 3.0), +}); + +for (let i = 0; i < WARMUP; i++) df.toArray(); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.toArray(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "dataframe_to_array", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_to_dict.ts b/benchmarks/tsb/bench_dataframe_to_dict.ts new file mode 100644 index 00000000..98f9f94b --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_to_dict.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: DataFrame.toDict() on 100k-row DataFrame. + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i * 1.0), + b: Array.from({ length: ROWS }, (_, i) => i * 2.0), +}); + +for (let i = 0; i < WARMUP; i++) df.toDict(); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.toDict(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "dataframe_to_dict", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_to_records.ts b/benchmarks/tsb/bench_dataframe_to_records.ts new file mode 100644 index 00000000..07919a40 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_to_records.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: DataFrame.toRecords() on 100k-row DataFrame. + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i * 1.0), + b: Array.from({ length: ROWS }, (_, i) => i * 2.0), +}); + +for (let i = 0; i < WARMUP; i++) df.toRecords(); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.toRecords(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "dataframe_to_records", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_to_string.ts b/benchmarks/tsb/bench_dataframe_to_string.ts new file mode 100644 index 00000000..26ecf2f9 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_to_string.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: dataFrameToString on 1k-row DataFrame + */ +import { DataFrame, dataFrameToString } from "../../src/index.js"; + +const ROWS = 1_000; +const WARMUP = 3; +const ITERATIONS = 10; +const a = Array.from({ length: ROWS }, (_, i) => i); +const b = Array.from({ length: ROWS }, (_, i) => i * 1.5); +const df = DataFrame.fromColumns({ a, b }); + +for (let i = 0; i < WARMUP; i++) dataFrameToString(df); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) dataFrameToString(df); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "dataframe_to_string", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_torecords.ts b/benchmarks/tsb/bench_dataframe_torecords.ts new file mode 100644 index 00000000..fa787be7 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_torecords.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: dataframe_torecords — DataFrame.toRecords() on a 10k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i), + b: Array.from({ length: ROWS }, (_, i) => i * 2.0), + c: Array.from({ length: ROWS }, (_, i) => i % 100), + d: Array.from({ length: ROWS }, (_, i) => i * 0.5), + e: Array.from({ length: ROWS }, (_, i) => i % 10), +}); + +for (let i = 0; i < WARMUP; i++) { + df.toRecords(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.toRecords(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_torecords", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_transform.ts b/benchmarks/tsb/bench_dataframe_transform.ts new file mode 100644 index 00000000..318d8574 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_transform.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: dataFrameTransform on 100k-row DataFrame + */ +import { DataFrame, dataFrameTransform } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const a = Array.from({ length: ROWS }, (_, i) => i * 0.1); +const b = Array.from({ length: ROWS }, (_, i) => i * 0.2); +const df = DataFrame.fromColumns({ a, b }); + +for (let i = 0; i < WARMUP; i++) dataFrameTransform(df, (v) => (v as number) * 2); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) dataFrameTransform(df, (v) => (v as number) * 2); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "dataframe_transform", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_transform_rows.ts b/benchmarks/tsb/bench_dataframe_transform_rows.ts new file mode 100644 index 00000000..6f80a885 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_transform_rows.ts @@ -0,0 +1,26 @@ +/** + * Benchmark: dataFrameTransformRows on 10k-row DataFrame + */ +import { DataFrame, dataFrameTransformRows } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; +const a = Array.from({ length: ROWS }, (_, i) => i * 1.0); +const b = Array.from({ length: ROWS }, (_, i) => i * 2.0); +const df = DataFrame.fromColumns({ a, b }); + +for (let i = 0; i < WARMUP; i++) + dataFrameTransformRows(df, (row) => ({ a: (row.a as number) * 2, b: (row.b as number) + 1 })); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) + dataFrameTransformRows(df, (row) => ({ a: (row.a as number) * 2, b: (row.b as number) + 1 })); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "dataframe_transform_rows", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_value_counts.ts b/benchmarks/tsb/bench_dataframe_value_counts.ts new file mode 100644 index 00000000..c2174c82 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_value_counts.ts @@ -0,0 +1,15 @@ +import { dataFrameValueCounts } from "tsb"; +import { DataFrame } from "tsb"; +const N = 100_000; +const cats = ["apple", "banana", "cherry", "date", "elderberry"]; +const df = new DataFrame({ + fruit: Array.from({ length: N }, (_, i) => cats[i % cats.length]), + color: Array.from({ length: N }, (_, i) => (i % 3 === 0 ? "red" : i % 3 === 1 ? "yellow" : "purple")), +}); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) dataFrameValueCounts(df, { subset: ["fruit", "color"] }); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) dataFrameValueCounts(df, { subset: ["fruit", "color"] }); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "dataframe_value_counts", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_dataframe_where.ts b/benchmarks/tsb/bench_dataframe_where.ts new file mode 100644 index 00000000..2b300fd9 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_where.ts @@ -0,0 +1,17 @@ +import { dataFrameWhere } from "tsb"; +import { DataFrame } from "tsb"; +const N = 100_000; +const cols = 4; +const data: Record = {}; +for (let c = 0; c < cols; c++) { + data[`col${c}`] = Array.from({ length: N }, (_, i) => (i % 200) - 100); +} +const df = new DataFrame(data); +const mask = Array.from({ length: N }, (_, i) => i % 2 === 0); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) dataFrameWhere(df, mask, { other: 0 }); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) dataFrameWhere(df, mask, { other: 0 }); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "dataframe_where", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_date_offset.ts b/benchmarks/tsb/bench_date_offset.ts new file mode 100644 index 00000000..f236ecc7 --- /dev/null +++ b/benchmarks/tsb/bench_date_offset.ts @@ -0,0 +1,48 @@ +/** + * Benchmark: DateOffset — MonthEnd, BusinessDay, YearBegin apply. + * Outputs JSON: {"function": "date_offset", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { MonthEnd, BusinessDay, YearBegin, Day } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const monthEnd = new MonthEnd(1); +const bizDay = new BusinessDay(5); +const yearBegin = new YearBegin(1); +const dayOffset = new Day(30); +const base = new Date(Date.UTC(2020, 0, 15)); +const dates = Array.from({ length: SIZE }, (_, i) => new Date(base.getTime() + i * 86_400_000)); + +for (let i = 0; i < WARMUP; i++) { + for (const d of dates) { + monthEnd.apply(d); + bizDay.apply(d); + yearBegin.apply(d); + dayOffset.apply(d); + } +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + for (const d of dates) { + monthEnd.apply(d); + bizDay.apply(d); + yearBegin.apply(d); + dayOffset.apply(d); + } + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "date_offset", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_datetime_accessor.ts b/benchmarks/tsb/bench_datetime_accessor.ts new file mode 100644 index 00000000..4c6907b9 --- /dev/null +++ b/benchmarks/tsb/bench_datetime_accessor.ts @@ -0,0 +1,21 @@ +import { Series } from "tsb"; +const N = 100_000; +const base = new Date("2020-01-01").getTime(); +const day = 24 * 60 * 60 * 1000; +const dates = Array.from({ length: N }, (_, i) => new Date(base + i * day)); +const s = new Series(dates); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) { + s.dt.year(); + s.dt.month(); + s.dt.dayofweek(); +} +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) { + s.dt.year(); + s.dt.month(); + s.dt.dayofweek(); +} +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "datetime_accessor", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_df_from_pairs.ts b/benchmarks/tsb/bench_df_from_pairs.ts new file mode 100644 index 00000000..8ca447fb --- /dev/null +++ b/benchmarks/tsb/bench_df_from_pairs.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: dataFrameFromPairs — build a DataFrame from [column, Series] pairs + */ +import { DataFrame, Series, dataFrameFromPairs } from "../../src/index.js"; + +const N = 10_000; +const pairs: [string, Series][] = [ + ["a", new Series({ data: Array.from({ length: N }, (_, i) => i) })], + ["b", new Series({ data: Array.from({ length: N }, (_, i) => i * 2) })], + ["c", new Series({ data: Array.from({ length: N }, (_, i) => i * 3) })], +]; + +const WARMUP = 3; +const ITERATIONS = 100; + +for (let i = 0; i < WARMUP; i++) { + dataFrameFromPairs(pairs); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameFromPairs(pairs); +} +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "df_from_pairs", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_diff.ts b/benchmarks/tsb/bench_diff.ts new file mode 100644 index 00000000..b65b42ac --- /dev/null +++ b/benchmarks/tsb/bench_diff.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: Series.diff() — first discrete difference. + * Outputs JSON: {"function": "diff", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 1.1 + 0.5) }); + +for (let i = 0; i < WARMUP; i++) { + s.diff(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + s.diff(); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "diff", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_drop_duplicates.ts b/benchmarks/tsb/bench_drop_duplicates.ts new file mode 100644 index 00000000..bfc65bc6 --- /dev/null +++ b/benchmarks/tsb/bench_drop_duplicates.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: DataFrame.drop_duplicates() — remove duplicate rows. + * Outputs JSON: {"function": "drop_duplicates", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i % 1000), + b: Array.from({ length: SIZE }, (_, i) => i % 500), +}); + +for (let i = 0; i < WARMUP; i++) { + df.drop_duplicates(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + df.drop_duplicates(); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "drop_duplicates", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_dt_date.ts b/benchmarks/tsb/bench_dt_date.ts new file mode 100644 index 00000000..591e182a --- /dev/null +++ b/benchmarks/tsb/bench_dt_date.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: dt_date — DatetimeAccessor date() on 100k datetime values + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const now = Date.now(); +const data = Array.from({ length: ROWS }, (_, i) => new Date(now + i * 86_400_000)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.dt.date(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.dt.date(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dt_date", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dt_dayofyear_weekday.ts b/benchmarks/tsb/bench_dt_dayofyear_weekday.ts new file mode 100644 index 00000000..ca90b29a --- /dev/null +++ b/benchmarks/tsb/bench_dt_dayofyear_weekday.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: dt_dayofyear_weekday — DatetimeAccessor dayofyear, weekday on 100k values + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const now = Date.now(); +const data = Array.from({ length: ROWS }, (_, i) => new Date(now + i * 86_400_000)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.dt.dayofyear(); + s.dt.weekday(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.dt.dayofyear(); + s.dt.weekday(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dt_dayofyear_weekday", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dt_days_in_month.ts b/benchmarks/tsb/bench_dt_days_in_month.ts new file mode 100644 index 00000000..0e88bd4a --- /dev/null +++ b/benchmarks/tsb/bench_dt_days_in_month.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: dt_days_in_month — dt.days_in_month on 100k datetime values + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const start2020 = new Date("2020-01-01").getTime(); +const data = Array.from({ length: ROWS }, (_, i) => new Date(start2020 + i * 86_400_000)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.dt.days_in_month(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.dt.days_in_month(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dt_days_in_month", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dt_floor_ceil.ts b/benchmarks/tsb/bench_dt_floor_ceil.ts new file mode 100644 index 00000000..622f41a6 --- /dev/null +++ b/benchmarks/tsb/bench_dt_floor_ceil.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: dt_floor_ceil — dt.floor and dt.ceil on 100k datetime values + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const now = Date.now(); +const data = Array.from({ length: ROWS }, (_, i) => new Date(now + i * 60_000)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.dt.floor("H"); + s.dt.ceil("H"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.dt.floor("H"); + s.dt.ceil("H"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dt_floor_ceil", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dt_hour_minute_second.ts b/benchmarks/tsb/bench_dt_hour_minute_second.ts new file mode 100644 index 00000000..048b3dcc --- /dev/null +++ b/benchmarks/tsb/bench_dt_hour_minute_second.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: dt_hour_minute_second — dt.hour, dt.minute, dt.second on 100k datetime values + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const now = Date.now(); +const data = Array.from({ length: ROWS }, (_, i) => new Date(now + i * 60_000)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.dt.hour(); + s.dt.minute(); + s.dt.second(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.dt.hour(); + s.dt.minute(); + s.dt.second(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dt_hour_minute_second", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dt_is_leap_year.ts b/benchmarks/tsb/bench_dt_is_leap_year.ts new file mode 100644 index 00000000..a5e7294e --- /dev/null +++ b/benchmarks/tsb/bench_dt_is_leap_year.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: dt_is_leap_year — dt.is_leap_year on 100k datetime values + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const start2020 = new Date("2020-01-01").getTime(); +const data = Array.from({ length: ROWS }, (_, i) => new Date(start2020 + i * 86_400_000)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.dt.is_leap_year(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.dt.is_leap_year(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dt_is_leap_year", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dt_is_month_start_end.ts b/benchmarks/tsb/bench_dt_is_month_start_end.ts new file mode 100644 index 00000000..57c00315 --- /dev/null +++ b/benchmarks/tsb/bench_dt_is_month_start_end.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: dt_is_month_start_end — dt.is_month_start and dt.is_month_end on 100k datetime values + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const start2020 = new Date("2020-01-01").getTime(); +const data = Array.from({ length: ROWS }, (_, i) => new Date(start2020 + i * 86_400_000)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.dt.is_month_start(); + s.dt.is_month_end(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.dt.is_month_start(); + s.dt.is_month_end(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dt_is_month_start_end", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dt_is_quarter_start_end.ts b/benchmarks/tsb/bench_dt_is_quarter_start_end.ts new file mode 100644 index 00000000..10b5ca4a --- /dev/null +++ b/benchmarks/tsb/bench_dt_is_quarter_start_end.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: dt_is_quarter_start_end — is_quarter_start, is_quarter_end on 100k datetime values + */ +import { Series } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const now = new Date("2024-01-01").getTime(); +const data = Array.from({ length: ROWS }, (_, i) => new Date(now + i * 24 * 3600 * 1000)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.dt.is_quarter_start(); + s.dt.is_quarter_end(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.dt.is_quarter_start(); + s.dt.is_quarter_end(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dt_is_quarter_start_end", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dt_is_year_start_end.ts b/benchmarks/tsb/bench_dt_is_year_start_end.ts new file mode 100644 index 00000000..53e12cda --- /dev/null +++ b/benchmarks/tsb/bench_dt_is_year_start_end.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: dt_is_year_start_end — dt.is_year_start and dt.is_year_end on 100k datetime values + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const start2020 = new Date("2020-01-01").getTime(); +const data = Array.from({ length: ROWS }, (_, i) => new Date(start2020 + i * 86_400_000)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.dt.is_year_start(); + s.dt.is_year_end(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.dt.is_year_start(); + s.dt.is_year_end(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dt_is_year_start_end", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dt_millisecond_microsecond_nanosecond.ts b/benchmarks/tsb/bench_dt_millisecond_microsecond_nanosecond.ts new file mode 100644 index 00000000..f57d886e --- /dev/null +++ b/benchmarks/tsb/bench_dt_millisecond_microsecond_nanosecond.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: dt_millisecond_microsecond_nanosecond — DatetimeAccessor millisecond, microsecond, nanosecond on 100k values + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const now = Date.now(); +const data = Array.from({ length: ROWS }, (_, i) => new Date(now + i * 1_000)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.dt.millisecond(); + s.dt.microsecond(); + s.dt.nanosecond(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.dt.millisecond(); + s.dt.microsecond(); + s.dt.nanosecond(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dt_millisecond_microsecond_nanosecond", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dt_normalize.ts b/benchmarks/tsb/bench_dt_normalize.ts new file mode 100644 index 00000000..2251202d --- /dev/null +++ b/benchmarks/tsb/bench_dt_normalize.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: dt_normalize — dt.normalize (truncate to midnight) on 100k datetime values + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const now = Date.now(); +const data = Array.from({ length: ROWS }, (_, i) => new Date(now + i * 60_000)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.dt.normalize(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.dt.normalize(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dt_normalize", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dt_quarter_month.ts b/benchmarks/tsb/bench_dt_quarter_month.ts new file mode 100644 index 00000000..ad7f2918 --- /dev/null +++ b/benchmarks/tsb/bench_dt_quarter_month.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: dt_quarter_month — dt.quarter, dt.is_month_start, dt.is_month_end on 100k datetime values + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const now = new Date("2024-01-01").getTime(); +const data = Array.from({ length: ROWS }, (_, i) => new Date(now + i * 24 * 3600 * 1000)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.dt.quarter(); + s.dt.is_month_start(); + s.dt.is_month_end(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.dt.quarter(); + s.dt.is_month_start(); + s.dt.is_month_end(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dt_quarter_month", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dt_round.ts b/benchmarks/tsb/bench_dt_round.ts new file mode 100644 index 00000000..2e74d552 --- /dev/null +++ b/benchmarks/tsb/bench_dt_round.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: dt_round — DatetimeAccessor round() to hour on 100k values + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const now = Date.now(); +const data = Array.from({ length: ROWS }, (_, i) => new Date(now + i * 60_000)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.dt.round("H"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.dt.round("H"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dt_round", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dt_year_month_day.ts b/benchmarks/tsb/bench_dt_year_month_day.ts new file mode 100644 index 00000000..f213609c --- /dev/null +++ b/benchmarks/tsb/bench_dt_year_month_day.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: dt_year_month_day — dt.year(), dt.month(), dt.day() on 100k datetime values + */ +import { Series } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const now = new Date("2024-01-01").getTime(); +const data = Array.from({ length: ROWS }, (_, i) => new Date(now + i * 24 * 3600 * 1000)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.dt.year(); + s.dt.month(); + s.dt.day(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.dt.year(); + s.dt.month(); + s.dt.day(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dt_year_month_day", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dtype.ts b/benchmarks/tsb/bench_dtype.ts new file mode 100644 index 00000000..6613e11b --- /dev/null +++ b/benchmarks/tsb/bench_dtype.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: Dtype — singleton lookup, inferFrom, commonType, and property access + */ +import { Dtype } from "../../src/index.js"; + +const WARMUP = 3; +const ITERATIONS = 10_000; + +const values = Array.from({ length: 100 }, (_, i) => i * 1.5); +const mixed = [1, 2.5, "hello", true]; + +for (let i = 0; i < WARMUP; i++) { + Dtype.from("float64"); + Dtype.inferFrom(values); + Dtype.commonType(Dtype.float32, Dtype.float64); + const dt = Dtype.from("float64"); + dt.isNumeric; dt.isFloat; dt.isInteger; dt.kind; dt.itemsize; +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + Dtype.from("float64"); + Dtype.inferFrom(values); + Dtype.commonType(Dtype.float32, Dtype.float64); + const dt = Dtype.from("float64"); + dt.isNumeric; dt.isFloat; dt.isInteger; dt.kind; dt.itemsize; +} +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "dtype", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dtype_predicates.ts b/benchmarks/tsb/bench_dtype_predicates.ts new file mode 100644 index 00000000..b20e21ef --- /dev/null +++ b/benchmarks/tsb/bench_dtype_predicates.ts @@ -0,0 +1,63 @@ +/** + * Benchmark: dtype predicate functions — isNumericDtype, isIntegerDtype, isFloatDtype, + * isBoolDtype, isStringDtype, isDatetimeDtype, isCategoricalDtype, isSignedIntegerDtype, + * isUnsignedIntegerDtype, isTimedeltaDtype, isObjectDtype, isComplexDtype, + * isExtensionArrayDtype, isPeriodDtype, isIntervalDtype + */ +import { + isNumericDtype, + isIntegerDtype, + isFloatDtype, + isBoolDtype, + isStringDtype, + isDatetimeDtype, + isCategoricalDtype, + isSignedIntegerDtype, + isUnsignedIntegerDtype, + isTimedeltaDtype, + isObjectDtype, + isComplexDtype, + isExtensionArrayDtype, + isPeriodDtype, + isIntervalDtype, +} from "../../src/index.js"; + +const WARMUP = 3; +const ITERATIONS = 10_000; + +const dtypes = ["float64", "int32", "uint8", "bool", "string", "datetime", "category", "object", "timedelta"] as const; + +function runChecks(): void { + for (const d of dtypes) { + isNumericDtype(d); + isIntegerDtype(d); + isFloatDtype(d); + isBoolDtype(d); + isStringDtype(d); + isDatetimeDtype(d); + isCategoricalDtype(d); + isSignedIntegerDtype(d); + isUnsignedIntegerDtype(d); + isTimedeltaDtype(d); + isObjectDtype(d); + isComplexDtype(d); + isExtensionArrayDtype(d); + isPeriodDtype(d); + isIntervalDtype(d); + } +} + +for (let i = 0; i < WARMUP; i++) runChecks(); + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) runChecks(); +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dtype_predicates", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_duplicated.ts b/benchmarks/tsb/bench_duplicated.ts new file mode 100644 index 00000000..054e80e9 --- /dev/null +++ b/benchmarks/tsb/bench_duplicated.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: DataFrame.duplicated() — detect duplicate rows. + * Outputs JSON: {"function": "duplicated", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i % 1000), + b: Array.from({ length: SIZE }, (_, i) => i % 500), +}); + +for (let i = 0; i < WARMUP; i++) { + df.duplicated(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + df.duplicated(); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "duplicated", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_ewm_apply.ts b/benchmarks/tsb/bench_ewm_apply.ts new file mode 100644 index 00000000..fdd70e25 --- /dev/null +++ b/benchmarks/tsb/bench_ewm_apply.ts @@ -0,0 +1,46 @@ +/** + * Benchmark: EWM.apply with custom function on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.05)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.ewm({ span: 20 }).apply((vals, weights) => { + let sum = 0; + let wsum = 0; + for (let j = 0; j < vals.length; j++) { + sum += (vals[j] as number) * (weights[j] as number); + wsum += weights[j] as number; + } + return wsum === 0 ? 0 : sum / wsum; + }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.ewm({ span: 20 }).apply((vals, weights) => { + let sum = 0; + let wsum = 0; + for (let j = 0; j < vals.length; j++) { + sum += (vals[j] as number) * (weights[j] as number); + wsum += weights[j] as number; + } + return wsum === 0 ? 0 : sum / wsum; + }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "ewm_apply", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_ewm_corr.ts b/benchmarks/tsb/bench_ewm_corr.ts new file mode 100644 index 00000000..100be4de --- /dev/null +++ b/benchmarks/tsb/bench_ewm_corr.ts @@ -0,0 +1,23 @@ +/** + * Benchmark: EWM.corr(other) on two 100k-element Series. + */ +import { Series, EWM } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const a = new Series({ data: Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.01)) }); +const b = new Series({ data: Array.from({ length: SIZE }, (_, i) => Math.cos(i * 0.01)) }); +const ewmA = new EWM(a, { span: 10 }); + +for (let i = 0; i < WARMUP; i++) ewmA.corr(b); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + ewmA.corr(b); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "ewm_corr", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_ewm_cov.ts b/benchmarks/tsb/bench_ewm_cov.ts new file mode 100644 index 00000000..66dbe7da --- /dev/null +++ b/benchmarks/tsb/bench_ewm_cov.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: EWM.cov between two 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data1 = Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.05)); +const data2 = Array.from({ length: ROWS }, (_, i) => Math.cos(i * 0.05)); +const s1 = new Series({ data: data1 }); +const s2 = new Series({ data: data2 }); + +for (let i = 0; i < WARMUP; i++) { + s1.ewm({ span: 20 }).cov(s2); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s1.ewm({ span: 20 }).cov(s2); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "ewm_cov", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_expanding_apply.ts b/benchmarks/tsb/bench_expanding_apply.ts new file mode 100644 index 00000000..23338a8c --- /dev/null +++ b/benchmarks/tsb/bench_expanding_apply.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: expanding apply with custom function on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 2; +const ITERATIONS = 5; + +const data = Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01)); +const s = new Series(data); +const fn = (values: readonly number[]) => { + let sum = 0; + for (const v of values) sum += v; + return sum / values.length; +}; + +for (let i = 0; i < WARMUP; i++) { + s.expanding().apply(fn); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.expanding().apply(fn); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "expanding_apply", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_expanding_count.ts b/benchmarks/tsb/bench_expanding_count.ts new file mode 100644 index 00000000..03acc9f5 --- /dev/null +++ b/benchmarks/tsb/bench_expanding_count.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: Expanding.count on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Float64Array.from({ length: ROWS }, (_, i) => (i % 10 === 0 ? NaN : Math.sin(i * 0.01))); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.expanding().count(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.expanding().count(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "expanding_count", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_expanding_max.ts b/benchmarks/tsb/bench_expanding_max.ts new file mode 100644 index 00000000..1697f3df --- /dev/null +++ b/benchmarks/tsb/bench_expanding_max.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: Expanding.max on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01)); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.expanding().max(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.expanding().max(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "expanding_max", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_expanding_mean.ts b/benchmarks/tsb/bench_expanding_mean.ts new file mode 100644 index 00000000..4ea94a4a --- /dev/null +++ b/benchmarks/tsb/bench_expanding_mean.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: expanding mean on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01)); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.expanding().mean(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.expanding().mean(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "expanding_mean", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_expanding_median.ts b/benchmarks/tsb/bench_expanding_median.ts new file mode 100644 index 00000000..7d203484 --- /dev/null +++ b/benchmarks/tsb/bench_expanding_median.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: Expanding.median on 10k-element Series (median is O(n^2)) + */ +import { Series } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 2; +const ITERATIONS = 5; + +const data = Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01)); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.expanding().median(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.expanding().median(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "expanding_median", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_expanding_min.ts b/benchmarks/tsb/bench_expanding_min.ts new file mode 100644 index 00000000..f707ba78 --- /dev/null +++ b/benchmarks/tsb/bench_expanding_min.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: Expanding.min on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01)); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.expanding().min(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.expanding().min(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "expanding_min", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_explode.ts b/benchmarks/tsb/bench_explode.ts new file mode 100644 index 00000000..a42bd4ed --- /dev/null +++ b/benchmarks/tsb/bench_explode.ts @@ -0,0 +1,15 @@ +import { Series } from "tsb"; + +const rng = (seed: number) => { let s = seed; return () => { s = (s * 1664525 + 1013904223) & 0xffffffff; return (s >>> 0) / 0xffffffff; }; }; +const rand = rng(42); +const data = Array.from({ length: 10_000 }, () => { + const len = Math.floor(rand() * 5) + 1; + return Array.from({ length: len }, () => Math.floor(rand() * 100)); +}); +const s = new Series(data); +for (let i = 0; i < 3; i++) s.explode(); +const N = 50; +const t0 = performance.now(); +for (let i = 0; i < N; i++) s.explode(); +const elapsed = performance.now() - t0; +console.log(JSON.stringify({ function: "explode", mean_ms: elapsed / N, iterations: N, total_ms: elapsed })); diff --git a/benchmarks/tsb/bench_factorize.ts b/benchmarks/tsb/bench_factorize.ts new file mode 100644 index 00000000..6147cfa4 --- /dev/null +++ b/benchmarks/tsb/bench_factorize.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: factorize / seriesFactorize — encode values as integer codes. + * Outputs JSON: {"function": "factorize", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { factorize, seriesFactorize, Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const categories = ["cat", "dog", "bird", "fish", "hamster"]; +const data = Array.from({ length: SIZE }, (_, i) => categories[i % categories.length]); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + factorize(data); + seriesFactorize(s); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + factorize(data); + seriesFactorize(s); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "factorize", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_fillna_dropna.ts b/benchmarks/tsb/bench_fillna_dropna.ts new file mode 100644 index 00000000..5bded048 --- /dev/null +++ b/benchmarks/tsb/bench_fillna_dropna.ts @@ -0,0 +1,18 @@ +import { fillna, dropna } from "tsb"; +import { Series } from "tsb"; +const N = 100_000; +const data: (number | null)[] = Array.from({ length: N }, (_, i) => (i % 7 === 0 ? null : i * 1.5)); +const s = new Series(data); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) { + fillna(s, { value: 0 }); + dropna(s); +} +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) { + fillna(s, { value: 0 }); + dropna(s); +} +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "fillna_dropna", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_format_compact.ts b/benchmarks/tsb/bench_format_compact.ts new file mode 100644 index 00000000..11119286 --- /dev/null +++ b/benchmarks/tsb/bench_format_compact.ts @@ -0,0 +1,22 @@ +/** + * Benchmark: formatCompact on 100k numbers + */ +import { formatCompact } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => i * 1234); + +for (let i = 0; i < WARMUP; i++) data.map((v) => formatCompact(v)); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) data.map((v) => formatCompact(v)); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "format_compact", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_format_currency.ts b/benchmarks/tsb/bench_format_currency.ts new file mode 100644 index 00000000..8769d99e --- /dev/null +++ b/benchmarks/tsb/bench_format_currency.ts @@ -0,0 +1,22 @@ +/** + * Benchmark: formatCurrency on 100k numbers + */ +import { formatCurrency } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => i * 9.99); + +for (let i = 0; i < WARMUP; i++) data.map((v) => formatCurrency(v)); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) data.map((v) => formatCurrency(v)); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "format_currency", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_format_engineering.ts b/benchmarks/tsb/bench_format_engineering.ts new file mode 100644 index 00000000..5ccaf28a --- /dev/null +++ b/benchmarks/tsb/bench_format_engineering.ts @@ -0,0 +1,22 @@ +/** + * Benchmark: formatEngineering on 100k numbers + */ +import { formatEngineering } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => i * 1.5e3); + +for (let i = 0; i < WARMUP; i++) data.map((v) => formatEngineering(v)); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) data.map((v) => formatEngineering(v)); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "format_engineering", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_format_float.ts b/benchmarks/tsb/bench_format_float.ts new file mode 100644 index 00000000..00cfc6b4 --- /dev/null +++ b/benchmarks/tsb/bench_format_float.ts @@ -0,0 +1,23 @@ +/** + * Benchmark: formatFloat on 100k numbers + */ +import { formatFloat } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => i * 3.14159); +const fmt = formatFloat(3); + +for (let i = 0; i < WARMUP; i++) data.map((v) => fmt(v)); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) data.map((v) => fmt(v)); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "format_float", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_format_percent.ts b/benchmarks/tsb/bench_format_percent.ts new file mode 100644 index 00000000..a78ed3f6 --- /dev/null +++ b/benchmarks/tsb/bench_format_percent.ts @@ -0,0 +1,22 @@ +/** + * Benchmark: formatPercent on 100k numbers + */ +import { formatPercent } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => i / ROWS); + +for (let i = 0; i < WARMUP; i++) data.map((v) => formatPercent(v)); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) data.map((v) => formatPercent(v)); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "format_percent", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_format_scientific.ts b/benchmarks/tsb/bench_format_scientific.ts new file mode 100644 index 00000000..596cdd61 --- /dev/null +++ b/benchmarks/tsb/bench_format_scientific.ts @@ -0,0 +1,22 @@ +/** + * Benchmark: formatScientific on 100k numbers + */ +import { formatScientific } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => i * 1.23456e-5); + +for (let i = 0; i < WARMUP; i++) data.map((v) => formatScientific(v)); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) data.map((v) => formatScientific(v)); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "format_scientific", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_format_thousands.ts b/benchmarks/tsb/bench_format_thousands.ts new file mode 100644 index 00000000..4dee6890 --- /dev/null +++ b/benchmarks/tsb/bench_format_thousands.ts @@ -0,0 +1,22 @@ +/** + * Benchmark: formatThousands on 100k numbers + */ +import { formatThousands } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => i * 1234.56); + +for (let i = 0; i < WARMUP; i++) data.map((v) => formatThousands(v)); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) data.map((v) => formatThousands(v)); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "format_thousands", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_from_dict_oriented.ts b/benchmarks/tsb/bench_from_dict_oriented.ts new file mode 100644 index 00000000..6c2f91eb --- /dev/null +++ b/benchmarks/tsb/bench_from_dict_oriented.ts @@ -0,0 +1,22 @@ +/** + * Benchmark: fromDictOriented (records orient) on 10k records + */ +import { fromDictOriented } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; +const records = Array.from({ length: ROWS }, (_, i) => ({ id: i, val: i * 1.5, name: `item_${i}` })); + +for (let i = 0; i < WARMUP; i++) fromDictOriented({ orient: "records", data: records }); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) fromDictOriented({ orient: "records", data: records }); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "from_dict_oriented", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_get_dummies.ts b/benchmarks/tsb/bench_get_dummies.ts new file mode 100644 index 00000000..aea1448d --- /dev/null +++ b/benchmarks/tsb/bench_get_dummies.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: getDummies / dataFrameGetDummies — one-hot encoding. + * Outputs JSON: {"function": "get_dummies", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { getDummies, dataFrameGetDummies, Series, DataFrame } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 3; +const ITERATIONS = 30; + +const categories = ["A", "B", "C", "D", "E"]; +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => categories[i % categories.length]) }); +const df = new DataFrame({ + cat1: Array.from({ length: SIZE }, (_, i) => categories[i % categories.length]), + cat2: Array.from({ length: SIZE }, (_, i) => ["x", "y", "z"][i % 3]), +}); + +for (let i = 0; i < WARMUP; i++) { + getDummies(s); + dataFrameGetDummies(df, { columns: ["cat1", "cat2"] }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + getDummies(s); + dataFrameGetDummies(df, { columns: ["cat1", "cat2"] }); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "get_dummies", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_groupby_agg.ts b/benchmarks/tsb/bench_groupby_agg.ts new file mode 100644 index 00000000..11eb6994 --- /dev/null +++ b/benchmarks/tsb/bench_groupby_agg.ts @@ -0,0 +1,16 @@ +import { DataFrame } from "tsb"; + +const rng = (seed: number) => { let s = seed; return () => { s = (s * 1664525 + 1013904223) & 0xffffffff; return (s >>> 0) / 0xffffffff; }; }; +const rand = rng(42); +const groups = ["A","B","C","D","E"]; +const df = new DataFrame({ + group: Array.from({ length: 100_000 }, () => groups[Math.floor(rand() * 5)]), + val1: Array.from({ length: 100_000 }, () => (rand() * 2 - 1) * 3), + val2: Array.from({ length: 100_000 }, () => (rand() * 2 - 1) * 3), +}); +for (let i = 0; i < 3; i++) df.groupby("group").agg({ val1: ["mean","std","min","max"], val2: ["sum","count"] }); +const N = 30; +const t0 = performance.now(); +for (let i = 0; i < N; i++) df.groupby("group").agg({ val1: ["mean","std","min","max"], val2: ["sum","count"] }); +const elapsed = performance.now() - t0; +console.log(JSON.stringify({ function: "groupby_agg", mean_ms: elapsed / N, iterations: N, total_ms: elapsed })); diff --git a/benchmarks/tsb/bench_groupby_apply.ts b/benchmarks/tsb/bench_groupby_apply.ts new file mode 100644 index 00000000..79557f43 --- /dev/null +++ b/benchmarks/tsb/bench_groupby_apply.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: GroupBy apply (identity) on 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 2; +const ITERATIONS = 5; +const keys = Array.from({ length: ROWS }, (_, i) => `g${i % 50}`); +const vals = Array.from({ length: ROWS }, (_, i) => i * 0.1); +const df = DataFrame.fromColumns({ key: keys, value: vals }); + +for (let i = 0; i < WARMUP; i++) df.groupby("key").apply((sub) => sub); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) df.groupby("key").apply((sub) => sub); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "groupby_apply", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_groupby_count.ts b/benchmarks/tsb/bench_groupby_count.ts new file mode 100644 index 00000000..b7144b6e --- /dev/null +++ b/benchmarks/tsb/bench_groupby_count.ts @@ -0,0 +1,15 @@ +import { DataFrame, DataFrameGroupBy } from "tsb"; +const N = 100_000; +const keys = ["A", "B", "C", "D", "E"]; +const df = new DataFrame({ + key: Array.from({ length: N }, (_, i) => keys[i % keys.length]), + val: Array.from({ length: N }, (_, i) => i * 1.0), +}); +const gbObj = new DataFrameGroupBy(df, ["key"]); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) gbObj.count(); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) gbObj.count(); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "groupby_count", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_groupby_custom_agg.ts b/benchmarks/tsb/bench_groupby_custom_agg.ts new file mode 100644 index 00000000..062bae71 --- /dev/null +++ b/benchmarks/tsb/bench_groupby_custom_agg.ts @@ -0,0 +1,28 @@ +/** + * Benchmark: GroupBy agg with custom function on 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const keys = Array.from({ length: ROWS }, (_, i) => `g${i % 100}`); +const vals = Array.from({ length: ROWS }, (_, i) => i * 0.1); +const df = DataFrame.fromColumns({ key: keys, value: vals }); +const rangeFn = (vals: readonly (string | number | null | boolean | bigint)[]) => { + const nums = vals.filter((v): v is number => typeof v === "number"); + return nums.length ? Math.max(...nums) - Math.min(...nums) : null; +}; + +for (let i = 0; i < WARMUP; i++) df.groupby("key").agg(rangeFn); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) df.groupby("key").agg(rangeFn); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "groupby_custom_agg", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_groupby_filter.ts b/benchmarks/tsb/bench_groupby_filter.ts new file mode 100644 index 00000000..ea8012dd --- /dev/null +++ b/benchmarks/tsb/bench_groupby_filter.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: GroupBy filter on 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const keys = Array.from({ length: ROWS }, (_, i) => `g${i % 200}`); +const vals = Array.from({ length: ROWS }, (_, i) => i * 0.1); +const df = DataFrame.fromColumns({ key: keys, value: vals }); + +for (let i = 0; i < WARMUP; i++) df.groupby("key").filter((sub) => sub.shape[0] > 400); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) df.groupby("key").filter((sub) => sub.shape[0] > 400); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "groupby_filter", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_groupby_first.ts b/benchmarks/tsb/bench_groupby_first.ts new file mode 100644 index 00000000..d5e204d6 --- /dev/null +++ b/benchmarks/tsb/bench_groupby_first.ts @@ -0,0 +1,16 @@ +import { DataFrame, DataFrameGroupBy } from "tsb"; +const N = 100_000; +const keys = ["A", "B", "C", "D", "E"]; +const df = new DataFrame({ + key: Array.from({ length: N }, (_, i) => keys[i % keys.length]), + val: Array.from({ length: N }, (_, i) => i * 0.5), + val2: Array.from({ length: N }, (_, i) => i % 100), +}); +const gbObj = new DataFrameGroupBy(df, ["key"]); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) gbObj.first(); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) gbObj.first(); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "groupby_first", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_groupby_get_group.ts b/benchmarks/tsb/bench_groupby_get_group.ts new file mode 100644 index 00000000..17d5933a --- /dev/null +++ b/benchmarks/tsb/bench_groupby_get_group.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: groupby_get_group — DataFrameGroupBy.getGroup on 100k rows + */ +import { DataFrame, Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const groupKeys = Array.from({ length: ROWS }, (_, i) => `group_${i % 5}`); +const values = Array.from({ length: ROWS }, (_, i) => i); +const df = new DataFrame({ + data: { group: groupKeys, value: values }, +}); +const grouped = df.groupby("group"); + +for (let i = 0; i < WARMUP; i++) { + grouped.getGroup("group_0"); + grouped.getGroup("group_1"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + grouped.getGroup("group_0"); + grouped.getGroup("group_1"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "groupby_get_group", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_groupby_last.ts b/benchmarks/tsb/bench_groupby_last.ts new file mode 100644 index 00000000..c8554166 --- /dev/null +++ b/benchmarks/tsb/bench_groupby_last.ts @@ -0,0 +1,16 @@ +import { DataFrame, DataFrameGroupBy } from "tsb"; +const N = 100_000; +const keys = ["A", "B", "C", "D", "E"]; +const df = new DataFrame({ + key: Array.from({ length: N }, (_, i) => keys[i % keys.length]), + val: Array.from({ length: N }, (_, i) => i * 0.5), + val2: Array.from({ length: N }, (_, i) => i % 100), +}); +const gbObj = new DataFrameGroupBy(df, ["key"]); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) gbObj.last(); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) gbObj.last(); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "groupby_last", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_groupby_max.ts b/benchmarks/tsb/bench_groupby_max.ts new file mode 100644 index 00000000..0beef0db --- /dev/null +++ b/benchmarks/tsb/bench_groupby_max.ts @@ -0,0 +1,15 @@ +import { DataFrame, DataFrameGroupBy } from "tsb"; +const N = 100_000; +const keys = ["A", "B", "C", "D", "E"]; +const df = new DataFrame({ + key: Array.from({ length: N }, (_, i) => keys[i % keys.length]), + val: Array.from({ length: N }, (_, i) => i * 1.0), +}); +const gbObj = new DataFrameGroupBy(df, ["key"]); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) gbObj.max(); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) gbObj.max(); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "groupby_max", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_groupby_median.ts b/benchmarks/tsb/bench_groupby_median.ts new file mode 100644 index 00000000..86cb6b2c --- /dev/null +++ b/benchmarks/tsb/bench_groupby_median.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: DataFrameGroupBy.agg with custom median function on 100k-row DataFrame. + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const df = DataFrame.fromColumns({ + group: Array.from({ length: ROWS }, (_, i) => i % 100), + value: Array.from({ length: ROWS }, (_, i) => (i * 1.414) % 9999), +}); + +function median(vals: readonly (number | string | boolean | null | undefined)[]): number { + const nums = vals.filter((v): v is number => typeof v === "number" && !Number.isNaN(v)); + if (nums.length === 0) return Number.NaN; + const sorted = [...nums].sort((a, b) => a - b); + const mid = Math.floor(sorted.length / 2); + return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid]; +} + +for (let i = 0; i < WARMUP; i++) df.groupby("group").agg({ value: median }); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.groupby("group").agg({ value: median }); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "groupby_median", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_groupby_min.ts b/benchmarks/tsb/bench_groupby_min.ts new file mode 100644 index 00000000..7d3393db --- /dev/null +++ b/benchmarks/tsb/bench_groupby_min.ts @@ -0,0 +1,15 @@ +import { DataFrame, DataFrameGroupBy } from "tsb"; +const N = 100_000; +const keys = ["A", "B", "C", "D", "E"]; +const df = new DataFrame({ + key: Array.from({ length: N }, (_, i) => keys[i % keys.length]), + val: Array.from({ length: N }, (_, i) => i * 1.0), +}); +const gbObj = new DataFrameGroupBy(df, ["key"]); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) gbObj.min(); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) gbObj.min(); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "groupby_min", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_groupby_multi_agg.ts b/benchmarks/tsb/bench_groupby_multi_agg.ts new file mode 100644 index 00000000..2a402949 --- /dev/null +++ b/benchmarks/tsb/bench_groupby_multi_agg.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: GroupBy multiple aggregations on 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const keys = Array.from({ length: ROWS }, (_, i) => `g${i % 100}`); +const vals = Array.from({ length: ROWS }, (_, i) => i * 0.1); +const df = DataFrame.fromColumns({ key: keys, value: vals }); + +for (let i = 0; i < WARMUP; i++) { + df.groupby("key").mean(); + df.groupby("key").std(); + df.groupby("key").min(); + df.groupby("key").max(); +} +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.groupby("key").mean(); + df.groupby("key").std(); + df.groupby("key").min(); + df.groupby("key").max(); +} +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "groupby_multi_agg", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_groupby_ngroups.ts b/benchmarks/tsb/bench_groupby_ngroups.ts new file mode 100644 index 00000000..9f5ef1a5 --- /dev/null +++ b/benchmarks/tsb/bench_groupby_ngroups.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: DataFrameGroupBy.ngroups and .groupKeys property access. + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = DataFrame.fromColumns({ + key: Array.from({ length: ROWS }, (_, i) => `g${i % 100}`), + val: Array.from({ length: ROWS }, (_, i) => i * 1.5), +}); +const gbk = df.groupby("key"); + +for (let i = 0; i < WARMUP; i++) { + gbk.ngroups; + gbk.groupKeys; +} + +const t0 = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + gbk.ngroups; + gbk.groupKeys; +} +const total = performance.now() - t0; + +console.log( + JSON.stringify({ function: "groupby_ngroups", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total }), +); diff --git a/benchmarks/tsb/bench_groupby_size.ts b/benchmarks/tsb/bench_groupby_size.ts new file mode 100644 index 00000000..0a7216d4 --- /dev/null +++ b/benchmarks/tsb/bench_groupby_size.ts @@ -0,0 +1,15 @@ +import { DataFrame, DataFrameGroupBy } from "tsb"; +const N = 100_000; +const keys = ["A", "B", "C", "D", "E"]; +const df = new DataFrame({ + key: Array.from({ length: N }, (_, i) => keys[i % keys.length]), + val: Array.from({ length: N }, (_, i) => i * 1.0), +}); +const gbObj = new DataFrameGroupBy(df, ["key"]); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) gbObj.agg("size"); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) gbObj.agg("size"); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "groupby_size", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_groupby_std.ts b/benchmarks/tsb/bench_groupby_std.ts new file mode 100644 index 00000000..a6a08edb --- /dev/null +++ b/benchmarks/tsb/bench_groupby_std.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: GroupBy std on 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const keys = Array.from({ length: ROWS }, (_, i) => `g${i % 100}`); +const vals = Array.from({ length: ROWS }, (_, i) => i * 0.1); +const df = DataFrame.fromColumns({ key: keys, value: vals }); + +for (let i = 0; i < WARMUP; i++) df.groupby("key").std(); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) df.groupby("key").std(); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "groupby_std", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_groupby_std_df.ts b/benchmarks/tsb/bench_groupby_std_df.ts new file mode 100644 index 00000000..7530232e --- /dev/null +++ b/benchmarks/tsb/bench_groupby_std_df.ts @@ -0,0 +1,25 @@ +/** + * Benchmark: DataFrame.groupby(by).agg('std') on 100k-row DataFrame. + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const df = DataFrame.fromColumns({ + group: Array.from({ length: ROWS }, (_, i) => i % 50), + a: Array.from({ length: ROWS }, (_, i) => (i * 1.23) % 9999), + b: Array.from({ length: ROWS }, (_, i) => (i * 4.56) % 9999), +}); + +for (let i = 0; i < WARMUP; i++) df.groupby("group").agg("std"); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.groupby("group").agg("std"); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "groupby_std_df", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_groupby_sum.ts b/benchmarks/tsb/bench_groupby_sum.ts new file mode 100644 index 00000000..9baa1be8 --- /dev/null +++ b/benchmarks/tsb/bench_groupby_sum.ts @@ -0,0 +1,16 @@ +import { DataFrame, DataFrameGroupBy } from "tsb"; +const N = 100_000; +const keys = ["A", "B", "C", "D", "E"]; +const df = new DataFrame({ + key: Array.from({ length: N }, (_, i) => keys[i % keys.length]), + val: Array.from({ length: N }, (_, i) => i * 1.0), + val2: Array.from({ length: N }, (_, i) => i % 200), +}); +const gbObj = new DataFrameGroupBy(df, ["key"]); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) gbObj.sum(); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) gbObj.sum(); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "groupby_sum", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_groupby_transform.ts b/benchmarks/tsb/bench_groupby_transform.ts new file mode 100644 index 00000000..62b6b728 --- /dev/null +++ b/benchmarks/tsb/bench_groupby_transform.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: GroupBy transform on 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const keys = Array.from({ length: ROWS }, (_, i) => `g${i % 100}`); +const vals = Array.from({ length: ROWS }, (_, i) => i * 0.1); +const df = DataFrame.fromColumns({ key: keys, value: vals }); + +for (let i = 0; i < WARMUP; i++) df.groupby("key").transform((v) => v.map((x) => (x as number))); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) df.groupby("key").transform((v) => v.map((x) => (x as number))); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "groupby_transform", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_groupby_var.ts b/benchmarks/tsb/bench_groupby_var.ts new file mode 100644 index 00000000..c43b42c9 --- /dev/null +++ b/benchmarks/tsb/bench_groupby_var.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: GroupBy var on 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const keys = Array.from({ length: ROWS }, (_, i) => `g${i % 100}`); +const vals = Array.from({ length: ROWS }, (_, i) => i * 0.1); +const df = DataFrame.fromColumns({ key: keys, value: vals }); + +for (let i = 0; i < WARMUP; i++) df.groupby("key").var(); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) df.groupby("key").var(); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "groupby_var", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_histogram.ts b/benchmarks/tsb/bench_histogram.ts new file mode 100644 index 00000000..d83f31ca --- /dev/null +++ b/benchmarks/tsb/bench_histogram.ts @@ -0,0 +1,22 @@ +/** + * Benchmark: histogram on 100k-element array + */ +import { histogram } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => (i % 1000) * 0.1); + +for (let i = 0; i < WARMUP; i++) histogram(data, { bins: 50 }); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) histogram(data, { bins: 50 }); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "histogram", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_idxmin_idxmax.ts b/benchmarks/tsb/bench_idxmin_idxmax.ts new file mode 100644 index 00000000..40464f65 --- /dev/null +++ b/benchmarks/tsb/bench_idxmin_idxmax.ts @@ -0,0 +1,26 @@ +/** + * Benchmark: idxminSeries / idxmaxSeries — index of min/max on a 100k-element Series. + * Outputs JSON: {"function": "idxmin_idxmax", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, idxminSeries, idxmaxSeries } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const data = Float64Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.01) * 1000); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + idxminSeries(s); + idxmaxSeries(s); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idxminSeries(s); + idxmaxSeries(s); +} +const total = performance.now() - start; + +console.log(JSON.stringify({ function: "idxmin_idxmax", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_index_append.ts b/benchmarks/tsb/bench_index_append.ts new file mode 100644 index 00000000..1e4d1aa2 --- /dev/null +++ b/benchmarks/tsb/bench_index_append.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: index_append — Index.append concatenating two indices + */ +import { Index } from "../../src/index.js"; + +const ROWS = 50_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data1 = Array.from({ length: ROWS }, (_, i) => `key_${i}`); +const data2 = Array.from({ length: ROWS }, (_, i) => `key_${ROWS + i}`); +const idx1 = new Index(data1); +const idx2 = new Index(data2); + +for (let i = 0; i < WARMUP; i++) { + idx1.append(idx2); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx1.append(idx2); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "index_append", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_arg_sort.ts b/benchmarks/tsb/bench_index_arg_sort.ts new file mode 100644 index 00000000..2e1fd3f8 --- /dev/null +++ b/benchmarks/tsb/bench_index_arg_sort.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: index_arg_sort — Index.argsort on 100k-element Index + */ +import { Index } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const labels = Array.from({ length: SIZE }, (_, i) => SIZE - i); +const idx = new Index(labels); + +for (let i = 0; i < WARMUP; i++) { + idx.argsort(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx.argsort(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "index_arg_sort", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_argmin_argmax.ts b/benchmarks/tsb/bench_index_argmin_argmax.ts new file mode 100644 index 00000000..58ed248d --- /dev/null +++ b/benchmarks/tsb/bench_index_argmin_argmax.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: index_argmin_argmax — Index.argmin and Index.argmax on 100k-element Index + */ +import { Index } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const labels = Array.from({ length: SIZE }, (_, i) => i); +const idx = new Index(labels); + +for (let i = 0; i < WARMUP; i++) { + idx.argmin(); + idx.argmax(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx.argmin(); + idx.argmax(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "index_argmin_argmax", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_contains.ts b/benchmarks/tsb/bench_index_contains.ts new file mode 100644 index 00000000..d1fde91f --- /dev/null +++ b/benchmarks/tsb/bench_index_contains.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: Index.contains and isin on 100k-element Index + */ +import { Index } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const labels = Array.from({ length: SIZE }, (_, i) => i); +const idx = new Index(labels); +const lookups = Array.from({ length: 1000 }, (_, i) => i * 100); + +for (let i = 0; i < WARMUP; i++) { + for (const lbl of lookups.slice(0, 10)) idx.contains(lbl); + idx.isin(lookups); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + for (const lbl of lookups.slice(0, 10)) idx.contains(lbl); + idx.isin(lookups); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "index_contains", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_copy_toarray.ts b/benchmarks/tsb/bench_index_copy_toarray.ts new file mode 100644 index 00000000..103abf3c --- /dev/null +++ b/benchmarks/tsb/bench_index_copy_toarray.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: Index copy and toArray on 100k-element Index + */ +import { Index } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const values = Array.from({ length: ROWS }, (_, i) => i); +const idx = new Index(values); + +for (let i = 0; i < WARMUP; i++) { + idx.copy(); + idx.toArray(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx.copy(); + idx.toArray(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "index_copy_toarray", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_delete_drop.ts b/benchmarks/tsb/bench_index_delete_drop.ts new file mode 100644 index 00000000..cfea9198 --- /dev/null +++ b/benchmarks/tsb/bench_index_delete_drop.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: index_delete_drop — Index.delete and Index.drop on 100k-element Index + */ +import { Index } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const labels = Array.from({ length: SIZE }, (_, i) => i); +const idx = new Index(labels); + +for (let i = 0; i < WARMUP; i++) { + idx.delete(500); + idx.drop([100, 200, 300, 400, 500]); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx.delete(500); + idx.drop([100, 200, 300, 400, 500]); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "index_delete_drop", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_drop_duplicates.ts b/benchmarks/tsb/bench_index_drop_duplicates.ts new file mode 100644 index 00000000..c309b908 --- /dev/null +++ b/benchmarks/tsb/bench_index_drop_duplicates.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: index_drop_duplicates — Index.dropDuplicates on 100k Index with 50% dupes + */ +import { Index } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const labels = Array.from({ length: SIZE }, (_, i) => i % (SIZE / 2)); +const idx = new Index(labels); + +for (let i = 0; i < WARMUP; i++) { + idx.dropDuplicates(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx.dropDuplicates(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "index_drop_duplicates", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_duplicated.ts b/benchmarks/tsb/bench_index_duplicated.ts new file mode 100644 index 00000000..08f9c24b --- /dev/null +++ b/benchmarks/tsb/bench_index_duplicated.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: index_duplicated — Index.duplicated() on 100k-element Index with duplicates + */ +import { Index } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +// Create index with ~10% duplicates +const idx = new Index(Array.from({ length: ROWS }, (_, i) => i % 90_000)); + +for (let i = 0; i < WARMUP; i++) { + idx.duplicated("first"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx.duplicated("first"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "index_duplicated", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_equals_identical.ts b/benchmarks/tsb/bench_index_equals_identical.ts new file mode 100644 index 00000000..33c5090a --- /dev/null +++ b/benchmarks/tsb/bench_index_equals_identical.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: index_equals_identical — Index.equals and Index.identical on 100k-element Index + */ +import { Index } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const labels = Array.from({ length: SIZE }, (_, i) => i); +const idx = new Index(labels); +const idx2 = new Index(labels.slice()); + +for (let i = 0; i < WARMUP; i++) { + idx.equals(idx2); + idx.identical(idx2); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx.equals(idx2); + idx.identical(idx2); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "index_equals_identical", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_fillna.ts b/benchmarks/tsb/bench_index_fillna.ts new file mode 100644 index 00000000..e840954c --- /dev/null +++ b/benchmarks/tsb/bench_index_fillna.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: index_fillna — Index.fillna replacing null values on 100k-element index + */ +import { Index } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => (i % 10 === 0 ? null : `key_${i}`)); +const idx = new Index(data); + +for (let i = 0; i < WARMUP; i++) { + idx.fillna("missing"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx.fillna("missing"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "index_fillna", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_getindexer.ts b/benchmarks/tsb/bench_index_getindexer.ts new file mode 100644 index 00000000..ec3d1bd9 --- /dev/null +++ b/benchmarks/tsb/bench_index_getindexer.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: index_getindexer — Index.getIndexer(target) on 10k-element Index + */ +import { Series } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const base = new Series(Float64Array.from({ length: ROWS }, (_, i) => i)); +const target = new Series(Float64Array.from({ length: 1000 }, (_, i) => i * 10)); + +for (let i = 0; i < WARMUP; i++) { + base.index.getIndexer(target.index); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + base.index.getIndexer(target.index); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "index_getindexer", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_getloc.ts b/benchmarks/tsb/bench_index_getloc.ts new file mode 100644 index 00000000..3891e112 --- /dev/null +++ b/benchmarks/tsb/bench_index_getloc.ts @@ -0,0 +1,26 @@ +/** + * Benchmark: Index.getLoc — locate positions of a label in an index. + */ +import { Index } from "../../src/index.js"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// Index with unique labels +const labels = Array.from({ length: SIZE }, (_, i) => i); +const idx = new Index(labels); + +for (let i = 0; i < WARMUP; i++) { + idx.getLoc(5000); +} + +const t0 = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx.getLoc(i % SIZE); +} +const total = performance.now() - t0; + +console.log( + JSON.stringify({ function: "index_getloc", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total }), +); diff --git a/benchmarks/tsb/bench_index_insert.ts b/benchmarks/tsb/bench_index_insert.ts new file mode 100644 index 00000000..7ad331b3 --- /dev/null +++ b/benchmarks/tsb/bench_index_insert.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: index_insert — Index.insert on 100k-element Index + */ +import { Index } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const labels = Array.from({ length: SIZE }, (_, i) => i); +const idx = new Index(labels); + +for (let i = 0; i < WARMUP; i++) { + idx.insert(500, 999_999); + idx.insert(0, -1); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx.insert(500, 999_999); + idx.insert(0, -1); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "index_insert", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_isin.ts b/benchmarks/tsb/bench_index_isin.ts new file mode 100644 index 00000000..def0813a --- /dev/null +++ b/benchmarks/tsb/bench_index_isin.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: index_isin — Index.isin() membership check on 100k-element Index + */ +import { Index } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const idx = new Index(Array.from({ length: ROWS }, (_, i) => i)); +const lookup = Array.from({ length: 1_000 }, (_, i) => i * 100); + +for (let i = 0; i < WARMUP; i++) { + idx.isin(lookup); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx.isin(lookup); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "index_isin", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_isna_dropna.ts b/benchmarks/tsb/bench_index_isna_dropna.ts new file mode 100644 index 00000000..a92cdfe1 --- /dev/null +++ b/benchmarks/tsb/bench_index_isna_dropna.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: index_isna_dropna — Index.isna and Index.dropna on 100k-element Index with nulls + */ +import { Index } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const labels = Array.from({ length: SIZE }, (_, i) => (i % 5 === 0 ? null : i)); +const idx = new Index(labels); + +for (let i = 0; i < WARMUP; i++) { + idx.isna(); + idx.dropna(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx.isna(); + idx.dropna(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "index_isna_dropna", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_min_max.ts b/benchmarks/tsb/bench_index_min_max.ts new file mode 100644 index 00000000..ce923399 --- /dev/null +++ b/benchmarks/tsb/bench_index_min_max.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: index_min_max — Index.min and Index.max on 100k-element Index + */ +import { Index } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const labels = Array.from({ length: SIZE }, (_, i) => i); +const idx = new Index(labels); + +for (let i = 0; i < WARMUP; i++) { + idx.min(); + idx.max(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx.min(); + idx.max(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "index_min_max", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_monotonic.ts b/benchmarks/tsb/bench_index_monotonic.ts new file mode 100644 index 00000000..027680f9 --- /dev/null +++ b/benchmarks/tsb/bench_index_monotonic.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: Index.isMonotonicIncreasing, isMonotonicDecreasing, isUnique on 100k-element Index + */ +import { Index } from "../../src/index.js"; + +const N = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const incData = Array.from({ length: N }, (_, i) => i); +const decData = Array.from({ length: N }, (_, i) => N - i); +const idxInc = new Index(incData); +const idxDec = new Index(decData); + +for (let i = 0; i < WARMUP; i++) { + idxInc.isMonotonicIncreasing; + idxDec.isMonotonicDecreasing; + idxInc.isUnique; +} +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idxInc.isMonotonicIncreasing; + idxDec.isMonotonicDecreasing; + idxInc.isUnique; +} +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "index_monotonic", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_nunique.ts b/benchmarks/tsb/bench_index_nunique.ts new file mode 100644 index 00000000..c9aa725e --- /dev/null +++ b/benchmarks/tsb/bench_index_nunique.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: index_nunique — Index.nunique on 100k-element Index with 50% unique values + */ +import { Index } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const labels = Array.from({ length: SIZE }, (_, i) => i % (SIZE / 2)); +const idx = new Index(labels); + +for (let i = 0; i < WARMUP; i++) { + idx.nunique(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx.nunique(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "index_nunique", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_ops.ts b/benchmarks/tsb/bench_index_ops.ts new file mode 100644 index 00000000..5e71f783 --- /dev/null +++ b/benchmarks/tsb/bench_index_ops.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: Index set operations (union, intersection, difference) on 50k-element Index + */ +import { Index } from "../../src/index.js"; + +const SIZE = 50_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const labelsA = Array.from({ length: SIZE }, (_, i) => i); +const labelsB = Array.from({ length: SIZE }, (_, i) => i + SIZE / 2); +const idxA = new Index(labelsA); +const idxB = new Index(labelsB); + +for (let i = 0; i < WARMUP; i++) { + idxA.union(idxB); + idxA.intersection(idxB); + idxA.difference(idxB); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idxA.union(idxB); + idxA.intersection(idxB); + idxA.difference(idxB); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "index_ops", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_rename.ts b/benchmarks/tsb/bench_index_rename.ts new file mode 100644 index 00000000..2fa1dc2b --- /dev/null +++ b/benchmarks/tsb/bench_index_rename.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: index_rename — Index.rename changing the index name + */ +import { Index } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `key_${i}`); +const idx = new Index(data, "original_name"); + +for (let i = 0; i < WARMUP; i++) { + idx.rename("new_name"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx.rename("new_name"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "index_rename", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_slice_take.ts b/benchmarks/tsb/bench_index_slice_take.ts new file mode 100644 index 00000000..e8cd9e24 --- /dev/null +++ b/benchmarks/tsb/bench_index_slice_take.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: index_slice_take — Index.slice and Index.take on 100k-element Index + */ +import { Index } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const labels = Array.from({ length: SIZE }, (_, i) => i); +const idx = new Index(labels); +const positions = Array.from({ length: 1000 }, (_, i) => i * 100); + +for (let i = 0; i < WARMUP; i++) { + idx.slice(0, 50_000); + idx.take(positions); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx.slice(0, 50_000); + idx.take(positions); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "index_slice_take", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_sort.ts b/benchmarks/tsb/bench_index_sort.ts new file mode 100644 index 00000000..22b4538e --- /dev/null +++ b/benchmarks/tsb/bench_index_sort.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: Index.sortValues on 100k-element Index + */ +import { Index } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const labels = Array.from({ length: SIZE }, (_, i) => SIZE - i); +const idx = new Index(labels); + +for (let i = 0; i < WARMUP; i++) { + idx.sortValues(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx.sortValues(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "index_sort", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_index_symmetric_diff.ts b/benchmarks/tsb/bench_index_symmetric_diff.ts new file mode 100644 index 00000000..21ef6f3a --- /dev/null +++ b/benchmarks/tsb/bench_index_symmetric_diff.ts @@ -0,0 +1,29 @@ +/** + * Benchmark: Index.symmetricDifference on 10k-element integer indexes + */ +import { Index } from "../../src/index.js"; + +const N = 10_000; +const a = new Index(Array.from({ length: N }, (_, i) => i)); +const b = new Index(Array.from({ length: N }, (_, i) => i + N / 2)); + +const WARMUP = 3; +const ITERATIONS = 50; + +for (let i = 0; i < WARMUP; i++) { + a.symmetricDifference(b); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + a.symmetricDifference(b); +} +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "index_symmetric_diff", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_interpolate.ts b/benchmarks/tsb/bench_interpolate.ts new file mode 100644 index 00000000..cc1dc495 --- /dev/null +++ b/benchmarks/tsb/bench_interpolate.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: Series.interpolate() — linear interpolation over NaN values. + * Outputs JSON: {"function": "interpolate", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const data = Array.from({ length: SIZE }, (_, i) => (i % 5 === 0 ? Number.NaN : i * 1.0)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.interpolate({ method: "linear" }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + s.interpolate({ method: "linear" }); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "interpolate", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_interval.ts b/benchmarks/tsb/bench_interval.ts new file mode 100644 index 00000000..2a04c369 --- /dev/null +++ b/benchmarks/tsb/bench_interval.ts @@ -0,0 +1,44 @@ +/** + * Benchmark: Interval / IntervalIndex — closed/open intervals. + * Outputs JSON: {"function": "interval", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Interval, IntervalIndex } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const intervals = Array.from({ length: SIZE }, (_, i) => new Interval(i, i + 1)); +const breaks = Array.from({ length: 1_001 }, (_, i) => i); + +for (let i = 0; i < WARMUP; i++) { + for (const iv of intervals.slice(0, 100)) { + void iv.contains(iv.mid); + void iv.length; + void iv.toString(); + } + IntervalIndex.fromBreaks(breaks); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + for (const iv of intervals) { + void iv.contains(iv.mid); + void iv.length; + void iv.toString(); + } + IntervalIndex.fromBreaks(breaks); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "interval", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_isin.ts b/benchmarks/tsb/bench_isin.ts new file mode 100644 index 00000000..2f3741b1 --- /dev/null +++ b/benchmarks/tsb/bench_isin.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: Series.isin() — membership test. + * Outputs JSON: {"function": "isin", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 5000) }); +const testSet = Array.from({ length: 2500 }, (_, i) => i); + +for (let i = 0; i < WARMUP; i++) { + s.isin(testSet); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + s.isin(testSet); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "isin", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_json_normalize.ts b/benchmarks/tsb/bench_json_normalize.ts new file mode 100644 index 00000000..cd733c05 --- /dev/null +++ b/benchmarks/tsb/bench_json_normalize.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: jsonNormalize — flatten nested JSON to a flat DataFrame. + * Outputs JSON: {"function": "json_normalize", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { jsonNormalize } from "../../src/index.ts"; + +const SIZE = 1_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const records = Array.from({ length: SIZE }, (_, i) => ({ + id: i, + name: `user_${i}`, + address: { city: `city_${i % 10}`, zip: `${10000 + i}` }, + scores: [i, i + 1, i + 2], +})); + +for (let i = 0; i < WARMUP; i++) { + jsonNormalize(records, { maxLevel: 2 }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + jsonNormalize(records, { maxLevel: 2 }); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "json_normalize", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_make_formatter.ts b/benchmarks/tsb/bench_make_formatter.ts new file mode 100644 index 00000000..59849eac --- /dev/null +++ b/benchmarks/tsb/bench_make_formatter.ts @@ -0,0 +1,16 @@ +import { makeFloatFormatter, makePercentFormatter, makeCurrencyFormatter } from "tsb"; +const WARMUP = 3; +const ITERS = 10_000; +for (let i = 0; i < WARMUP; i++) { + makeFloatFormatter(2); + makePercentFormatter(1); + makeCurrencyFormatter("$", 2); +} +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) { + makeFloatFormatter(2); + makePercentFormatter(1); + makeCurrencyFormatter("$", 2); +} +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "make_formatter", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_mask.ts b/benchmarks/tsb/bench_mask.ts new file mode 100644 index 00000000..646748ac --- /dev/null +++ b/benchmarks/tsb/bench_mask.ts @@ -0,0 +1,13 @@ +import { Series } from "tsb"; + +const rng = (seed: number) => { let s = seed; return () => { s = (s * 1664525 + 1013904223) & 0xffffffff; return ((s >>> 0) / 0xffffffff) * 2 - 1; }; }; +const rand = rng(42); +const data = Array.from({ length: 100_000 }, () => rand() * 3); +const s = new Series(data); +const cond = s.map((v: number) => v < 0); +for (let i = 0; i < 3; i++) s.mask(cond, 0.0); +const N = 100; +const t0 = performance.now(); +for (let i = 0; i < N; i++) s.mask(cond, 0.0); +const elapsed = performance.now() - t0; +console.log(JSON.stringify({ function: "mask", mean_ms: elapsed / N, iterations: N, total_ms: elapsed })); diff --git a/benchmarks/tsb/bench_melt.ts b/benchmarks/tsb/bench_melt.ts new file mode 100644 index 00000000..f30243ac --- /dev/null +++ b/benchmarks/tsb/bench_melt.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: melt (wide to long) on 10k-row DataFrame + */ +import { DataFrame, melt } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const a = Float64Array.from({ length: ROWS }, (_, i) => i * 0.1); +const b = Float64Array.from({ length: ROWS }, (_, i) => i * 0.2); +const c = Float64Array.from({ length: ROWS }, (_, i) => i * 0.3); +const df = new DataFrame({ A: a, B: b, C: c }); + +for (let i = 0; i < WARMUP; i++) { + melt(df, { value_vars: ["A", "B", "C"] }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + melt(df, { value_vars: ["A", "B", "C"] }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "melt", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_merge_inner.ts b/benchmarks/tsb/bench_merge_inner.ts new file mode 100644 index 00000000..392e6443 --- /dev/null +++ b/benchmarks/tsb/bench_merge_inner.ts @@ -0,0 +1,28 @@ +/** + * Benchmark: merge(left, right, { how: "inner" }) on 50k-row DataFrames. + */ +import { DataFrame, merge } from "../../src/index.js"; + +const ROWS = 50_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const left = DataFrame.fromColumns({ + id: Array.from({ length: ROWS }, (_, i) => i), + val: Array.from({ length: ROWS }, (_, i) => i * 1.5), +}); +const right = DataFrame.fromColumns({ + id: Array.from({ length: ROWS }, (_, i) => i + 10000), + extra: Array.from({ length: ROWS }, (_, i) => i * 2.0), +}); + +for (let i = 0; i < WARMUP; i++) merge(left, right, { on: "id", how: "inner" }); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + merge(left, right, { on: "id", how: "inner" }); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "merge_inner", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_merge_left.ts b/benchmarks/tsb/bench_merge_left.ts new file mode 100644 index 00000000..7d180a07 --- /dev/null +++ b/benchmarks/tsb/bench_merge_left.ts @@ -0,0 +1,28 @@ +/** + * Benchmark: merge(left, right, { how: "left" }) on 50k-row DataFrames. + */ +import { DataFrame, merge } from "../../src/index.js"; + +const ROWS = 50_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const left = DataFrame.fromColumns({ + id: Array.from({ length: ROWS }, (_, i) => i), + val: Array.from({ length: ROWS }, (_, i) => i * 1.5), +}); +const right = DataFrame.fromColumns({ + id: Array.from({ length: ROWS }, (_, i) => i % (ROWS / 2)), + extra: Array.from({ length: ROWS }, (_, i) => i * 2.0), +}); + +for (let i = 0; i < WARMUP; i++) merge(left, right, { on: "id", how: "left" }); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + merge(left, right, { on: "id", how: "left" }); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "merge_left", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_merge_left_on_right_on.ts b/benchmarks/tsb/bench_merge_left_on_right_on.ts new file mode 100644 index 00000000..b62e69d3 --- /dev/null +++ b/benchmarks/tsb/bench_merge_left_on_right_on.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: merge with left_on/right_on — join on differently-named columns. + */ +import { DataFrame, merge } from "../../src/index.js"; + +const ROWS = 20_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const left = DataFrame.fromColumns({ + emp_id: Array.from({ length: ROWS }, (_, i) => i), + salary: Array.from({ length: ROWS }, (_, i) => 30000 + i * 10), +}); +const right = DataFrame.fromColumns({ + id: Array.from({ length: ROWS / 2 }, (_, i) => i), + dept: Array.from({ length: ROWS / 2 }, (_, i) => `dept${i % 10}`), +}); + +for (let i = 0; i < WARMUP; i++) { + merge(left, right, { left_on: "emp_id", right_on: "id" }); +} + +const t0 = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + merge(left, right, { left_on: "emp_id", right_on: "id" }); +} +const total = performance.now() - t0; + +console.log( + JSON.stringify({ function: "merge_left_on_right_on", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total }), +); diff --git a/benchmarks/tsb/bench_merge_outer.ts b/benchmarks/tsb/bench_merge_outer.ts new file mode 100644 index 00000000..3ac80768 --- /dev/null +++ b/benchmarks/tsb/bench_merge_outer.ts @@ -0,0 +1,28 @@ +/** + * Benchmark: merge(left, right, { how: "outer" }) on 30k-row DataFrames. + */ +import { DataFrame, merge } from "../../src/index.js"; + +const ROWS = 30_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const left = DataFrame.fromColumns({ + id: Array.from({ length: ROWS }, (_, i) => i), + val: Array.from({ length: ROWS }, (_, i) => i * 1.5), +}); +const right = DataFrame.fromColumns({ + id: Array.from({ length: ROWS }, (_, i) => i + ROWS / 2), + extra: Array.from({ length: ROWS }, (_, i) => i * 2.0), +}); + +for (let i = 0; i < WARMUP; i++) merge(left, right, { on: "id", how: "outer" }); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + merge(left, right, { on: "id", how: "outer" }); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "merge_outer", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_merge_right.ts b/benchmarks/tsb/bench_merge_right.ts new file mode 100644 index 00000000..248cd99c --- /dev/null +++ b/benchmarks/tsb/bench_merge_right.ts @@ -0,0 +1,28 @@ +/** + * Benchmark: merge(left, right, { how: "right" }) on 50k-row DataFrames. + */ +import { DataFrame, merge } from "../../src/index.js"; + +const ROWS = 50_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const left = DataFrame.fromColumns({ + id: Array.from({ length: ROWS }, (_, i) => i % (ROWS / 2)), + val: Array.from({ length: ROWS }, (_, i) => i * 1.5), +}); +const right = DataFrame.fromColumns({ + id: Array.from({ length: ROWS }, (_, i) => i), + extra: Array.from({ length: ROWS }, (_, i) => i * 2.0), +}); + +for (let i = 0; i < WARMUP; i++) merge(left, right, { on: "id", how: "right" }); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + merge(left, right, { on: "id", how: "right" }); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "merge_right", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_min_max_normalize.ts b/benchmarks/tsb/bench_min_max_normalize.ts new file mode 100644 index 00000000..35267b26 --- /dev/null +++ b/benchmarks/tsb/bench_min_max_normalize.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: min-max normalization on 100k-element Series + */ +import { Series, minMaxNormalize } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01) * 100 + 50); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + minMaxNormalize(s); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + minMaxNormalize(s); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "min_max_normalize", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_mode_series.ts b/benchmarks/tsb/bench_mode_series.ts new file mode 100644 index 00000000..5ecb4ba5 --- /dev/null +++ b/benchmarks/tsb/bench_mode_series.ts @@ -0,0 +1,25 @@ +/** + * Benchmark: modeSeries — mode of a 10k-element integer Series. + * Outputs JSON: {"function": "mode_series", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, modeSeries } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +// 10k integers with bounded range to create repeated values +const data = Array.from({ length: SIZE }, (_, i) => i % 200); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + modeSeries(s); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + modeSeries(s); +} +const total = performance.now() - start; + +console.log(JSON.stringify({ function: "mode_series", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_move_column.ts b/benchmarks/tsb/bench_move_column.ts new file mode 100644 index 00000000..ad925070 --- /dev/null +++ b/benchmarks/tsb/bench_move_column.ts @@ -0,0 +1,25 @@ +/** + * Benchmark: moveColumn on a 100k-row DataFrame + */ +import { DataFrame, moveColumn } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const a = Array.from({ length: ROWS }, (_, i) => i); +const b = Array.from({ length: ROWS }, (_, i) => i * 2); +const c = Array.from({ length: ROWS }, (_, i) => i * 3); +const df = DataFrame.fromColumns({ a, b, c }); + +for (let i = 0; i < WARMUP; i++) moveColumn(df, "c", 0); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) moveColumn(df, "c", 0); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "move_column", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_multi_index.ts b/benchmarks/tsb/bench_multi_index.ts new file mode 100644 index 00000000..64294b7d --- /dev/null +++ b/benchmarks/tsb/bench_multi_index.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: MultiIndex construction on 100k pairs + */ +import { MultiIndex } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const a = Array.from({ length: ROWS }, (_, i) => `a${i % 100}`); +const b = Array.from({ length: ROWS }, (_, i) => i % 1000); +const tuples: [string, number][] = a.map((v, i) => [v, b[i]]); + +for (let i = 0; i < WARMUP; i++) new MultiIndex({ tuples }); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) new MultiIndex({ tuples }); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "multi_index", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_multi_index_contains.ts b/benchmarks/tsb/bench_multi_index_contains.ts new file mode 100644 index 00000000..f8c54516 --- /dev/null +++ b/benchmarks/tsb/bench_multi_index_contains.ts @@ -0,0 +1,26 @@ +/** + * Benchmark: MultiIndex.contains — check if a tuple key exists in the index. + */ +import { MultiIndex } from "../../src/index.js"; + +const SIZE = 5_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const arr1 = Array.from({ length: SIZE }, (_, i) => `a${i % 50}`); +const arr2 = Array.from({ length: SIZE }, (_, i) => i % 100); +const mi = MultiIndex.fromArrays([arr1, arr2]); + +for (let i = 0; i < WARMUP; i++) { + mi.contains(["a0", 0]); +} + +const t0 = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + mi.contains([`a${i % 50}`, i % 100]); +} +const total = performance.now() - t0; + +console.log( + JSON.stringify({ function: "multi_index_contains", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total }), +); diff --git a/benchmarks/tsb/bench_multi_index_droplevel.ts b/benchmarks/tsb/bench_multi_index_droplevel.ts new file mode 100644 index 00000000..007aab29 --- /dev/null +++ b/benchmarks/tsb/bench_multi_index_droplevel.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: MultiIndex droplevel, reorderLevels, and setNames + */ +import { MultiIndex } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const a = Array.from({ length: ROWS }, (_, i) => `a${i % 100}`); +const b = Array.from({ length: ROWS }, (_, i) => i % 1000); +const c = Array.from({ length: ROWS }, (_, i) => i % 50); +const tuples: [string, number, number][] = a.map((v, i) => [v, b[i], c[i]]); +const mi = new MultiIndex({ tuples, names: ["x", "y", "z"] }); + +for (let i = 0; i < WARMUP; i++) { + mi.droplevel(0); + mi.reorderLevels([2, 1, 0]); + mi.setNames(["a", "b", "c"]); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + mi.droplevel(0); + mi.reorderLevels([2, 1, 0]); + mi.setNames(["a", "b", "c"]); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "multi_index_droplevel", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_multi_index_duplicated.ts b/benchmarks/tsb/bench_multi_index_duplicated.ts new file mode 100644 index 00000000..ef070e23 --- /dev/null +++ b/benchmarks/tsb/bench_multi_index_duplicated.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: MultiIndex.duplicated() and dropDuplicates() on 100k-pair MultiIndex + */ +import { MultiIndex } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +// Create a MultiIndex with duplicates (10k unique pairs repeated 10 times) +const a = Array.from({ length: ROWS }, (_, i) => `a${i % 100}`); +const b = Array.from({ length: ROWS }, (_, i) => i % 1000); +const tuples: [string, number][] = a.map((v, i) => [v, b[i]]); +const mi = new MultiIndex({ tuples }); + +for (let i = 0; i < WARMUP; i++) { + mi.duplicated(); + mi.dropDuplicates(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + mi.duplicated(); + mi.dropDuplicates(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "multi_index_duplicated", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_multi_index_fromarrays.ts b/benchmarks/tsb/bench_multi_index_fromarrays.ts new file mode 100644 index 00000000..c3ebaa90 --- /dev/null +++ b/benchmarks/tsb/bench_multi_index_fromarrays.ts @@ -0,0 +1,25 @@ +/** + * Benchmark: MultiIndex.fromArrays — build from separate level arrays. + */ +import { MultiIndex } from "../../src/index.js"; + +const SIZE = 5_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const arr1 = Array.from({ length: SIZE }, (_, i) => `a${i % 50}`); +const arr2 = Array.from({ length: SIZE }, (_, i) => i % 100); + +for (let i = 0; i < WARMUP; i++) { + MultiIndex.fromArrays([arr1, arr2]); +} + +const t0 = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + MultiIndex.fromArrays([arr1, arr2]); +} +const total = performance.now() - t0; + +console.log( + JSON.stringify({ function: "multi_index_fromarrays", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total }), +); diff --git a/benchmarks/tsb/bench_multi_index_fromproduct.ts b/benchmarks/tsb/bench_multi_index_fromproduct.ts new file mode 100644 index 00000000..7dbef8e2 --- /dev/null +++ b/benchmarks/tsb/bench_multi_index_fromproduct.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: MultiIndex.fromProduct — build from Cartesian product. + */ +import { MultiIndex } from "../../src/index.js"; + +const WARMUP = 3; +const ITERATIONS = 30; + +const level1 = Array.from({ length: 50 }, (_, i) => `a${i}`); +const level2 = Array.from({ length: 100 }, (_, i) => i); + +for (let i = 0; i < WARMUP; i++) { + MultiIndex.fromProduct([level1, level2]); +} + +const t0 = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + MultiIndex.fromProduct([level1, level2]); +} +const total = performance.now() - t0; + +console.log( + JSON.stringify({ function: "multi_index_fromproduct", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total }), +); diff --git a/benchmarks/tsb/bench_multi_index_getloc.ts b/benchmarks/tsb/bench_multi_index_getloc.ts new file mode 100644 index 00000000..fdcdf787 --- /dev/null +++ b/benchmarks/tsb/bench_multi_index_getloc.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: MultiIndex.getLoc key lookup + */ +import { MultiIndex } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const a = Array.from({ length: ROWS }, (_, i) => `a${i % 100}`); +const b = Array.from({ length: ROWS }, (_, i) => i % 1000); +const tuples: [string, number][] = a.map((v, i) => [v, b[i]]); +const mi = new MultiIndex({ tuples }); +const key: [string, number] = ["a50", 500]; + +for (let i = 0; i < WARMUP; i++) { + mi.getLoc(key); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + mi.getLoc(key); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "multi_index_getloc", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_multi_index_isin.ts b/benchmarks/tsb/bench_multi_index_isin.ts new file mode 100644 index 00000000..eaae8eaa --- /dev/null +++ b/benchmarks/tsb/bench_multi_index_isin.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: MultiIndex.isin() on 100k-pair MultiIndex + */ +import { MultiIndex } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const a = Array.from({ length: ROWS }, (_, i) => `a${i % 100}`); +const b = Array.from({ length: ROWS }, (_, i) => i % 1000); +const tuples: [string, number][] = a.map((v, i) => [v, b[i] as number]); +const mi = new MultiIndex({ tuples }); +// 1000 tuples to look up +const lookupTuples: [string, number][] = Array.from({ length: 1000 }, (_, i) => [ + `a${i % 100}`, + i % 1000, +]); + +for (let i = 0; i < WARMUP; i++) mi.isin(lookupTuples); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) mi.isin(lookupTuples); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "multi_index_isin", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_multi_index_isna_dropna.ts b/benchmarks/tsb/bench_multi_index_isna_dropna.ts new file mode 100644 index 00000000..1ea8b880 --- /dev/null +++ b/benchmarks/tsb/bench_multi_index_isna_dropna.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: MultiIndex.isna(), notna(), and dropna() on 100k-pair MultiIndex with some nulls + */ +import { MultiIndex } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +// Create a MultiIndex with some null values +const a = Array.from({ length: ROWS }, (_, i) => (i % 10 === 0 ? null : `a${i % 100}`)); +const b = Array.from({ length: ROWS }, (_, i) => (i % 20 === 0 ? null : i % 1000)); +const tuples: [string | null, number | null][] = a.map((v, i) => [v, b[i]]); +const mi = new MultiIndex({ tuples }); + +for (let i = 0; i < WARMUP; i++) { + mi.isna(); + mi.notna(); + mi.dropna(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + mi.isna(); + mi.notna(); + mi.dropna(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "multi_index_isna_dropna", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_multi_index_reorder_levels.ts b/benchmarks/tsb/bench_multi_index_reorder_levels.ts new file mode 100644 index 00000000..1efaccc7 --- /dev/null +++ b/benchmarks/tsb/bench_multi_index_reorder_levels.ts @@ -0,0 +1,26 @@ +/** + * Benchmark: MultiIndex.reorderLevels() on 100k-pair MultiIndex + */ +import { MultiIndex } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const a = Array.from({ length: ROWS }, (_, i) => `a${i % 100}`); +const b = Array.from({ length: ROWS }, (_, i) => i % 1000); +const c = Array.from({ length: ROWS }, (_, i) => i % 50); +const tuples: [string, number, number][] = a.map((v, i) => [v, b[i] as number, c[i] as number]); +const mi = new MultiIndex({ tuples }); + +for (let i = 0; i < WARMUP; i++) mi.reorderLevels([2, 0, 1]); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) mi.reorderLevels([2, 0, 1]); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "multi_index_reorder_levels", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_multi_index_set_names.ts b/benchmarks/tsb/bench_multi_index_set_names.ts new file mode 100644 index 00000000..bd3e6f56 --- /dev/null +++ b/benchmarks/tsb/bench_multi_index_set_names.ts @@ -0,0 +1,25 @@ +/** + * Benchmark: MultiIndex.setNames() on 100k-pair MultiIndex + */ +import { MultiIndex } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const a = Array.from({ length: ROWS }, (_, i) => `a${i % 100}`); +const b = Array.from({ length: ROWS }, (_, i) => i % 1000); +const tuples: [string, number][] = a.map((v, i) => [v, b[i] as number]); +const mi = new MultiIndex({ tuples }); + +for (let i = 0; i < WARMUP; i++) mi.setNames(["level0", "level1"]); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) mi.setNames(["level0", "level1"]); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "multi_index_set_names", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_multi_index_setops.ts b/benchmarks/tsb/bench_multi_index_setops.ts new file mode 100644 index 00000000..77f685b5 --- /dev/null +++ b/benchmarks/tsb/bench_multi_index_setops.ts @@ -0,0 +1,42 @@ +/** + * Benchmark: MultiIndex set operations (union, intersection, difference) + */ +import { MultiIndex } from "../../src/index.js"; + +const ROWS = 50_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const a1 = Array.from({ length: ROWS }, (_, i) => `a${i % 100}`); +const b1 = Array.from({ length: ROWS }, (_, i) => i % 1000); +const tuples1: [string, number][] = a1.map((v, i) => [v, b1[i]]); + +const a2 = Array.from({ length: ROWS }, (_, i) => `a${(i + 50) % 100}`); +const b2 = Array.from({ length: ROWS }, (_, i) => (i + 500) % 1000); +const tuples2: [string, number][] = a2.map((v, i) => [v, b2[i]]); + +const mi1 = new MultiIndex({ tuples: tuples1 }); +const mi2 = new MultiIndex({ tuples: tuples2 }); + +for (let i = 0; i < WARMUP; i++) { + mi1.union(mi2); + mi1.intersection(mi2); + mi1.difference(mi2); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + mi1.union(mi2); + mi1.intersection(mi2); + mi1.difference(mi2); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "multi_index_setops", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_multi_index_sort_equals.ts b/benchmarks/tsb/bench_multi_index_sort_equals.ts new file mode 100644 index 00000000..1fe646e4 --- /dev/null +++ b/benchmarks/tsb/bench_multi_index_sort_equals.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: MultiIndex.sortValues() and equals() on 100k-pair MultiIndex + */ +import { MultiIndex } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const a = Array.from({ length: ROWS }, (_, i) => `a${i % 100}`); +const b = Array.from({ length: ROWS }, (_, i) => i % 1000); +const tuples: [string, number][] = a.map((v, i) => [v, b[i]]); +const mi = new MultiIndex({ tuples }); +const mi2 = new MultiIndex({ tuples: tuples.slice() }); + +for (let i = 0; i < WARMUP; i++) { + mi.sortValues(); + mi.equals(mi2); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + mi.sortValues(); + mi.equals(mi2); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "multi_index_sort_equals", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_multi_index_swaplevel.ts b/benchmarks/tsb/bench_multi_index_swaplevel.ts new file mode 100644 index 00000000..ec3cf61b --- /dev/null +++ b/benchmarks/tsb/bench_multi_index_swaplevel.ts @@ -0,0 +1,25 @@ +/** + * Benchmark: MultiIndex.swaplevel() on 100k-pair MultiIndex + */ +import { MultiIndex } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const a = Array.from({ length: ROWS }, (_, i) => `a${i % 100}`); +const b = Array.from({ length: ROWS }, (_, i) => i % 1000); +const tuples: [string, number][] = a.map((v, i) => [v, b[i]]); +const mi = new MultiIndex({ tuples }); + +for (let i = 0; i < WARMUP; i++) mi.swaplevel(); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) mi.swaplevel(); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "multi_index_swaplevel", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_multi_index_to_array.ts b/benchmarks/tsb/bench_multi_index_to_array.ts new file mode 100644 index 00000000..fa2d8122 --- /dev/null +++ b/benchmarks/tsb/bench_multi_index_to_array.ts @@ -0,0 +1,25 @@ +/** + * Benchmark: MultiIndex.toArray() on 100k-pair MultiIndex + */ +import { MultiIndex } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const a = Array.from({ length: ROWS }, (_, i) => `a${i % 100}`); +const b = Array.from({ length: ROWS }, (_, i) => i % 1000); +const tuples: [string, number][] = a.map((v, i) => [v, b[i] as number]); +const mi = new MultiIndex({ tuples }); + +for (let i = 0; i < WARMUP; i++) mi.toArray(); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) mi.toArray(); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "multi_index_to_array", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_nancumops.ts b/benchmarks/tsb/bench_nancumops.ts new file mode 100644 index 00000000..64b12fac --- /dev/null +++ b/benchmarks/tsb/bench_nancumops.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: nansum / nanmean / nanvar / nanstd — nan-ignoring aggregates on a 100k-element array. + * Outputs JSON: {"function": "nancumops", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { nansum, nanmean, nanvar, nanstd, nanmin, nanmax } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +// Array with ~10% NaN values +const data: (number | null)[] = Array.from({ length: SIZE }, (_, i) => + i % 10 === 0 ? null : Math.sin(i * 0.01) * 100, +); + +for (let i = 0; i < WARMUP; i++) { + nansum(data); + nanmean(data); + nanvar(data); + nanstd(data); + nanmin(data); + nanmax(data); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + nansum(data); + nanmean(data); + nanvar(data); + nanstd(data); + nanmin(data); + nanmax(data); +} +const total = performance.now() - start; + +console.log(JSON.stringify({ function: "nancumops", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_nat_sort.ts b/benchmarks/tsb/bench_nat_sort.ts new file mode 100644 index 00000000..8057e148 --- /dev/null +++ b/benchmarks/tsb/bench_nat_sort.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: natSorted / natCompare / natArgSort — natural sort. + * Outputs JSON: {"function": "nat_sort", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { natSorted, natCompare, natArgSort } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const data = Array.from({ length: SIZE }, (_, i) => `item${i % 1000}_v${i % 10}`); + +for (let i = 0; i < WARMUP; i++) { + natSorted(data); + natArgSort(data); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + natSorted(data); + natArgSort(data); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "nat_sort", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_nlargest.ts b/benchmarks/tsb/bench_nlargest.ts new file mode 100644 index 00000000..d7d55cca --- /dev/null +++ b/benchmarks/tsb/bench_nlargest.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: Series nlargest + * + * Returns the N largest values from a large numeric Series. + * Outputs JSON: {"function": "nlargest", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ + +import { Series, nlargestSeries } from "../../src/index.ts"; + +const SIZE = 100_000; +const N = 100; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i * 7919) % SIZE) }); + +for (let i = 0; i < WARMUP; i++) { + nlargestSeries(s, N); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + nlargestSeries(s, N); + const end = performance.now(); + times.push(end - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; + +console.log(JSON.stringify({ + function: "nlargest", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, +})); diff --git a/benchmarks/tsb/bench_notna_isna.ts b/benchmarks/tsb/bench_notna_isna.ts new file mode 100644 index 00000000..bdb237b6 --- /dev/null +++ b/benchmarks/tsb/bench_notna_isna.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: notna/isna on 100k-element Series with NaN + */ +import { Series, notna, isna } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data: (number | null)[] = Array.from({ length: ROWS }, (_, i) => + i % 5 === 0 ? null : i * 0.1, +); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + notna(s); + isna(s); +} +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + notna(s); + isna(s); +} +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "notna_isna", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_nsmallest.ts b/benchmarks/tsb/bench_nsmallest.ts new file mode 100644 index 00000000..fe5114b4 --- /dev/null +++ b/benchmarks/tsb/bench_nsmallest.ts @@ -0,0 +1,12 @@ +import { Series } from "tsb"; + +const rng = (seed: number) => { let s = seed; return () => { s = (s * 1664525 + 1013904223) & 0xffffffff; return ((s >>> 0) / 0xffffffff) * 2 - 1; }; }; +const rand = rng(42); +const data = Array.from({ length: 100_000 }, () => rand() * 3); +const s = new Series(data); +for (let i = 0; i < 3; i++) s.nsmallest(10); +const N = 100; +const t0 = performance.now(); +for (let i = 0; i < N; i++) s.nsmallest(10); +const elapsed = performance.now() - t0; +console.log(JSON.stringify({ function: "nsmallest", mean_ms: elapsed / N, iterations: N, total_ms: elapsed })); diff --git a/benchmarks/tsb/bench_pct_change.ts b/benchmarks/tsb/bench_pct_change.ts new file mode 100644 index 00000000..5c142bc7 --- /dev/null +++ b/benchmarks/tsb/bench_pct_change.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: Series.pct_change() — percentage change between elements. + * Outputs JSON: {"function": "pct_change", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 1.1 + 1.0) }); + +for (let i = 0; i < WARMUP; i++) { + s.pct_change(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + s.pct_change(); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "pct_change", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_pctchange_df.ts b/benchmarks/tsb/bench_pctchange_df.ts new file mode 100644 index 00000000..55c11759 --- /dev/null +++ b/benchmarks/tsb/bench_pctchange_df.ts @@ -0,0 +1,39 @@ +/** + * Benchmark: pctChangeDataFrame — percentage change across DataFrame columns. + * Outputs JSON: {"function": "pctchange_df", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { pctChangeDataFrame, DataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i * 1.1 + 1), + b: Array.from({ length: SIZE }, (_, i) => i * 0.5 + 2), + c: Array.from({ length: SIZE }, (_, i) => i * 2.3 + 3), +}); + +for (let i = 0; i < WARMUP; i++) { + pctChangeDataFrame(df); + pctChangeDataFrame(df, { periods: 3 }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + pctChangeDataFrame(df); + pctChangeDataFrame(df, { periods: 3 }); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "pctchange_df", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_pearson_corr.ts b/benchmarks/tsb/bench_pearson_corr.ts new file mode 100644 index 00000000..6563c5a5 --- /dev/null +++ b/benchmarks/tsb/bench_pearson_corr.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: Pearson correlation between two 100k-element Series + */ +import { Series, pearsonCorr } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const a = Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01)); +const b = Float64Array.from({ length: ROWS }, (_, i) => Math.cos(i * 0.01)); +const sa = new Series(a); +const sb = new Series(b); + +for (let i = 0; i < WARMUP; i++) { + pearsonCorr(sa, sb); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + pearsonCorr(sa, sb); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "pearson_corr", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_percentile_of_score.ts b/benchmarks/tsb/bench_percentile_of_score.ts new file mode 100644 index 00000000..7e2e001b --- /dev/null +++ b/benchmarks/tsb/bench_percentile_of_score.ts @@ -0,0 +1,10 @@ +import { percentileOfScore } from "tsb"; +const N = 100_000; +const data = Array.from({ length: N }, (_, i) => (i % 1000) * 0.1); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) percentileOfScore(data, 50.0); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) percentileOfScore(data, 50.0); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "percentile_of_score", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_period.ts b/benchmarks/tsb/bench_period.ts new file mode 100644 index 00000000..5c67c540 --- /dev/null +++ b/benchmarks/tsb/bench_period.ts @@ -0,0 +1,48 @@ +/** + * Benchmark: Period / PeriodIndex — fixed-frequency time spans. + * Outputs JSON: {"function": "period", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Period, PeriodIndex } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const baseDate = new Date(Date.UTC(2020, 0, 1)); +const periods = Array.from({ length: SIZE }, (_, i) => { + const d = new Date(baseDate.getTime() + i * 86_400_000); + return Period.fromDate(d, "D"); +}); + +const startQ = Period.fromDate(new Date(Date.UTC(2000, 0, 1)), "Q"); +const endQ = Period.fromDate(new Date(Date.UTC(2024, 11, 31)), "Q"); + +for (let i = 0; i < WARMUP; i++) { + for (const p of periods.slice(0, 100)) { + void p.toString(); + p.add(1); + } + PeriodIndex.fromRange(startQ, endQ); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + for (const p of periods) { + void p.toString(); + p.add(1); + } + PeriodIndex.fromRange(startQ, endQ); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "period", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_pipe_bench.ts b/benchmarks/tsb/bench_pipe_bench.ts new file mode 100644 index 00000000..9949b831 --- /dev/null +++ b/benchmarks/tsb/bench_pipe_bench.ts @@ -0,0 +1,27 @@ +/** + * Benchmark: pipe with 3 transforms on a 100k-element Series + */ +import { Series, pipe } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => i * 0.5); +const s = new Series({ data }); + +const double = (x: Series) => x.mul(2); +const addOne = (x: Series) => x.add(1); +const absVal = (x: Series) => x.abs(); + +for (let i = 0; i < WARMUP; i++) pipe(s, double, addOne, absVal); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) pipe(s, double, addOne, absVal); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "pipe_bench", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_pivot.ts b/benchmarks/tsb/bench_pivot.ts new file mode 100644 index 00000000..3d68ee3e --- /dev/null +++ b/benchmarks/tsb/bench_pivot.ts @@ -0,0 +1,23 @@ +import { DataFrame } from "tsb"; + +const rows = 100; +const cols = 20; +const rng = (seed: number) => { let s = seed; return () => { s = (s * 1664525 + 1013904223) & 0xffffffff; return ((s >>> 0) / 0xffffffff) * 2 - 1; }; }; +const rand = rng(42); +const rowArr: number[] = []; +const colArr: number[] = []; +const valArr: number[] = []; +for (let r = 0; r < rows; r++) { + for (let c = 0; c < cols; c++) { + rowArr.push(r); + colArr.push(c); + valArr.push(rand() * 3); + } +} +const df = new DataFrame({ row: rowArr, col: colArr, val: valArr }); +for (let i = 0; i < 3; i++) df.pivot({ index: "row", columns: "col", values: "val" }); +const N = 100; +const t0 = performance.now(); +for (let i = 0; i < N; i++) df.pivot({ index: "row", columns: "col", values: "val" }); +const elapsed = performance.now() - t0; +console.log(JSON.stringify({ function: "pivot", mean_ms: elapsed / N, iterations: N, total_ms: elapsed })); diff --git a/benchmarks/tsb/bench_pop_column.ts b/benchmarks/tsb/bench_pop_column.ts new file mode 100644 index 00000000..6d78efc1 --- /dev/null +++ b/benchmarks/tsb/bench_pop_column.ts @@ -0,0 +1,25 @@ +/** + * Benchmark: popColumn on a 100k-row DataFrame + */ +import { DataFrame, popColumn } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const a = Array.from({ length: ROWS }, (_, i) => i); +const b = Array.from({ length: ROWS }, (_, i) => i * 2); +const c = Array.from({ length: ROWS }, (_, i) => i * 3); +const df = DataFrame.fromColumns({ a, b, c }); + +for (let i = 0; i < WARMUP; i++) popColumn(df, "b"); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) popColumn(df, "b"); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "pop_column", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_quantile.ts b/benchmarks/tsb/bench_quantile.ts new file mode 100644 index 00000000..fe7bcc66 --- /dev/null +++ b/benchmarks/tsb/bench_quantile.ts @@ -0,0 +1,18 @@ +import { quantile } from "tsb"; +const N = 100_000; +const sorted = Array.from({ length: N }, (_, i) => i * 0.001); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) { + quantile(sorted, 0.25); + quantile(sorted, 0.5); + quantile(sorted, 0.75); +} +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) { + quantile(sorted, 0.25); + quantile(sorted, 0.5); + quantile(sorted, 0.75); +} +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "quantile", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_range_index.ts b/benchmarks/tsb/bench_range_index.ts new file mode 100644 index 00000000..a090118d --- /dev/null +++ b/benchmarks/tsb/bench_range_index.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: RangeIndex construction, toArray(), slice(), contains() + */ +import { RangeIndex } from "../../src/index.js"; + +const N = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +for (let i = 0; i < WARMUP; i++) { + const r = new RangeIndex(N); + r.toArray(); + r.slice(1000, 5000); + r.contains(50_000); +} +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + const r = new RangeIndex(N); + r.toArray(); + r.slice(1000, 5000); + r.contains(50_000); +} +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "range_index", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_rank.ts b/benchmarks/tsb/bench_rank.ts new file mode 100644 index 00000000..a1c36b8b --- /dev/null +++ b/benchmarks/tsb/bench_rank.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: Series rank + * + * Ranks a large numeric Series using average tie-breaking. + * Outputs JSON: {"function": "rank", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ + +import { Series, rankSeries } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +function makeData(): readonly number[] { + return Array.from({ length: SIZE }, (_, i) => Math.floor(i / 3) * 1.5); +} + +const s = new Series({ data: Array.from(makeData()) }); + +for (let i = 0; i < WARMUP; i++) { + rankSeries(s, { method: "average" }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + rankSeries(s, { method: "average" }); + const end = performance.now(); + times.push(end - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; + +console.log(JSON.stringify({ + function: "rank", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, +})); diff --git a/benchmarks/tsb/bench_read_json.ts b/benchmarks/tsb/bench_read_json.ts new file mode 100644 index 00000000..f916d9a5 --- /dev/null +++ b/benchmarks/tsb/bench_read_json.ts @@ -0,0 +1,46 @@ +/** + * Benchmark: DataFrame readJson + * + * Parses a JSON string into a DataFrame (records orient). + * Outputs JSON: {"function": "read_json", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ + +import { readJson } from "../../src/index.ts"; + +const ROWS = 5_000; +const WARMUP = 5; +const ITERATIONS = 50; + +function makeJsonString(): string { + const records = Array.from({ length: ROWS }, (_, i) => ({ + id: i, + x: i * 1.1, + y: i * 2.2, + label: `item_${i % 100}`, + })); + return JSON.stringify(records); +} + +const jsonStr = makeJsonString(); + +for (let i = 0; i < WARMUP; i++) { + readJson(jsonStr); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + readJson(jsonStr); + const end = performance.now(); + times.push(end - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; + +console.log(JSON.stringify({ + function: "read_json", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, +})); diff --git a/benchmarks/tsb/bench_reorder_columns.ts b/benchmarks/tsb/bench_reorder_columns.ts new file mode 100644 index 00000000..c0940605 --- /dev/null +++ b/benchmarks/tsb/bench_reorder_columns.ts @@ -0,0 +1,25 @@ +/** + * Benchmark: reorderColumns on a 100k-row DataFrame + */ +import { DataFrame, reorderColumns } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const a = Array.from({ length: ROWS }, (_, i) => i); +const b = Array.from({ length: ROWS }, (_, i) => i * 2); +const c = Array.from({ length: ROWS }, (_, i) => i * 3); +const df = DataFrame.fromColumns({ a, b, c }); + +for (let i = 0; i < WARMUP; i++) reorderColumns(df, ["c", "a", "b"]); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) reorderColumns(df, ["c", "a", "b"]); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "reorder_columns", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_replace_dataframe.ts b/benchmarks/tsb/bench_replace_dataframe.ts new file mode 100644 index 00000000..9ca458f9 --- /dev/null +++ b/benchmarks/tsb/bench_replace_dataframe.ts @@ -0,0 +1,42 @@ +/** + * Benchmark: replaceDataFrame — replace values in a DataFrame. + * Outputs JSON: {"function": "replace_dataframe", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { replaceDataFrame, DataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i % 10), + b: Array.from({ length: SIZE }, (_, i) => i % 5), + c: Array.from({ length: SIZE }, (_, i) => ["x", "y", "z"][i % 3]), +}); +const mapping = new Map([ + [0, 100], + [1, 200], + [2, 300], +]); + +for (let i = 0; i < WARMUP; i++) { + replaceDataFrame(df, mapping); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + replaceDataFrame(df, mapping); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "replace_dataframe", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_resample.ts b/benchmarks/tsb/bench_resample.ts new file mode 100644 index 00000000..3f962abd --- /dev/null +++ b/benchmarks/tsb/bench_resample.ts @@ -0,0 +1,15 @@ +import { Series } from "tsb"; + +// minute-resolution timestamps for 100k points starting 2020-01-01 +const base = new Date("2020-01-01T00:00:00Z").getTime(); +const idx = Array.from({ length: 100_000 }, (_, i) => new Date(base + i * 60_000)); +const rng = (seed: number) => { let s = seed; return () => { s = (s * 1664525 + 1013904223) & 0xffffffff; return ((s >>> 0) / 0xffffffff) * 2 - 1; }; }; +const rand = rng(42); +const data = Array.from({ length: 100_000 }, () => rand() * 3); +const s = new Series(data, { index: idx }); +for (let i = 0; i < 3; i++) s.resample("1h").mean(); +const N = 50; +const t0 = performance.now(); +for (let i = 0; i < N; i++) s.resample("1h").mean(); +const elapsed = performance.now() - t0; +console.log(JSON.stringify({ function: "resample", mean_ms: elapsed / N, iterations: N, total_ms: elapsed })); diff --git a/benchmarks/tsb/bench_rolling_agg.ts b/benchmarks/tsb/bench_rolling_agg.ts new file mode 100644 index 00000000..0b3283c2 --- /dev/null +++ b/benchmarks/tsb/bench_rolling_agg.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: rollingAgg (multi-aggregation rolling window) on 100k-element Series + */ +import { Series, rollingAgg } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01)); +const s = new Series(data); +const fns = { + mean: (v: readonly number[]) => v.reduce((a, b) => a + b, 0) / v.length, + sum: (v: readonly number[]) => v.reduce((a, b) => a + b, 0), +}; + +for (let i = 0; i < WARMUP; i++) { + rollingAgg(s, 10, fns); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + rollingAgg(s, 10, fns); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "rolling_agg", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_rolling_apply.ts b/benchmarks/tsb/bench_rolling_apply.ts new file mode 100644 index 00000000..1d640052 --- /dev/null +++ b/benchmarks/tsb/bench_rolling_apply.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: rollingApply on 10k-element Series + */ +import { Series, rollingApply } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => i * 0.1); +const s = new Series({ data }); +const mean = (window: number[]) => window.reduce((a, b) => a + b, 0) / window.length; + +for (let i = 0; i < WARMUP; i++) rollingApply(s, 10, mean); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) rollingApply(s, 10, mean); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "rolling_apply", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_rolling_std.ts b/benchmarks/tsb/bench_rolling_std.ts new file mode 100644 index 00000000..2cd7d8cc --- /dev/null +++ b/benchmarks/tsb/bench_rolling_std.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: rolling standard deviation with window=100 on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01)); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.rolling(100).std(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.rolling(100).std(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "rolling_std", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_rolling_sum.ts b/benchmarks/tsb/bench_rolling_sum.ts new file mode 100644 index 00000000..e5104998 --- /dev/null +++ b/benchmarks/tsb/bench_rolling_sum.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: rolling sum with window=100 on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01)); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.rolling(100).sum(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.rolling(100).sum(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "rolling_sum", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_sample.ts b/benchmarks/tsb/bench_sample.ts new file mode 100644 index 00000000..4935485f --- /dev/null +++ b/benchmarks/tsb/bench_sample.ts @@ -0,0 +1,12 @@ +import { Series } from "tsb"; + +const rng = (seed: number) => { let s = seed; return () => { s = (s * 1664525 + 1013904223) & 0xffffffff; return ((s >>> 0) / 0xffffffff) * 2 - 1; }; }; +const rand = rng(42); +const data = Array.from({ length: 100_000 }, () => rand() * 3); +const s = new Series(data); +for (let i = 0; i < 3; i++) s.sample(1000); +const N = 100; +const t0 = performance.now(); +for (let i = 0; i < N; i++) s.sample(1000); +const elapsed = performance.now() - t0; +console.log(JSON.stringify({ function: "sample", mean_ms: elapsed / N, iterations: N, total_ms: elapsed })); diff --git a/benchmarks/tsb/bench_searchsorted.ts b/benchmarks/tsb/bench_searchsorted.ts new file mode 100644 index 00000000..71a4410b --- /dev/null +++ b/benchmarks/tsb/bench_searchsorted.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: searchsorted / searchsortedMany — binary search on sorted arrays. + * Outputs JSON: {"function": "searchsorted", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { searchsorted, searchsortedMany } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const sorted = Array.from({ length: SIZE }, (_, i) => i * 2); // even numbers 0..199998 +const needles = Array.from({ length: 1_000 }, (_, i) => i * 200); + +for (let i = 0; i < WARMUP; i++) { + searchsorted(sorted, 50_000); + searchsortedMany(sorted, needles); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + searchsorted(sorted, 50_000); + searchsortedMany(sorted, needles); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "searchsorted", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_select_dtypes.ts b/benchmarks/tsb/bench_select_dtypes.ts new file mode 100644 index 00000000..3a3879a4 --- /dev/null +++ b/benchmarks/tsb/bench_select_dtypes.ts @@ -0,0 +1,44 @@ +/** + * Benchmark: selectDtypes — filter DataFrame columns by dtype. + * Outputs JSON: {"function": "select_dtypes", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { selectDtypes, DataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i), + b: Array.from({ length: SIZE }, (_, i) => i * 1.5), + c: Array.from({ length: SIZE }, (_, i) => `str${i % 1000}`), + d: Array.from({ length: SIZE }, (_, i) => i % 2 === 0), + e: Array.from({ length: SIZE }, (_, i) => i * 2), + f: Array.from({ length: SIZE }, (_, i) => `label${i % 100}`), +}); + +for (let i = 0; i < WARMUP; i++) { + selectDtypes(df, { include: ["number"] }); + selectDtypes(df, { include: ["string"] }); + selectDtypes(df, { exclude: ["boolean"] }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + selectDtypes(df, { include: ["number"] }); + selectDtypes(df, { include: ["string"] }); + selectDtypes(df, { exclude: ["boolean"] }); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "select_dtypes", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_sem_var.ts b/benchmarks/tsb/bench_sem_var.ts new file mode 100644 index 00000000..dad01aee --- /dev/null +++ b/benchmarks/tsb/bench_sem_var.ts @@ -0,0 +1,26 @@ +/** + * Benchmark: varSeries / semSeries — variance and standard error of mean on a 100k-element Series. + * Outputs JSON: {"function": "sem_var", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, varSeries, semSeries } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const data = Float64Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.01) * 100); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + varSeries(s); + semSeries(s); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + varSeries(s); + semSeries(s); +} +const total = performance.now() - start; + +console.log(JSON.stringify({ function: "sem_var", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_abs.ts b/benchmarks/tsb/bench_series_abs.ts new file mode 100644 index 00000000..1034fdba --- /dev/null +++ b/benchmarks/tsb/bench_series_abs.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: Series.abs() — element-wise absolute value. + * Outputs JSON: {"function": "series_abs", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i - 50000) * 1.0) }); + +for (let i = 0; i < WARMUP; i++) { + s.abs(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + s.abs(); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "series_abs", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_series_apply.ts b/benchmarks/tsb/bench_series_apply.ts new file mode 100644 index 00000000..023995ed --- /dev/null +++ b/benchmarks/tsb/bench_series_apply.ts @@ -0,0 +1,23 @@ +/** + * Benchmark: seriesApply on 100k-element Series + */ +import { Series, seriesApply } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => i * 0.1); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) seriesApply(s, (v) => (v as number) * 2 + 1); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) seriesApply(s, (v) => (v as number) * 2 + 1); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "series_apply", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_at_iat.ts b/benchmarks/tsb/bench_series_at_iat.ts new file mode 100644 index 00000000..fbf94a12 --- /dev/null +++ b/benchmarks/tsb/bench_series_at_iat.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: series_at_iat — Series.at(label) and Series.iat(i) point access on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Float64Array.from({ length: ROWS }, (_, i) => i * 1.5); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + for (let j = 0; j < 1000; j++) s.iat(j); + for (let j = 0; j < 1000; j++) s.at(j); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + for (let j = 0; j < 1000; j++) s.iat(j); + for (let j = 0; j < 1000; j++) s.at(j); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_at_iat", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_compare.ts b/benchmarks/tsb/bench_series_compare.ts new file mode 100644 index 00000000..8ba75bba --- /dev/null +++ b/benchmarks/tsb/bench_series_compare.ts @@ -0,0 +1,41 @@ +/** + * Benchmark: Series comparison operators (eq, ne, lt, gt, le, ge) on 100k Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const data = Array.from({ length: ROWS }, (_, i) => i * 0.1); +const s = new Series({ data }); +const threshold = ROWS * 0.05; + +for (let i = 0; i < WARMUP; i++) { + s.eq(threshold); + s.ne(threshold); + s.lt(threshold); + s.gt(threshold); + s.le(threshold); + s.ge(threshold); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.eq(threshold); + s.ne(threshold); + s.lt(threshold); + s.gt(threshold); + s.le(threshold); + s.ge(threshold); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_compare", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_copy.ts b/benchmarks/tsb/bench_series_copy.ts new file mode 100644 index 00000000..24c3839d --- /dev/null +++ b/benchmarks/tsb/bench_series_copy.ts @@ -0,0 +1,21 @@ +/** + * Benchmark: Series.copy() on 100k Series. + */ +import { Series } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 0.5), name: "original" }); + +for (let i = 0; i < WARMUP; i++) s.copy(); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.copy(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_copy", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_corr.ts b/benchmarks/tsb/bench_series_corr.ts new file mode 100644 index 00000000..0cf7c87e --- /dev/null +++ b/benchmarks/tsb/bench_series_corr.ts @@ -0,0 +1,22 @@ +/** + * Benchmark: Series.corr(other) Pearson correlation on 100k-element Series. + */ +import { Series } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const a = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 0.1) }); +const b = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 0.2 + Math.random()) }); + +for (let i = 0; i < WARMUP; i++) a.corr(b); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + a.corr(b); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_corr", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_count.ts b/benchmarks/tsb/bench_series_count.ts new file mode 100644 index 00000000..d7bd163f --- /dev/null +++ b/benchmarks/tsb/bench_series_count.ts @@ -0,0 +1,21 @@ +/** + * Benchmark: Series.count() — non-NA count on 100k Series with some NAs. + */ +import { Series } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 10; +const ITERATIONS = 100; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 5 === 0 ? null : i) }); + +for (let i = 0; i < WARMUP; i++) s.count(); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.count(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_count", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_describe.ts b/benchmarks/tsb/bench_series_describe.ts new file mode 100644 index 00000000..9dfddc61 --- /dev/null +++ b/benchmarks/tsb/bench_series_describe.ts @@ -0,0 +1,21 @@ +/** + * Benchmark: describe(s) — summary statistics function on 100k Series. + */ +import { Series, describe } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i * 1.1) % 9999) }); + +for (let i = 0; i < WARMUP; i++) describe(s); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + describe(s); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_describe", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_digitize.ts b/benchmarks/tsb/bench_series_digitize.ts new file mode 100644 index 00000000..0d5bcf36 --- /dev/null +++ b/benchmarks/tsb/bench_series_digitize.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: seriesDigitize on 100k-element Series + */ +import { Series, seriesDigitize } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => i * 0.001); +const s = new Series({ data }); +const bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]; + +for (let i = 0; i < WARMUP; i++) seriesDigitize(s, bins); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) seriesDigitize(s, bins); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "series_digitize", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_dropna.ts b/benchmarks/tsb/bench_series_dropna.ts new file mode 100644 index 00000000..3500ac83 --- /dev/null +++ b/benchmarks/tsb/bench_series_dropna.ts @@ -0,0 +1,21 @@ +/** + * Benchmark: Series.dropna() on 100k Series with ~20% NAs. + */ +import { Series } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 5 === 0 ? null : i * 1.0) }); + +for (let i = 0; i < WARMUP; i++) s.dropna(); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.dropna(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_dropna", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_dt_strftime.ts b/benchmarks/tsb/bench_series_dt_strftime.ts new file mode 100644 index 00000000..2fb1ed23 --- /dev/null +++ b/benchmarks/tsb/bench_series_dt_strftime.ts @@ -0,0 +1,13 @@ +import { Series } from "tsb"; +const N = 100_000; +const base = new Date("2020-01-01").getTime(); +const day = 24 * 60 * 60 * 1000; +const dates = Array.from({ length: N }, (_, i) => new Date(base + i * day)); +const s = new Series(dates); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) s.dt.strftime("%Y-%m-%d"); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) s.dt.strftime("%Y-%m-%d"); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "series_dt_strftime", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_filter.ts b/benchmarks/tsb/bench_series_filter.ts new file mode 100644 index 00000000..3f9f978c --- /dev/null +++ b/benchmarks/tsb/bench_series_filter.ts @@ -0,0 +1,22 @@ +/** + * Benchmark: Series.filter(mask) — boolean selection on 100k Series. + */ +import { Series } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i) }); +const mask = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 2 === 0) }); + +for (let i = 0; i < WARMUP; i++) s.filter(mask); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.filter(mask); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_filter", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_floordiv_mod_pow.ts b/benchmarks/tsb/bench_series_floordiv_mod_pow.ts new file mode 100644 index 00000000..db302be5 --- /dev/null +++ b/benchmarks/tsb/bench_series_floordiv_mod_pow.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: Series floordiv, mod, and pow operators on 100k Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const data = Array.from({ length: ROWS }, (_, i) => (i + 1) * 0.5); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.floordiv(3); + s.mod(7); + s.pow(2); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.floordiv(3); + s.mod(7); + s.pow(2); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_floordiv_mod_pow", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_from_object.ts b/benchmarks/tsb/bench_series_from_object.ts new file mode 100644 index 00000000..20fdbb1d --- /dev/null +++ b/benchmarks/tsb/bench_series_from_object.ts @@ -0,0 +1,23 @@ +/** + * Benchmark: Series.fromObject() on 10k-key object + */ +import { Series } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; +const obj: Record = {}; +for (let i = 0; i < ROWS; i++) obj[`key_${i}`] = i * 1.5; + +for (let i = 0; i < WARMUP; i++) Series.fromObject(obj); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) Series.fromObject(obj); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "series_from_object", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_groupby.ts b/benchmarks/tsb/bench_series_groupby.ts new file mode 100644 index 00000000..840923eb --- /dev/null +++ b/benchmarks/tsb/bench_series_groupby.ts @@ -0,0 +1,22 @@ +/** + * Benchmark: Series.groupby(by).agg('sum') on 100k Series. + */ +import { Series } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i * 1.5) % 9999) }); +const by = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 100) }); + +for (let i = 0; i < WARMUP; i++) s.groupby(by).agg("sum"); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.groupby(by).agg("sum"); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_groupby", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_groupby_apply.ts b/benchmarks/tsb/bench_series_groupby_apply.ts new file mode 100644 index 00000000..c69ce0a9 --- /dev/null +++ b/benchmarks/tsb/bench_series_groupby_apply.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: SeriesGroupBy.apply — apply a function to each group. + */ +import { Series } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const data = Array.from({ length: ROWS }, (_, i) => i * 0.5); +const by = new Series({ data: Array.from({ length: ROWS }, (_, i) => i % 100) }); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.groupby(by).apply((g) => g); +} + +const t0 = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.groupby(by).apply((g) => { + const vals = g.toArray() as number[]; + const mean = vals.reduce((a, b) => a + b, 0) / vals.length; + return new Series({ data: vals.map((v) => v - mean) }); + }); +} +const total = performance.now() - t0; + +console.log( + JSON.stringify({ function: "series_groupby_apply", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total }), +); diff --git a/benchmarks/tsb/bench_series_groupby_filter.ts b/benchmarks/tsb/bench_series_groupby_filter.ts new file mode 100644 index 00000000..06329485 --- /dev/null +++ b/benchmarks/tsb/bench_series_groupby_filter.ts @@ -0,0 +1,26 @@ +/** + * Benchmark: SeriesGroupBy.filter — keep groups matching a predicate. + */ +import { Series } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const data = Array.from({ length: ROWS }, (_, i) => i * 1.0); +const by = new Series({ data: Array.from({ length: ROWS }, (_, i) => i % 100) }); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.groupby(by).filter((g) => (g.toArray() as number[]).reduce((a, b) => a + b, 0) > 1000); +} + +const t0 = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.groupby(by).filter((g) => (g.toArray() as number[]).reduce((a, b) => a + b, 0) > 1000); +} +const total = performance.now() - t0; + +console.log( + JSON.stringify({ function: "series_groupby_filter", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total }), +); diff --git a/benchmarks/tsb/bench_series_groupby_transform.ts b/benchmarks/tsb/bench_series_groupby_transform.ts new file mode 100644 index 00000000..604d1311 --- /dev/null +++ b/benchmarks/tsb/bench_series_groupby_transform.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: SeriesGroupBy.transform on 100k Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => (i * 1.5) % 9999); +const by = new Series({ data: Array.from({ length: ROWS }, (_, i) => i % 50) }); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.groupby(by).transform((vals) => { + const m = (vals as number[]).reduce((a, b) => a + b, 0) / vals.length; + return (vals as number[]).map((v) => v - m); + }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.groupby(by).transform((vals) => { + const m = (vals as number[]).reduce((a, b) => a + b, 0) / vals.length; + return (vals as number[]).map((v) => v - m); + }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_groupby_transform", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_iloc.ts b/benchmarks/tsb/bench_series_iloc.ts new file mode 100644 index 00000000..81d1ee4f --- /dev/null +++ b/benchmarks/tsb/bench_series_iloc.ts @@ -0,0 +1,22 @@ +/** + * Benchmark: Series.iloc(positions[]) — integer position selection on 100k Series. + */ +import { Series } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 3.0) }); +const positions = Array.from({ length: 1000 }, (_, i) => i * 100); + +for (let i = 0; i < WARMUP; i++) s.iloc(positions); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.iloc(positions); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_iloc", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_isin.ts b/benchmarks/tsb/bench_series_isin.ts new file mode 100644 index 00000000..6e361446 --- /dev/null +++ b/benchmarks/tsb/bench_series_isin.ts @@ -0,0 +1,22 @@ +/** + * Benchmark: Series.isin(values) on 100k Series with 100-element lookup set. + */ +import { Series } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 500) }); +const lookupSet = Array.from({ length: 100 }, (_, i) => i * 5); + +for (let i = 0; i < WARMUP; i++) s.isin(lookupSet); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.isin(lookupSet); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_isin", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_isna_notna.ts b/benchmarks/tsb/bench_series_isna_notna.ts new file mode 100644 index 00000000..1a51159e --- /dev/null +++ b/benchmarks/tsb/bench_series_isna_notna.ts @@ -0,0 +1,22 @@ +/** + * Benchmark: Series.isna() and Series.notna() on 100k Series with NAs. + */ +import { Series } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 3 === 0 ? null : i * 1.0) }); + +for (let i = 0; i < WARMUP; i++) { s.isna(); s.notna(); } + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.isna(); + s.notna(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_isna_notna", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_loc.ts b/benchmarks/tsb/bench_series_loc.ts new file mode 100644 index 00000000..a8b367a3 --- /dev/null +++ b/benchmarks/tsb/bench_series_loc.ts @@ -0,0 +1,23 @@ +/** + * Benchmark: Series.loc(labels[]) — label-based selection on 100k Series. + */ +import { Series, Index } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const labels = Array.from({ length: SIZE }, (_, i) => i); +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 2.0), index: new Index(labels) }); +const selectLabels = Array.from({ length: 1000 }, (_, i) => i * 100); + +for (let i = 0; i < WARMUP; i++) s.loc(selectLabels); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.loc(selectLabels); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_loc", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_map.ts b/benchmarks/tsb/bench_series_map.ts new file mode 100644 index 00000000..899cd0bb --- /dev/null +++ b/benchmarks/tsb/bench_series_map.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: Series.map() with a dictionary lookup. + * Outputs JSON: {"function": "series_map", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 1000) }); +const lookup = new Map(Array.from({ length: 1000 }, (_, i) => [i, i * 2.5])); + +for (let i = 0; i < WARMUP; i++) { + s.map(lookup); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + s.map(lookup); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "series_map", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_series_median.ts b/benchmarks/tsb/bench_series_median.ts new file mode 100644 index 00000000..1178a036 --- /dev/null +++ b/benchmarks/tsb/bench_series_median.ts @@ -0,0 +1,21 @@ +/** + * Benchmark: Series.median() on 100k-element numeric Series. + */ +import { Series } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i * 1.7) % 9999) }); + +for (let i = 0; i < WARMUP; i++) s.median(); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.median(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_median", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_min_max.ts b/benchmarks/tsb/bench_series_min_max.ts new file mode 100644 index 00000000..03099032 --- /dev/null +++ b/benchmarks/tsb/bench_series_min_max.ts @@ -0,0 +1,21 @@ +/** + * Benchmark: Series.min() and Series.max() on 100k numeric Series. + */ +import { Series } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i * 3.14) % 5000) }); + +for (let i = 0; i < WARMUP; i++) { s.min(); s.max(); } + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.min(); s.max(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_min_max", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_nlargest.ts b/benchmarks/tsb/bench_series_nlargest.ts new file mode 100644 index 00000000..faab5fd3 --- /dev/null +++ b/benchmarks/tsb/bench_series_nlargest.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: nlargest on 100k-element Series (top 1000) + */ +import { Series, nlargestSeries } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01) * 1000); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + nlargestSeries(s, 1000); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + nlargestSeries(s, 1000); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_nlargest", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_nunique.ts b/benchmarks/tsb/bench_series_nunique.ts new file mode 100644 index 00000000..3a40da23 --- /dev/null +++ b/benchmarks/tsb/bench_series_nunique.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: Series.nunique() — count unique values. + * Outputs JSON: {"function": "series_nunique", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 1000) }); + +for (let i = 0; i < WARMUP; i++) { + s.nunique(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + s.nunique(); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "series_nunique", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_series_properties.ts b/benchmarks/tsb/bench_series_properties.ts new file mode 100644 index 00000000..20030660 --- /dev/null +++ b/benchmarks/tsb/bench_series_properties.ts @@ -0,0 +1,28 @@ +/** + * Benchmark: Series property access — shape, ndim, size, empty, values, dtype, name + */ +import { Series } from "../../src/index.js"; + +const N = 100_000; +const s = new Series({ data: Array.from({ length: N }, (_, i) => i * 1.0), name: "x" }); + +const WARMUP = 3; +const ITERATIONS = 100_000; + +for (let i = 0; i < WARMUP; i++) { + s.shape; s.ndim; s.size; s.empty; s.values; s.dtype; s.name; +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.shape; s.ndim; s.size; s.empty; s.values; s.dtype; s.name; +} +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "series_properties", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_quantile.ts b/benchmarks/tsb/bench_series_quantile.ts new file mode 100644 index 00000000..4dbfcf61 --- /dev/null +++ b/benchmarks/tsb/bench_series_quantile.ts @@ -0,0 +1,22 @@ +/** + * Benchmark: Series.quantile(q) on 100k numeric Series. + */ +import { Series } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i * 1.41) % 10000) }); + +for (let i = 0; i < WARMUP; i++) { s.quantile(0.25); s.quantile(0.75); } + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.quantile(0.25); + s.quantile(0.75); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_quantile", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_rank.ts b/benchmarks/tsb/bench_series_rank.ts new file mode 100644 index 00000000..10b05127 --- /dev/null +++ b/benchmarks/tsb/bench_series_rank.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: Series rank on 100k-element Series + */ +import { Series, rankSeries } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01) * 1000); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + rankSeries(s); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + rankSeries(s); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_rank", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_rename.ts b/benchmarks/tsb/bench_series_rename.ts new file mode 100644 index 00000000..8bfee0be --- /dev/null +++ b/benchmarks/tsb/bench_series_rename.ts @@ -0,0 +1,21 @@ +/** + * Benchmark: Series.rename(name) on 100k Series. + */ +import { Series } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 10; +const ITERATIONS = 100; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i), name: "old_name" }); + +for (let i = 0; i < WARMUP; i++) s.rename("new_name"); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.rename("new_name"); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_rename", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_replace.ts b/benchmarks/tsb/bench_series_replace.ts new file mode 100644 index 00000000..60d1b655 --- /dev/null +++ b/benchmarks/tsb/bench_series_replace.ts @@ -0,0 +1,13 @@ +import { Series } from "tsb"; + +const rng = (seed: number) => { let s = seed; return () => { s = (s * 1664525 + 1013904223) & 0xffffffff; return (s >>> 0) / 0xffffffff; }; }; +const rand = rng(42); +const data = Array.from({ length: 100_000 }, () => Math.floor(rand() * 10)); +const s = new Series(data); +const mapping = new Map(Array.from({ length: 10 }, (_, i) => [i, i * 10] as [number, number])); +for (let i = 0; i < 3; i++) s.replace(mapping); +const N = 50; +const t0 = performance.now(); +for (let i = 0; i < N; i++) s.replace(mapping); +const elapsed = performance.now() - t0; +console.log(JSON.stringify({ function: "series_replace", mean_ms: elapsed / N, iterations: N, total_ms: elapsed })); diff --git a/benchmarks/tsb/bench_series_resetindex.ts b/benchmarks/tsb/bench_series_resetindex.ts new file mode 100644 index 00000000..7636c4ed --- /dev/null +++ b/benchmarks/tsb/bench_series_resetindex.ts @@ -0,0 +1,22 @@ +/** + * Benchmark: Series.resetIndex() on 100k Series. + */ +import { Series, Index } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const labels = Array.from({ length: SIZE }, (_, i) => `key_${i}`); +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i), index: new Index(labels) }); + +for (let i = 0; i < WARMUP; i++) s.resetIndex(); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.resetIndex(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_resetindex", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_setindex.ts b/benchmarks/tsb/bench_series_setindex.ts new file mode 100644 index 00000000..bedd8d7a --- /dev/null +++ b/benchmarks/tsb/bench_series_setindex.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: series_setindex — Series.setIndex(index) on a 100k-element Series + */ +import { Index, Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Float64Array.from({ length: ROWS }, (_, i) => i * 1.5); +const s = new Series(data); +const newIndex = new Index(Array.from({ length: ROWS }, (_, i) => `key${i}`)); + +for (let i = 0; i < WARMUP; i++) { + s.setIndex(newIndex); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.setIndex(newIndex); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_setindex", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_sort_index.ts b/benchmarks/tsb/bench_series_sort_index.ts new file mode 100644 index 00000000..76eb38c6 --- /dev/null +++ b/benchmarks/tsb/bench_series_sort_index.ts @@ -0,0 +1,22 @@ +/** + * Benchmark: Series.sortIndex() on 100k Series with string labels. + */ +import { Series, Index } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const labels = Array.from({ length: SIZE }, (_, i) => `lbl_${(SIZE - i).toString().padStart(6, "0")}`); +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i), index: new Index(labels) }); + +for (let i = 0; i < WARMUP; i++) s.sortIndex(); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.sortIndex(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_sort_index", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_std_var.ts b/benchmarks/tsb/bench_series_std_var.ts new file mode 100644 index 00000000..41bdef09 --- /dev/null +++ b/benchmarks/tsb/bench_series_std_var.ts @@ -0,0 +1,21 @@ +/** + * Benchmark: Series.std() and Series.var() on 100k numeric Series. + */ +import { Series } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i * 2.71) % 10000) }); + +for (let i = 0; i < WARMUP; i++) { s.std(); s.var(); } + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.std(); s.var(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_std_var", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_str_replace.ts b/benchmarks/tsb/bench_series_str_replace.ts new file mode 100644 index 00000000..be4d0041 --- /dev/null +++ b/benchmarks/tsb/bench_series_str_replace.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: series_str_replace — str.replace on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `hello_world_${i % 200}`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.replace("world", "there"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.replace("world", "there"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_str_replace", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_sum_mean.ts b/benchmarks/tsb/bench_series_sum_mean.ts new file mode 100644 index 00000000..a5e3ec62 --- /dev/null +++ b/benchmarks/tsb/bench_series_sum_mean.ts @@ -0,0 +1,21 @@ +/** + * Benchmark: Series.sum() and Series.mean() on 100k numeric Series. + */ +import { Series } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 0.001) }); + +for (let i = 0; i < WARMUP; i++) { s.sum(); s.mean(); } + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.sum(); s.mean(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_sum_mean", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_to_string.ts b/benchmarks/tsb/bench_series_to_string.ts new file mode 100644 index 00000000..5dc91253 --- /dev/null +++ b/benchmarks/tsb/bench_series_to_string.ts @@ -0,0 +1,23 @@ +/** + * Benchmark: seriesToString on 1k-element Series + */ +import { Series, seriesToString } from "../../src/index.js"; + +const N = 1_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: N }, (_, i) => i * 0.1); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) seriesToString(s); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) seriesToString(s); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "series_to_string", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_toarray_tolist.ts b/benchmarks/tsb/bench_series_toarray_tolist.ts new file mode 100644 index 00000000..61409904 --- /dev/null +++ b/benchmarks/tsb/bench_series_toarray_tolist.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: Series toArray and toList on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Float64Array.from({ length: ROWS }, (_, i) => i * 0.5); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.toArray(); + s.toList(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.toArray(); + s.toList(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_toarray_tolist", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_toobject.ts b/benchmarks/tsb/bench_series_toobject.ts new file mode 100644 index 00000000..d3aa6094 --- /dev/null +++ b/benchmarks/tsb/bench_series_toobject.ts @@ -0,0 +1,21 @@ +/** + * Benchmark: Series.toObject() — convert to {label: value} record on 100k Series. + */ +import { Series } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 1.5) }); + +for (let i = 0; i < WARMUP; i++) s.toObject(); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.toObject(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_toobject", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_transform.ts b/benchmarks/tsb/bench_series_transform.ts new file mode 100644 index 00000000..a5833f57 --- /dev/null +++ b/benchmarks/tsb/bench_series_transform.ts @@ -0,0 +1,23 @@ +/** + * Benchmark: seriesTransform on 100k-element Series + */ +import { Series, seriesTransform } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => i * 0.1); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) seriesTransform(s, (v) => (v as number) ** 2); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) seriesTransform(s, (v) => (v as number) ** 2); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "series_transform", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_unique.ts b/benchmarks/tsb/bench_series_unique.ts new file mode 100644 index 00000000..8643bc5e --- /dev/null +++ b/benchmarks/tsb/bench_series_unique.ts @@ -0,0 +1,21 @@ +/** + * Benchmark: Series.unique() on 100k-element Series with 1000 distinct values. + */ +import { Series } from "../../src/index.js"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 1000) }); + +for (let i = 0; i < WARMUP; i++) s.unique(); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.unique(); + times.push(performance.now() - t0); +} +const total = times.reduce((a, b) => a + b, 0); +console.log(JSON.stringify({ function: "series_unique", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_series_with_values.ts b/benchmarks/tsb/bench_series_with_values.ts new file mode 100644 index 00000000..472f7a11 --- /dev/null +++ b/benchmarks/tsb/bench_series_with_values.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: Series.withValues() on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => i * 1.0); +const newData = Array.from({ length: ROWS }, (_, i) => i * 2.0); +const s = new Series({ data, name: "x" }); + +for (let i = 0; i < WARMUP; i++) s.withValues(newData); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) s.withValues(newData); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "series_with_values", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_skew_kurt.ts b/benchmarks/tsb/bench_skew_kurt.ts new file mode 100644 index 00000000..cb47e27e --- /dev/null +++ b/benchmarks/tsb/bench_skew_kurt.ts @@ -0,0 +1,26 @@ +/** + * Benchmark: skewSeries / kurtSeries — skewness and kurtosis on a 100k-element Series. + * Outputs JSON: {"function": "skew_kurt", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, skewSeries, kurtSeries } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const data = Float64Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.01) * 100); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + skewSeries(s); + kurtSeries(s); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + skewSeries(s); + kurtSeries(s); +} +const total = performance.now() - start; + +console.log(JSON.stringify({ function: "skew_kurt", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_str_byte_length.ts b/benchmarks/tsb/bench_str_byte_length.ts new file mode 100644 index 00000000..d0e7ce23 --- /dev/null +++ b/benchmarks/tsb/bench_str_byte_length.ts @@ -0,0 +1,13 @@ +import { strByteLength } from "tsb"; +import { Series } from "tsb"; +const N = 100_000; +const words = ["hello", "world", "typescript", "benchmark", "tsb"]; +const data = Array.from({ length: N }, (_, i) => words[i % words.length]); +const s = new Series(data); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) strByteLength(s); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) strByteLength(s); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "str_byte_length", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_str_case.ts b/benchmarks/tsb/bench_str_case.ts new file mode 100644 index 00000000..6bf2140a --- /dev/null +++ b/benchmarks/tsb/bench_str_case.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: str_case — str.title, str.capitalize, str.swapcase on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `hello world ${i}`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.title(); + s.str.capitalize(); + s.str.swapcase(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.title(); + s.str.capitalize(); + s.str.swapcase(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_case", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_cat.ts b/benchmarks/tsb/bench_str_cat.ts new file mode 100644 index 00000000..8b333786 --- /dev/null +++ b/benchmarks/tsb/bench_str_cat.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: str_cat — str.cat concatenating a Series with another array on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `hello_${i % 200}`); +const other = Array.from({ length: ROWS }, (_, i) => `_world_${i % 100}`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.cat([other], "-"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.cat([other], "-"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_cat", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_char_width.ts b/benchmarks/tsb/bench_str_char_width.ts new file mode 100644 index 00000000..d4b1fe50 --- /dev/null +++ b/benchmarks/tsb/bench_str_char_width.ts @@ -0,0 +1,13 @@ +import { strCharWidth } from "tsb"; +import { Series } from "tsb"; +const N = 100_000; +const words = ["hello", "world", "café", "résumé", "naïve"]; +const data = Array.from({ length: N }, (_, i) => words[i % words.length]); +const s = new Series(data); +const WARMUP = 3; +const ITERS = 20; +for (let i = 0; i < WARMUP; i++) strCharWidth(s); +const t0 = performance.now(); +for (let i = 0; i < ITERS; i++) strCharWidth(s); +const total = performance.now() - t0; +console.log(JSON.stringify({ function: "str_char_width", mean_ms: total / ITERS, iterations: ITERS, total_ms: total })); diff --git a/benchmarks/tsb/bench_str_count.ts b/benchmarks/tsb/bench_str_count.ts new file mode 100644 index 00000000..5332f8d2 --- /dev/null +++ b/benchmarks/tsb/bench_str_count.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: str_count — str.count occurrences of pattern on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `abc abc abc ${i % 5 === 0 ? "abc" : "xyz"}`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.count("abc"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.count("abc"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_count", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_dedent.ts b/benchmarks/tsb/bench_str_dedent.ts new file mode 100644 index 00000000..b57b0018 --- /dev/null +++ b/benchmarks/tsb/bench_str_dedent.ts @@ -0,0 +1,22 @@ +/** + * Benchmark: strDedent on 50k multi-line strings + */ +import { strDedent } from "../../src/index.js"; + +const N = 50_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: N }, (_, i) => ` line1 ${i}\n line2 ${i}\n line3 ${i}`); + +for (let i = 0; i < WARMUP; i++) data.map((s) => strDedent(s)); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) data.map((s) => strDedent(s)); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "str_dedent", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_encode.ts b/benchmarks/tsb/bench_str_encode.ts new file mode 100644 index 00000000..ed4e65ac --- /dev/null +++ b/benchmarks/tsb/bench_str_encode.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: str_encode — str.encode byte-length encoding on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `hello world ${i}`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.encode(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.encode(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_encode", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_extract_all.ts b/benchmarks/tsb/bench_str_extract_all.ts new file mode 100644 index 00000000..47a3ff25 --- /dev/null +++ b/benchmarks/tsb/bench_str_extract_all.ts @@ -0,0 +1,23 @@ +/** + * Benchmark: strExtractAll on 10k-element string Series + */ +import { Series, strExtractAll } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => `val${i} num${i * 2} extra${i}`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) strExtractAll(s, /\d+/g); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) strExtractAll(s, /\d+/g); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "str_extract_all", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_extract_groups.ts b/benchmarks/tsb/bench_str_extract_groups.ts new file mode 100644 index 00000000..f26c25ae --- /dev/null +++ b/benchmarks/tsb/bench_str_extract_groups.ts @@ -0,0 +1,23 @@ +/** + * Benchmark: strExtractGroups on 10k-element string Series + */ +import { Series, strExtractGroups } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => `user_${i}_score_${i % 100}`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) strExtractGroups(s, /user_(\d+)_score_(\d+)/); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) strExtractGroups(s, /user_(\d+)_score_(\d+)/); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "str_extract_groups", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_find.ts b/benchmarks/tsb/bench_str_find.ts new file mode 100644 index 00000000..f6f835cb --- /dev/null +++ b/benchmarks/tsb/bench_str_find.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: str_find — str.find and str.rfind on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `hello_world_${i % 200}_end`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.find("world"); + s.str.rfind("_"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.find("world"); + s.str.rfind("_"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_find", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_fullmatch.ts b/benchmarks/tsb/bench_str_fullmatch.ts new file mode 100644 index 00000000..854a009d --- /dev/null +++ b/benchmarks/tsb/bench_str_fullmatch.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: str_fullmatch — str.fullmatch (regex full match) on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `item_${i % 200}`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.fullmatch("item_\\d+"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.fullmatch("item_\\d+"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_fullmatch", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_get_dummies.ts b/benchmarks/tsb/bench_str_get_dummies.ts new file mode 100644 index 00000000..5e1cacab --- /dev/null +++ b/benchmarks/tsb/bench_str_get_dummies.ts @@ -0,0 +1,23 @@ +/** + * Benchmark: strGetDummies on 10k-element string Series + */ +import { Series, strGetDummies } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => `a|b|${String.fromCharCode(97 + (i % 5))}`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) strGetDummies(s, { sep: "|" }); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) strGetDummies(s, { sep: "|" }); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "str_get_dummies", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_indent.ts b/benchmarks/tsb/bench_str_indent.ts new file mode 100644 index 00000000..782128ce --- /dev/null +++ b/benchmarks/tsb/bench_str_indent.ts @@ -0,0 +1,22 @@ +/** + * Benchmark: strIndent on 50k multi-line strings + */ +import { strIndent } from "../../src/index.js"; + +const N = 50_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: N }, (_, i) => `line1 ${i}\nline2 ${i}\nline3 ${i}`); + +for (let i = 0; i < WARMUP; i++) data.map((s) => strIndent(s, { prefix: " " })); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) data.map((s) => strIndent(s, { prefix: " " })); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "str_indent", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_is_alpha_digit.ts b/benchmarks/tsb/bench_str_is_alpha_digit.ts new file mode 100644 index 00000000..53667e06 --- /dev/null +++ b/benchmarks/tsb/bench_str_is_alpha_digit.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: str_is_alpha_digit — str.isalpha and str.isdigit on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => (i % 2 === 0 ? `hello` : `12345`)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.isalpha(); + s.str.isdigit(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.isalpha(); + s.str.isdigit(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_is_alpha_digit", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_isalnum_isnumeric.ts b/benchmarks/tsb/bench_str_isalnum_isnumeric.ts new file mode 100644 index 00000000..426fc1fe --- /dev/null +++ b/benchmarks/tsb/bench_str_isalnum_isnumeric.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: str_isalnum_isnumeric — str.isalnum and str.isnumeric on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => (i % 3 === 0 ? `abc123` : i % 3 === 1 ? `12345` : `hello!`)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.isalnum(); + s.str.isnumeric(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.isalnum(); + s.str.isnumeric(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_isalnum_isnumeric", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_islower_isupper.ts b/benchmarks/tsb/bench_str_islower_isupper.ts new file mode 100644 index 00000000..af5cc2a7 --- /dev/null +++ b/benchmarks/tsb/bench_str_islower_isupper.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: str_islower_isupper — str.islower and str.isupper on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => (i % 2 === 0 ? `hello` : `WORLD`)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.islower(); + s.str.isupper(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.islower(); + s.str.isupper(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_islower_isupper", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_istitle_isspace.ts b/benchmarks/tsb/bench_str_istitle_isspace.ts new file mode 100644 index 00000000..2e8540b2 --- /dev/null +++ b/benchmarks/tsb/bench_str_istitle_isspace.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: str_istitle_isspace — str.istitle and str.isspace on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => (i % 3 === 0 ? `Hello World` : i % 3 === 1 ? ` ` : `hello world`)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.istitle(); + s.str.isspace(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.istitle(); + s.str.isspace(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_istitle_isspace", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_join.ts b/benchmarks/tsb/bench_str_join.ts new file mode 100644 index 00000000..d1cfbb04 --- /dev/null +++ b/benchmarks/tsb/bench_str_join.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: str_join — str.join on 100k list-of-strings Series values + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +// Series where each element is a list of strings (already split) +const data = Array.from({ length: ROWS }, (_, i) => [`a${i % 10}`, `b${i % 5}`, `c${i % 3}`]); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.join("-"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.join("-"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_join", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_len.ts b/benchmarks/tsb/bench_str_len.ts new file mode 100644 index 00000000..b84df06a --- /dev/null +++ b/benchmarks/tsb/bench_str_len.ts @@ -0,0 +1,23 @@ +/** + * Benchmark: Series.str.len() on 100k-element string Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => `item_${i}_value`); +const s = new Series({ data, name: "text" }); + +for (let i = 0; i < WARMUP; i++) s.str.len(); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) s.str.len(); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "str_len", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_lower_upper.ts b/benchmarks/tsb/bench_str_lower_upper.ts new file mode 100644 index 00000000..48f0c13f --- /dev/null +++ b/benchmarks/tsb/bench_str_lower_upper.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: str_lower_upper — str.lower and str.upper on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `Hello_World_${i % 200}`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.lower(); + s.str.upper(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.lower(); + s.str.upper(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_lower_upper", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_match.ts b/benchmarks/tsb/bench_str_match.ts new file mode 100644 index 00000000..35be4f5a --- /dev/null +++ b/benchmarks/tsb/bench_str_match.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: str_match — str.match regex matching on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `item_${i % 500}_abc`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.match(/^item_\d+/); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.match(/^item_\d+/); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_match", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_multi_replace.ts b/benchmarks/tsb/bench_str_multi_replace.ts new file mode 100644 index 00000000..56e15e2b --- /dev/null +++ b/benchmarks/tsb/bench_str_multi_replace.ts @@ -0,0 +1,28 @@ +/** + * Benchmark: strMultiReplace on 100k-element string Series + */ +import { Series, strMultiReplace } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => `foo bar baz ${i}`); +const s = new Series({ data }); +const pairs: [string, string][] = [ + ["foo", "alpha"], + ["bar", "beta"], + ["baz", "gamma"], +]; + +for (let i = 0; i < WARMUP; i++) strMultiReplace(s, pairs); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) strMultiReplace(s, pairs); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "str_multi_replace", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_normalize.ts b/benchmarks/tsb/bench_str_normalize.ts new file mode 100644 index 00000000..07496d91 --- /dev/null +++ b/benchmarks/tsb/bench_str_normalize.ts @@ -0,0 +1,23 @@ +/** + * Benchmark: strNormalize on 100k-element string Series + */ +import { Series, strNormalize } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => `caf\u00e9 ${i}`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) strNormalize(s, "NFC"); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) strNormalize(s, "NFC"); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "str_normalize", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_pad.ts b/benchmarks/tsb/bench_str_pad.ts new file mode 100644 index 00000000..06c2e648 --- /dev/null +++ b/benchmarks/tsb/bench_str_pad.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: str_pad — str.pad, str.ljust, str.rjust, str.zfill on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `hello_${i % 200}`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.pad(20); + s.str.ljust(20); + s.str.rjust(20); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.pad(20); + s.str.ljust(20); + s.str.rjust(20); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_pad", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_partition.ts b/benchmarks/tsb/bench_str_partition.ts new file mode 100644 index 00000000..73fce95f --- /dev/null +++ b/benchmarks/tsb/bench_str_partition.ts @@ -0,0 +1,23 @@ +/** + * Benchmark: strPartition on 100k-element string Series + */ +import { Series, strPartition } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => `prefix_${i}_suffix`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) strPartition(s, "_"); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) strPartition(s, "_"); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "str_partition", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_remove_prefix.ts b/benchmarks/tsb/bench_str_remove_prefix.ts new file mode 100644 index 00000000..e863ad18 --- /dev/null +++ b/benchmarks/tsb/bench_str_remove_prefix.ts @@ -0,0 +1,23 @@ +/** + * Benchmark: strRemovePrefix on 100k-element string Series + */ +import { Series, strRemovePrefix } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => `prefix_value_${i}`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) strRemovePrefix(s, "prefix_"); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) strRemovePrefix(s, "prefix_"); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "str_remove_prefix", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_remove_suffix.ts b/benchmarks/tsb/bench_str_remove_suffix.ts new file mode 100644 index 00000000..990bd845 --- /dev/null +++ b/benchmarks/tsb/bench_str_remove_suffix.ts @@ -0,0 +1,23 @@ +/** + * Benchmark: strRemoveSuffix on 100k-element string Series + */ +import { Series, strRemoveSuffix } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => `value_${i}_suffix`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) strRemoveSuffix(s, "_suffix"); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) strRemoveSuffix(s, "_suffix"); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "str_remove_suffix", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_repeat.ts b/benchmarks/tsb/bench_str_repeat.ts new file mode 100644 index 00000000..7de3e8b0 --- /dev/null +++ b/benchmarks/tsb/bench_str_repeat.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: str_repeat — str.repeat on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `ab_${i % 100}`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.repeat(3); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.repeat(3); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_repeat", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_rpartition.ts b/benchmarks/tsb/bench_str_rpartition.ts new file mode 100644 index 00000000..676900c6 --- /dev/null +++ b/benchmarks/tsb/bench_str_rpartition.ts @@ -0,0 +1,23 @@ +/** + * Benchmark: strRPartition on 100k-element string Series + */ +import { Series, strRPartition } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => `prefix_${i}_suffix`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) strRPartition(s, "_"); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) strRPartition(s, "_"); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "str_rpartition", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_rsplit.ts b/benchmarks/tsb/bench_str_rsplit.ts new file mode 100644 index 00000000..b9f65a80 --- /dev/null +++ b/benchmarks/tsb/bench_str_rsplit.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: str_rsplit — StringAccessor rsplit() on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `part_${i % 100}_b_c_d`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.rsplit("_", undefined, 2); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.rsplit("_", undefined, 2); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_rsplit", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_slice_get.ts b/benchmarks/tsb/bench_str_slice_get.ts new file mode 100644 index 00000000..2df2c5ca --- /dev/null +++ b/benchmarks/tsb/bench_str_slice_get.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: str_slice_get — str.slice and str.get character extraction on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `hello_world_${i}`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.slice(0, 5); + s.str.get(0); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.slice(0, 5); + s.str.get(0); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_slice_get", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_slice_replace.ts b/benchmarks/tsb/bench_str_slice_replace.ts new file mode 100644 index 00000000..4bb342a2 --- /dev/null +++ b/benchmarks/tsb/bench_str_slice_replace.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: str_slice_replace — StringAccessor sliceReplace() on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `hello_world_${i % 1000}`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.sliceReplace(0, 5, "goodbye"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.sliceReplace(0, 5, "goodbye"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_slice_replace", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_split_expand.ts b/benchmarks/tsb/bench_str_split_expand.ts new file mode 100644 index 00000000..65557cbd --- /dev/null +++ b/benchmarks/tsb/bench_str_split_expand.ts @@ -0,0 +1,23 @@ +/** + * Benchmark: strSplitExpand on 10k-element string Series + */ +import { Series, strSplitExpand } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => `a_${i}_b_${i * 2}_c`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) strSplitExpand(s, "_"); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) strSplitExpand(s, "_"); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "str_split_expand", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_startswith_endswith.ts b/benchmarks/tsb/bench_str_startswith_endswith.ts new file mode 100644 index 00000000..dd97855c --- /dev/null +++ b/benchmarks/tsb/bench_str_startswith_endswith.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: str_startswith_endswith — str.startswith and str.endswith on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `hello_world_${i % 200}_suffix`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.startswith("hello"); + s.str.endswith("suffix"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.startswith("hello"); + s.str.endswith("suffix"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_startswith_endswith", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_strip.ts b/benchmarks/tsb/bench_str_strip.ts new file mode 100644 index 00000000..fa90c9d1 --- /dev/null +++ b/benchmarks/tsb/bench_str_strip.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: str_strip — str.strip, str.lstrip, str.rstrip on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => ` hello_world_${i % 200} `); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.strip(); + s.str.lstrip(); + s.str.rstrip(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.strip(); + s.str.lstrip(); + s.str.rstrip(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_strip", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_translate.ts b/benchmarks/tsb/bench_str_translate.ts new file mode 100644 index 00000000..14fd7608 --- /dev/null +++ b/benchmarks/tsb/bench_str_translate.ts @@ -0,0 +1,24 @@ +/** + * Benchmark: strTranslate on 100k-element string Series + */ +import { Series, strTranslate } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; +const data = Array.from({ length: ROWS }, (_, i) => `hello world ${i}`); +const s = new Series({ data }); +const table: Record = { h: "H", w: "W", o: "0" }; + +for (let i = 0; i < WARMUP; i++) strTranslate(s, table); +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) strTranslate(s, table); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "str_translate", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_wrap.ts b/benchmarks/tsb/bench_str_wrap.ts new file mode 100644 index 00000000..fea2922d --- /dev/null +++ b/benchmarks/tsb/bench_str_wrap.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: str_wrap — str.wrap word wrapping on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, () => `the quick brown fox jumps over the lazy dog`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.wrap(20); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.wrap(20); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_wrap", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_zfill_center_ljust_rjust.ts b/benchmarks/tsb/bench_str_zfill_center_ljust_rjust.ts new file mode 100644 index 00000000..5d17a3ca --- /dev/null +++ b/benchmarks/tsb/bench_str_zfill_center_ljust_rjust.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: str_zfill_center_ljust_rjust — padding operations on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `${i}`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.zfill(10); + s.str.center(10); + s.str.ljust(10); + s.str.rjust(10); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.zfill(10); + s.str.center(10); + s.str.ljust(10); + s.str.rjust(10); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "str_zfill_center_ljust_rjust", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_string_contains.ts b/benchmarks/tsb/bench_string_contains.ts new file mode 100644 index 00000000..33eb0305 --- /dev/null +++ b/benchmarks/tsb/bench_string_contains.ts @@ -0,0 +1,13 @@ +import { Series } from "tsb"; + +const words = ["apple", "banana", "cherry", "date", "elderberry"]; +const rng = (seed: number) => { let s = seed; return () => { s = (s * 1664525 + 1013904223) & 0xffffffff; return (s >>> 0) / 0xffffffff; }; }; +const rand = rng(42); +const data = Array.from({ length: 100_000 }, () => words[Math.floor(rand() * 5)]); +const s = new Series(data); +for (let i = 0; i < 3; i++) s.str.contains("an"); +const N = 50; +const t0 = performance.now(); +for (let i = 0; i < N; i++) s.str.contains("an"); +const elapsed = performance.now() - t0; +console.log(JSON.stringify({ function: "string_contains", mean_ms: elapsed / N, iterations: N, total_ms: elapsed })); diff --git a/benchmarks/tsb/bench_timedelta.ts b/benchmarks/tsb/bench_timedelta.ts new file mode 100644 index 00000000..25761570 --- /dev/null +++ b/benchmarks/tsb/bench_timedelta.ts @@ -0,0 +1,45 @@ +/** + * Benchmark: Timedelta — construction and arithmetic. + * Outputs JSON: {"function": "timedelta", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Timedelta } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const td1 = Timedelta.fromComponents({ days: 1, hours: 2, minutes: 30 }); +const td2 = Timedelta.fromComponents({ hours: 3, minutes: 45, seconds: 10 }); +const deltas = Array.from({ length: SIZE }, (_, i) => Timedelta.fromComponents({ days: i % 365, hours: i % 24 })); + +for (let i = 0; i < WARMUP; i++) { + for (const d of deltas) { + d.add(td1); + d.subtract(td2); + void d.totalHours; + void d.totalSeconds; + } +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + for (const d of deltas) { + d.add(td1); + d.subtract(td2); + void d.totalHours; + void d.totalSeconds; + } + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "timedelta", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_timestamp.ts b/benchmarks/tsb/bench_timestamp.ts new file mode 100644 index 00000000..92041577 --- /dev/null +++ b/benchmarks/tsb/bench_timestamp.ts @@ -0,0 +1,43 @@ +/** + * Benchmark: Timestamp — construction and component accessors. + * Outputs JSON: {"function": "timestamp", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Timestamp } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const dates = Array.from({ length: SIZE }, (_, i) => new Date(Date.UTC(2020, 0, 1) + i * 86_400_000)); + +for (let i = 0; i < WARMUP; i++) { + for (const d of dates) { + const ts = new Timestamp(d); + void ts.year; + void ts.month; + void ts.dayofweek; + } +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + for (const d of dates) { + const ts = new Timestamp(d); + void ts.year; + void ts.month; + void ts.dayofweek; + } + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "timestamp", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_to_csv.ts b/benchmarks/tsb/bench_to_csv.ts new file mode 100644 index 00000000..fb1ce422 --- /dev/null +++ b/benchmarks/tsb/bench_to_csv.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: toCsv — serialize a 10k-row DataFrame to CSV string + */ +import { DataFrame, toCsv } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const df = new DataFrame({ + id: Float64Array.from({ length: ROWS }, (_, i) => i), + value: Float64Array.from({ length: ROWS }, (_, i) => i * 1.1), + score: Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01)), +}); + +for (let i = 0; i < WARMUP; i++) { + toCsv(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + toCsv(df); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "to_csv", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_to_datetime.ts b/benchmarks/tsb/bench_to_datetime.ts new file mode 100644 index 00000000..c854006a --- /dev/null +++ b/benchmarks/tsb/bench_to_datetime.ts @@ -0,0 +1,41 @@ +/** + * Benchmark: toDatetime — parse scalar/array values to Date. + * Outputs JSON: {"function": "to_datetime", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { toDatetime } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const base = new Date("2020-01-01").getTime(); +const msPerDay = 86_400_000; +const dateStrings = Array.from({ length: SIZE }, (_, i) => { + const d = new Date(base + i * msPerDay); + return `${d.getFullYear()}-${String(d.getMonth() + 1).padStart(2, "0")}-${String(d.getDate()).padStart(2, "0")}`; +}); +const timestamps = Array.from({ length: SIZE }, (_, i) => base + i * msPerDay); + +for (let i = 0; i < WARMUP; i++) { + toDatetime(dateStrings); + toDatetime(timestamps); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + toDatetime(dateStrings); + toDatetime(timestamps); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "to_datetime", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_to_dict_oriented_all.ts b/benchmarks/tsb/bench_to_dict_oriented_all.ts new file mode 100644 index 00000000..0565d7e5 --- /dev/null +++ b/benchmarks/tsb/bench_to_dict_oriented_all.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: toDictOriented with records, list, split, dict orientations on 10k-row DataFrame + */ +import { DataFrame, toDictOriented } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; +const a = Array.from({ length: ROWS }, (_, i) => i); +const b = Array.from({ length: ROWS }, (_, i) => i * 1.5); +const c = Array.from({ length: ROWS }, (_, i) => `s${i}`); +const df = new DataFrame({ columns: { a, b, c } }); + +for (let i = 0; i < WARMUP; i++) { + toDictOriented(df, "records"); + toDictOriented(df, "list"); + toDictOriented(df, "split"); +} +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + toDictOriented(df, "records"); + toDictOriented(df, "list"); + toDictOriented(df, "split"); +} +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "to_dict_oriented_all", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_to_json.ts b/benchmarks/tsb/bench_to_json.ts new file mode 100644 index 00000000..ed8c22a2 --- /dev/null +++ b/benchmarks/tsb/bench_to_json.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: toJson — serialize a 10k-row DataFrame to JSON string + */ +import { DataFrame, toJson } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const df = new DataFrame({ + id: Float64Array.from({ length: ROWS }, (_, i) => i), + value: Float64Array.from({ length: ROWS }, (_, i) => i * 1.1), + score: Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01)), +}); + +for (let i = 0; i < WARMUP; i++) { + toJson(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + toJson(df); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "to_json", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_to_numeric.ts b/benchmarks/tsb/bench_to_numeric.ts new file mode 100644 index 00000000..aea1b05a --- /dev/null +++ b/benchmarks/tsb/bench_to_numeric.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: toNumericArray / toNumericSeries — coerce values to numeric. + * Outputs JSON: {"function": "to_numeric", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { toNumericArray, toNumericSeries, Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const strNums = Array.from({ length: SIZE }, (_, i) => String(i * 1.5)); +const s = new Series({ data: strNums }); + +for (let i = 0; i < WARMUP; i++) { + toNumericArray(strNums, { errors: "coerce" }); + toNumericSeries(s, { errors: "coerce" }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + toNumericArray(strNums, { errors: "coerce" }); + toNumericSeries(s, { errors: "coerce" }); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "to_numeric", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_type_checks.ts b/benchmarks/tsb/bench_type_checks.ts new file mode 100644 index 00000000..4a19574d --- /dev/null +++ b/benchmarks/tsb/bench_type_checks.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: isScalar, isListLike, isArrayLike, isDictLike, isIterator on mixed values + */ +import { isScalar, isListLike, isArrayLike, isDictLike, isIterator } from "../../src/index.js"; + +const ITERATIONS = 100_000; +const WARMUP = 3; +const MEASURED = 10; + +const values = [42, "hello", null, [1, 2, 3], { a: 1 }, new Set([1, 2]), new Map()]; + +function runChecks(): void { + for (const v of values) { + isScalar(v); + isListLike(v); + isArrayLike(v); + isDictLike(v); + isIterator(v); + } +} + +for (let i = 0; i < WARMUP; i++) for (let j = 0; j < ITERATIONS; j++) runChecks(); +const start = performance.now(); +for (let i = 0; i < MEASURED; i++) for (let j = 0; j < ITERATIONS; j++) runChecks(); +const total = performance.now() - start; +console.log( + JSON.stringify({ + function: "type_checks", + mean_ms: total / MEASURED, + iterations: MEASURED, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_unstack.ts b/benchmarks/tsb/bench_unstack.ts new file mode 100644 index 00000000..9bebfac0 --- /dev/null +++ b/benchmarks/tsb/bench_unstack.ts @@ -0,0 +1,39 @@ +/** + * Benchmark: Series.unstack() — pivot innermost MultiIndex level to columns. + * Outputs JSON: {"function": "unstack", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const ROWS = 500; +const COLS = 10; +const WARMUP = 5; +const ITERATIONS = 50; + +const data = Array.from({ length: ROWS * COLS }, (_, i) => i * 1.0); +const index = Array.from( + { length: ROWS * COLS }, + (_, i) => [Math.floor(i / COLS), i % COLS] as [number, number], +); +const s = new Series({ data, index }); + +for (let i = 0; i < WARMUP; i++) { + s.unstack(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + s.unstack(); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "unstack", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_value_type_checks.ts b/benchmarks/tsb/bench_value_type_checks.ts new file mode 100644 index 00000000..c95912d3 --- /dev/null +++ b/benchmarks/tsb/bench_value_type_checks.ts @@ -0,0 +1,53 @@ +/** + * Benchmark: extended value type predicates — isNumber, isBool, isStringValue, + * isFloat, isInteger, isBigInt, isRegExp, isReCompilable, isMissing, isHashable, isDate + */ +import { + isNumber, + isBool, + isStringValue, + isFloat, + isInteger, + isBigInt, + isRegExp, + isReCompilable, + isMissing, + isHashable, + isDate, +} from "../../src/index.js"; + +const WARMUP = 3; +const ITERATIONS = 10_000; + +const mixed = [42, 3.14, true, "hello", null, undefined, BigInt(9007199254740993), /abc/i, new Date(), { a: 1 }]; + +function runChecks(): void { + for (const v of mixed) { + isNumber(v); + isBool(v); + isStringValue(v); + isFloat(v); + isInteger(v); + isBigInt(v); + isRegExp(v); + isReCompilable(v); + isMissing(v); + isHashable(v); + isDate(v); + } +} + +for (let i = 0; i < WARMUP; i++) runChecks(); + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) runChecks(); +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "value_type_checks", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_where.ts b/benchmarks/tsb/bench_where.ts new file mode 100644 index 00000000..14843151 --- /dev/null +++ b/benchmarks/tsb/bench_where.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: Series.where() — conditional replacement. + * Outputs JSON: {"function": "where", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 1.0) }); +const cond = s.gt(50000.0); + +for (let i = 0; i < WARMUP; i++) { + s.where(cond, 0.0); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + s.where(cond, 0.0); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "where", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_zscore.ts b/benchmarks/tsb/bench_zscore.ts new file mode 100644 index 00000000..6e856325 --- /dev/null +++ b/benchmarks/tsb/bench_zscore.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: zscore normalization on 100k-element Series + */ +import { Series, zscore } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01) * 100 + 50); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + zscore(s); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + zscore(s); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "zscore", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +);