From 0258e6f450519b0fba15dedb7aea35f704df8e32 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 17:46:59 +0000 Subject: [PATCH 1/5] Iteration 159: 5 new benchmark pairs (513 total, +5 vs best 508) Added benchmarks for: - series_sign: seriesSign element-wise sign function (numpy.sign equivalent) - groupby_groups_props: DataFrameGroupBy .groups/.groupKeys/.ngroups properties - merge_sort: merge with sort=true option (pd.merge sort=True equivalent) - series_groupby_groups: SeriesGroupBy .groups/.groupKeys/.ngroups properties - pipe_fn: pipe functional composition operator (Series.pipe equivalent) Run: https://github.com/githubnext/tsessebe/actions/runs/24577736975 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../pandas/bench_groupby_groups_props.py | 36 +++++++++++++++ benchmarks/pandas/bench_merge_sort.py | 33 ++++++++++++++ benchmarks/pandas/bench_pipe_fn.py | 38 ++++++++++++++++ .../pandas/bench_series_groupby_groups.py | 34 ++++++++++++++ benchmarks/pandas/bench_series_sign.py | 26 +++++++++++ benchmarks/tsb/bench_groupby_groups_props.ts | 45 +++++++++++++++++++ benchmarks/tsb/bench_merge_sort.ts | 42 +++++++++++++++++ benchmarks/tsb/bench_pipe_fn.ts | 45 +++++++++++++++++++ benchmarks/tsb/bench_series_groupby_groups.ts | 43 ++++++++++++++++++ benchmarks/tsb/bench_series_sign.ts | 35 +++++++++++++++ 10 files changed, 377 insertions(+) create mode 100644 benchmarks/pandas/bench_groupby_groups_props.py create mode 100644 benchmarks/pandas/bench_merge_sort.py create mode 100644 benchmarks/pandas/bench_pipe_fn.py create mode 100644 benchmarks/pandas/bench_series_groupby_groups.py create mode 100644 benchmarks/pandas/bench_series_sign.py create mode 100644 benchmarks/tsb/bench_groupby_groups_props.ts create mode 100644 benchmarks/tsb/bench_merge_sort.ts create mode 100644 benchmarks/tsb/bench_pipe_fn.ts create mode 100644 benchmarks/tsb/bench_series_groupby_groups.ts create mode 100644 benchmarks/tsb/bench_series_sign.ts diff --git a/benchmarks/pandas/bench_groupby_groups_props.py b/benchmarks/pandas/bench_groupby_groups_props.py new file mode 100644 index 00000000..5f8d42c4 --- /dev/null +++ b/benchmarks/pandas/bench_groupby_groups_props.py @@ -0,0 +1,36 @@ +"""Benchmark: DataFrameGroupBy .groups / .ngroups properties on 100k-row DataFrame.""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +depts = ["eng", "hr", "sales", "finance", "ops", "legal", "mkt", "it", "rd", "ops2"] +df = pd.DataFrame({ + "dept": [depts[i % len(depts)] for i in range(SIZE)], + "salary": [50_000 + (i % 100) * 1000 for i in range(SIZE)], + "score": [(i % 100) * 0.01 for i in range(SIZE)], +}) + +gb = df.groupby("dept") + +for _ in range(WARMUP): + _g = gb.groups + _k = list(gb.groups.keys()) + _n = gb.ngroups + +times = [] +for _ in range(ITERATIONS): + start = time.perf_counter() + _g = gb.groups + _k = list(gb.groups.keys()) + _n = gb.ngroups + times.append((time.perf_counter() - start) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS + +print(json.dumps({"function": "groupby_groups_props", "mean_ms": mean_ms, "iterations": ITERATIONS, "total_ms": total_ms})) diff --git a/benchmarks/pandas/bench_merge_sort.py b/benchmarks/pandas/bench_merge_sort.py new file mode 100644 index 00000000..6ad62f24 --- /dev/null +++ b/benchmarks/pandas/bench_merge_sort.py @@ -0,0 +1,33 @@ +"""Benchmark: merge with sort=True — sort result by join-key on 50k-row DataFrames.""" +import json +import time +import numpy as np +import pandas as pd + +ROWS = 50_000 +WARMUP = 3 +ITERATIONS = 20 + +left = pd.DataFrame({ + "id": np.arange(ROWS) % (ROWS // 2), + "val_l": np.arange(ROWS) * 1.5, +}) + +right = pd.DataFrame({ + "id": np.arange(ROWS // 2), + "val_r": np.arange(ROWS // 2) * 2.0, +}) + +for _ in range(WARMUP): + pd.merge(left, right, on="id", how="inner", sort=True) + +times = [] +for _ in range(ITERATIONS): + start = time.perf_counter() + pd.merge(left, right, on="id", how="inner", sort=True) + times.append((time.perf_counter() - start) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS + +print(json.dumps({"function": "merge_sort", "mean_ms": mean_ms, "iterations": ITERATIONS, "total_ms": total_ms})) diff --git a/benchmarks/pandas/bench_pipe_fn.py b/benchmarks/pandas/bench_pipe_fn.py new file mode 100644 index 00000000..5143f5f0 --- /dev/null +++ b/benchmarks/pandas/bench_pipe_fn.py @@ -0,0 +1,38 @@ +"""Benchmark: pipe — functional pipeline composition via pandas Series.pipe on 100k-element Series.""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series((np.arange(SIZE) % 200) - 100.0) +df = pd.DataFrame({ + "a": (np.arange(SIZE) % 100) - 50.0, + "b": np.sin(np.arange(SIZE) * 0.01) * 100, +}) + +def double(x: pd.Series) -> pd.Series: + return x * 2 + +def add_hundred(x: pd.Series) -> pd.Series: + return x + 100 + +def abs_series(x: pd.Series) -> pd.Series: + return x.abs() + +for _ in range(WARMUP): + s.pipe(abs_series).pipe(double).pipe(add_hundred) + +times = [] +for _ in range(ITERATIONS): + start = time.perf_counter() + s.pipe(abs_series).pipe(double).pipe(add_hundred) + times.append((time.perf_counter() - start) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS + +print(json.dumps({"function": "pipe_fn", "mean_ms": mean_ms, "iterations": ITERATIONS, "total_ms": total_ms})) diff --git a/benchmarks/pandas/bench_series_groupby_groups.py b/benchmarks/pandas/bench_series_groupby_groups.py new file mode 100644 index 00000000..3538002a --- /dev/null +++ b/benchmarks/pandas/bench_series_groupby_groups.py @@ -0,0 +1,34 @@ +"""Benchmark: SeriesGroupBy .groups / .ngroups properties on 100k-element Series.""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +categories = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"] +data = np.arange(SIZE) * 0.1 +by = [categories[i % len(categories)] for i in range(SIZE)] + +s = pd.Series(data) +gb = s.groupby(by) + +for _ in range(WARMUP): + _g = gb.groups + _k = list(gb.groups.keys()) + _n = gb.ngroups + +times = [] +for _ in range(ITERATIONS): + start = time.perf_counter() + _g = gb.groups + _k = list(gb.groups.keys()) + _n = gb.ngroups + times.append((time.perf_counter() - start) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS + +print(json.dumps({"function": "series_groupby_groups", "mean_ms": mean_ms, "iterations": ITERATIONS, "total_ms": total_ms})) diff --git a/benchmarks/pandas/bench_series_sign.py b/benchmarks/pandas/bench_series_sign.py new file mode 100644 index 00000000..363681d8 --- /dev/null +++ b/benchmarks/pandas/bench_series_sign.py @@ -0,0 +1,26 @@ +"""Benchmark: seriesSign — element-wise sign via numpy.sign on 100k-element Series.""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +data = np.sin(np.arange(SIZE) * 0.01) * 1000 +s = pd.Series(data) + +for _ in range(WARMUP): + np.sign(s) + +times = [] +for _ in range(ITERATIONS): + start = time.perf_counter() + np.sign(s) + times.append((time.perf_counter() - start) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS + +print(json.dumps({"function": "series_sign", "mean_ms": mean_ms, "iterations": ITERATIONS, "total_ms": total_ms})) diff --git a/benchmarks/tsb/bench_groupby_groups_props.ts b/benchmarks/tsb/bench_groupby_groups_props.ts new file mode 100644 index 00000000..6bdc7fa5 --- /dev/null +++ b/benchmarks/tsb/bench_groupby_groups_props.ts @@ -0,0 +1,45 @@ +/** + * Benchmark: DataFrameGroupBy .groups / .groupKeys / .ngroups properties on 100k rows. + * Outputs JSON: {"function": "groupby_groups_props", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, DataFrameGroupBy } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const depts = ["eng", "hr", "sales", "finance", "ops", "legal", "mkt", "it", "rd", "ops2"]; +const df = DataFrame.fromColumns({ + dept: Array.from({ length: SIZE }, (_, i) => depts[i % depts.length]), + salary: Array.from({ length: SIZE }, (_, i) => 50_000 + (i % 100) * 1000), + score: Array.from({ length: SIZE }, (_, i) => (i % 100) * 0.01), +}); + +const gb = new DataFrameGroupBy(df, ["dept"]); + +for (let i = 0; i < WARMUP; i++) { + const _g = gb.groups; + const _k = gb.groupKeys; + const _n = gb.ngroups; +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + const _g = gb.groups; + const _k = gb.groupKeys; + const _n = gb.ngroups; + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; + +console.log( + JSON.stringify({ + function: "groupby_groups_props", + mean_ms: meanMs, + iterations: ITERATIONS, + total_ms: totalMs, + }), +); diff --git a/benchmarks/tsb/bench_merge_sort.ts b/benchmarks/tsb/bench_merge_sort.ts new file mode 100644 index 00000000..4f2db140 --- /dev/null +++ b/benchmarks/tsb/bench_merge_sort.ts @@ -0,0 +1,42 @@ +/** + * Benchmark: merge with sort=true — sort result by join-key values on 50k-row DataFrames. + * Outputs JSON: {"function": "merge_sort", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, merge } from "../../src/index.ts"; + +const ROWS = 50_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const left = DataFrame.fromColumns({ + id: Array.from({ length: ROWS }, (_, i) => i % (ROWS / 2)), + val_l: Array.from({ length: ROWS }, (_, i) => i * 1.5), +}); + +const right = DataFrame.fromColumns({ + id: Array.from({ length: ROWS / 2 }, (_, i) => i), + val_r: Array.from({ length: ROWS / 2 }, (_, i) => i * 2.0), +}); + +for (let i = 0; i < WARMUP; i++) { + merge(left, right, { on: "id", how: "inner", sort: true }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + merge(left, right, { on: "id", how: "inner", sort: true }); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; + +console.log( + JSON.stringify({ + function: "merge_sort", + mean_ms: meanMs, + iterations: ITERATIONS, + total_ms: totalMs, + }), +); diff --git a/benchmarks/tsb/bench_pipe_fn.ts b/benchmarks/tsb/bench_pipe_fn.ts new file mode 100644 index 00000000..aaa0d1f6 --- /dev/null +++ b/benchmarks/tsb/bench_pipe_fn.ts @@ -0,0 +1,45 @@ +/** + * Benchmark: pipe — functional pipeline composition operator on 100k-element Series and DataFrame. + * Outputs JSON: {"function": "pipe_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, pipe, seriesAbs, seriesMul, seriesAdd } from "../../src/index.ts"; +import type { Scalar } from "../../src/types.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i % 200) - 100.0) }); +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => (i % 100) - 50.0), + b: Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.01) * 100), +}); + +const double = (x: Series) => seriesMul(x, 2); +const addHundred = (x: Series) => seriesAdd(x, 100); +const abs = (x: Series) => seriesAbs(x); + +for (let i = 0; i < WARMUP; i++) { + pipe(s, abs, double, addHundred); + pipe(42, (x: number) => x * 2, (x: number) => x + 1); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + pipe(s, abs, double, addHundred); + pipe(42, (x: number) => x * 2, (x: number) => x + 1); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; + +console.log( + JSON.stringify({ + function: "pipe_fn", + mean_ms: meanMs, + iterations: ITERATIONS, + total_ms: totalMs, + }), +); diff --git a/benchmarks/tsb/bench_series_groupby_groups.ts b/benchmarks/tsb/bench_series_groupby_groups.ts new file mode 100644 index 00000000..3344f77a --- /dev/null +++ b/benchmarks/tsb/bench_series_groupby_groups.ts @@ -0,0 +1,43 @@ +/** + * Benchmark: SeriesGroupBy .groups / .groupKeys / .ngroups properties on 100k-element Series. + * Outputs JSON: {"function": "series_groupby_groups", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, SeriesGroupBy } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const categories = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]; +const data = Array.from({ length: SIZE }, (_, i) => i * 0.1); +const by = Array.from({ length: SIZE }, (_, i) => categories[i % categories.length]); + +const s = new Series({ data }); +const gb = new SeriesGroupBy(s, by); + +for (let i = 0; i < WARMUP; i++) { + const _g = gb.groups; + const _k = gb.groupKeys; + const _n = gb.ngroups; +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + const _g = gb.groups; + const _k = gb.groupKeys; + const _n = gb.ngroups; + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; + +console.log( + JSON.stringify({ + function: "series_groupby_groups", + mean_ms: meanMs, + iterations: ITERATIONS, + total_ms: totalMs, + }), +); diff --git a/benchmarks/tsb/bench_series_sign.ts b/benchmarks/tsb/bench_series_sign.ts new file mode 100644 index 00000000..b4b4a815 --- /dev/null +++ b/benchmarks/tsb/bench_series_sign.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: seriesSign — element-wise sign on 100k-element Series. + * Outputs JSON: {"function": "series_sign", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, seriesSign } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const data = Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.01) * 1000); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + seriesSign(s); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + seriesSign(s); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; + +console.log( + JSON.stringify({ + function: "series_sign", + mean_ms: meanMs, + iterations: ITERATIONS, + total_ms: totalMs, + }), +); From 4edd60113ac55c67a65ea0a55d60297597003017 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 19:04:35 +0000 Subject: [PATCH 2/5] Iteration 161: 5 new benchmark pairs (513 total, +5 vs best 508) Run: https://github.com/githubnext/tsessebe/actions/runs/24581386899 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../bench_clip_dataframe_with_bounds.py | 39 +++++++++++++++ .../pandas/bench_clip_series_with_bounds.py | 35 ++++++++++++++ benchmarks/pandas/bench_dataframe_pipe_to.py | 42 +++++++++++++++++ .../pandas/bench_qcut_interval_index.py | 32 +++++++++++++ benchmarks/pandas/bench_series_log2_log10.py | 44 +++++++++++++++++ .../tsb/bench_clip_dataframe_with_bounds.ts | 40 ++++++++++++++++ .../tsb/bench_clip_series_with_bounds.ts | 39 +++++++++++++++ benchmarks/tsb/bench_dataframe_pipe_to.ts | 47 +++++++++++++++++++ benchmarks/tsb/bench_qcut_interval_index.ts | 34 ++++++++++++++ benchmarks/tsb/bench_series_log2_log10.ts | 45 ++++++++++++++++++ 10 files changed, 397 insertions(+) create mode 100644 benchmarks/pandas/bench_clip_dataframe_with_bounds.py create mode 100644 benchmarks/pandas/bench_clip_series_with_bounds.py create mode 100644 benchmarks/pandas/bench_dataframe_pipe_to.py create mode 100644 benchmarks/pandas/bench_qcut_interval_index.py create mode 100644 benchmarks/pandas/bench_series_log2_log10.py create mode 100644 benchmarks/tsb/bench_clip_dataframe_with_bounds.ts create mode 100644 benchmarks/tsb/bench_clip_series_with_bounds.ts create mode 100644 benchmarks/tsb/bench_dataframe_pipe_to.ts create mode 100644 benchmarks/tsb/bench_qcut_interval_index.ts create mode 100644 benchmarks/tsb/bench_series_log2_log10.ts diff --git a/benchmarks/pandas/bench_clip_dataframe_with_bounds.py b/benchmarks/pandas/bench_clip_dataframe_with_bounds.py new file mode 100644 index 00000000..af38dd96 --- /dev/null +++ b/benchmarks/pandas/bench_clip_dataframe_with_bounds.py @@ -0,0 +1,39 @@ +""" +Benchmark: pandas DataFrame.clip with Series bounds (axis=0) on 100k-row DataFrame. +Outputs JSON: {"function": "clip_dataframe_with_bounds", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +df = pd.DataFrame({ + "a": [(i % 200) - 100 for i in range(SIZE)], + "b": [(i % 150) - 75 for i in range(SIZE)], + "c": [(i % 100) - 50 for i in range(SIZE)], +}) + +lower_bounds = pd.Series([(i % 40) - 20 for i in range(SIZE)]) +upper_bounds = pd.Series([(i % 40) + 20 for i in range(SIZE)]) + +for _ in range(WARMUP): + df.clip(lower=lower_bounds, upper=upper_bounds, axis=0) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.clip(lower=lower_bounds, upper=upper_bounds, axis=0) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "clip_dataframe_with_bounds", + "mean_ms": mean_ms, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_clip_series_with_bounds.py b/benchmarks/pandas/bench_clip_series_with_bounds.py new file mode 100644 index 00000000..5ad3a06b --- /dev/null +++ b/benchmarks/pandas/bench_clip_series_with_bounds.py @@ -0,0 +1,35 @@ +""" +Benchmark: pandas Series.clip with per-element Series bounds on 100k values. +Outputs JSON: {"function": "clip_series_with_bounds", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +data = [(i % 200) - 100 for i in range(SIZE)] +lower = pd.Series([(i % 50) - 30 for i in range(SIZE)]) +upper = pd.Series([(i % 50) + 20 for i in range(SIZE)]) +series = pd.Series(data) + +for _ in range(WARMUP): + series.clip(lower=lower, upper=upper) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + series.clip(lower=lower, upper=upper) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "clip_series_with_bounds", + "mean_ms": mean_ms, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_dataframe_pipe_to.py b/benchmarks/pandas/bench_dataframe_pipe_to.py new file mode 100644 index 00000000..740d36bd --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_pipe_to.py @@ -0,0 +1,42 @@ +""" +Benchmark: pandas DataFrame.pipe with positional target argument on 100k-row DataFrame. +Mirrors tsb's dataFramePipeTo — inserting the DataFrame at a specific arg position. +Outputs JSON: {"function": "dataframe_pipe_to", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + + +def filter_above(threshold: float, df: pd.DataFrame) -> pd.DataFrame: + return df[df["val"] > threshold] + + +left = pd.DataFrame({ + "key": [i % 1000 for i in range(SIZE)], + "val": [i * 1.5 for i in range(SIZE)], +}) + +for _ in range(WARMUP): + # pandas pipe with tuple form: (fn, 'positional_kwarg') — use pipe with lambda here + left.pipe(lambda df: filter_above(50_000, df)) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + left.pipe(lambda df: filter_above(50_000, df)) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "dataframe_pipe_to", + "mean_ms": mean_ms, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_qcut_interval_index.py b/benchmarks/pandas/bench_qcut_interval_index.py new file mode 100644 index 00000000..a1cdffe2 --- /dev/null +++ b/benchmarks/pandas/bench_qcut_interval_index.py @@ -0,0 +1,32 @@ +""" +Benchmark: pandas qcut with IntervalIndex output on 100k values. +Outputs JSON: {"function": "qcut_interval_index", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +data = (np.arange(SIZE) * 1.1) % 1000 + +for _ in range(WARMUP): + pd.qcut(data, q=10, duplicates="drop", retbins=True) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.qcut(data, q=10, duplicates="drop", retbins=True) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "qcut_interval_index", + "mean_ms": mean_ms, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_series_log2_log10.py b/benchmarks/pandas/bench_series_log2_log10.py new file mode 100644 index 00000000..1d7344e6 --- /dev/null +++ b/benchmarks/pandas/bench_series_log2_log10.py @@ -0,0 +1,44 @@ +""" +Benchmark: pandas Series/DataFrame log2 / log10 on 100k values. +Outputs JSON: {"function": "series_log2_log10", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +data = (np.arange(1, SIZE + 1) * 0.01) +s = pd.Series(data) +df = pd.DataFrame({ + "a": data, + "b": np.arange(1, SIZE + 1) * 0.02, + "c": np.arange(1, SIZE + 1) * 0.03, +}) + +for _ in range(WARMUP): + np.log2(s) + np.log10(s) + np.log2(df) + np.log10(df) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + np.log2(s) + np.log10(s) + np.log2(df) + np.log10(df) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "series_log2_log10", + "mean_ms": mean_ms, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/tsb/bench_clip_dataframe_with_bounds.ts b/benchmarks/tsb/bench_clip_dataframe_with_bounds.ts new file mode 100644 index 00000000..83d87145 --- /dev/null +++ b/benchmarks/tsb/bench_clip_dataframe_with_bounds.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: clipDataFrameWithBounds with Series bounds (axis=0) on 100k-row DataFrame. + * Outputs JSON: {"function": "clip_dataframe_with_bounds", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, clipDataFrameWithBounds } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => (i % 200) - 100), + b: Array.from({ length: SIZE }, (_, i) => (i % 150) - 75), + c: Array.from({ length: SIZE }, (_, i) => (i % 100) - 50), +}); + +const lowerBounds = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i % 40) - 20) }); +const upperBounds = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i % 40) + 20) }); + +for (let i = 0; i < WARMUP; i++) { + clipDataFrameWithBounds(df, { lower: lowerBounds, upper: upperBounds, axis: 0 }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + clipDataFrameWithBounds(df, { lower: lowerBounds, upper: upperBounds, axis: 0 }); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "clip_dataframe_with_bounds", + mean_ms: meanMs, + iterations: ITERATIONS, + total_ms: totalMs, + }), +); diff --git a/benchmarks/tsb/bench_clip_series_with_bounds.ts b/benchmarks/tsb/bench_clip_series_with_bounds.ts new file mode 100644 index 00000000..9b8b05db --- /dev/null +++ b/benchmarks/tsb/bench_clip_series_with_bounds.ts @@ -0,0 +1,39 @@ +/** + * Benchmark: clipSeriesWithBounds with per-element Series bounds on 100k values. + * Outputs JSON: {"function": "clip_series_with_bounds", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, clipSeriesWithBounds } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const data = Array.from({ length: SIZE }, (_, i) => (i % 200) - 100); +const lower = Array.from({ length: SIZE }, (_, i) => (i % 50) - 30); +const upper = Array.from({ length: SIZE }, (_, i) => (i % 50) + 20); + +const series = new Series({ data }); +const lowerSeries = new Series({ data: lower }); +const upperSeries = new Series({ data: upper }); + +for (let i = 0; i < WARMUP; i++) { + clipSeriesWithBounds(series, { lower: lowerSeries, upper: upperSeries }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + clipSeriesWithBounds(series, { lower: lowerSeries, upper: upperSeries }); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "clip_series_with_bounds", + mean_ms: meanMs, + iterations: ITERATIONS, + total_ms: totalMs, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_pipe_to.ts b/benchmarks/tsb/bench_dataframe_pipe_to.ts new file mode 100644 index 00000000..876f9fe5 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_pipe_to.ts @@ -0,0 +1,47 @@ +/** + * Benchmark: dataFramePipeTo — insert DataFrame at a specific argument position in a pipeline. + * Outputs JSON: {"function": "dataframe_pipe_to", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dataFramePipeTo } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const left = DataFrame.fromColumns({ + key: Array.from({ length: SIZE }, (_, i) => i % 1000), + val: Array.from({ length: SIZE }, (_, i) => i * 1.5), +}); + +const right = DataFrame.fromColumns({ + key: Array.from({ length: 1000 }, (_, i) => i), + label: Array.from({ length: 1000 }, (_, i) => `item_${i}`), +}); + +// A simple transform: filter df rows where col > threshold +function filterAbove(threshold: number, df: DataFrame): DataFrame { + return df.filter((row) => (row["val"] as number) > threshold); +} + +for (let i = 0; i < WARMUP; i++) { + // dataFramePipeTo inserts `left` at position 1: filterAbove(threshold, left) + dataFramePipeTo(left, 1, filterAbove, 50_000); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + dataFramePipeTo(left, 1, filterAbove, 50_000); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "dataframe_pipe_to", + mean_ms: meanMs, + iterations: ITERATIONS, + total_ms: totalMs, + }), +); diff --git a/benchmarks/tsb/bench_qcut_interval_index.ts b/benchmarks/tsb/bench_qcut_interval_index.ts new file mode 100644 index 00000000..2bd412bf --- /dev/null +++ b/benchmarks/tsb/bench_qcut_interval_index.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: qcutIntervalIndex — compute quantile-based IntervalIndex from 100k values. + * Outputs JSON: {"function": "qcut_interval_index", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { qcutIntervalIndex } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const data = Array.from({ length: SIZE }, (_, i) => (i * 1.1) % 1000); + +// Quantile-based binning into 10 equal-frequency bins +for (let i = 0; i < WARMUP; i++) { + qcutIntervalIndex(data, 10); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + qcutIntervalIndex(data, 10); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "qcut_interval_index", + mean_ms: meanMs, + iterations: ITERATIONS, + total_ms: totalMs, + }), +); diff --git a/benchmarks/tsb/bench_series_log2_log10.ts b/benchmarks/tsb/bench_series_log2_log10.ts new file mode 100644 index 00000000..8ee2348e --- /dev/null +++ b/benchmarks/tsb/bench_series_log2_log10.ts @@ -0,0 +1,45 @@ +/** + * Benchmark: seriesLog2 / seriesLog10 / dataFrameLog2 / dataFrameLog10 on 100k values. + * Outputs JSON: {"function": "series_log2_log10", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, seriesLog2, seriesLog10, dataFrameLog2, dataFrameLog10 } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const data = Array.from({ length: SIZE }, (_, i) => (i + 1) * 0.01); +const s = new Series({ data }); +const df = DataFrame.fromColumns({ + a: data, + b: Array.from({ length: SIZE }, (_, i) => (i + 1) * 0.02), + c: Array.from({ length: SIZE }, (_, i) => (i + 1) * 0.03), +}); + +for (let i = 0; i < WARMUP; i++) { + seriesLog2(s); + seriesLog10(s); + dataFrameLog2(df); + dataFrameLog10(df); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + seriesLog2(s); + seriesLog10(s); + dataFrameLog2(df); + dataFrameLog10(df); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "series_log2_log10", + mean_ms: meanMs, + iterations: ITERATIONS, + total_ms: totalMs, + }), +); From 94a76b2e1ad35ffd96051f4f8aa0e6902a626f4f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 20:03:05 +0000 Subject: [PATCH 3/5] Iteration 163: 6 new benchmark pairs (514 total, +1 vs best 513) Added: series_to_array, dataframe_has_col_get, series_var_method, series_min_max_method, dataframe_var_method, dataframe_median_method. Run: https://github.com/githubnext/tsessebe/actions/runs/24583783780 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../pandas/bench_dataframe_has_col_get.py | 25 ++++++++++++ .../pandas/bench_dataframe_median_method.py | 20 ++++++++++ .../pandas/bench_dataframe_var_method.py | 20 ++++++++++ .../pandas/bench_series_min_max_method.py | 24 +++++++++++ benchmarks/pandas/bench_series_to_array.py | 24 +++++++++++ benchmarks/pandas/bench_series_var_method.py | 20 ++++++++++ benchmarks/tsb/bench_dataframe_has_col_get.ts | 40 +++++++++++++++++++ .../tsb/bench_dataframe_median_method.ts | 34 ++++++++++++++++ benchmarks/tsb/bench_dataframe_var_method.ts | 34 ++++++++++++++++ benchmarks/tsb/bench_series_min_max_method.ts | 34 ++++++++++++++++ benchmarks/tsb/bench_series_to_array.ts | 34 ++++++++++++++++ benchmarks/tsb/bench_series_var_method.ts | 30 ++++++++++++++ 12 files changed, 339 insertions(+) create mode 100644 benchmarks/pandas/bench_dataframe_has_col_get.py create mode 100644 benchmarks/pandas/bench_dataframe_median_method.py create mode 100644 benchmarks/pandas/bench_dataframe_var_method.py create mode 100644 benchmarks/pandas/bench_series_min_max_method.py create mode 100644 benchmarks/pandas/bench_series_to_array.py create mode 100644 benchmarks/pandas/bench_series_var_method.py create mode 100644 benchmarks/tsb/bench_dataframe_has_col_get.ts create mode 100644 benchmarks/tsb/bench_dataframe_median_method.ts create mode 100644 benchmarks/tsb/bench_dataframe_var_method.ts create mode 100644 benchmarks/tsb/bench_series_min_max_method.ts create mode 100644 benchmarks/tsb/bench_series_to_array.ts create mode 100644 benchmarks/tsb/bench_series_var_method.ts diff --git a/benchmarks/pandas/bench_dataframe_has_col_get.py b/benchmarks/pandas/bench_dataframe_has_col_get.py new file mode 100644 index 00000000..1b678c18 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_has_col_get.py @@ -0,0 +1,25 @@ +"""Benchmark: DataFrame column presence and access (.keys(), [], __getitem__) on 100k-row DataFrame.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 10 +ITERATIONS = 100 + +df = pd.DataFrame({"a": list(range(SIZE)), "b": [i * 2.0 for i in range(SIZE)], "c": [str(i) for i in range(SIZE)]}) + +for _ in range(WARMUP): + "a" in df.columns + df["b"] + df.get("c") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + "a" in df.columns + df["b"] + df.get("c") + times.append((time.perf_counter() - t0) * 1000) + +total = sum(times) +print(json.dumps({"function": "dataframe_has_col_get", "mean_ms": round(total / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total, 3)})) diff --git a/benchmarks/pandas/bench_dataframe_median_method.py b/benchmarks/pandas/bench_dataframe_median_method.py new file mode 100644 index 00000000..5e7acfaf --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_median_method.py @@ -0,0 +1,20 @@ +"""Benchmark: DataFrame.median() — column-wise median on 100k-row DataFrame.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({"a": [i * 1.1 for i in range(SIZE)], "b": [i * 2.2 for i in range(SIZE)], "c": [i * 3.3 for i in range(SIZE)]}) + +for _ in range(WARMUP): df.median() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.median() + times.append((time.perf_counter() - t0) * 1000) + +total = sum(times) +print(json.dumps({"function": "dataframe_median_method", "mean_ms": round(total / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total, 3)})) diff --git a/benchmarks/pandas/bench_dataframe_var_method.py b/benchmarks/pandas/bench_dataframe_var_method.py new file mode 100644 index 00000000..f809e18b --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_var_method.py @@ -0,0 +1,20 @@ +"""Benchmark: DataFrame.var() — column-wise variance on 100k-row DataFrame.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 10 +ITERATIONS = 100 + +df = pd.DataFrame({"a": [i * 1.1 for i in range(SIZE)], "b": [i * 2.2 for i in range(SIZE)], "c": [i * 3.3 for i in range(SIZE)]}) + +for _ in range(WARMUP): df.var() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.var() + times.append((time.perf_counter() - t0) * 1000) + +total = sum(times) +print(json.dumps({"function": "dataframe_var_method", "mean_ms": round(total / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total, 3)})) diff --git a/benchmarks/pandas/bench_series_min_max_method.py b/benchmarks/pandas/bench_series_min_max_method.py new file mode 100644 index 00000000..1675a423 --- /dev/null +++ b/benchmarks/pandas/bench_series_min_max_method.py @@ -0,0 +1,24 @@ +"""Benchmark: Series.min() and .max() — min/max on 100k numeric Series.""" +import json, time +import math +import pandas as pd + +SIZE = 100_000 +WARMUP = 10 +ITERATIONS = 100 + +s = pd.Series([math.sin(i) * 1000 for i in range(SIZE)]) + +for _ in range(WARMUP): + s.min() + s.max() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.min() + s.max() + times.append((time.perf_counter() - t0) * 1000) + +total = sum(times) +print(json.dumps({"function": "series_min_max_method", "mean_ms": round(total / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total, 3)})) diff --git a/benchmarks/pandas/bench_series_to_array.py b/benchmarks/pandas/bench_series_to_array.py new file mode 100644 index 00000000..b9577123 --- /dev/null +++ b/benchmarks/pandas/bench_series_to_array.py @@ -0,0 +1,24 @@ +"""Benchmark: Series.to_numpy() and .tolist() — convert 100k-element Series to plain arrays.""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 10 +ITERATIONS = 100 + +s = pd.Series([i * 2.5 for i in range(SIZE)]) + +for _ in range(WARMUP): + s.to_numpy() + s.tolist() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.to_numpy() + s.tolist() + times.append((time.perf_counter() - t0) * 1000) + +total = sum(times) +print(json.dumps({"function": "series_to_array", "mean_ms": round(total / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total, 3)})) diff --git a/benchmarks/pandas/bench_series_var_method.py b/benchmarks/pandas/bench_series_var_method.py new file mode 100644 index 00000000..813ef65c --- /dev/null +++ b/benchmarks/pandas/bench_series_var_method.py @@ -0,0 +1,20 @@ +"""Benchmark: Series.var() — variance on 100k numeric Series.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 10 +ITERATIONS = 100 + +s = pd.Series([i * 0.5 for i in range(SIZE)]) + +for _ in range(WARMUP): s.var() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.var() + times.append((time.perf_counter() - t0) * 1000) + +total = sum(times) +print(json.dumps({"function": "series_var_method", "mean_ms": round(total / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total, 3)})) diff --git a/benchmarks/tsb/bench_dataframe_has_col_get.ts b/benchmarks/tsb/bench_dataframe_has_col_get.ts new file mode 100644 index 00000000..f1647cfd --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_has_col_get.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: DataFrame.has(), .col(), .get() — column presence and access on 100k-row DataFrame. + * Outputs JSON: {"function": "dataframe_has_col_get", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 10; +const ITERATIONS = 100; + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i), + b: Array.from({ length: SIZE }, (_, i) => i * 2.0), + c: Array.from({ length: SIZE }, (_, i) => String(i)), +}); + +for (let i = 0; i < WARMUP; i++) { + df.has("a"); + df.col("b"); + df.get("c"); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.has("a"); + df.col("b"); + df.get("c"); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "dataframe_has_col_get", + mean_ms: Math.round((total / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_median_method.ts b/benchmarks/tsb/bench_dataframe_median_method.ts new file mode 100644 index 00000000..eba97d53 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_median_method.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: DataFrame.median() — column-wise median on 100k-row DataFrame. + * Outputs JSON: {"function": "dataframe_median_method", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i * 1.1), + b: Array.from({ length: SIZE }, (_, i) => i * 2.2), + c: Array.from({ length: SIZE }, (_, i) => i * 3.3), +}); + +for (let i = 0; i < WARMUP; i++) df.median(); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.median(); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "dataframe_median_method", + mean_ms: Math.round((total / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_var_method.ts b/benchmarks/tsb/bench_dataframe_var_method.ts new file mode 100644 index 00000000..5c119f47 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_var_method.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: DataFrame.var() — column-wise variance on 100k-row DataFrame. + * Outputs JSON: {"function": "dataframe_var_method", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 10; +const ITERATIONS = 100; + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i * 1.1), + b: Array.from({ length: SIZE }, (_, i) => i * 2.2), + c: Array.from({ length: SIZE }, (_, i) => i * 3.3), +}); + +for (let i = 0; i < WARMUP; i++) df.var(); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.var(); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "dataframe_var_method", + mean_ms: Math.round((total / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_series_min_max_method.ts b/benchmarks/tsb/bench_series_min_max_method.ts new file mode 100644 index 00000000..a540b0f2 --- /dev/null +++ b/benchmarks/tsb/bench_series_min_max_method.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: Series.min() and .max() — min/max on 100k numeric Series. + * Outputs JSON: {"function": "series_min_max_method", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 10; +const ITERATIONS = 100; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => Math.sin(i) * 1000) }); + +for (let i = 0; i < WARMUP; i++) { + s.min(); + s.max(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.min(); + s.max(); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "series_min_max_method", + mean_ms: Math.round((total / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_series_to_array.ts b/benchmarks/tsb/bench_series_to_array.ts new file mode 100644 index 00000000..d61d5eec --- /dev/null +++ b/benchmarks/tsb/bench_series_to_array.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: Series.toArray() and .toList() — convert 100k-element Series to plain arrays. + * Outputs JSON: {"function": "series_to_array", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 10; +const ITERATIONS = 100; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 2.5) }); + +for (let i = 0; i < WARMUP; i++) { + s.toArray(); + s.toList(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.toArray(); + s.toList(); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "series_to_array", + mean_ms: Math.round((total / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_series_var_method.ts b/benchmarks/tsb/bench_series_var_method.ts new file mode 100644 index 00000000..d56c673e --- /dev/null +++ b/benchmarks/tsb/bench_series_var_method.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: Series.var() — variance on 100k numeric Series. + * Outputs JSON: {"function": "series_var_method", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 10; +const ITERATIONS = 100; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 0.5) }); + +for (let i = 0; i < WARMUP; i++) s.var(); + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.var(); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "series_var_method", + mean_ms: Math.round((total / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total * 1000) / 1000, + }), +); From a770db3ec6cb5ef5a587af5f568989b98df4a514 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 21:06:14 +0000 Subject: [PATCH 4/5] Iteration 165: 5 new benchmark pairs (513 total, +5 vs branch 508) Added benchmark pairs for: - series_set_reset_index: Series.setIndex() and Series.resetIndex() - melt_id_vars: melt() with id_vars, var_name, value_name options - concat_series_axis0: concat of 5 Series along axis=0 - stack_options: stack() with dropna=true/false options - sample_frac: sampleSeries/sampleDataFrame with frac option Run: https://github.com/githubnext/tsessebe/actions/runs/24585962377 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../pandas/bench_concat_series_axis0.py | 30 ++++++++++++ benchmarks/pandas/bench_melt_id_vars.py | 34 ++++++++++++++ benchmarks/pandas/bench_sample_frac.py | 36 +++++++++++++++ .../pandas/bench_series_set_reset_index.py | 30 ++++++++++++ benchmarks/pandas/bench_stack_options.py | 37 +++++++++++++++ benchmarks/tsb/bench_concat_series_axis0.ts | 35 ++++++++++++++ benchmarks/tsb/bench_melt_id_vars.ts | 46 +++++++++++++++++++ benchmarks/tsb/bench_sample_frac.ts | 42 +++++++++++++++++ .../tsb/bench_series_set_reset_index.ts | 35 ++++++++++++++ benchmarks/tsb/bench_stack_options.ts | 43 +++++++++++++++++ 10 files changed, 368 insertions(+) create mode 100644 benchmarks/pandas/bench_concat_series_axis0.py create mode 100644 benchmarks/pandas/bench_melt_id_vars.py create mode 100644 benchmarks/pandas/bench_sample_frac.py create mode 100644 benchmarks/pandas/bench_series_set_reset_index.py create mode 100644 benchmarks/pandas/bench_stack_options.py create mode 100644 benchmarks/tsb/bench_concat_series_axis0.ts create mode 100644 benchmarks/tsb/bench_melt_id_vars.ts create mode 100644 benchmarks/tsb/bench_sample_frac.ts create mode 100644 benchmarks/tsb/bench_series_set_reset_index.ts create mode 100644 benchmarks/tsb/bench_stack_options.ts diff --git a/benchmarks/pandas/bench_concat_series_axis0.py b/benchmarks/pandas/bench_concat_series_axis0.py new file mode 100644 index 00000000..bcb7c99e --- /dev/null +++ b/benchmarks/pandas/bench_concat_series_axis0.py @@ -0,0 +1,30 @@ +"""Benchmark: pd.concat of multiple Series along axis=0 — vertical stacking +of 5 Series of 20k elements each.""" +import json, time +import numpy as np +import pandas as pd + +CHUNK = 20_000 +WARMUP = 5 +ITERATIONS = 30 + +s1 = pd.Series(np.arange(CHUNK, dtype=float) * 1.0) +s2 = pd.Series(np.arange(CHUNK, dtype=float) * 2.0) +s3 = pd.Series(np.arange(CHUNK, dtype=float) * 3.0) +s4 = pd.Series(np.arange(CHUNK, dtype=float) * 4.0) +s5 = pd.Series(np.arange(CHUNK, dtype=float) * 5.0) + +for _ in range(WARMUP): + pd.concat([s1, s2, s3, s4, s5]) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.concat([s1, s2, s3, s4, s5]) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "concat_series_axis0", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_melt_id_vars.py b/benchmarks/pandas/bench_melt_id_vars.py new file mode 100644 index 00000000..c0196d50 --- /dev/null +++ b/benchmarks/pandas/bench_melt_id_vars.py @@ -0,0 +1,34 @@ +"""Benchmark: pd.melt with id_vars — unpivot keeping identifier columns fixed, +with custom var_name and value_name on a 10k-row DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 10_000 +WARMUP = 5 +ITERATIONS = 30 + +ids = [f"id_{i}" for i in range(ROWS)] +category = ["A", "B", "C"][ : ROWS] +category = [["A", "B", "C"][i % 3] for i in range(ROWS)] +q1 = np.arange(ROWS, dtype=float) +q2 = np.arange(ROWS, dtype=float) * 1.1 +q3 = np.arange(ROWS, dtype=float) * 1.2 +q4 = np.arange(ROWS, dtype=float) * 1.3 + +df = pd.DataFrame({"id": ids, "category": category, "Q1": q1, "Q2": q2, "Q3": q3, "Q4": q4}) + +for _ in range(WARMUP): + pd.melt(df, id_vars=["id", "category"], var_name="quarter", value_name="revenue") + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.melt(df, id_vars=["id", "category"], var_name="quarter", value_name="revenue") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "melt_id_vars", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_sample_frac.py b/benchmarks/pandas/bench_sample_frac.py new file mode 100644 index 00000000..6e5f9992 --- /dev/null +++ b/benchmarks/pandas/bench_sample_frac.py @@ -0,0 +1,36 @@ +"""Benchmark: Series.sample(frac=...) and DataFrame.sample(frac=...) — +fractional sampling (10% of 100k elements) with and without replacement.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +data = np.arange(ROWS, dtype=float) * 1.5 +s = pd.Series(data) +df = pd.DataFrame({ + "a": np.arange(ROWS, dtype=float), + "b": np.arange(ROWS, dtype=float) * 2.0, + "c": np.arange(ROWS, dtype=float) * 3.0, +}) + +for _ in range(WARMUP): + s.sample(frac=0.1) + s.sample(frac=0.05, replace=True) + df.sample(frac=0.1) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.sample(frac=0.1) + s.sample(frac=0.05, replace=True) + df.sample(frac=0.1) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "sample_frac", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_set_reset_index.py b/benchmarks/pandas/bench_series_set_reset_index.py new file mode 100644 index 00000000..951340ea --- /dev/null +++ b/benchmarks/pandas/bench_series_set_reset_index.py @@ -0,0 +1,30 @@ +"""Benchmark: Series.set_axis() and Series.reset_index() — reassign or reset the +row-index of a 100k-element Series.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +data = np.arange(SIZE, dtype=float) * 1.5 +s = pd.Series(data) +new_index = pd.Index(np.arange(SIZE) * 2) + +for _ in range(WARMUP): + s.set_axis(new_index) + s.reset_index(drop=True) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.set_axis(new_index) + s.reset_index(drop=True) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_set_reset_index", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_stack_options.py b/benchmarks/pandas/bench_stack_options.py new file mode 100644 index 00000000..fc18edc9 --- /dev/null +++ b/benchmarks/pandas/bench_stack_options.py @@ -0,0 +1,37 @@ +"""Benchmark: DataFrame.stack with dropna=True/False options — includes null values +in the output on a 2k-row x 5-column DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 2_000 +WARMUP = 5 +ITERATIONS = 30 + +def make_col(mul: float) -> list: + return [None if i % 10 == 0 else float(i) * mul for i in range(ROWS)] + +df = pd.DataFrame({ + "a": make_col(1.0), + "b": make_col(1.1), + "c": make_col(1.2), + "d": make_col(1.3), + "e": make_col(1.4), +}) + +for _ in range(WARMUP): + df.stack(dropna=True) + df.stack(dropna=False) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.stack(dropna=True) + df.stack(dropna=False) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "stack_options", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/tsb/bench_concat_series_axis0.ts b/benchmarks/tsb/bench_concat_series_axis0.ts new file mode 100644 index 00000000..9ae088b8 --- /dev/null +++ b/benchmarks/tsb/bench_concat_series_axis0.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: concat of multiple Series objects along axis=0 — vertical stacking + * of 5 Series of 20k elements each. + * Outputs JSON: {"function": "concat_series_axis0", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, concat } from "../../src/index.ts"; + +const CHUNK = 20_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const s1 = new Series({ data: Array.from({ length: CHUNK }, (_, i) => i * 1.0) }); +const s2 = new Series({ data: Array.from({ length: CHUNK }, (_, i) => i * 2.0) }); +const s3 = new Series({ data: Array.from({ length: CHUNK }, (_, i) => i * 3.0) }); +const s4 = new Series({ data: Array.from({ length: CHUNK }, (_, i) => i * 4.0) }); +const s5 = new Series({ data: Array.from({ length: CHUNK }, (_, i) => i * 5.0) }); + +for (let i = 0; i < WARMUP; i++) { + concat([s1, s2, s3, s4, s5]); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + concat([s1, s2, s3, s4, s5]); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "concat_series_axis0", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_melt_id_vars.ts b/benchmarks/tsb/bench_melt_id_vars.ts new file mode 100644 index 00000000..8b9bf35d --- /dev/null +++ b/benchmarks/tsb/bench_melt_id_vars.ts @@ -0,0 +1,46 @@ +/** + * Benchmark: melt with id_vars — unpivot a wide DataFrame keeping identifier + * columns fixed, with custom var_name and value_name on a 10k-row DataFrame. + * Outputs JSON: {"function": "melt_id_vars", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, melt } from "../../src/index.ts"; + +const ROWS = 10_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const ids = Array.from({ length: ROWS }, (_, i) => `id_${i}`); +const category = Array.from({ length: ROWS }, (_, i) => ["A", "B", "C"][i % 3]); +const q1 = Array.from({ length: ROWS }, (_, i) => i * 1.0); +const q2 = Array.from({ length: ROWS }, (_, i) => i * 1.1); +const q3 = Array.from({ length: ROWS }, (_, i) => i * 1.2); +const q4 = Array.from({ length: ROWS }, (_, i) => i * 1.3); + +const df = DataFrame.fromColumns({ id: ids, category, Q1: q1, Q2: q2, Q3: q3, Q4: q4 }); + +for (let i = 0; i < WARMUP; i++) { + melt(df, { + id_vars: ["id", "category"], + var_name: "quarter", + value_name: "revenue", + }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + melt(df, { + id_vars: ["id", "category"], + var_name: "quarter", + value_name: "revenue", + }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "melt_id_vars", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_sample_frac.ts b/benchmarks/tsb/bench_sample_frac.ts new file mode 100644 index 00000000..0bbf1b6e --- /dev/null +++ b/benchmarks/tsb/bench_sample_frac.ts @@ -0,0 +1,42 @@ +/** + * Benchmark: sampleSeries with frac option and sampleDataFrame with frac option. + * Fractional sampling (10% of 100k elements) with and without replacement. + * Outputs JSON: {"function": "sample_frac", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, sampleSeries, sampleDataFrame } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const data = Array.from({ length: ROWS }, (_, i) => i * 1.5); +const s = new Series({ data }); + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i * 1.0), + b: Array.from({ length: ROWS }, (_, i) => i * 2.0), + c: Array.from({ length: ROWS }, (_, i) => i * 3.0), +}); + +for (let i = 0; i < WARMUP; i++) { + sampleSeries(s, { frac: 0.1 }); + sampleSeries(s, { frac: 0.05, replace: true }); + sampleDataFrame(df, { frac: 0.1 }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + sampleSeries(s, { frac: 0.1 }); + sampleSeries(s, { frac: 0.05, replace: true }); + sampleDataFrame(df, { frac: 0.1 }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "sample_frac", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_set_reset_index.ts b/benchmarks/tsb/bench_series_set_reset_index.ts new file mode 100644 index 00000000..428262be --- /dev/null +++ b/benchmarks/tsb/bench_series_set_reset_index.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: Series.setIndex() and Series.resetIndex() — reassign or reset the + * row-index of a 100k-element Series. + * Outputs JSON: {"function": "series_set_reset_index", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Index, Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const data = Array.from({ length: SIZE }, (_, i) => i * 1.5); +const s = new Series({ data }); +const newIndex = new Index(Array.from({ length: SIZE }, (_, i) => i * 2)); + +for (let i = 0; i < WARMUP; i++) { + s.setIndex(newIndex); + s.resetIndex(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.setIndex(newIndex); + s.resetIndex(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_set_reset_index", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_stack_options.ts b/benchmarks/tsb/bench_stack_options.ts new file mode 100644 index 00000000..9ddbe77f --- /dev/null +++ b/benchmarks/tsb/bench_stack_options.ts @@ -0,0 +1,43 @@ +/** + * Benchmark: stack with dropna=false option — includes null values in the output + * on a 2k-row x 5-column DataFrame (100k total cells including nulls). + * Outputs JSON: {"function": "stack_options", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, stack } from "../../src/index.ts"; + +const ROWS = 2_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// Create a DataFrame with some null values (every 10th element is null) +const makeCol = (mul: number) => + Array.from({ length: ROWS }, (_, i) => (i % 10 === 0 ? null : i * mul)); + +const df = DataFrame.fromColumns({ + a: makeCol(1.0), + b: makeCol(1.1), + c: makeCol(1.2), + d: makeCol(1.3), + e: makeCol(1.4), +}); + +for (let i = 0; i < WARMUP; i++) { + stack(df, { dropna: true }); + stack(df, { dropna: false }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + stack(df, { dropna: true }); + stack(df, { dropna: false }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "stack_options", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); From 1df361d96ce6aaa9a315807026b3d8e9c5e682d5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 21:48:45 +0000 Subject: [PATCH 5/5] Iteration 166: 5 new benchmark pairs (534 total, +5 vs best 529) Adds benchmarks for: str_split_method, categorical_index_modify, applySeries_fn, dataframe_apply_stats, dataframe_from_columns. Cherry-picked iters 159-165 from diverged branch (+21) plus these 5 brings canonical branch from 508 to 534 pairs. Run: https://github.com/githubnext/tsessebe/actions/runs/24587057857 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- benchmarks/pandas/bench_applySeries_fn.py | 34 ++++++++++++++ .../pandas/bench_categorical_index_modify.py | 46 ++++++++++++++++++ .../pandas/bench_dataframe_apply_stats.py | 41 ++++++++++++++++ .../pandas/bench_dataframe_from_columns.py | 36 ++++++++++++++ benchmarks/pandas/bench_str_split_method.py | 34 ++++++++++++++ benchmarks/tsb/bench_applySeries_fn.ts | 36 ++++++++++++++ .../tsb/bench_categorical_index_modify.ts | 47 +++++++++++++++++++ benchmarks/tsb/bench_dataframe_apply_stats.ts | 43 +++++++++++++++++ .../tsb/bench_dataframe_from_columns.ts | 36 ++++++++++++++ benchmarks/tsb/bench_str_split_method.ts | 36 ++++++++++++++ 10 files changed, 389 insertions(+) create mode 100644 benchmarks/pandas/bench_applySeries_fn.py create mode 100644 benchmarks/pandas/bench_categorical_index_modify.py create mode 100644 benchmarks/pandas/bench_dataframe_apply_stats.py create mode 100644 benchmarks/pandas/bench_dataframe_from_columns.py create mode 100644 benchmarks/pandas/bench_str_split_method.py create mode 100644 benchmarks/tsb/bench_applySeries_fn.ts create mode 100644 benchmarks/tsb/bench_categorical_index_modify.ts create mode 100644 benchmarks/tsb/bench_dataframe_apply_stats.ts create mode 100644 benchmarks/tsb/bench_dataframe_from_columns.ts create mode 100644 benchmarks/tsb/bench_str_split_method.ts diff --git a/benchmarks/pandas/bench_applySeries_fn.py b/benchmarks/pandas/bench_applySeries_fn.py new file mode 100644 index 00000000..60358abf --- /dev/null +++ b/benchmarks/pandas/bench_applySeries_fn.py @@ -0,0 +1,34 @@ +""" +Benchmark: pandas Series.apply() with (value) lambda — 100k-element Series. +Mirrors tsb's applySeries (stats/apply.ts) behavior. +Outputs JSON: {"function": "applySeries_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +s = pd.Series([i * 0.5 for i in range(SIZE)]) + +fn = lambda v: v * 2 + 1 # noqa: E731 + +for _ in range(WARMUP): + s.apply(fn) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.apply(fn) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "applySeries_fn", + "mean_ms": mean_ms, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_categorical_index_modify.py b/benchmarks/pandas/bench_categorical_index_modify.py new file mode 100644 index 00000000..ba2e2157 --- /dev/null +++ b/benchmarks/pandas/bench_categorical_index_modify.py @@ -0,0 +1,46 @@ +""" +Benchmark: pandas CategoricalIndex modification — rename_categories, reorder_categories, +remove_categories, set_categories, remove_unused_categories on a 10k-element index. +Outputs JSON: {"function": "categorical_index_modify", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +CATS = ["alpha", "beta", "gamma", "delta", "epsilon"] +labels = [CATS[i % len(CATS)] for i in range(SIZE)] +ci = pd.CategoricalIndex(labels) + +for _ in range(WARMUP): + ci.rename_categories(["A", "B", "C", "D", "E"]) + ci.reorder_categories(["epsilon", "delta", "gamma", "beta", "alpha"]) + ci.remove_categories(["epsilon"]) + ci.set_categories(["alpha", "beta", "gamma"]) + ci.remove_unused_categories() + ci.as_ordered() + ci.as_unordered() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + ci.rename_categories(["A", "B", "C", "D", "E"]) + ci.reorder_categories(["epsilon", "delta", "gamma", "beta", "alpha"]) + ci.remove_categories(["epsilon"]) + ci.set_categories(["alpha", "beta", "gamma"]) + ci.remove_unused_categories() + ci.as_ordered() + ci.as_unordered() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "categorical_index_modify", + "mean_ms": mean_ms, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_dataframe_apply_stats.py b/benchmarks/pandas/bench_dataframe_apply_stats.py new file mode 100644 index 00000000..c50846fd --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_apply_stats.py @@ -0,0 +1,41 @@ +""" +Benchmark: pandas DataFrame.apply() — apply fn to each column (axis=0) and row (axis=1). +Mirrors tsb's dataFrameApply (stats/apply.ts) behavior. +Outputs JSON: {"function": "dataframe_apply_stats", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +SIZE = 10_000 +WARMUP = 3 +ITERATIONS = 20 + +df = pd.DataFrame({ + "a": (np.arange(SIZE) * 1.0), + "b": (np.arange(SIZE) * 2.0), + "c": (np.arange(SIZE) * 3.0), +}) + +sum_fn = lambda col: col.mean() # noqa: E731 + +for _ in range(WARMUP): + df.apply(sum_fn, axis=0) + df.apply(sum_fn, axis=1) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.apply(sum_fn, axis=0) + df.apply(sum_fn, axis=1) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "dataframe_apply_stats", + "mean_ms": mean_ms, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_dataframe_from_columns.py b/benchmarks/pandas/bench_dataframe_from_columns.py new file mode 100644 index 00000000..5be8f3f5 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_from_columns.py @@ -0,0 +1,36 @@ +""" +Benchmark: pandas DataFrame() construction — create 100k-row DataFrame from column arrays. +Mirrors tsb's DataFrame.fromColumns() behavior. +Outputs JSON: {"function": "dataframe_from_columns", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +col_a = np.arange(SIZE, dtype=float) +col_b = np.arange(SIZE, dtype=float) * 2.5 +col_c = np.arange(SIZE) % 1000 +col_d = np.sin(np.arange(SIZE) * 0.001) + +for _ in range(WARMUP): + pd.DataFrame({"a": col_a, "b": col_b, "c": col_c, "d": col_d}) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.DataFrame({"a": col_a, "b": col_b, "c": col_c, "d": col_d}) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "dataframe_from_columns", + "mean_ms": mean_ms, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_str_split_method.py b/benchmarks/pandas/bench_str_split_method.py new file mode 100644 index 00000000..448bf2e4 --- /dev/null +++ b/benchmarks/pandas/bench_str_split_method.py @@ -0,0 +1,34 @@ +""" +Benchmark: pandas Series.str.split() — split strings by delimiter on 100k strings. +Outputs JSON: {"function": "str_split_method", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +data = [f"part{i % 100}_b{i % 50}_c{i % 25}" for i in range(SIZE)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.split("_") + s.str.split("_", n=2) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.str.split("_") + s.str.split("_", n=2) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "str_split_method", + "mean_ms": mean_ms, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/tsb/bench_applySeries_fn.ts b/benchmarks/tsb/bench_applySeries_fn.ts new file mode 100644 index 00000000..a5f6035b --- /dev/null +++ b/benchmarks/tsb/bench_applySeries_fn.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: applySeries (stats/apply.ts) — element-wise fn receiving (value, label) on 100k-element Series. + * This is the standalone stats version, distinct from seriesApply (core/pipe_apply.ts). + * Outputs JSON: {"function": "applySeries_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, applySeries } from "../../src/index.ts"; +import type { Scalar, Label } from "../../src/types.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 0.5) }); + +const fn = (v: Scalar, _label: Label): Scalar => (v as number) * 2 + 1; + +for (let i = 0; i < WARMUP; i++) { + applySeries(s, fn); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + applySeries(s, fn); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "applySeries_fn", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_categorical_index_modify.ts b/benchmarks/tsb/bench_categorical_index_modify.ts new file mode 100644 index 00000000..62e1bdf3 --- /dev/null +++ b/benchmarks/tsb/bench_categorical_index_modify.ts @@ -0,0 +1,47 @@ +/** + * Benchmark: CategoricalIndex modification — renameCategories, reorderCategories, removeCategories, + * setCategories, removeUnusedCategories, asOrdered/asUnordered on a 10k-element index. + * Outputs JSON: {"function": "categorical_index_modify", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { CategoricalIndex } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const CATS = ["alpha", "beta", "gamma", "delta", "epsilon"]; +const labels = Array.from({ length: SIZE }, (_, i) => CATS[i % CATS.length]); +const ci = CategoricalIndex.fromArray(labels); + +for (let i = 0; i < WARMUP; i++) { + ci.renameCategories(["A", "B", "C", "D", "E"]); + ci.reorderCategories(["epsilon", "delta", "gamma", "beta", "alpha"]); + ci.removeCategories(["epsilon"]); + ci.setCategories(["alpha", "beta", "gamma"]); + ci.removeUnusedCategories(); + ci.asOrdered(); + ci.asUnordered(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + ci.renameCategories(["A", "B", "C", "D", "E"]); + ci.reorderCategories(["epsilon", "delta", "gamma", "beta", "alpha"]); + ci.removeCategories(["epsilon"]); + ci.setCategories(["alpha", "beta", "gamma"]); + ci.removeUnusedCategories(); + ci.asOrdered(); + ci.asUnordered(); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "categorical_index_modify", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_apply_stats.ts b/benchmarks/tsb/bench_dataframe_apply_stats.ts new file mode 100644 index 00000000..52d76819 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_apply_stats.ts @@ -0,0 +1,43 @@ +/** + * Benchmark: dataFrameApply (stats/apply.ts) — apply fn to each column (axis=0) and each row (axis=1) + * on a 10k-row DataFrame. This is the standalone stats function, not df.apply(). + * Outputs JSON: {"function": "dataframe_apply_stats", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, Series, dataFrameApply } from "../../src/index.ts"; +import type { Scalar } from "../../src/types.ts"; + +const SIZE = 10_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => i * 1.0), + b: Array.from({ length: SIZE }, (_, i) => i * 2.0), + c: Array.from({ length: SIZE }, (_, i) => i * 3.0), +}); + +const sumFn = (slice: Series) => + slice.values.reduce((acc, v) => acc + (v as number), 0) / slice.length; + +for (let i = 0; i < WARMUP; i++) { + dataFrameApply(df, sumFn, { axis: 0 }); + dataFrameApply(df, sumFn, { axis: 1 }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + dataFrameApply(df, sumFn, { axis: 0 }); + dataFrameApply(df, sumFn, { axis: 1 }); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "dataframe_apply_stats", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_from_columns.ts b/benchmarks/tsb/bench_dataframe_from_columns.ts new file mode 100644 index 00000000..33305f6d --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_from_columns.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: DataFrame.fromColumns() — construct a 100k-row DataFrame from column arrays. + * Tests the performance of the most common DataFrame construction path. + * Outputs JSON: {"function": "dataframe_from_columns", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const colA = Array.from({ length: SIZE }, (_, i) => i * 1.0); +const colB = Array.from({ length: SIZE }, (_, i) => i * 2.5); +const colC = Array.from({ length: SIZE }, (_, i) => i % 1000); +const colD = Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.001)); + +for (let i = 0; i < WARMUP; i++) { + DataFrame.fromColumns({ a: colA, b: colB, c: colC, d: colD }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + DataFrame.fromColumns({ a: colA, b: colB, c: colC, d: colD }); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "dataframe_from_columns", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_str_split_method.ts b/benchmarks/tsb/bench_str_split_method.ts new file mode 100644 index 00000000..83cbb929 --- /dev/null +++ b/benchmarks/tsb/bench_str_split_method.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: StringAccessor.split() — s.str.split(pat, n) on 100k strings. + * Distinct from strSplitExpand (which uses the standalone function). + * Outputs JSON: {"function": "str_split_method", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const data = Array.from({ length: SIZE }, (_, i) => `part${i % 100}_b${i % 50}_c${i % 25}`); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.str.split("_"); + s.str.split("_", undefined, 2); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.str.split("_"); + s.str.split("_", undefined, 2); + times.push(performance.now() - t0); +} + +const total = times.reduce((a, b) => a + b, 0); +console.log( + JSON.stringify({ + function: "str_split_method", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +);