From 46f4b4c3ae7b9a30cc0a4f01d89e06ba3e411bb7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 16 Apr 2026 22:05:20 +0000 Subject: [PATCH 01/19] Iteration 135: Add 8 benchmark pairs (388 total, +8 vs best 380) Added benchmark pairs for: dataframe_shift_diff, dataframe_pow_mod, clip_series_bounds, reindex, dataframe_compare, series_add_sub_mul_div, numeric_ops_math, dataframe_add_sub_mul_div. Run: https://github.com/githubnext/tsessebe/actions/runs/24535650224 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- benchmarks/pandas/bench_clip_series_bounds.py | 41 ++++++++++++++++ .../pandas/bench_dataframe_add_sub_mul_div.py | 39 +++++++++++++++ benchmarks/pandas/bench_dataframe_compare.py | 39 +++++++++++++++ benchmarks/pandas/bench_dataframe_pow_mod.py | 37 +++++++++++++++ .../pandas/bench_dataframe_shift_diff.py | 36 ++++++++++++++ benchmarks/pandas/bench_numeric_ops_math.py | 38 +++++++++++++++ benchmarks/pandas/bench_reindex.py | 36 ++++++++++++++ .../pandas/bench_series_add_sub_mul_div.py | 36 ++++++++++++++ benchmarks/tsb/bench_clip_series_bounds.ts | 47 +++++++++++++++++++ .../tsb/bench_dataframe_add_sub_mul_div.ts | 46 ++++++++++++++++++ benchmarks/tsb/bench_dataframe_compare.ts | 40 ++++++++++++++++ benchmarks/tsb/bench_dataframe_pow_mod.ts | 38 +++++++++++++++ benchmarks/tsb/bench_dataframe_shift_diff.ts | 36 ++++++++++++++ benchmarks/tsb/bench_numeric_ops_math.ts | 46 ++++++++++++++++++ benchmarks/tsb/bench_reindex.ts | 46 ++++++++++++++++++ .../tsb/bench_series_add_sub_mul_div.ts | 37 +++++++++++++++ 16 files changed, 638 insertions(+) create mode 100644 benchmarks/pandas/bench_clip_series_bounds.py create mode 100644 benchmarks/pandas/bench_dataframe_add_sub_mul_div.py create mode 100644 benchmarks/pandas/bench_dataframe_compare.py create mode 100644 benchmarks/pandas/bench_dataframe_pow_mod.py create mode 100644 benchmarks/pandas/bench_dataframe_shift_diff.py create mode 100644 benchmarks/pandas/bench_numeric_ops_math.py create mode 100644 benchmarks/pandas/bench_reindex.py create mode 100644 benchmarks/pandas/bench_series_add_sub_mul_div.py create mode 100644 benchmarks/tsb/bench_clip_series_bounds.ts create mode 100644 benchmarks/tsb/bench_dataframe_add_sub_mul_div.ts create mode 100644 benchmarks/tsb/bench_dataframe_compare.ts create mode 100644 benchmarks/tsb/bench_dataframe_pow_mod.ts create mode 100644 benchmarks/tsb/bench_dataframe_shift_diff.ts create mode 100644 benchmarks/tsb/bench_numeric_ops_math.ts create mode 100644 benchmarks/tsb/bench_reindex.ts create mode 100644 benchmarks/tsb/bench_series_add_sub_mul_div.ts diff --git a/benchmarks/pandas/bench_clip_series_bounds.py b/benchmarks/pandas/bench_clip_series_bounds.py new file mode 100644 index 00000000..312b989a --- /dev/null +++ b/benchmarks/pandas/bench_clip_series_bounds.py @@ -0,0 +1,41 @@ +""" +Benchmark: Series.clip(lower=, upper=) / DataFrame.clip(lower=, upper=) — element-wise clip bounds. +Outputs JSON: {"function": "clip_series_bounds", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +data = np.arange(SIZE) - SIZE / 2 +s = pd.Series(data) +lower_s = pd.Series(np.full(SIZE, -10000.0)) +upper_s = pd.Series(np.full(SIZE, 10000.0)) + +df = pd.DataFrame({ + "a": np.arange(SIZE) - SIZE / 2, + "b": np.sin(np.arange(SIZE) * 0.01) * 100, +}) +lower_df = pd.DataFrame({"a": np.full(SIZE, -10000.0), "b": np.full(SIZE, -50.0)}) +upper_df = pd.DataFrame({"a": np.full(SIZE, 10000.0), "b": np.full(SIZE, 50.0)}) + +for _ in range(WARMUP): + s.clip(lower=lower_s, upper=upper_s) + df.clip(lower=lower_df, upper=upper_df) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.clip(lower=lower_s, upper=upper_s) + df.clip(lower=lower_df, upper=upper_df) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "clip_series_bounds", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_add_sub_mul_div.py b/benchmarks/pandas/bench_dataframe_add_sub_mul_div.py new file mode 100644 index 00000000..7778cfa2 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_add_sub_mul_div.py @@ -0,0 +1,39 @@ +""" +Benchmark: DataFrame.add / sub / mul / div — standalone arithmetic on 50k-row DataFrame. +Outputs JSON: {"function": "dataframe_add_sub_mul_div", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 50_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({ + "a": np.arange(SIZE) * 1.5, + "b": np.arange(SIZE) * 2.0, + "c": (np.arange(SIZE) % 100) + 1, +}) + +for _ in range(WARMUP): + df.add(10) + df.sub(5) + df.mul(2) + df.div(3) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.add(10) + df.sub(5) + df.mul(2) + df.div(3) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_add_sub_mul_div", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_compare.py b/benchmarks/pandas/bench_dataframe_compare.py new file mode 100644 index 00000000..3167e398 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_compare.py @@ -0,0 +1,39 @@ +""" +Benchmark: DataFrame == / != / < / > — element-wise comparison. +Outputs JSON: {"function": "dataframe_compare", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 50_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({ + "a": np.arange(SIZE), + "b": np.arange(SIZE) * 2, + "c": np.arange(SIZE) % 100, +}) + +for _ in range(WARMUP): + df.eq(50) + df.ne(50) + df.lt(50) + df.gt(50) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.eq(50) + df.ne(50) + df.lt(50) + df.gt(50) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_compare", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_pow_mod.py b/benchmarks/pandas/bench_dataframe_pow_mod.py new file mode 100644 index 00000000..0f854e5e --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_pow_mod.py @@ -0,0 +1,37 @@ +""" +Benchmark: DataFrame ** / % / // — power, modulo, floor division on DataFrame. +Outputs JSON: {"function": "dataframe_pow_mod", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 50_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({ + "a": (np.arange(SIZE) % 10) + 1, + "b": (np.arange(SIZE) % 7) + 1, + "c": (np.arange(SIZE) % 5) + 1, +}) + +for _ in range(WARMUP): + df.pow(2) + df.mod(3) + df.floordiv(2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.pow(2) + df.mod(3) + df.floordiv(2) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_pow_mod", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_shift_diff.py b/benchmarks/pandas/bench_dataframe_shift_diff.py new file mode 100644 index 00000000..d85600bd --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_shift_diff.py @@ -0,0 +1,36 @@ +""" +Benchmark: DataFrame.shift / DataFrame.diff — shift and diff on a 50k-row DataFrame. +Outputs JSON: {"function": "dataframe_shift_diff", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 50_000 +WARMUP = 5 +ITERATIONS = 50 + +rng = np.random.default_rng(42) +df = pd.DataFrame({ + "a": np.arange(SIZE) * 1.5, + "b": np.sin(np.arange(SIZE) * 0.01) * 100, + "c": np.arange(SIZE) % 200, +}) + +for _ in range(WARMUP): + df.shift(1) + df.diff(1) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.shift(1) + df.diff(1) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_shift_diff", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_numeric_ops_math.py b/benchmarks/pandas/bench_numeric_ops_math.py new file mode 100644 index 00000000..6c5945cf --- /dev/null +++ b/benchmarks/pandas/bench_numeric_ops_math.py @@ -0,0 +1,38 @@ +""" +Benchmark: np.floor / np.ceil / np.trunc / np.sqrt / np.log — math operations on 100k Series. +Outputs JSON: {"function": "numeric_ops_math", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +data = (np.arange(SIZE) + 1) * 0.1 +s = pd.Series(data) + +for _ in range(WARMUP): + np.floor(s) + np.ceil(s) + np.trunc(s) + np.sqrt(s) + np.log(s) + +start = time.perf_counter() +for _ in range(ITERATIONS): + np.floor(s) + np.ceil(s) + np.trunc(s) + np.sqrt(s) + np.log(s) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "numeric_ops_math", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_reindex.py b/benchmarks/pandas/bench_reindex.py new file mode 100644 index 00000000..c6b6dffd --- /dev/null +++ b/benchmarks/pandas/bench_reindex.py @@ -0,0 +1,36 @@ +""" +Benchmark: Series.reindex / DataFrame.reindex — realign to a new index. +Outputs JSON: {"function": "reindex", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +orig_labels = np.arange(SIZE) * 2 # 0, 2, 4, ..., 2*(SIZE-1) +data = np.arange(SIZE) * 1.5 +s = pd.Series(data, index=orig_labels) +new_index = np.arange(SIZE + 1000) # 0..SIZE+999 + +df = pd.DataFrame({"a": data, "b": data * 2}, index=orig_labels) + +for _ in range(WARMUP): + s.reindex(new_index) + df.reindex(new_index) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.reindex(new_index) + df.reindex(new_index) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "reindex", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_add_sub_mul_div.py b/benchmarks/pandas/bench_series_add_sub_mul_div.py new file mode 100644 index 00000000..5c36b968 --- /dev/null +++ b/benchmarks/pandas/bench_series_add_sub_mul_div.py @@ -0,0 +1,36 @@ +""" +Benchmark: Series.add / sub / mul / div — standalone arithmetic on 100k-element Series. +Outputs JSON: {"function": "series_add_sub_mul_div", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +a = pd.Series(np.arange(SIZE) * 1.5) +b = pd.Series((np.arange(SIZE) % 1000) + 1) + +for _ in range(WARMUP): + a.add(b) + a.sub(b) + a.mul(2) + a.div(b) + +start = time.perf_counter() +for _ in range(ITERATIONS): + a.add(b) + a.sub(b) + a.mul(2) + a.div(b) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_add_sub_mul_div", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/tsb/bench_clip_series_bounds.ts b/benchmarks/tsb/bench_clip_series_bounds.ts new file mode 100644 index 00000000..dc2c10ac --- /dev/null +++ b/benchmarks/tsb/bench_clip_series_bounds.ts @@ -0,0 +1,47 @@ +/** + * Benchmark: clipSeriesWithBounds / clipDataFrameWithBounds — clip with lower/upper bounds. + * Outputs JSON: {"function": "clip_series_bounds", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, clipSeriesWithBounds, clipDataFrameWithBounds } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i - SIZE / 2) }); +const lower = new Series({ data: Array.from({ length: SIZE }, () => -10000) }); +const upper = new Series({ data: Array.from({ length: SIZE }, () => 10000) }); + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i - SIZE / 2), + b: Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.01) * 100), +}); +const dfLower = new DataFrame({ + a: Array.from({ length: SIZE }, () => -10000), + b: Array.from({ length: SIZE }, () => -50), +}); +const dfUpper = new DataFrame({ + a: Array.from({ length: SIZE }, () => 10000), + b: Array.from({ length: SIZE }, () => 50), +}); + +for (let i = 0; i < WARMUP; i++) { + clipSeriesWithBounds(s, lower, upper); + clipDataFrameWithBounds(df, dfLower, dfUpper); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + clipSeriesWithBounds(s, lower, upper); + clipDataFrameWithBounds(df, dfLower, dfUpper); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "clip_series_bounds", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_add_sub_mul_div.ts b/benchmarks/tsb/bench_dataframe_add_sub_mul_div.ts new file mode 100644 index 00000000..8bf39535 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_add_sub_mul_div.ts @@ -0,0 +1,46 @@ +/** + * Benchmark: dataFrameAdd / dataFrameSub / dataFrameMul / dataFrameDiv — standalone DataFrame arithmetic. + * Outputs JSON: {"function": "dataframe_add_sub_mul_div", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { + DataFrame, + dataFrameAdd, + dataFrameSub, + dataFrameMul, + dataFrameDiv, +} from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i * 1.5), + b: Array.from({ length: SIZE }, (_, i) => i * 2.0), + c: Array.from({ length: SIZE }, (_, i) => (i % 100) + 1), +}); + +for (let i = 0; i < WARMUP; i++) { + dataFrameAdd(df, 10); + dataFrameSub(df, 5); + dataFrameMul(df, 2); + dataFrameDiv(df, 3); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameAdd(df, 10); + dataFrameSub(df, 5); + dataFrameMul(df, 2); + dataFrameDiv(df, 3); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_add_sub_mul_div", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_compare.ts b/benchmarks/tsb/bench_dataframe_compare.ts new file mode 100644 index 00000000..8b5de0ee --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_compare.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: dataFrameEq / dataFrameNe / dataFrameLt / dataFrameGt — element-wise compare. + * Outputs JSON: {"function": "dataframe_compare", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dataFrameEq, dataFrameNe, dataFrameLt, dataFrameGt } from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i), + b: Array.from({ length: SIZE }, (_, i) => i * 2), + c: Array.from({ length: SIZE }, (_, i) => i % 100), +}); + +for (let i = 0; i < WARMUP; i++) { + dataFrameEq(df, 50); + dataFrameNe(df, 50); + dataFrameLt(df, 50); + dataFrameGt(df, 50); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameEq(df, 50); + dataFrameNe(df, 50); + dataFrameLt(df, 50); + dataFrameGt(df, 50); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_compare", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_pow_mod.ts b/benchmarks/tsb/bench_dataframe_pow_mod.ts new file mode 100644 index 00000000..7e882ae8 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_pow_mod.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: dataFramePow / dataFrameMod / dataFrameFloorDiv — power, modulo, floor division on DataFrame. + * Outputs JSON: {"function": "dataframe_pow_mod", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dataFramePow, dataFrameMod, dataFrameFloorDiv } from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => (i % 10) + 1), + b: Array.from({ length: SIZE }, (_, i) => (i % 7) + 1), + c: Array.from({ length: SIZE }, (_, i) => (i % 5) + 1), +}); + +for (let i = 0; i < WARMUP; i++) { + dataFramePow(df, 2); + dataFrameMod(df, 3); + dataFrameFloorDiv(df, 2); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFramePow(df, 2); + dataFrameMod(df, 3); + dataFrameFloorDiv(df, 2); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_pow_mod", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_shift_diff.ts b/benchmarks/tsb/bench_dataframe_shift_diff.ts new file mode 100644 index 00000000..59d59e32 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_shift_diff.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: dataFrameShift / dataFrameDiff — shift and diff on a 50k-row DataFrame. + * Outputs JSON: {"function": "dataframe_shift_diff", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dataFrameShift, dataFrameDiff } from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i * 1.5), + b: Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.01) * 100), + c: Array.from({ length: SIZE }, (_, i) => i % 200), +}); + +for (let i = 0; i < WARMUP; i++) { + dataFrameShift(df, 1); + dataFrameDiff(df, 1); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameShift(df, 1); + dataFrameDiff(df, 1); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_shift_diff", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_numeric_ops_math.ts b/benchmarks/tsb/bench_numeric_ops_math.ts new file mode 100644 index 00000000..bccded35 --- /dev/null +++ b/benchmarks/tsb/bench_numeric_ops_math.ts @@ -0,0 +1,46 @@ +/** + * Benchmark: seriesFloor / seriesCeil / seriesTrunc / seriesSqrt / seriesLog — math operations. + * Outputs JSON: {"function": "numeric_ops_math", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { + Series, + seriesFloor, + seriesCeil, + seriesTrunc, + seriesSqrt, + seriesLog, +} from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +// Positive values for sqrt/log +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i + 1) * 0.1) }); + +for (let i = 0; i < WARMUP; i++) { + seriesFloor(s); + seriesCeil(s); + seriesTrunc(s); + seriesSqrt(s); + seriesLog(s); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesFloor(s); + seriesCeil(s); + seriesTrunc(s); + seriesSqrt(s); + seriesLog(s); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "numeric_ops_math", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_reindex.ts b/benchmarks/tsb/bench_reindex.ts new file mode 100644 index 00000000..3853af2e --- /dev/null +++ b/benchmarks/tsb/bench_reindex.ts @@ -0,0 +1,46 @@ +/** + * Benchmark: reindexSeries / reindexDataFrame — realign to a new index. + * Outputs JSON: {"function": "reindex", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, Index, reindexSeries, reindexDataFrame } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +// Original: even indices 0, 2, 4, ..., 2*(SIZE-1) +const origLabels = Array.from({ length: SIZE }, (_, i) => i * 2); +const data = Array.from({ length: SIZE }, (_, i) => i * 1.5); +const s = new Series({ data, index: new Index(origLabels) }); + +// New index: 0..SIZE+1000 (some match, some are new) +const newIndex = Array.from({ length: SIZE + 1000 }, (_, i) => i); + +const df = new DataFrame( + { + a: data, + b: data.map((v) => v * 2), + }, + new Index(origLabels), +); + +for (let i = 0; i < WARMUP; i++) { + reindexSeries(s, newIndex); + reindexDataFrame(df, { index: newIndex }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + reindexSeries(s, newIndex); + reindexDataFrame(df, { index: newIndex }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "reindex", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_add_sub_mul_div.ts b/benchmarks/tsb/bench_series_add_sub_mul_div.ts new file mode 100644 index 00000000..891b471f --- /dev/null +++ b/benchmarks/tsb/bench_series_add_sub_mul_div.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: seriesAdd / seriesSub / seriesMul / seriesDiv — standalone arithmetic functions. + * Outputs JSON: {"function": "series_add_sub_mul_div", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, seriesAdd, seriesSub, seriesMul, seriesDiv } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const a = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 1.5) }); +const b = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i % 1000) + 1) }); + +for (let i = 0; i < WARMUP; i++) { + seriesAdd(a, b); + seriesSub(a, b); + seriesMul(a, 2); + seriesDiv(a, b); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesAdd(a, b); + seriesSub(a, b); + seriesMul(a, 2); + seriesDiv(a, b); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_add_sub_mul_div", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); From 9191c697b7a702ca171deb2e2030286b024114a0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 16 Apr 2026 22:29:53 +0000 Subject: [PATCH 02/19] Iteration 136: Add 8 benchmark pairs (396 total, +8 vs 388) - bench_series_any_all: anySeries / allSeries boolean reductions - bench_dataframe_any_all: anyDataFrame / allDataFrame boolean reductions - bench_dataframe_nunique: nuniqueDataFrame per-column unique counts - bench_series_crosstab: seriesCrosstab two-series cross-tabulation - bench_bdate_range: bdate_range business-day DatetimeIndex generation - bench_series_radd_rsub: seriesRadd / seriesRsub / seriesRmul / seriesRdiv reverse arithmetic - bench_dataframe_radd_rsub: dataFrameRadd / dataFrameRsub / dataFrameRmul / dataFrameRdiv reverse arithmetic - bench_series_exp_log: seriesExp / seriesLog2 / seriesLog10 / seriesSign extended math Run: https://github.com/githubnext/tsessebe/actions/runs/24536797293 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- benchmarks/pandas/bench_bdate_range.py | 25 ++++++++++ benchmarks/pandas/bench_dataframe_any_all.py | 35 ++++++++++++++ benchmarks/pandas/bench_dataframe_nunique.py | 33 +++++++++++++ .../pandas/bench_dataframe_radd_rsub.py | 38 +++++++++++++++ benchmarks/pandas/bench_series_any_all.py | 31 ++++++++++++ benchmarks/pandas/bench_series_crosstab.py | 33 +++++++++++++ benchmarks/pandas/bench_series_exp_log.py | 35 ++++++++++++++ benchmarks/pandas/bench_series_radd_rsub.py | 35 ++++++++++++++ benchmarks/tsb/bench_bdate_range.ts | 27 +++++++++++ benchmarks/tsb/bench_dataframe_any_all.ts | 38 +++++++++++++++ benchmarks/tsb/bench_dataframe_nunique.ts | 36 ++++++++++++++ benchmarks/tsb/bench_dataframe_radd_rsub.ts | 48 +++++++++++++++++++ benchmarks/tsb/bench_series_any_all.ts | 32 +++++++++++++ benchmarks/tsb/bench_series_crosstab.ts | 40 ++++++++++++++++ benchmarks/tsb/bench_series_exp_log.ts | 37 ++++++++++++++ benchmarks/tsb/bench_series_radd_rsub.ts | 36 ++++++++++++++ 16 files changed, 559 insertions(+) create mode 100644 benchmarks/pandas/bench_bdate_range.py create mode 100644 benchmarks/pandas/bench_dataframe_any_all.py create mode 100644 benchmarks/pandas/bench_dataframe_nunique.py create mode 100644 benchmarks/pandas/bench_dataframe_radd_rsub.py create mode 100644 benchmarks/pandas/bench_series_any_all.py create mode 100644 benchmarks/pandas/bench_series_crosstab.py create mode 100644 benchmarks/pandas/bench_series_exp_log.py create mode 100644 benchmarks/pandas/bench_series_radd_rsub.py create mode 100644 benchmarks/tsb/bench_bdate_range.ts create mode 100644 benchmarks/tsb/bench_dataframe_any_all.ts create mode 100644 benchmarks/tsb/bench_dataframe_nunique.ts create mode 100644 benchmarks/tsb/bench_dataframe_radd_rsub.ts create mode 100644 benchmarks/tsb/bench_series_any_all.ts create mode 100644 benchmarks/tsb/bench_series_crosstab.ts create mode 100644 benchmarks/tsb/bench_series_exp_log.ts create mode 100644 benchmarks/tsb/bench_series_radd_rsub.ts diff --git a/benchmarks/pandas/bench_bdate_range.py b/benchmarks/pandas/bench_bdate_range.py new file mode 100644 index 00000000..33b3cb9b --- /dev/null +++ b/benchmarks/pandas/bench_bdate_range.py @@ -0,0 +1,25 @@ +""" +Benchmark: pd.bdate_range — generate business-day DatetimeIndex with 1000 periods. +Outputs JSON: {"function": "bdate_range", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +WARMUP = 5 +ITERATIONS = 100 + +for _ in range(WARMUP): + pd.bdate_range(start="2020-01-01", periods=1000) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.bdate_range(start="2020-01-01", periods=1000) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "bdate_range", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_any_all.py b/benchmarks/pandas/bench_dataframe_any_all.py new file mode 100644 index 00000000..6458d290 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_any_all.py @@ -0,0 +1,35 @@ +""" +Benchmark: DataFrame.any() / all() — boolean reductions on 100k-row DataFrame. +Outputs JSON: {"function": "dataframe_any_all", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({ + "a": np.arange(SIZE) % 2 == 0, + "b": np.arange(SIZE) % 3 != 0, + "c": np.arange(SIZE) > 0, +}) + +for _ in range(WARMUP): + df.any() + df.all() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.any() + df.all() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_any_all", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_nunique.py b/benchmarks/pandas/bench_dataframe_nunique.py new file mode 100644 index 00000000..9babcfae --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_nunique.py @@ -0,0 +1,33 @@ +""" +Benchmark: DataFrame.nunique() — count unique values per column on 100k-row DataFrame. +Outputs JSON: {"function": "dataframe_nunique", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +df = pd.DataFrame({ + "cat": np.arange(SIZE) % 100, + "val": np.arange(SIZE) % 500, + "grp": np.arange(SIZE) % 10, +}) + +for _ in range(WARMUP): + df.nunique() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.nunique() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_nunique", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_radd_rsub.py b/benchmarks/pandas/bench_dataframe_radd_rsub.py new file mode 100644 index 00000000..25b5ea1d --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_radd_rsub.py @@ -0,0 +1,38 @@ +""" +Benchmark: DataFrame.radd / rsub / rmul / rdiv — reverse arithmetic on 100k-row DataFrame. +Outputs JSON: {"function": "dataframe_radd_rsub", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({ + "x": (np.arange(SIZE) % 1000 + 1).astype(float), + "y": (np.arange(SIZE) % 500 + 0.5), +}) + +for _ in range(WARMUP): + df.radd(100) + df.rsub(100) + df.rmul(2) + df.rdiv(1000) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.radd(100) + df.rsub(100) + df.rmul(2) + df.rdiv(1000) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_radd_rsub", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_any_all.py b/benchmarks/pandas/bench_series_any_all.py new file mode 100644 index 00000000..83993b80 --- /dev/null +++ b/benchmarks/pandas/bench_series_any_all.py @@ -0,0 +1,31 @@ +""" +Benchmark: Series.any() / all() — boolean reductions on 100k-element Series. +Outputs JSON: {"function": "series_any_all", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series(np.arange(SIZE) % 2 == 0) + +for _ in range(WARMUP): + s.any() + s.all() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.any() + s.all() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_any_all", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_crosstab.py b/benchmarks/pandas/bench_series_crosstab.py new file mode 100644 index 00000000..ee79aa99 --- /dev/null +++ b/benchmarks/pandas/bench_series_crosstab.py @@ -0,0 +1,33 @@ +""" +Benchmark: pd.crosstab — cross-tabulation of two categorical Series. +Outputs JSON: {"function": "series_crosstab", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 20 + +categories_a = ["apple", "banana", "cherry", "date", "elderberry"] +categories_b = ["north", "south", "east", "west"] + +a = pd.Series([categories_a[i % len(categories_a)] for i in range(SIZE)], name="product") +b = pd.Series([categories_b[i % len(categories_b)] for i in range(SIZE)], name="region") + +for _ in range(WARMUP): + pd.crosstab(a, b) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.crosstab(a, b) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_crosstab", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_exp_log.py b/benchmarks/pandas/bench_series_exp_log.py new file mode 100644 index 00000000..e3288f0a --- /dev/null +++ b/benchmarks/pandas/bench_series_exp_log.py @@ -0,0 +1,35 @@ +""" +Benchmark: Series.map(np.exp) / log2 / log10 / sign — extended math on 100k-element Series. +Outputs JSON: {"function": "series_exp_log", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series((np.arange(SIZE) % 1000 + 1).astype(float)) + +for _ in range(WARMUP): + np.exp(s) + np.log2(s) + np.log10(s) + np.sign(s) + +start = time.perf_counter() +for _ in range(ITERATIONS): + np.exp(s) + np.log2(s) + np.log10(s) + np.sign(s) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_exp_log", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_radd_rsub.py b/benchmarks/pandas/bench_series_radd_rsub.py new file mode 100644 index 00000000..437ef684 --- /dev/null +++ b/benchmarks/pandas/bench_series_radd_rsub.py @@ -0,0 +1,35 @@ +""" +Benchmark: Series.radd / rsub / rmul / rdiv — reverse arithmetic on 100k-element Series. +Outputs JSON: {"function": "series_radd_rsub", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series((np.arange(SIZE) % 1000) + 1, dtype=float) + +for _ in range(WARMUP): + s.radd(100) + s.rsub(100) + s.rmul(2) + s.rdiv(1000) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.radd(100) + s.rsub(100) + s.rmul(2) + s.rdiv(1000) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_radd_rsub", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/tsb/bench_bdate_range.ts b/benchmarks/tsb/bench_bdate_range.ts new file mode 100644 index 00000000..b4b2c18f --- /dev/null +++ b/benchmarks/tsb/bench_bdate_range.ts @@ -0,0 +1,27 @@ +/** + * Benchmark: bdate_range — generate business-day DatetimeIndex with 1000 periods. + * Outputs JSON: {"function": "bdate_range", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { bdate_range } from "../../src/index.ts"; + +const WARMUP = 5; +const ITERATIONS = 100; + +for (let i = 0; i < WARMUP; i++) { + bdate_range({ start: "2020-01-01", periods: 1000 }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + bdate_range({ start: "2020-01-01", periods: 1000 }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "bdate_range", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_any_all.ts b/benchmarks/tsb/bench_dataframe_any_all.ts new file mode 100644 index 00000000..c9935eb2 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_any_all.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: anyDataFrame / allDataFrame — boolean reductions on 100k-row DataFrame. + * Outputs JSON: {"function": "dataframe_any_all", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, Series, anyDataFrame, allDataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = new DataFrame({ + columns: new Map([ + ["a", new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 2 === 0) })], + ["b", new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 3 !== 0) })], + ["c", new Series({ data: Array.from({ length: SIZE }, (_, i) => i > 0) })], + ]), +}); + +for (let i = 0; i < WARMUP; i++) { + anyDataFrame(df); + allDataFrame(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + anyDataFrame(df); + allDataFrame(df); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_any_all", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_nunique.ts b/benchmarks/tsb/bench_dataframe_nunique.ts new file mode 100644 index 00000000..454914dc --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_nunique.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: nuniqueDataFrame — count unique values per column on 100k-row DataFrame. + * Outputs JSON: {"function": "dataframe_nunique", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, Series, nuniqueDataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const df = new DataFrame({ + columns: new Map([ + ["cat", new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 100) })], + ["val", new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 500) })], + ["grp", new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 10) })], + ]), +}); + +for (let i = 0; i < WARMUP; i++) { + nuniqueDataFrame(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + nuniqueDataFrame(df); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_nunique", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_radd_rsub.ts b/benchmarks/tsb/bench_dataframe_radd_rsub.ts new file mode 100644 index 00000000..62ed3b7c --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_radd_rsub.ts @@ -0,0 +1,48 @@ +/** + * Benchmark: dataFrameRadd / dataFrameRsub / dataFrameRmul / dataFrameRdiv — reverse arithmetic on 100k-row DataFrame. + * Outputs JSON: {"function": "dataframe_radd_rsub", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { + DataFrame, + Series, + dataFrameRadd, + dataFrameRsub, + dataFrameRmul, + dataFrameRdiv, +} from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = new DataFrame({ + columns: new Map([ + ["x", new Series({ data: Array.from({ length: SIZE }, (_, i) => (i % 1000) + 1) })], + ["y", new Series({ data: Array.from({ length: SIZE }, (_, i) => (i % 500) + 0.5) })], + ]), +}); + +for (let i = 0; i < WARMUP; i++) { + dataFrameRadd(df, 100); + dataFrameRsub(df, 100); + dataFrameRmul(df, 2); + dataFrameRdiv(df, 1000); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameRadd(df, 100); + dataFrameRsub(df, 100); + dataFrameRmul(df, 2); + dataFrameRdiv(df, 1000); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_radd_rsub", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_any_all.ts b/benchmarks/tsb/bench_series_any_all.ts new file mode 100644 index 00000000..524d811b --- /dev/null +++ b/benchmarks/tsb/bench_series_any_all.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: anySeries / allSeries — boolean reductions on 100k-element Series. + * Outputs JSON: {"function": "series_any_all", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, anySeries, allSeries } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 2 === 0) }); + +for (let i = 0; i < WARMUP; i++) { + anySeries(s); + allSeries(s); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + anySeries(s); + allSeries(s); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_any_all", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_crosstab.ts b/benchmarks/tsb/bench_series_crosstab.ts new file mode 100644 index 00000000..9441cde3 --- /dev/null +++ b/benchmarks/tsb/bench_series_crosstab.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: seriesCrosstab — cross-tabulation of two categorical Series. + * Outputs JSON: {"function": "series_crosstab", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, seriesCrosstab } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const CATEGORIES_A = ["apple", "banana", "cherry", "date", "elderberry"]; +const CATEGORIES_B = ["north", "south", "east", "west"]; + +const a = new Series({ + data: Array.from({ length: SIZE }, (_, i) => CATEGORIES_A[i % CATEGORIES_A.length]), + name: "product", +}); +const b = new Series({ + data: Array.from({ length: SIZE }, (_, i) => CATEGORIES_B[i % CATEGORIES_B.length]), + name: "region", +}); + +for (let i = 0; i < WARMUP; i++) { + seriesCrosstab(a, b); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesCrosstab(a, b); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_crosstab", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_exp_log.ts b/benchmarks/tsb/bench_series_exp_log.ts new file mode 100644 index 00000000..0b8d2c7e --- /dev/null +++ b/benchmarks/tsb/bench_series_exp_log.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: seriesExp / seriesLog2 / seriesLog10 / seriesSign — extended math on 100k-element Series. + * Outputs JSON: {"function": "series_exp_log", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, seriesExp, seriesLog2, seriesLog10, seriesSign } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +// Positive values for log operations +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i % 1000) + 1) }); + +for (let i = 0; i < WARMUP; i++) { + seriesExp(s); + seriesLog2(s); + seriesLog10(s); + seriesSign(s); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesExp(s); + seriesLog2(s); + seriesLog10(s); + seriesSign(s); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_exp_log", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_radd_rsub.ts b/benchmarks/tsb/bench_series_radd_rsub.ts new file mode 100644 index 00000000..d149deaa --- /dev/null +++ b/benchmarks/tsb/bench_series_radd_rsub.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: seriesRadd / seriesRsub / seriesRmul / seriesRdiv — reverse arithmetic on 100k-element Series. + * Outputs JSON: {"function": "series_radd_rsub", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, seriesRadd, seriesRsub, seriesRmul, seriesRdiv } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i % 1000) + 1) }); + +for (let i = 0; i < WARMUP; i++) { + seriesRadd(s, 100); + seriesRsub(s, 100); + seriesRmul(s, 2); + seriesRdiv(s, 1000); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesRadd(s, 100); + seriesRsub(s, 100); + seriesRmul(s, 2); + seriesRdiv(s, 1000); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_radd_rsub", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); From 3165c28cc3eb423c1e42581221856cec88d8967e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 16 Apr 2026 23:05:18 +0000 Subject: [PATCH 03/19] Iteration 137: Add 8 benchmark pairs (404 total, +8 vs 396) Added pairs: infer_dtype, value_counts_binned, categorical_index, tz_localize_convert, align_series, align_dataframe, memory_usage, named_agg. Covers dtype inference, binned value counts, CategoricalIndex ops, timezone operations, Series/DataFrame alignment, memory estimation, and named aggregation (lost in iter 133's missing branch). Run: https://github.com/githubnext/tsessebe/actions/runs/24537885791 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- benchmarks/pandas/bench_align_dataframe.py | 43 +++++++++++++++ benchmarks/pandas/bench_align_series.py | 36 +++++++++++++ benchmarks/pandas/bench_categorical_index.py | 41 ++++++++++++++ benchmarks/pandas/bench_infer_dtype.py | 39 ++++++++++++++ benchmarks/pandas/bench_memory_usage.py | 42 +++++++++++++++ benchmarks/pandas/bench_named_agg.py | 49 +++++++++++++++++ .../pandas/bench_tz_localize_convert.py | 32 +++++++++++ .../pandas/bench_value_counts_binned.py | 32 +++++++++++ benchmarks/tsb/bench_align_dataframe.ts | 53 +++++++++++++++++++ benchmarks/tsb/bench_align_series.ts | 40 ++++++++++++++ benchmarks/tsb/bench_categorical_index.ts | 45 ++++++++++++++++ benchmarks/tsb/bench_infer_dtype.ts | 39 ++++++++++++++ benchmarks/tsb/bench_memory_usage.ts | 44 +++++++++++++++ benchmarks/tsb/bench_named_agg.ts | 50 +++++++++++++++++ benchmarks/tsb/bench_tz_localize_convert.ts | 34 ++++++++++++ benchmarks/tsb/bench_value_counts_binned.ts | 33 ++++++++++++ 16 files changed, 652 insertions(+) create mode 100644 benchmarks/pandas/bench_align_dataframe.py create mode 100644 benchmarks/pandas/bench_align_series.py create mode 100644 benchmarks/pandas/bench_categorical_index.py create mode 100644 benchmarks/pandas/bench_infer_dtype.py create mode 100644 benchmarks/pandas/bench_memory_usage.py create mode 100644 benchmarks/pandas/bench_named_agg.py create mode 100644 benchmarks/pandas/bench_tz_localize_convert.py create mode 100644 benchmarks/pandas/bench_value_counts_binned.py create mode 100644 benchmarks/tsb/bench_align_dataframe.ts create mode 100644 benchmarks/tsb/bench_align_series.ts create mode 100644 benchmarks/tsb/bench_categorical_index.ts create mode 100644 benchmarks/tsb/bench_infer_dtype.ts create mode 100644 benchmarks/tsb/bench_memory_usage.ts create mode 100644 benchmarks/tsb/bench_named_agg.ts create mode 100644 benchmarks/tsb/bench_tz_localize_convert.ts create mode 100644 benchmarks/tsb/bench_value_counts_binned.ts diff --git a/benchmarks/pandas/bench_align_dataframe.py b/benchmarks/pandas/bench_align_dataframe.py new file mode 100644 index 00000000..b0f13984 --- /dev/null +++ b/benchmarks/pandas/bench_align_dataframe.py @@ -0,0 +1,43 @@ +""" +Benchmark: DataFrame.align — align two 10k-row DataFrames on inner/outer/left join. +Outputs JSON: {"function": "align_dataframe", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 30 + +idx_a = [i * 2 for i in range(SIZE)] +idx_b = [i * 3 for i in range(SIZE)] + +df_a = pd.DataFrame( + {"x": [i * 1.0 for i in range(SIZE)], "y": [i * 2.0 for i in range(SIZE)], "z": [i * 3.0 for i in range(SIZE)]}, + index=idx_a, +) +df_b = pd.DataFrame( + {"y": [i * 10.0 for i in range(SIZE)], "z": [i * 20.0 for i in range(SIZE)], "w": [i * 30.0 for i in range(SIZE)]}, + index=idx_b, +) + +for _ in range(WARMUP): + df_a.align(df_b, join="inner") + df_a.align(df_b, join="outer") + df_a.align(df_b, join="left") + +start = time.perf_counter() +for _ in range(ITERATIONS): + df_a.align(df_b, join="inner") + df_a.align(df_b, join="outer") + df_a.align(df_b, join="left") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "align_dataframe", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_align_series.py b/benchmarks/pandas/bench_align_series.py new file mode 100644 index 00000000..b5e5eda7 --- /dev/null +++ b/benchmarks/pandas/bench_align_series.py @@ -0,0 +1,36 @@ +""" +Benchmark: Series.align — align two 50k-element Series on inner/outer/left join. +Outputs JSON: {"function": "align_series", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 50_000 +WARMUP = 5 +ITERATIONS = 30 + +idx_a = [i * 2 for i in range(SIZE)] +idx_b = [i * 3 for i in range(SIZE)] +s_a = pd.Series([i * 1.0 for i in range(SIZE)], index=idx_a) +s_b = pd.Series([i * 2.0 for i in range(SIZE)], index=idx_b) + +for _ in range(WARMUP): + s_a.align(s_b, join="inner") + s_a.align(s_b, join="outer") + s_a.align(s_b, join="left") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s_a.align(s_b, join="inner") + s_a.align(s_b, join="outer") + s_a.align(s_b, join="left") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "align_series", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_categorical_index.py b/benchmarks/pandas/bench_categorical_index.py new file mode 100644 index 00000000..dd504c72 --- /dev/null +++ b/benchmarks/pandas/bench_categorical_index.py @@ -0,0 +1,41 @@ +""" +Benchmark: pandas.CategoricalIndex — creation, get_loc, add_categories, set operations on 100k elements. +Outputs JSON: {"function": "categorical_index", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +CATS = ["alpha", "beta", "gamma", "delta", "epsilon"] +labels = [CATS[i % len(CATS)] for i in range(SIZE)] +ci = pd.CategoricalIndex(labels) +labels2 = [CATS[(i + 2) % len(CATS)] for i in range(SIZE // 2)] +ci2 = pd.CategoricalIndex(labels2) + +for _ in range(WARMUP): + pd.CategoricalIndex(labels) + ci.get_loc("beta") + ci.add_categories(["zeta"]) + ci.union(ci2) + ci.intersection(ci2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.CategoricalIndex(labels) + ci.get_loc("beta") + ci.add_categories(["zeta"]) + ci.union(ci2) + ci.intersection(ci2) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "categorical_index", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_infer_dtype.py b/benchmarks/pandas/bench_infer_dtype.py new file mode 100644 index 00000000..9f95bff7 --- /dev/null +++ b/benchmarks/pandas/bench_infer_dtype.py @@ -0,0 +1,39 @@ +""" +Benchmark: infer_dtype — pandas.api.types.infer_dtype on 100k-element arrays. +Outputs JSON: {"function": "infer_dtype", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd +from pandas.api.types import infer_dtype + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +int_arr = list(range(SIZE)) +float_arr = [i * 0.5 for i in range(SIZE)] +str_arr = [f"val_{i}" for i in range(SIZE)] +mixed_arr = [f"s{i}" if i % 3 == 0 else i for i in range(SIZE)] + +for _ in range(WARMUP): + infer_dtype(int_arr, skipna=True) + infer_dtype(float_arr, skipna=True) + infer_dtype(str_arr, skipna=True) + infer_dtype(mixed_arr, skipna=True) + +start = time.perf_counter() +for _ in range(ITERATIONS): + infer_dtype(int_arr, skipna=True) + infer_dtype(float_arr, skipna=True) + infer_dtype(str_arr, skipna=True) + infer_dtype(mixed_arr, skipna=True) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "infer_dtype", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_memory_usage.py b/benchmarks/pandas/bench_memory_usage.py new file mode 100644 index 00000000..6e46fe19 --- /dev/null +++ b/benchmarks/pandas/bench_memory_usage.py @@ -0,0 +1,42 @@ +""" +Benchmark: Series.memory_usage / DataFrame.memory_usage — memory estimation. +Outputs JSON: {"function": "memory_usage", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +num_series = pd.Series([i * 1.0 for i in range(SIZE)]) +str_series = pd.Series([f"label_{i % 100}" for i in range(SIZE)]) +df = pd.DataFrame({ + "a": [i * 1.0 for i in range(SIZE)], + "b": [i * 2.0 for i in range(SIZE)], + "c": [f"cat_{i % 50}" for i in range(SIZE)], + "d": [i % 2 == 0 for i in range(SIZE)], +}) + +for _ in range(WARMUP): + num_series.memory_usage() + str_series.memory_usage(deep=True) + df.memory_usage() + df.memory_usage(deep=True) + +start = time.perf_counter() +for _ in range(ITERATIONS): + num_series.memory_usage() + str_series.memory_usage(deep=True) + df.memory_usage() + df.memory_usage(deep=True) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "memory_usage", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_named_agg.py b/benchmarks/pandas/bench_named_agg.py new file mode 100644 index 00000000..2b20ff7c --- /dev/null +++ b/benchmarks/pandas/bench_named_agg.py @@ -0,0 +1,49 @@ +""" +Benchmark: DataFrameGroupBy.agg with named aggregations (pandas.NamedAgg) on 100k rows. +Outputs JSON: {"function": "named_agg", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +depts = ["eng", "hr", "sales", "finance", "ops"] +df = pd.DataFrame({ + "dept": [depts[i % len(depts)] for i in range(SIZE)], + "salary": [50_000 + (i % 100) * 1000 for i in range(SIZE)], + "headcount": [1 + (i % 5) for i in range(SIZE)], + "score": [(i % 100) * 0.1 for i in range(SIZE)], +}) + +gb = df.groupby("dept") + +for _ in range(WARMUP): + gb.agg( + total_salary=pd.NamedAgg(column="salary", aggfunc="sum"), + avg_salary=pd.NamedAgg(column="salary", aggfunc="mean"), + max_salary=pd.NamedAgg(column="salary", aggfunc="max"), + employees=pd.NamedAgg(column="headcount", aggfunc="count"), + avg_score=pd.NamedAgg(column="score", aggfunc="mean"), + ) + +start = time.perf_counter() +for _ in range(ITERATIONS): + gb.agg( + total_salary=pd.NamedAgg(column="salary", aggfunc="sum"), + avg_salary=pd.NamedAgg(column="salary", aggfunc="mean"), + max_salary=pd.NamedAgg(column="salary", aggfunc="max"), + employees=pd.NamedAgg(column="headcount", aggfunc="count"), + avg_score=pd.NamedAgg(column="score", aggfunc="mean"), + ) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "named_agg", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_tz_localize_convert.py b/benchmarks/pandas/bench_tz_localize_convert.py new file mode 100644 index 00000000..b2d35e9d --- /dev/null +++ b/benchmarks/pandas/bench_tz_localize_convert.py @@ -0,0 +1,32 @@ +""" +Benchmark: DatetimeIndex.tz_localize / tz_convert — timezone operations on 10k-element index. +Outputs JSON: {"function": "tz_localize_convert", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +naive = pd.date_range(start="2024-01-01", periods=SIZE, freq="h") + +for _ in range(WARMUP): + utc = naive.tz_localize("UTC") + utc.tz_convert("America/New_York") + naive.tz_localize("America/New_York", ambiguous="NaT", nonexistent="NaT") + +start = time.perf_counter() +for _ in range(ITERATIONS): + utc = naive.tz_localize("UTC") + utc.tz_convert("America/New_York") + naive.tz_localize("America/New_York", ambiguous="NaT", nonexistent="NaT") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "tz_localize_convert", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_value_counts_binned.py b/benchmarks/pandas/bench_value_counts_binned.py new file mode 100644 index 00000000..b6e3cf24 --- /dev/null +++ b/benchmarks/pandas/bench_value_counts_binned.py @@ -0,0 +1,32 @@ +""" +Benchmark: Series.value_counts(bins=N) — bin 100k values and count occurrences. +Outputs JSON: {"function": "value_counts_binned", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +data = [(i % 1000) * 0.1 for i in range(SIZE)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.value_counts(bins=10) + s.value_counts(bins=50) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.value_counts(bins=10) + s.value_counts(bins=50) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "value_counts_binned", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/tsb/bench_align_dataframe.ts b/benchmarks/tsb/bench_align_dataframe.ts new file mode 100644 index 00000000..13d07f86 --- /dev/null +++ b/benchmarks/tsb/bench_align_dataframe.ts @@ -0,0 +1,53 @@ +/** + * Benchmark: alignDataFrame — align two 10k-row DataFrames on inner/outer join. + * Outputs JSON: {"function": "align_dataframe", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, Index, alignDataFrame } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const idxA = Array.from({ length: SIZE }, (_, i) => i * 2); +const idxB = Array.from({ length: SIZE }, (_, i) => i * 3); + +const dfA = new DataFrame( + { + x: Array.from({ length: SIZE }, (_, i) => i * 1.0), + y: Array.from({ length: SIZE }, (_, i) => i * 2.0), + z: Array.from({ length: SIZE }, (_, i) => i * 3.0), + }, + { index: new Index(idxA) }, +); + +const dfB = new DataFrame( + { + y: Array.from({ length: SIZE }, (_, i) => i * 10.0), + z: Array.from({ length: SIZE }, (_, i) => i * 20.0), + w: Array.from({ length: SIZE }, (_, i) => i * 30.0), + }, + { index: new Index(idxB) }, +); + +for (let i = 0; i < WARMUP; i++) { + alignDataFrame(dfA, dfB, { join: "inner" }); + alignDataFrame(dfA, dfB, { join: "outer" }); + alignDataFrame(dfA, dfB, { join: "left" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + alignDataFrame(dfA, dfB, { join: "inner" }); + alignDataFrame(dfA, dfB, { join: "outer" }); + alignDataFrame(dfA, dfB, { join: "left" }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "align_dataframe", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_align_series.ts b/benchmarks/tsb/bench_align_series.ts new file mode 100644 index 00000000..b8cfe17d --- /dev/null +++ b/benchmarks/tsb/bench_align_series.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: alignSeries — align two 50k-element Series on inner/outer join. + * Outputs JSON: {"function": "align_series", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, Index, alignSeries } from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// Two overlapping indexes: evens vs multiples of 3 +const idxA = Array.from({ length: SIZE }, (_, i) => i * 2); +const idxB = Array.from({ length: SIZE }, (_, i) => i * 3); +const dataA = Array.from({ length: SIZE }, (_, i) => i * 1.0); +const dataB = Array.from({ length: SIZE }, (_, i) => i * 2.0); +const seriesA = new Series(dataA, { index: new Index(idxA) }); +const seriesB = new Series(dataB, { index: new Index(idxB) }); + +for (let i = 0; i < WARMUP; i++) { + alignSeries(seriesA, seriesB, { join: "inner" }); + alignSeries(seriesA, seriesB, { join: "outer" }); + alignSeries(seriesA, seriesB, { join: "left" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + alignSeries(seriesA, seriesB, { join: "inner" }); + alignSeries(seriesA, seriesB, { join: "outer" }); + alignSeries(seriesA, seriesB, { join: "left" }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "align_series", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_categorical_index.ts b/benchmarks/tsb/bench_categorical_index.ts new file mode 100644 index 00000000..1fd6b4d8 --- /dev/null +++ b/benchmarks/tsb/bench_categorical_index.ts @@ -0,0 +1,45 @@ +/** + * Benchmark: CategoricalIndex — creation, getLoc, addCategories, set operations on 100k elements. + * Outputs JSON: {"function": "categorical_index", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { CategoricalIndex } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const CATS = ["alpha", "beta", "gamma", "delta", "epsilon"]; +const labels = Array.from({ length: SIZE }, (_, i) => CATS[i % CATS.length]); +const ci = CategoricalIndex.fromArray(labels); +const ci2 = CategoricalIndex.fromArray( + Array.from({ length: SIZE / 2 }, (_, i) => CATS[(i + 2) % CATS.length]), +); + +for (let i = 0; i < WARMUP; i++) { + CategoricalIndex.fromArray(labels); + ci.getLoc("beta"); + ci.getLocsAll("gamma"); + ci.addCategories(["zeta"]); + ci.unionCategories(ci2); + ci.intersectCategories(ci2); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + CategoricalIndex.fromArray(labels); + ci.getLoc("beta"); + ci.getLocsAll("gamma"); + ci.addCategories(["zeta"]); + ci.unionCategories(ci2); + ci.intersectCategories(ci2); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "categorical_index", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_infer_dtype.ts b/benchmarks/tsb/bench_infer_dtype.ts new file mode 100644 index 00000000..2ec2a68c --- /dev/null +++ b/benchmarks/tsb/bench_infer_dtype.ts @@ -0,0 +1,39 @@ +/** + * Benchmark: inferDtype — infer dominant type from 100k-element array. + * Outputs JSON: {"function": "infer_dtype", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { inferDtype } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const intArr = Array.from({ length: SIZE }, (_, i) => i); +const floatArr = Array.from({ length: SIZE }, (_, i) => i * 0.5); +const strArr = Array.from({ length: SIZE }, (_, i) => `val_${i}`); +const mixedArr = Array.from({ length: SIZE }, (_, i) => (i % 3 === 0 ? `s${i}` : i)); + +for (let i = 0; i < WARMUP; i++) { + inferDtype(intArr); + inferDtype(floatArr); + inferDtype(strArr); + inferDtype(mixedArr); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + inferDtype(intArr); + inferDtype(floatArr); + inferDtype(strArr); + inferDtype(mixedArr); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "infer_dtype", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_memory_usage.ts b/benchmarks/tsb/bench_memory_usage.ts new file mode 100644 index 00000000..e716dcc5 --- /dev/null +++ b/benchmarks/tsb/bench_memory_usage.ts @@ -0,0 +1,44 @@ +/** + * Benchmark: seriesMemoryUsage / dataFrameMemoryUsage — memory estimation. + * Outputs JSON: {"function": "memory_usage", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, seriesMemoryUsage, dataFrameMemoryUsage } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const numSeries = new Series(Array.from({ length: SIZE }, (_, i) => i * 1.0)); +const strSeries = new Series(Array.from({ length: SIZE }, (_, i) => `label_${i % 100}`)); + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i * 1.0), + b: Array.from({ length: SIZE }, (_, i) => i * 2.0), + c: Array.from({ length: SIZE }, (_, i) => `cat_${i % 50}`), + d: Array.from({ length: SIZE }, (_, i) => i % 2 === 0), +}); + +for (let i = 0; i < WARMUP; i++) { + seriesMemoryUsage(numSeries); + seriesMemoryUsage(strSeries, { deep: true }); + dataFrameMemoryUsage(df); + dataFrameMemoryUsage(df, { deep: true }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesMemoryUsage(numSeries); + seriesMemoryUsage(strSeries, { deep: true }); + dataFrameMemoryUsage(df); + dataFrameMemoryUsage(df, { deep: true }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "memory_usage", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_named_agg.ts b/benchmarks/tsb/bench_named_agg.ts new file mode 100644 index 00000000..37d55b6f --- /dev/null +++ b/benchmarks/tsb/bench_named_agg.ts @@ -0,0 +1,50 @@ +/** + * Benchmark: DataFrameGroupBy.aggNamed — named aggregation spec with 100k rows. + * Outputs JSON: {"function": "named_agg", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, DataFrameGroupBy, namedAgg } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const depts = ["eng", "hr", "sales", "finance", "ops"]; +const df = new DataFrame({ + dept: Array.from({ length: SIZE }, (_, i) => depts[i % depts.length]), + salary: Array.from({ length: SIZE }, (_, i) => 50_000 + (i % 100) * 1000), + headcount: Array.from({ length: SIZE }, (_, i) => 1 + (i % 5)), + score: Array.from({ length: SIZE }, (_, i) => (i % 100) * 0.1), +}); + +const gb = new DataFrameGroupBy(df, ["dept"]); + +for (let i = 0; i < WARMUP; i++) { + gb.aggNamed({ + total_salary: namedAgg("salary", "sum"), + avg_salary: namedAgg("salary", "mean"), + max_salary: namedAgg("salary", "max"), + employees: namedAgg("headcount", "count"), + avg_score: namedAgg("score", "mean"), + }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + gb.aggNamed({ + total_salary: namedAgg("salary", "sum"), + avg_salary: namedAgg("salary", "mean"), + max_salary: namedAgg("salary", "max"), + employees: namedAgg("headcount", "count"), + avg_score: namedAgg("score", "mean"), + }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "named_agg", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_tz_localize_convert.ts b/benchmarks/tsb/bench_tz_localize_convert.ts new file mode 100644 index 00000000..05a0a5fc --- /dev/null +++ b/benchmarks/tsb/bench_tz_localize_convert.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: tz_localize / tz_convert — timezone operations on 10k-element DatetimeIndex. + * Outputs JSON: {"function": "tz_localize_convert", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { date_range, tz_localize, tz_convert } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const naive = date_range({ start: "2024-01-01", periods: SIZE, freq: "h" }); + +for (let i = 0; i < WARMUP; i++) { + const utc = tz_localize(naive, "UTC"); + tz_convert(utc, "America/New_York"); + tz_localize(naive, "America/New_York"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + const utc = tz_localize(naive, "UTC"); + tz_convert(utc, "America/New_York"); + tz_localize(naive, "America/New_York"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "tz_localize_convert", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_value_counts_binned.ts b/benchmarks/tsb/bench_value_counts_binned.ts new file mode 100644 index 00000000..006422cf --- /dev/null +++ b/benchmarks/tsb/bench_value_counts_binned.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: valueCountsBinned — bin 100k values into intervals and count. + * Outputs JSON: {"function": "value_counts_binned", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, valueCountsBinned } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const data = Array.from({ length: SIZE }, (_, i) => (i % 1000) * 0.1); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + valueCountsBinned(s, 10); + valueCountsBinned(s, 50); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + valueCountsBinned(s, 10); + valueCountsBinned(s, 50); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "value_counts_binned", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); From 26437c0c12385a394ec02cdd97aebec88519c6e6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 16 Apr 2026 23:31:47 +0000 Subject: [PATCH 04/19] Iteration 138: Add 8 benchmark pairs (412 total, +8 vs 404) Added benchmarks for: - series_ceil_floor_trunc_sqrt: seriesCeil/seriesFloor/seriesTrunc/seriesSqrt vs numpy - dataframe_ceil_floor_trunc: dataFrameCeil/Floor/Trunc/Sqrt vs numpy on DataFrame - dataframe_exp_log: dataFrameExp/Log/Log2/Log10 vs numpy on DataFrame - pivot_table_full: pivotTableFull (with margins) vs pd.pivot_table - read_excel: readExcel/xlsxSheetNames with 10k-row XLSX vs pd.read_excel/openpyxl - pipe_chain_ops: pipeChain/pipeTo/dataFramePipeChain/dataFramePipeTo vs .pipe() - nan_extended_agg: nancount/nanmedian/nanprod vs Series.count/median/prod - series_pipe_apply: pipeSeries/dataFramePipe vs Series.pipe/DataFrame.pipe Run: https://github.com/githubnext/tsessebe/actions/runs/24538933188 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../bench_dataframe_ceil_floor_trunc.py | 38 +++++ benchmarks/pandas/bench_dataframe_exp_log.py | 38 +++++ benchmarks/pandas/bench_nan_extended_agg.py | 39 +++++ benchmarks/pandas/bench_pipe_chain_ops.py | 46 ++++++ benchmarks/pandas/bench_pivot_table_full.py | 36 +++++ benchmarks/pandas/bench_read_excel.py | 49 +++++++ .../bench_series_ceil_floor_trunc_sqrt.py | 35 +++++ benchmarks/pandas/bench_series_pipe_apply.py | 38 +++++ .../tsb/bench_dataframe_ceil_floor_trunc.ts | 38 +++++ benchmarks/tsb/bench_dataframe_exp_log.ts | 39 +++++ benchmarks/tsb/bench_nan_extended_agg.ts | 37 +++++ benchmarks/tsb/bench_pipe_chain_ops.ts | 62 ++++++++ benchmarks/tsb/bench_pivot_table_full.ts | 37 +++++ benchmarks/tsb/bench_read_excel.ts | 133 ++++++++++++++++++ .../tsb/bench_series_ceil_floor_trunc_sqrt.ts | 36 +++++ benchmarks/tsb/bench_series_pipe_apply.ts | 40 ++++++ 16 files changed, 741 insertions(+) create mode 100644 benchmarks/pandas/bench_dataframe_ceil_floor_trunc.py create mode 100644 benchmarks/pandas/bench_dataframe_exp_log.py create mode 100644 benchmarks/pandas/bench_nan_extended_agg.py create mode 100644 benchmarks/pandas/bench_pipe_chain_ops.py create mode 100644 benchmarks/pandas/bench_pivot_table_full.py create mode 100644 benchmarks/pandas/bench_read_excel.py create mode 100644 benchmarks/pandas/bench_series_ceil_floor_trunc_sqrt.py create mode 100644 benchmarks/pandas/bench_series_pipe_apply.py create mode 100644 benchmarks/tsb/bench_dataframe_ceil_floor_trunc.ts create mode 100644 benchmarks/tsb/bench_dataframe_exp_log.ts create mode 100644 benchmarks/tsb/bench_nan_extended_agg.ts create mode 100644 benchmarks/tsb/bench_pipe_chain_ops.ts create mode 100644 benchmarks/tsb/bench_pivot_table_full.ts create mode 100644 benchmarks/tsb/bench_read_excel.ts create mode 100644 benchmarks/tsb/bench_series_ceil_floor_trunc_sqrt.ts create mode 100644 benchmarks/tsb/bench_series_pipe_apply.ts diff --git a/benchmarks/pandas/bench_dataframe_ceil_floor_trunc.py b/benchmarks/pandas/bench_dataframe_ceil_floor_trunc.py new file mode 100644 index 00000000..174a9f24 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_ceil_floor_trunc.py @@ -0,0 +1,38 @@ +""" +Benchmark: DataFrame ceil / floor / trunc / sqrt — math rounding on 100k-row DataFrame. +Outputs JSON: {"function": "dataframe_ceil_floor_trunc", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +df = pd.DataFrame({ + "a": (np.arange(ROWS) % 1000) * 0.7 + 0.3, + "b": (np.arange(ROWS) % 500) * 1.3 + 0.1, +}) + +for _ in range(WARMUP): + np.ceil(df) + np.floor(df) + np.trunc(df) + np.sqrt(df) + +start = time.perf_counter() +for _ in range(ITERATIONS): + np.ceil(df) + np.floor(df) + np.trunc(df) + np.sqrt(df) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_ceil_floor_trunc", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_exp_log.py b/benchmarks/pandas/bench_dataframe_exp_log.py new file mode 100644 index 00000000..a5206311 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_exp_log.py @@ -0,0 +1,38 @@ +""" +Benchmark: DataFrame exp / log / log2 / log10 — exponentiation/log on 100k-row DataFrame. +Outputs JSON: {"function": "dataframe_exp_log", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +df = pd.DataFrame({ + "a": (np.arange(ROWS) % 1000) + 1, + "b": (np.arange(ROWS) % 500) + 1, +}) + +for _ in range(WARMUP): + np.exp(df) + np.log(df) + np.log2(df) + np.log10(df) + +start = time.perf_counter() +for _ in range(ITERATIONS): + np.exp(df) + np.log(df) + np.log2(df) + np.log10(df) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_exp_log", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_nan_extended_agg.py b/benchmarks/pandas/bench_nan_extended_agg.py new file mode 100644 index 00000000..a2cf1a11 --- /dev/null +++ b/benchmarks/pandas/bench_nan_extended_agg.py @@ -0,0 +1,39 @@ +""" +Benchmark: count/median/prod nan-ignoring aggregates on 100k-element array. +Outputs JSON: {"function": "nan_extended_agg", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +# Array with ~10% NaN values; small values to avoid prod overflow +data = np.where( + np.arange(SIZE) % 10 == 0, + np.nan, + (np.arange(SIZE) % 100) * 0.01 + 1, +) +s = pd.Series(data) + +for _ in range(WARMUP): + s.count() + s.median(skipna=True) + s.prod(skipna=True) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.count() + s.median(skipna=True) + s.prod(skipna=True) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "nan_extended_agg", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_pipe_chain_ops.py b/benchmarks/pandas/bench_pipe_chain_ops.py new file mode 100644 index 00000000..53216352 --- /dev/null +++ b/benchmarks/pandas/bench_pipe_chain_ops.py @@ -0,0 +1,46 @@ +""" +Benchmark: pipe chaining utilities — pipeChain / pipeTo / dataFramePipeChain / dataFramePipeTo. +Outputs JSON: {"function": "pipe_chain_ops", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series(np.arange(SIZE) * 0.5 - SIZE * 0.25) +df = pd.DataFrame({ + "a": np.arange(SIZE) * 0.5, + "b": np.arange(SIZE) * 0.3 + 1, +}) + +def double(x): return x * 2 +def add_one(x): return x + 1 +def abs_val(x): return x.abs() + +# pandas equivalent of pipeChain: .pipe(fn1).pipe(fn2).pipe(fn3) +# pandas equivalent of pipeTo: .pipe(fn, *args) with positional arg + +for _ in range(WARMUP): + s.pipe(double).pipe(add_one).pipe(abs_val) + s.pipe(abs_val) + df.pipe(double).pipe(abs_val) + df.pipe(abs_val) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.pipe(double).pipe(add_one).pipe(abs_val) + s.pipe(abs_val) + df.pipe(double).pipe(abs_val) + df.pipe(abs_val) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "pipe_chain_ops", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_pivot_table_full.py b/benchmarks/pandas/bench_pivot_table_full.py new file mode 100644 index 00000000..18e4905f --- /dev/null +++ b/benchmarks/pandas/bench_pivot_table_full.py @@ -0,0 +1,36 @@ +""" +Benchmark: pivot_table with margins on 50k-row DataFrame. +Outputs JSON: {"function": "pivot_table_full", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +ROWS = 50_000 +WARMUP = 3 +ITERATIONS = 20 + +regions = ["North", "South", "East", "West"] +products = ["A", "B", "C", "D", "E"] + +df = pd.DataFrame({ + "region": [regions[i % len(regions)] for i in range(ROWS)], + "product": [products[i % len(products)] for i in range(ROWS)], + "sales": (np.arange(ROWS) % 1000) * 1.5 + 10, +}) + +for _ in range(WARMUP): + pd.pivot_table(df, values="sales", index="region", columns="product", aggfunc="mean", margins=True) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.pivot_table(df, values="sales", index="region", columns="product", aggfunc="mean", margins=True) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "pivot_table_full", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_read_excel.py b/benchmarks/pandas/bench_read_excel.py new file mode 100644 index 00000000..a7e93244 --- /dev/null +++ b/benchmarks/pandas/bench_read_excel.py @@ -0,0 +1,49 @@ +""" +Benchmark: pd.read_excel / ExcelFile.sheet_names — parse a 10k-row XLSX file. +Outputs JSON: {"function": "read_excel", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import io +import numpy as np +import pandas as pd + +try: + import openpyxl +except ImportError: + import subprocess, sys + subprocess.check_call([sys.executable, "-m", "pip", "install", "openpyxl", "--quiet"]) + import openpyxl + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 + +# Build an XLSX file in memory using openpyxl +wb = openpyxl.Workbook() +ws = wb.active +ws.title = "Sheet1" +ws.append(["id", "name", "value", "score"]) +for i in range(ROWS): + ws.append([i, f"item_{i % 100}", i * 1.5, float(np.sin(i * 0.01))]) + +buf = io.BytesIO() +wb.save(buf) +xlsx_bytes = buf.getvalue() + +for _ in range(WARMUP): + pd.read_excel(io.BytesIO(xlsx_bytes), engine="openpyxl") + pd.ExcelFile(io.BytesIO(xlsx_bytes), engine="openpyxl").sheet_names + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.read_excel(io.BytesIO(xlsx_bytes), engine="openpyxl") + pd.ExcelFile(io.BytesIO(xlsx_bytes), engine="openpyxl").sheet_names +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "read_excel", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_ceil_floor_trunc_sqrt.py b/benchmarks/pandas/bench_series_ceil_floor_trunc_sqrt.py new file mode 100644 index 00000000..f2baffe4 --- /dev/null +++ b/benchmarks/pandas/bench_series_ceil_floor_trunc_sqrt.py @@ -0,0 +1,35 @@ +""" +Benchmark: Series ceil / floor / trunc / sqrt — math rounding on 100k-element Series. +Outputs JSON: {"function": "series_ceil_floor_trunc_sqrt", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series((np.arange(SIZE) % 1000) * 0.7 + 0.3) + +for _ in range(WARMUP): + np.ceil(s) + np.floor(s) + np.trunc(s) + np.sqrt(s) + +start = time.perf_counter() +for _ in range(ITERATIONS): + np.ceil(s) + np.floor(s) + np.trunc(s) + np.sqrt(s) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_ceil_floor_trunc_sqrt", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_pipe_apply.py b/benchmarks/pandas/bench_series_pipe_apply.py new file mode 100644 index 00000000..a2254628 --- /dev/null +++ b/benchmarks/pandas/bench_series_pipe_apply.py @@ -0,0 +1,38 @@ +""" +Benchmark: Series.pipe / DataFrame.pipe — pipe function application utilities. +Outputs JSON: {"function": "series_pipe_apply", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series(np.arange(SIZE) * 0.5 - SIZE * 0.25) +df = pd.DataFrame({ + "a": np.arange(SIZE) * 0.5, + "b": np.arange(SIZE) * 0.3 + 1, +}) + +def abs_and_double(x): + return x.abs() * 2 + +for _ in range(WARMUP): + s.pipe(abs_and_double) + df.pipe(abs_and_double) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.pipe(abs_and_double) + df.pipe(abs_and_double) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_pipe_apply", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/tsb/bench_dataframe_ceil_floor_trunc.ts b/benchmarks/tsb/bench_dataframe_ceil_floor_trunc.ts new file mode 100644 index 00000000..56dd2941 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_ceil_floor_trunc.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: dataFrameCeil / dataFrameFloor / dataFrameTrunc / dataFrameSqrt — math rounding on 100k-row DataFrame. + * Outputs JSON: {"function": "dataframe_ceil_floor_trunc", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dataFrameCeil, dataFrameFloor, dataFrameTrunc, dataFrameSqrt } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const a = Array.from({ length: ROWS }, (_, i) => (i % 1000) * 0.7 + 0.3); +const b = Array.from({ length: ROWS }, (_, i) => (i % 500) * 1.3 + 0.1); +const df = DataFrame.fromColumns({ a, b }); + +for (let i = 0; i < WARMUP; i++) { + dataFrameCeil(df); + dataFrameFloor(df); + dataFrameTrunc(df); + dataFrameSqrt(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameCeil(df); + dataFrameFloor(df); + dataFrameTrunc(df); + dataFrameSqrt(df); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_ceil_floor_trunc", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_exp_log.ts b/benchmarks/tsb/bench_dataframe_exp_log.ts new file mode 100644 index 00000000..d674a471 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_exp_log.ts @@ -0,0 +1,39 @@ +/** + * Benchmark: dataFrameExp / dataFrameLog / dataFrameLog2 / dataFrameLog10 — exponentiation/log on 100k-row DataFrame. + * Outputs JSON: {"function": "dataframe_exp_log", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dataFrameExp, dataFrameLog, dataFrameLog2, dataFrameLog10 } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// Positive values to avoid NaN in log operations +const a = Array.from({ length: ROWS }, (_, i) => (i % 1000) + 1); +const b = Array.from({ length: ROWS }, (_, i) => (i % 500) + 1); +const df = DataFrame.fromColumns({ a, b }); + +for (let i = 0; i < WARMUP; i++) { + dataFrameExp(df); + dataFrameLog(df); + dataFrameLog2(df); + dataFrameLog10(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameExp(df); + dataFrameLog(df); + dataFrameLog2(df); + dataFrameLog10(df); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_exp_log", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_nan_extended_agg.ts b/benchmarks/tsb/bench_nan_extended_agg.ts new file mode 100644 index 00000000..1a3d72c1 --- /dev/null +++ b/benchmarks/tsb/bench_nan_extended_agg.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: nancount / nanmedian / nanprod — nan-ignoring aggregates on a 100k-element array. + * Outputs JSON: {"function": "nan_extended_agg", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { nancount, nanmedian, nanprod } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +// Array with ~10% null values; use small values to avoid nanprod overflow +const data: (number | null)[] = Array.from({ length: SIZE }, (_, i) => + i % 10 === 0 ? null : (i % 100) * 0.01 + 1, +); + +for (let i = 0; i < WARMUP; i++) { + nancount(data); + nanmedian(data); + nanprod(data); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + nancount(data); + nanmedian(data); + nanprod(data); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "nan_extended_agg", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_pipe_chain_ops.ts b/benchmarks/tsb/bench_pipe_chain_ops.ts new file mode 100644 index 00000000..c4afe31d --- /dev/null +++ b/benchmarks/tsb/bench_pipe_chain_ops.ts @@ -0,0 +1,62 @@ +/** + * Benchmark: pipeChain / pipeTo / dataFramePipeChain / dataFramePipeTo — function chaining utilities. + * Outputs JSON: {"function": "pipe_chain_ops", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { + Series, + DataFrame, + pipeChain, + pipeTo, + dataFramePipeChain, + dataFramePipeTo, + seriesAdd, + seriesMul, + seriesAbs, +} from "../../src/index.ts"; +import type { Scalar } from "../../src/types.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 0.5 - SIZE * 0.25) }); +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => i * 0.5), + b: Array.from({ length: SIZE }, (_, i) => i * 0.3 + 1), +}); + +const double = (x: Series) => seriesMul(x, 2); +const addOne = (x: Series) => seriesAdd(x, 1); +const absVal = (x: Series) => seriesAbs(x); + +const dfDouble = (d: DataFrame) => d.mul(2); +const dfAbs = (d: DataFrame) => d.abs(); + +// pipeTo: insert series at position 0 of a unary function +const identity = (x: Series) => seriesAbs(x); +const dfIdentity = (d: DataFrame) => d.abs(); + +for (let i = 0; i < WARMUP; i++) { + pipeChain(s, double, addOne, absVal); + pipeTo(s, 0, identity); + dataFramePipeChain(df, dfDouble, dfAbs); + dataFramePipeTo(df, 0, dfIdentity); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + pipeChain(s, double, addOne, absVal); + pipeTo(s, 0, identity); + dataFramePipeChain(df, dfDouble, dfAbs); + dataFramePipeTo(df, 0, dfIdentity); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "pipe_chain_ops", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_pivot_table_full.ts b/benchmarks/tsb/bench_pivot_table_full.ts new file mode 100644 index 00000000..9819d1dd --- /dev/null +++ b/benchmarks/tsb/bench_pivot_table_full.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: pivotTableFull — extended pivot table with margins on 50k-row DataFrame. + * Outputs JSON: {"function": "pivot_table_full", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, pivotTableFull } from "../../src/index.ts"; + +const ROWS = 50_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const regions = ["North", "South", "East", "West"]; +const products = ["A", "B", "C", "D", "E"]; + +const region = Array.from({ length: ROWS }, (_, i) => regions[i % regions.length]); +const product = Array.from({ length: ROWS }, (_, i) => products[i % products.length]); +const sales = Array.from({ length: ROWS }, (_, i) => (i % 1000) * 1.5 + 10); + +const df = DataFrame.fromColumns({ region, product, sales }); + +for (let i = 0; i < WARMUP; i++) { + pivotTableFull(df, { values: "sales", index: "region", columns: "product", aggfunc: "mean", margins: true }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + pivotTableFull(df, { values: "sales", index: "region", columns: "product", aggfunc: "mean", margins: true }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "pivot_table_full", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_read_excel.ts b/benchmarks/tsb/bench_read_excel.ts new file mode 100644 index 00000000..0549138c --- /dev/null +++ b/benchmarks/tsb/bench_read_excel.ts @@ -0,0 +1,133 @@ +/** + * Benchmark: readExcel / xlsxSheetNames — parse a 10k-row XLSX file. + * Outputs JSON: {"function": "read_excel", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { readExcel, xlsxSheetNames } from "../../src/index.ts"; + +// ─── minimal XLSX builder (adapted from tests/io/read_excel.test.ts) ────────── + +const ENC = new TextEncoder(); + +function le16(n: number): Uint8Array { + const v = n & 0xffff; + return new Uint8Array([v & 0xff, (v >> 8) & 0xff]); +} +function le32(n: number): Uint8Array { + const v = n >>> 0; + return new Uint8Array([v & 0xff, (v >> 8) & 0xff, (v >> 16) & 0xff, (v >> 24) & 0xff]); +} +function joinBytes(...parts: Uint8Array[]): Uint8Array { + let total = 0; + for (const p of parts) total += p.length; + const out = new Uint8Array(total); + let pos = 0; + for (const p of parts) { out.set(p, pos); pos += p.length; } + return out; +} +function buildStoredZip(files: { name: string; data: Uint8Array }[]): Uint8Array { + const localParts: Uint8Array[] = []; + const localOffsets: number[] = []; + let curOffset = 0; + for (const f of files) { + const nameBytes = ENC.encode(f.name); + const lh = joinBytes( + new Uint8Array([0x50, 0x4b, 0x03, 0x04]), + le16(20), le16(0), le16(0), le16(0), le16(0), le32(0), + le32(f.data.length), le32(f.data.length), + le16(nameBytes.length), le16(0), nameBytes, f.data, + ); + localOffsets.push(curOffset); + localParts.push(lh); + curOffset += lh.length; + } + const cdParts: Uint8Array[] = []; + for (const [i, f] of files.entries()) { + const nameBytes = ENC.encode(f.name); + const off = localOffsets[i] ?? 0; + cdParts.push(joinBytes( + new Uint8Array([0x50, 0x4b, 0x01, 0x02]), + le16(20), le16(20), le16(0), le16(0), le16(0), le16(0), le32(0), + le32(f.data.length), le32(f.data.length), + le16(nameBytes.length), le16(0), le16(0), le16(0), le16(0), le32(0), le32(off), + nameBytes, + )); + } + const cdSize = cdParts.reduce((s, p) => s + p.length, 0); + const cdOffset = curOffset; + const eocd = joinBytes( + new Uint8Array([0x50, 0x4b, 0x05, 0x06]), + le16(0), le16(0), le16(files.length), le16(files.length), + le32(cdSize), le32(cdOffset), le16(0), + ); + return joinBytes(...localParts, ...cdParts, eocd); +} +function escXml(s: string): string { + return s.replace(/&/g, "&").replace(//g, ">").replace(/"/g, """); +} +function colLetter(c: number): string { + let col = c + 1; let result = ""; + while (col > 0) { const rem = (col - 1) % 26; result = String.fromCharCode(65 + rem) + result; col = Math.floor((col - 1) / 26); } + return result; +} +function makeXlsx(headers: string[], rows: (string | number | null)[][]): Uint8Array { + const strs: string[] = []; const strIdx = new Map(); + const reg = (s: string): number => { const x = strIdx.get(s); if (x !== undefined) return x; const i = strs.length; strs.push(s); strIdx.set(s, i); return i; }; + for (const h of headers) reg(h); + for (const row of rows) for (const c of row) if (typeof c === "string") reg(c); + const sst = `\n\n${strs.map((s) => `${escXml(s)}`).join("\n")}\n`; + const hCells = headers.map((h, c) => `${reg(h)}`).join(""); + const dataCells = rows.map((row, ri) => { + const cells = row.map((cell, ci) => cell === null ? "" : typeof cell === "string" ? `${reg(cell)}` : `${cell}`).join(""); + return `${cells}`; + }).join("\n"); + const ws = `\n${hCells}\n${dataCells}`; + const wb = `\n`; + const wbRels = `\n`; + const rels = `\n`; + const ct = `\n`; + return buildStoredZip([ + { name: "[Content_Types].xml", data: ENC.encode(ct) }, + { name: "_rels/.rels", data: ENC.encode(rels) }, + { name: "xl/workbook.xml", data: ENC.encode(wb) }, + { name: "xl/_rels/workbook.xml.rels", data: ENC.encode(wbRels) }, + { name: "xl/sharedStrings.xml", data: ENC.encode(sst) }, + { name: "xl/worksheets/sheet1.xml", data: ENC.encode(ws) }, + ]); +} + +// ─── benchmark ──────────────────────────────────────────────────────────────── + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const headers = ["id", "name", "value", "score"]; +const rows: (string | number | null)[][] = Array.from({ length: ROWS }, (_, i) => [ + i, + `item_${i % 100}`, + i * 1.5, + Math.sin(i * 0.01), +]); + +const xlsx = makeXlsx(headers, rows); + +for (let i = 0; i < WARMUP; i++) { + readExcel(xlsx); + xlsxSheetNames(xlsx); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + readExcel(xlsx); + xlsxSheetNames(xlsx); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "read_excel", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_ceil_floor_trunc_sqrt.ts b/benchmarks/tsb/bench_series_ceil_floor_trunc_sqrt.ts new file mode 100644 index 00000000..5c783e8b --- /dev/null +++ b/benchmarks/tsb/bench_series_ceil_floor_trunc_sqrt.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: seriesCeil / seriesFloor / seriesTrunc / seriesSqrt — math rounding on 100k-element Series. + * Outputs JSON: {"function": "series_ceil_floor_trunc_sqrt", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, seriesCeil, seriesFloor, seriesTrunc, seriesSqrt } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i % 1000) * 0.7 + 0.3) }); + +for (let i = 0; i < WARMUP; i++) { + seriesCeil(s); + seriesFloor(s); + seriesTrunc(s); + seriesSqrt(s); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesCeil(s); + seriesFloor(s); + seriesTrunc(s); + seriesSqrt(s); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_ceil_floor_trunc_sqrt", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_pipe_apply.ts b/benchmarks/tsb/bench_series_pipe_apply.ts new file mode 100644 index 00000000..564028a6 --- /dev/null +++ b/benchmarks/tsb/bench_series_pipe_apply.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: pipeSeries / dataFramePipe — pipe function application utilities. + * Outputs JSON: {"function": "series_pipe_apply", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, pipeSeries, dataFramePipe, seriesAbs, seriesMul } from "../../src/index.ts"; +import type { Scalar } from "../../src/types.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 0.5 - SIZE * 0.25) }); +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => i * 0.5), + b: Array.from({ length: SIZE }, (_, i) => i * 0.3 + 1), +}); + +const absAndDouble = (x: Series) => seriesMul(seriesAbs(x), 2); +const dfAbsAndDouble = (d: DataFrame) => d.abs().mul(2); + +for (let i = 0; i < WARMUP; i++) { + pipeSeries(s, absAndDouble); + dataFramePipe(df, dfAbsAndDouble); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + pipeSeries(s, absAndDouble); + dataFramePipe(df, dfAbsAndDouble); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_pipe_apply", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); From bae766a1db23999a1d86b68ed6abddfd95e3d3bf Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 00:05:34 +0000 Subject: [PATCH 05/19] Iteration 139: Add 8 benchmark pairs (420 total, +8 vs 412) Added benchmark pairs: - cut_interval_index: cutIntervalIndex/qcutIntervalIndex vs pd.cut/qcut - dataframe_sign: dataFrameSign vs np.sign(df) - argsort_scalars: argsortScalars/searchsortedMany vs np.argsort/searchsorted - interval_index_ops: IntervalIndex.contains/get_loc vs pd.IntervalIndex ops - period_index_range: PeriodIndex.periodRange/fromPeriods vs pd.period_range - datetime_index_from: DatetimeIndex.fromDates/fromTimestamps vs pd.DatetimeIndex - timedelta_index: TimedeltaIndex.fromTimedeltas/fromRange/fromStrings vs pd.TimedeltaIndex - resolve_freq: resolveFreq vs pd.tseries.frequencies.to_offset Run: https://github.com/githubnext/tsessebe/actions/runs/24539911725 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- benchmarks/pandas/bench_argsort_scalars.py | 24 +++++++++++ benchmarks/pandas/bench_cut_interval_index.py | 24 +++++++++++ benchmarks/pandas/bench_dataframe_sign.py | 25 +++++++++++ .../pandas/bench_datetime_index_from.py | 25 +++++++++++ benchmarks/pandas/bench_interval_index_ops.py | 30 ++++++++++++++ benchmarks/pandas/bench_period_index_range.py | 25 +++++++++++ benchmarks/pandas/bench_resolve_freq.py | 24 +++++++++++ benchmarks/pandas/bench_timedelta_index.py | 29 +++++++++++++ benchmarks/tsb/bench_argsort_scalars.ts | 37 +++++++++++++++++ benchmarks/tsb/bench_cut_interval_index.ts | 33 +++++++++++++++ benchmarks/tsb/bench_dataframe_sign.ts | 34 +++++++++++++++ benchmarks/tsb/bench_datetime_index_from.ts | 36 ++++++++++++++++ benchmarks/tsb/bench_interval_index_ops.ts | 41 +++++++++++++++++++ benchmarks/tsb/bench_period_index_range.ts | 37 +++++++++++++++++ benchmarks/tsb/bench_resolve_freq.ts | 36 ++++++++++++++++ benchmarks/tsb/bench_timedelta_index.ts | 40 ++++++++++++++++++ 16 files changed, 500 insertions(+) create mode 100644 benchmarks/pandas/bench_argsort_scalars.py create mode 100644 benchmarks/pandas/bench_cut_interval_index.py create mode 100644 benchmarks/pandas/bench_dataframe_sign.py create mode 100644 benchmarks/pandas/bench_datetime_index_from.py create mode 100644 benchmarks/pandas/bench_interval_index_ops.py create mode 100644 benchmarks/pandas/bench_period_index_range.py create mode 100644 benchmarks/pandas/bench_resolve_freq.py create mode 100644 benchmarks/pandas/bench_timedelta_index.py create mode 100644 benchmarks/tsb/bench_argsort_scalars.ts create mode 100644 benchmarks/tsb/bench_cut_interval_index.ts create mode 100644 benchmarks/tsb/bench_dataframe_sign.ts create mode 100644 benchmarks/tsb/bench_datetime_index_from.ts create mode 100644 benchmarks/tsb/bench_interval_index_ops.ts create mode 100644 benchmarks/tsb/bench_period_index_range.ts create mode 100644 benchmarks/tsb/bench_resolve_freq.ts create mode 100644 benchmarks/tsb/bench_timedelta_index.ts diff --git a/benchmarks/pandas/bench_argsort_scalars.py b/benchmarks/pandas/bench_argsort_scalars.py new file mode 100644 index 00000000..db5a11e1 --- /dev/null +++ b/benchmarks/pandas/bench_argsort_scalars.py @@ -0,0 +1,24 @@ +"""Benchmark: np.argsort / np.searchsorted — sort/search utilities on 100k-element arrays.""" +import json +import time +import numpy as np + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +arr = np.sin(np.arange(SIZE) * 0.001) * SIZE +sorted_arr = np.sort(arr) +queries = (np.arange(1000) - 500) * SIZE / 500 + +for _ in range(WARMUP): + np.argsort(arr) + np.searchsorted(sorted_arr, queries) + +start = time.perf_counter() +for _ in range(ITERATIONS): + np.argsort(arr) + np.searchsorted(sorted_arr, queries) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "argsort_scalars", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_cut_interval_index.py b/benchmarks/pandas/bench_cut_interval_index.py new file mode 100644 index 00000000..5d4bb426 --- /dev/null +++ b/benchmarks/pandas/bench_cut_interval_index.py @@ -0,0 +1,24 @@ +"""Benchmark: cutIntervalIndex / qcutIntervalIndex — pd.cut/qcut returning IntervalIndex on 100k-element Series.""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +data = (np.arange(SIZE) % 1000) * 0.1 +s = pd.Series(data) + +for _ in range(WARMUP): + pd.cut(s, 20, retbins=False) + pd.qcut(s, 10, duplicates="drop") + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.cut(s, 20, retbins=False) + pd.qcut(s, 10, duplicates="drop") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "cut_interval_index", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_dataframe_sign.py b/benchmarks/pandas/bench_dataframe_sign.py new file mode 100644 index 00000000..46c0155a --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_sign.py @@ -0,0 +1,25 @@ +"""Benchmark: DataFrame sign operation — np.sign on 100k-row DataFrame.""" +import json +import time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({ + "a": (np.arange(ROWS) % 200) - 100, + "b": np.sin(np.arange(ROWS) * 0.01) * 1000, + "c": (np.arange(ROWS) % 3) - 1, +}) + +for _ in range(WARMUP): + np.sign(df) + +start = time.perf_counter() +for _ in range(ITERATIONS): + np.sign(df) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "dataframe_sign", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_datetime_index_from.py b/benchmarks/pandas/bench_datetime_index_from.py new file mode 100644 index 00000000..d61a64d7 --- /dev/null +++ b/benchmarks/pandas/bench_datetime_index_from.py @@ -0,0 +1,25 @@ +"""Benchmark: pd.DatetimeIndex from dates/timestamps — DatetimeIndex construction from raw data.""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +base = pd.Timestamp("2000-01-01") +dates = [base + pd.Timedelta(days=i) for i in range(SIZE)] +timestamps = np.arange(SIZE) * 86_400 * 1_000_000_000 + base.value # nanosecond timestamps + +for _ in range(WARMUP): + pd.DatetimeIndex(dates) + pd.DatetimeIndex(timestamps) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.DatetimeIndex(dates) + pd.DatetimeIndex(timestamps) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "datetime_index_from", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_interval_index_ops.py b/benchmarks/pandas/bench_interval_index_ops.py new file mode 100644 index 00000000..e6b606ea --- /dev/null +++ b/benchmarks/pandas/bench_interval_index_ops.py @@ -0,0 +1,30 @@ +"""Benchmark: IntervalIndex contains / get_loc — interval index lookup ops on 1k-interval index.""" +import json +import time +import numpy as np +import pandas as pd + +BREAKS = 1_001 # 1000 intervals +QUERIES = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +breaks = np.arange(BREAKS) * 0.1 +idx = pd.IntervalIndex.from_breaks(breaks) + +# Query values spread across the range +query_values = (np.arange(QUERIES) / QUERIES) * (BREAKS - 1) * 0.1 + +for _ in range(WARMUP): + for q in query_values[:100]: + idx.contains(q) + idx.get_loc(q) + +start = time.perf_counter() +for _ in range(ITERATIONS): + for q in query_values: + idx.contains(q) + idx.get_loc(q) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "interval_index_ops", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_period_index_range.py b/benchmarks/pandas/bench_period_index_range.py new file mode 100644 index 00000000..54e16100 --- /dev/null +++ b/benchmarks/pandas/bench_period_index_range.py @@ -0,0 +1,25 @@ +"""Benchmark: pd.period_range / pd.PeriodIndex — PeriodIndex construction.""" +import json +import time +import pandas as pd + +WARMUP = 5 +ITERATIONS = 50 + +start_d = pd.Period("2000-01-01", freq="D") +start_m = pd.Period("2000-01", freq="M") +day_periods = pd.period_range(start="2000-01-01", periods=365 * 10, freq="D") + +for _ in range(WARMUP): + pd.period_range(start=start_d, periods=3650, freq="D") + pd.period_range(start=start_m, periods=120, freq="ME") + pd.PeriodIndex(day_periods[:365]) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.period_range(start=start_d, periods=3650, freq="D") + pd.period_range(start=start_m, periods=120, freq="ME") + pd.PeriodIndex(day_periods[:365]) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "period_index_range", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_resolve_freq.py b/benchmarks/pandas/bench_resolve_freq.py new file mode 100644 index 00000000..212c5a4e --- /dev/null +++ b/benchmarks/pandas/bench_resolve_freq.py @@ -0,0 +1,24 @@ +"""Benchmark: pd.tseries.frequencies.to_offset — frequency string-to-offset resolution.""" +import json +import time +import pandas as pd +from pandas.tseries.frequencies import to_offset + +WARMUP = 5 +ITERATIONS = 1_000 + +freqs = ["D", "h", "min", "s", "ms", "ME", "QE", "YE", "W", "B"] + +for _ in range(WARMUP): + for f in freqs: + to_offset(f) + to_offset(f"2{f}") + +start = time.perf_counter() +for _ in range(ITERATIONS): + for f in freqs: + to_offset(f) + to_offset(f"2{f}") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "resolve_freq", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_timedelta_index.py b/benchmarks/pandas/bench_timedelta_index.py new file mode 100644 index 00000000..b04908b4 --- /dev/null +++ b/benchmarks/pandas/bench_timedelta_index.py @@ -0,0 +1,29 @@ +"""Benchmark: pd.TimedeltaIndex construction from timedeltas/range/strings.""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 1_000 +WARMUP = 5 +ITERATIONS = 100 + +deltas = [pd.Timedelta(days=i, hours=i % 24) for i in range(SIZE)] +start_td = pd.Timedelta(days=0) +stop_td = pd.Timedelta(days=SIZE) +step_td = pd.Timedelta(days=1) +strings = [f"{i}D" for i in range(SIZE)] + +for _ in range(WARMUP): + pd.TimedeltaIndex(deltas) + pd.timedelta_range(start=start_td, end=stop_td, freq=step_td) + pd.to_timedelta(strings) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.TimedeltaIndex(deltas) + pd.timedelta_range(start=start_td, end=stop_td, freq=step_td) + pd.to_timedelta(strings) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "timedelta_index", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/tsb/bench_argsort_scalars.ts b/benchmarks/tsb/bench_argsort_scalars.ts new file mode 100644 index 00000000..1cf71c66 --- /dev/null +++ b/benchmarks/tsb/bench_argsort_scalars.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: argsortScalars / searchsortedMany — sort/search utilities on 100k-element arrays. + * Outputs JSON: {"function": "argsort_scalars", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { argsortScalars, searchsortedMany } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +// Array of numbers to sort/search +const arr = Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.001) * SIZE); +// Sorted version for searchsortedMany +const sorted = [...arr].sort((a, b) => (a as number) - (b as number)); +// Query values for searchsortedMany +const queries = Array.from({ length: 1000 }, (_, i) => (i - 500) * SIZE / 500); + +for (let i = 0; i < WARMUP; i++) { + argsortScalars(arr); + searchsortedMany(sorted, queries); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + argsortScalars(arr); + searchsortedMany(sorted, queries); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "argsort_scalars", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_cut_interval_index.ts b/benchmarks/tsb/bench_cut_interval_index.ts new file mode 100644 index 00000000..829830ff --- /dev/null +++ b/benchmarks/tsb/bench_cut_interval_index.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: cutIntervalIndex / qcutIntervalIndex — cut/qcut returning IntervalIndex on 100k-element Series. + * Outputs JSON: {"function": "cut_interval_index", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, cutIntervalIndex, qcutIntervalIndex } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const data = Array.from({ length: SIZE }, (_, i) => (i % 1000) * 0.1); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + cutIntervalIndex(s, 20); + qcutIntervalIndex(s, 10); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + cutIntervalIndex(s, 20); + qcutIntervalIndex(s, 10); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "cut_interval_index", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_sign.ts b/benchmarks/tsb/bench_dataframe_sign.ts new file mode 100644 index 00000000..b1e222c0 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_sign.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: dataFrameSign — sign operation on 100k-row DataFrame. + * Outputs JSON: {"function": "dataframe_sign", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dataFrameSign } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => (i % 200) - 100), + b: Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01) * 1000), + c: Array.from({ length: ROWS }, (_, i) => (i % 3) - 1), +}); + +for (let i = 0; i < WARMUP; i++) { + dataFrameSign(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameSign(df); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_sign", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_datetime_index_from.ts b/benchmarks/tsb/bench_datetime_index_from.ts new file mode 100644 index 00000000..3db85d7d --- /dev/null +++ b/benchmarks/tsb/bench_datetime_index_from.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: DatetimeIndex.fromDates / DatetimeIndex.fromTimestamps — DatetimeIndex construction from raw data. + * Outputs JSON: {"function": "datetime_index_from", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DatetimeIndex } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const BASE = Date.UTC(2000, 0, 1); +const DAY_MS = 86_400_000; + +const dates = Array.from({ length: SIZE }, (_, i) => new Date(BASE + i * DAY_MS)); +const timestamps = Array.from({ length: SIZE }, (_, i) => BASE + i * DAY_MS); + +for (let i = 0; i < WARMUP; i++) { + DatetimeIndex.fromDates(dates); + DatetimeIndex.fromTimestamps(timestamps); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + DatetimeIndex.fromDates(dates); + DatetimeIndex.fromTimestamps(timestamps); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "datetime_index_from", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_interval_index_ops.ts b/benchmarks/tsb/bench_interval_index_ops.ts new file mode 100644 index 00000000..ef61dce8 --- /dev/null +++ b/benchmarks/tsb/bench_interval_index_ops.ts @@ -0,0 +1,41 @@ +/** + * Benchmark: IntervalIndex.contains / IntervalIndex.get_loc — interval index lookup ops on 1k-interval index. + * Outputs JSON: {"function": "interval_index_ops", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { IntervalIndex } from "../../src/index.ts"; + +const BREAKS = 1_001; // 1000 intervals +const QUERIES = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const breaks = Array.from({ length: BREAKS }, (_, i) => i * 0.1); +const idx = IntervalIndex.fromBreaks(breaks); + +// Query values spread across the range +const queryValues = Array.from({ length: QUERIES }, (_, i) => (i / QUERIES) * (BREAKS - 1) * 0.1); + +for (let i = 0; i < WARMUP; i++) { + for (let q = 0; q < 100; q++) { + idx.contains(queryValues[q]!); + idx.get_loc(queryValues[q]!); + } +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + for (let q = 0; q < QUERIES; q++) { + idx.contains(queryValues[q]!); + idx.get_loc(queryValues[q]!); + } +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "interval_index_ops", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_period_index_range.ts b/benchmarks/tsb/bench_period_index_range.ts new file mode 100644 index 00000000..1d259b38 --- /dev/null +++ b/benchmarks/tsb/bench_period_index_range.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: PeriodIndex.periodRange / PeriodIndex.fromPeriods — PeriodIndex construction. + * Outputs JSON: {"function": "period_index_range", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Period, PeriodIndex } from "../../src/index.ts"; + +const WARMUP = 5; +const ITERATIONS = 50; + +const startPeriod = Period.fromDate(new Date(Date.UTC(2000, 0, 1)), "D"); +const startMonth = Period.fromDate(new Date(Date.UTC(2000, 0, 1)), "M"); +const dayPeriods = Array.from({ length: 365 * 10 }, (_, i) => + Period.fromDate(new Date(Date.UTC(2000, 0, 1) + i * 86_400_000), "D"), +); + +for (let i = 0; i < WARMUP; i++) { + PeriodIndex.periodRange(startPeriod, 3650); + PeriodIndex.periodRange(startMonth, 120); + PeriodIndex.fromPeriods(dayPeriods.slice(0, 365)); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + PeriodIndex.periodRange(startPeriod, 3650); + PeriodIndex.periodRange(startMonth, 120); + PeriodIndex.fromPeriods(dayPeriods.slice(0, 365)); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "period_index_range", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_resolve_freq.ts b/benchmarks/tsb/bench_resolve_freq.ts new file mode 100644 index 00000000..15ac18d8 --- /dev/null +++ b/benchmarks/tsb/bench_resolve_freq.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: resolveFreq — frequency string-to-offset resolution on many inputs. + * Outputs JSON: {"function": "resolve_freq", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { resolveFreq } from "../../src/index.ts"; + +const WARMUP = 5; +const ITERATIONS = 1_000; + +// Various frequency strings to resolve +const freqs = ["D", "h", "min", "s", "ms", "ME", "QE", "YE", "W", "B"] as const; + +for (let i = 0; i < WARMUP; i++) { + for (const f of freqs) { + resolveFreq(f); + resolveFreq(f, 2); + } +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + for (const f of freqs) { + resolveFreq(f); + resolveFreq(f, 2); + } +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "resolve_freq", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_timedelta_index.ts b/benchmarks/tsb/bench_timedelta_index.ts new file mode 100644 index 00000000..b3bcdd29 --- /dev/null +++ b/benchmarks/tsb/bench_timedelta_index.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: TimedeltaIndex.fromTimedeltas / fromRange / fromStrings — TimedeltaIndex construction. + * Outputs JSON: {"function": "timedelta_index", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Timedelta, TimedeltaIndex } from "../../src/index.ts"; + +const SIZE = 1_000; +const WARMUP = 5; +const ITERATIONS = 100; + +const deltas = Array.from({ length: SIZE }, (_, i) => + Timedelta.fromComponents({ days: i, hours: i % 24 }), +); +const startTd = Timedelta.fromComponents({ days: 0 }); +const stopTd = Timedelta.fromComponents({ days: SIZE }); +const stepTd = Timedelta.fromComponents({ days: 1 }); +const strings = Array.from({ length: SIZE }, (_, i) => `${i}D`); + +for (let i = 0; i < WARMUP; i++) { + TimedeltaIndex.fromTimedeltas(deltas); + TimedeltaIndex.fromRange(startTd, stopTd, stepTd); + TimedeltaIndex.fromStrings(strings); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + TimedeltaIndex.fromTimedeltas(deltas); + TimedeltaIndex.fromRange(startTd, stopTd, stepTd); + TimedeltaIndex.fromStrings(strings); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "timedelta_index", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); From 081cb30a610f5d8ef6676d6479df52e533f6b05c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 03:27:10 +0000 Subject: [PATCH 06/19] Iteration 142: Add 9 benchmark pairs (429 total, +9 vs 420) Added 9 new benchmark pairs: - groupby_multi_key: DataFrameGroupBy with multi-column keys ["dept","region"] vs pandas multi-key groupby - timestamp_static: Timestamp.fromComponents/fromisoformat/fromtimestamp vs pd.Timestamp static ctors - tz_datetime_index_ops: TZDatetimeIndex.toLocalStrings/sort/unique/filter/contains vs tz-aware DatetimeIndex ops - rolling_center_min_periods: Rolling with center=true and minPeriods options vs pandas rolling center/min_periods - cast_scalar: castScalar type coercion vs Python int()/float()/str() conversions - concat_options: concat with join="inner" and ignoreIndex=true vs pd.concat join/ignore_index - ewm_com_halflife: EWM with com and halflife params vs pandas ewm(com/halflife) - nat_sort_key: natSortKey tokenizer vs Python regex-based natural sort key - dataframe_iter: DataFrame.items()/iterrows() vs pandas df.items()/iterrows() Note: State file claimed best was 428 (from iters 140/141 that were not pushed to branch); actual branch had 420 pairs. This iteration rebuilds to 429 (new actual best). Run: https://github.com/githubnext/tsessebe/actions/runs/24545567127 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- benchmarks/pandas/bench_cast_scalar.py | 36 +++++++++++++ benchmarks/pandas/bench_concat_options.py | 37 +++++++++++++ benchmarks/pandas/bench_dataframe_iter.py | 44 +++++++++++++++ benchmarks/pandas/bench_ewm_com_halflife.py | 34 ++++++++++++ benchmarks/pandas/bench_groupby_multi_key.py | 35 ++++++++++++ benchmarks/pandas/bench_nat_sort_key.py | 41 ++++++++++++++ .../bench_rolling_center_min_periods.py | 32 +++++++++++ benchmarks/pandas/bench_timestamp_static.py | 41 ++++++++++++++ .../pandas/bench_tz_datetime_index_ops.py | 36 +++++++++++++ benchmarks/tsb/bench_cast_scalar.ts | 51 ++++++++++++++++++ benchmarks/tsb/bench_concat_options.ts | 45 ++++++++++++++++ benchmarks/tsb/bench_dataframe_iter.ts | 54 +++++++++++++++++++ benchmarks/tsb/bench_ewm_com_halflife.ts | 40 ++++++++++++++ benchmarks/tsb/bench_groupby_multi_key.ts | 42 +++++++++++++++ benchmarks/tsb/bench_nat_sort_key.ts | 46 ++++++++++++++++ .../tsb/bench_rolling_center_min_periods.ts | 38 +++++++++++++ benchmarks/tsb/bench_timestamp_static.ts | 47 ++++++++++++++++ benchmarks/tsb/bench_tz_datetime_index_ops.ts | 43 +++++++++++++++ 18 files changed, 742 insertions(+) create mode 100644 benchmarks/pandas/bench_cast_scalar.py create mode 100644 benchmarks/pandas/bench_concat_options.py create mode 100644 benchmarks/pandas/bench_dataframe_iter.py create mode 100644 benchmarks/pandas/bench_ewm_com_halflife.py create mode 100644 benchmarks/pandas/bench_groupby_multi_key.py create mode 100644 benchmarks/pandas/bench_nat_sort_key.py create mode 100644 benchmarks/pandas/bench_rolling_center_min_periods.py create mode 100644 benchmarks/pandas/bench_timestamp_static.py create mode 100644 benchmarks/pandas/bench_tz_datetime_index_ops.py create mode 100644 benchmarks/tsb/bench_cast_scalar.ts create mode 100644 benchmarks/tsb/bench_concat_options.ts create mode 100644 benchmarks/tsb/bench_dataframe_iter.ts create mode 100644 benchmarks/tsb/bench_ewm_com_halflife.ts create mode 100644 benchmarks/tsb/bench_groupby_multi_key.ts create mode 100644 benchmarks/tsb/bench_nat_sort_key.ts create mode 100644 benchmarks/tsb/bench_rolling_center_min_periods.ts create mode 100644 benchmarks/tsb/bench_timestamp_static.ts create mode 100644 benchmarks/tsb/bench_tz_datetime_index_ops.ts diff --git a/benchmarks/pandas/bench_cast_scalar.py b/benchmarks/pandas/bench_cast_scalar.py new file mode 100644 index 00000000..e363912f --- /dev/null +++ b/benchmarks/pandas/bench_cast_scalar.py @@ -0,0 +1,36 @@ +""" +Benchmark: Python type coercion equivalents — int(), float(), str(), bool() conversions. +Outputs JSON: {"function": "cast_scalar", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +int_values = [i % 1000 for i in range(SIZE)] +float_values = [i * 0.5 for i in range(SIZE)] +str_values = [str(i % 1000) for i in range(SIZE)] +bool_values = [i % 2 == 0 for i in range(SIZE)] + +for _ in range(WARMUP): + for j in range(SIZE): + int(float_values[j]) + float(int_values[j]) + int(str_values[j]) + int(bool_values[j]) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + for j in range(SIZE): + int(float_values[j]) + float(int_values[j]) + int(str_values[j]) + int(bool_values[j]) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "cast_scalar", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_concat_options.py b/benchmarks/pandas/bench_concat_options.py new file mode 100644 index 00000000..ae777d96 --- /dev/null +++ b/benchmarks/pandas/bench_concat_options.py @@ -0,0 +1,37 @@ +""" +Benchmark: pandas concat with join="inner" and ignore_index=True options. +Outputs JSON: {"function": "concat_options", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +ROWS = 50_000 +WARMUP = 5 +ITERATIONS = 20 + +df1 = pd.DataFrame({ + "a": [i * 1.0 for i in range(ROWS)], + "b": [i * 2.0 for i in range(ROWS)], + "c": [i * 3.0 for i in range(ROWS)], +}) +df2 = pd.DataFrame({ + "a": [i * 1.5 for i in range(ROWS)], + "b": [i * 2.5 for i in range(ROWS)], + "d": [i * 4.0 for i in range(ROWS)], +}) + +for _ in range(WARMUP): + pd.concat([df1, df2], join="inner", ignore_index=True) + pd.concat([df1, df2], join="outer", ignore_index=True) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.concat([df1, df2], join="inner", ignore_index=True) + pd.concat([df1, df2], join="outer", ignore_index=True) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "concat_options", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_dataframe_iter.py b/benchmarks/pandas/bench_dataframe_iter.py new file mode 100644 index 00000000..2075cb9e --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_iter.py @@ -0,0 +1,44 @@ +""" +Benchmark: pandas DataFrame.items() / DataFrame.iterrows() — column and row iteration. +Outputs JSON: {"function": "dataframe_iter", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +ROWS = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({ + "a": [i * 1.0 for i in range(ROWS)], + "b": [i * 2.0 for i in range(ROWS)], + "c": [i * 3.0 for i in range(ROWS)], +}) + + +def consume_items(df: pd.DataFrame) -> None: + for _, s in df.items(): + _ = s.sum() + + +def consume_iterrows(df: pd.DataFrame) -> None: + count = 0 + for _ in df.iterrows(): + count += 1 + + +for _ in range(WARMUP): + consume_items(df) + consume_iterrows(df) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + consume_items(df) + consume_iterrows(df) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "dataframe_iter", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_ewm_com_halflife.py b/benchmarks/pandas/bench_ewm_com_halflife.py new file mode 100644 index 00000000..27f7a976 --- /dev/null +++ b/benchmarks/pandas/bench_ewm_com_halflife.py @@ -0,0 +1,34 @@ +""" +Benchmark: pandas EWM with com and halflife decay parameters. +Outputs JSON: {"function": "ewm_com_halflife", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [float(np.sin(i * 0.05)) for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.ewm(com=9).mean() + s.ewm(halflife=10).mean() + s.ewm(com=5).std() + s.ewm(halflife=7).var() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.ewm(com=9).mean() + s.ewm(halflife=10).mean() + s.ewm(com=5).std() + s.ewm(halflife=7).var() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "ewm_com_halflife", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_groupby_multi_key.py b/benchmarks/pandas/bench_groupby_multi_key.py new file mode 100644 index 00000000..dd2d9b8d --- /dev/null +++ b/benchmarks/pandas/bench_groupby_multi_key.py @@ -0,0 +1,35 @@ +""" +Benchmark: pandas DataFrame groupby with multiple key columns. +Outputs JSON: {"function": "groupby_multi_key", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +depts = ["eng", "sales", "hr", "ops"] +regions = ["north", "south", "east", "west"] +dept = [depts[i % len(depts)] for i in range(ROWS)] +region = [regions[i % len(regions)] for i in range(ROWS)] +value = [i * 0.5 for i in range(ROWS)] +bonus = [i * 0.1 for i in range(ROWS)] + +df = pd.DataFrame({"dept": dept, "region": region, "value": value, "bonus": bonus}) + +for _ in range(WARMUP): + df.groupby(["dept", "region"]).sum() + df.groupby(["dept", "region"]).mean() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.groupby(["dept", "region"]).sum() + df.groupby(["dept", "region"]).mean() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "groupby_multi_key", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_nat_sort_key.py b/benchmarks/pandas/bench_nat_sort_key.py new file mode 100644 index 00000000..6f2ec0ee --- /dev/null +++ b/benchmarks/pandas/bench_nat_sort_key.py @@ -0,0 +1,41 @@ +""" +Benchmark: Python natural sort key equivalent — natsort library or manual tokenization. +Uses natsort if available, else falls back to a simple tokenizer. +Outputs JSON: {"function": "nat_sort_key", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import re + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +data = [f"file{i % 1000}_v{(i % 10) + 1}.{i % 100}" for i in range(SIZE)] +mixed_case = [f"Item{i % 500}_Part{(i % 20) + 1}" for i in range(SIZE)] + + +def nat_sort_key(s: str, ignore_case: bool = False) -> list: + """Simple natural sort key tokenizer (matches tsb natSortKey logic).""" + if ignore_case: + s = s.lower() + parts = re.split(r"(\d+)", s) + return [int(p) if p.isdigit() else p for p in parts] + + +for _ in range(WARMUP): + for j in range(SIZE): + nat_sort_key(data[j]) + nat_sort_key(mixed_case[j], ignore_case=True) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + for j in range(SIZE): + nat_sort_key(data[j]) + nat_sort_key(mixed_case[j], ignore_case=True) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "nat_sort_key", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_rolling_center_min_periods.py b/benchmarks/pandas/bench_rolling_center_min_periods.py new file mode 100644 index 00000000..fd4e6378 --- /dev/null +++ b/benchmarks/pandas/bench_rolling_center_min_periods.py @@ -0,0 +1,32 @@ +""" +Benchmark: pandas Rolling with center=True and min_periods options. +Outputs JSON: {"function": "rolling_center_min_periods", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [float('nan') if i % 10 == 0 else float(np.sin(i * 0.01)) for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.rolling(50, center=True).mean() + s.rolling(100, min_periods=10).sum() + s.rolling(30, center=True, min_periods=5).std() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.rolling(50, center=True).mean() + s.rolling(100, min_periods=10).sum() + s.rolling(30, center=True, min_periods=5).std() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "rolling_center_min_periods", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_timestamp_static.py b/benchmarks/pandas/bench_timestamp_static.py new file mode 100644 index 00000000..4dbe056a --- /dev/null +++ b/benchmarks/pandas/bench_timestamp_static.py @@ -0,0 +1,41 @@ +""" +Benchmark: pandas Timestamp static constructors — fromtimestamp, fromisoformat, components. +Outputs JSON: {"function": "timestamp_static", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +import datetime + +iso_strings = [ + (datetime.datetime(2020, 1, 1) + datetime.timedelta(days=i)).isoformat() + for i in range(SIZE) +] +timestamps_s = [ + (datetime.datetime(2020, 1, 1) + datetime.timedelta(hours=i)).timestamp() + for i in range(SIZE) +] + +for _ in range(WARMUP): + for j in range(SIZE): + pd.Timestamp(year=2020, month=(j % 12) + 1, day=(j % 28) + 1) + pd.Timestamp(iso_strings[j % len(iso_strings)]) + pd.Timestamp.fromtimestamp(timestamps_s[j % len(timestamps_s)]) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + for j in range(SIZE): + pd.Timestamp(year=2020, month=(j % 12) + 1, day=(j % 28) + 1) + pd.Timestamp(iso_strings[j % len(iso_strings)]) + pd.Timestamp.fromtimestamp(timestamps_s[j % len(timestamps_s)]) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "timestamp_static", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_tz_datetime_index_ops.py b/benchmarks/pandas/bench_tz_datetime_index_ops.py new file mode 100644 index 00000000..6d64926d --- /dev/null +++ b/benchmarks/pandas/bench_tz_datetime_index_ops.py @@ -0,0 +1,36 @@ +""" +Benchmark: pandas DatetimeTZDtype index methods — tz_localize, sort_values, unique, filter, isin. +Outputs JSON: {"function": "tz_datetime_index_ops", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +naive = pd.date_range(start="2024-01-01", periods=SIZE, freq="h") +tz_idx = naive.tz_localize("America/New_York") +ref_date = pd.Timestamp("2024-06-01", tz="America/New_York") + +for _ in range(WARMUP): + tz_idx.strftime("%Y-%m-%d %H:%M:%S %Z") + tz_idx.sort_values() + tz_idx.unique() + tz_idx[tz_idx >= ref_date] + ref_date in tz_idx + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + tz_idx.strftime("%Y-%m-%d %H:%M:%S %Z") + tz_idx.sort_values() + tz_idx.unique() + tz_idx[tz_idx >= ref_date] + ref_date in tz_idx + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "tz_datetime_index_ops", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/tsb/bench_cast_scalar.ts b/benchmarks/tsb/bench_cast_scalar.ts new file mode 100644 index 00000000..641573cf --- /dev/null +++ b/benchmarks/tsb/bench_cast_scalar.ts @@ -0,0 +1,51 @@ +/** + * Benchmark: castScalar — type coercion of scalar values to various Dtype kinds. + * Outputs JSON: {"function": "cast_scalar", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { castScalar, Dtype } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const intDtype = Dtype.from("int64"); +const floatDtype = Dtype.from("float64"); +const strDtype = Dtype.from("str"); +const boolDtype = Dtype.from("bool"); + +const intValues = Array.from({ length: SIZE }, (_, i) => i % 1000); +const floatValues = Array.from({ length: SIZE }, (_, i) => i * 0.5); +const strValues = Array.from({ length: SIZE }, (_, i) => String(i % 1000)); +const boolValues = Array.from({ length: SIZE }, (_, i) => i % 2 === 0); + +for (let i = 0; i < WARMUP; i++) { + for (let j = 0; j < SIZE; j++) { + castScalar(floatValues[j], intDtype); + castScalar(intValues[j], floatDtype); + castScalar(strValues[j], intDtype); + castScalar(boolValues[j], intDtype); + } +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + for (let j = 0; j < SIZE; j++) { + castScalar(floatValues[j], intDtype); + castScalar(intValues[j], floatDtype); + castScalar(strValues[j], intDtype); + castScalar(boolValues[j], intDtype); + } + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "cast_scalar", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_concat_options.ts b/benchmarks/tsb/bench_concat_options.ts new file mode 100644 index 00000000..77e681ab --- /dev/null +++ b/benchmarks/tsb/bench_concat_options.ts @@ -0,0 +1,45 @@ +/** + * Benchmark: concat with join="inner" and ignoreIndex=true options. + * Outputs JSON: {"function": "concat_options", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, concat } from "../../src/index.ts"; + +const ROWS = 50_000; +const WARMUP = 5; +const ITERATIONS = 20; + +// Two DataFrames with partial column overlap (inner join drops non-shared columns) +const df1 = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i * 1.0), + b: Array.from({ length: ROWS }, (_, i) => i * 2.0), + c: Array.from({ length: ROWS }, (_, i) => i * 3.0), +}); +const df2 = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i * 1.5), + b: Array.from({ length: ROWS }, (_, i) => i * 2.5), + d: Array.from({ length: ROWS }, (_, i) => i * 4.0), +}); + +for (let i = 0; i < WARMUP; i++) { + concat([df1, df2], { join: "inner", ignoreIndex: true }); + concat([df1, df2], { join: "outer", ignoreIndex: true }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + concat([df1, df2], { join: "inner", ignoreIndex: true }); + concat([df1, df2], { join: "outer", ignoreIndex: true }); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "concat_options", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_iter.ts b/benchmarks/tsb/bench_dataframe_iter.ts new file mode 100644 index 00000000..3923120c --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_iter.ts @@ -0,0 +1,54 @@ +/** + * Benchmark: DataFrame.items() / DataFrame.iterrows() — column and row iteration. + * Outputs JSON: {"function": "dataframe_iter", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.ts"; + +const ROWS = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i * 1.0), + b: Array.from({ length: ROWS }, (_, i) => i * 2.0), + c: Array.from({ length: ROWS }, (_, i) => i * 3.0), +}); + +function consumeItems(df: DataFrame): void { + for (const [, s] of df.items()) { + void s.sum(); + } +} + +function consumeIterrows(df: DataFrame): void { + let count = 0; + for (const _entry of df.iterrows()) { + void _entry; + count++; + } + void count; +} + +for (let i = 0; i < WARMUP; i++) { + consumeItems(df); + consumeIterrows(df); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + consumeItems(df); + consumeIterrows(df); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "dataframe_iter", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_ewm_com_halflife.ts b/benchmarks/tsb/bench_ewm_com_halflife.ts new file mode 100644 index 00000000..438facf3 --- /dev/null +++ b/benchmarks/tsb/bench_ewm_com_halflife.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: EWM with com and halflife decay parameters (vs existing span/alpha benches). + * Outputs JSON: {"function": "ewm_com_halflife", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.05)); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.ewm({ com: 9 }).mean(); + s.ewm({ halflife: 10 }).mean(); + s.ewm({ com: 5 }).std(); + s.ewm({ halflife: 7 }).var(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.ewm({ com: 9 }).mean(); + s.ewm({ halflife: 10 }).mean(); + s.ewm({ com: 5 }).std(); + s.ewm({ halflife: 7 }).var(); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "ewm_com_halflife", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_groupby_multi_key.ts b/benchmarks/tsb/bench_groupby_multi_key.ts new file mode 100644 index 00000000..80efb824 --- /dev/null +++ b/benchmarks/tsb/bench_groupby_multi_key.ts @@ -0,0 +1,42 @@ +/** + * Benchmark: DataFrameGroupBy with multiple key columns — groupby(["dept","region"]). + * Outputs JSON: {"function": "groupby_multi_key", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const depts = ["eng", "sales", "hr", "ops"]; +const regions = ["north", "south", "east", "west"]; +const dept = Array.from({ length: ROWS }, (_, i) => depts[i % depts.length]); +const region = Array.from({ length: ROWS }, (_, i) => regions[i % regions.length]); +const value = Array.from({ length: ROWS }, (_, i) => i * 0.5); +const bonus = Array.from({ length: ROWS }, (_, i) => i * 0.1); + +const df = DataFrame.fromColumns({ dept, region, value, bonus }); + +for (let i = 0; i < WARMUP; i++) { + df.groupby(["dept", "region"]).sum(); + df.groupby(["dept", "region"]).mean(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.groupby(["dept", "region"]).sum(); + df.groupby(["dept", "region"]).mean(); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "groupby_multi_key", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_nat_sort_key.ts b/benchmarks/tsb/bench_nat_sort_key.ts new file mode 100644 index 00000000..d98e4487 --- /dev/null +++ b/benchmarks/tsb/bench_nat_sort_key.ts @@ -0,0 +1,46 @@ +/** + * Benchmark: natSortKey — compute natural-sort key tokens for strings. + * Outputs JSON: {"function": "nat_sort_key", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { natSortKey } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const data = Array.from( + { length: SIZE }, + (_, i) => `file${i % 1000}_v${(i % 10) + 1}.${i % 100}`, +); +const mixedCase = Array.from( + { length: SIZE }, + (_, i) => `Item${i % 500}_Part${(i % 20) + 1}`, +); + +for (let i = 0; i < WARMUP; i++) { + for (let j = 0; j < SIZE; j++) { + natSortKey(data[j]); + natSortKey(mixedCase[j], { ignoreCase: true }); + } +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + for (let j = 0; j < SIZE; j++) { + natSortKey(data[j]); + natSortKey(mixedCase[j], { ignoreCase: true }); + } + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "nat_sort_key", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_rolling_center_min_periods.ts b/benchmarks/tsb/bench_rolling_center_min_periods.ts new file mode 100644 index 00000000..c5939604 --- /dev/null +++ b/benchmarks/tsb/bench_rolling_center_min_periods.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: Rolling with center=true and minPeriods options. + * Outputs JSON: {"function": "rolling_center_min_periods", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => (i % 10 === 0 ? null : Math.sin(i * 0.01))); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.rolling(50, { center: true }).mean(); + s.rolling(100, { minPeriods: 10 }).sum(); + s.rolling(30, { center: true, minPeriods: 5 }).std(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.rolling(50, { center: true }).mean(); + s.rolling(100, { minPeriods: 10 }).sum(); + s.rolling(30, { center: true, minPeriods: 5 }).std(); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "rolling_center_min_periods", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_timestamp_static.ts b/benchmarks/tsb/bench_timestamp_static.ts new file mode 100644 index 00000000..69a6d1f4 --- /dev/null +++ b/benchmarks/tsb/bench_timestamp_static.ts @@ -0,0 +1,47 @@ +/** + * Benchmark: Timestamp static constructors — fromComponents, fromisoformat, fromtimestamp, now, today. + * Outputs JSON: {"function": "timestamp_static", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Timestamp } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const isoStrings = Array.from({ length: SIZE }, (_, i) => { + const d = new Date(Date.UTC(2020, 0, 1) + i * 86_400_000); + return d.toISOString(); +}); +const timestamps = Array.from({ length: SIZE }, (_, i) => + Date.UTC(2020, 0, 1) + i * 3_600_000, +); + +for (let i = 0; i < WARMUP; i++) { + for (let j = 0; j < SIZE; j++) { + Timestamp.fromComponents({ year: 2020, month: (j % 12) + 1, day: (j % 28) + 1 }); + Timestamp.fromisoformat(isoStrings[j % isoStrings.length]); + Timestamp.fromtimestamp(timestamps[j % timestamps.length]); + } +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + for (let j = 0; j < SIZE; j++) { + Timestamp.fromComponents({ year: 2020, month: (j % 12) + 1, day: (j % 28) + 1 }); + Timestamp.fromisoformat(isoStrings[j % isoStrings.length]); + Timestamp.fromtimestamp(timestamps[j % timestamps.length]); + } + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "timestamp_static", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_tz_datetime_index_ops.ts b/benchmarks/tsb/bench_tz_datetime_index_ops.ts new file mode 100644 index 00000000..cf28440d --- /dev/null +++ b/benchmarks/tsb/bench_tz_datetime_index_ops.ts @@ -0,0 +1,43 @@ +/** + * Benchmark: TZDatetimeIndex methods — toLocalStrings, sort, unique, filter, contains. + * Outputs JSON: {"function": "tz_datetime_index_ops", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { date_range, tz_localize } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const naive = date_range({ start: "2024-01-01", periods: SIZE, freq: "h" }); +const tzIdx = tz_localize(naive, "America/New_York"); +const refDate = new Date("2024-06-01T00:00:00Z"); + +for (let i = 0; i < WARMUP; i++) { + tzIdx.toLocalStrings(); + tzIdx.sort(); + tzIdx.unique(); + tzIdx.filter((d) => d >= refDate); + tzIdx.contains(refDate); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + tzIdx.toLocalStrings(); + tzIdx.sort(); + tzIdx.unique(); + tzIdx.filter((d) => d >= refDate); + tzIdx.contains(refDate); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "tz_datetime_index_ops", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); From a84aca2e966835acc16dcd746088bbed490f99a9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 04:48:23 +0000 Subject: [PATCH 07/19] Iteration 143: Add 8 benchmark pairs (437 total, +8 vs 429) Add standalone functional-form benchmarks and new operation benchmarks: - bench_quantile_fn: quantileSeries/quantileDataFrame standalone functions - bench_pct_change_fn: pctChangeSeries/pctChangeDataFrame standalone functions - bench_merge_suffixes: merge with custom suffixes option - bench_expanding_min_periods: Expanding with minPeriods option - bench_dt_isocalendar: DatetimeAccessor.isocalendar_week - bench_period_asfreq: Period.asfreq/PeriodIndex.asfreq frequency conversion - bench_sample_fn: sampleSeries/sampleDataFrame standalone functions - bench_nunique_fn: nuniqueSeries/nuniqueDataFrame standalone functions Run: https://github.com/githubnext/tsessebe/actions/runs/24547746540 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- benchmarks/pandas/bench_dt_isocalendar.py | 27 ++++++++++ .../pandas/bench_expanding_min_periods.py | 32 ++++++++++++ benchmarks/pandas/bench_merge_suffixes.py | 34 +++++++++++++ benchmarks/pandas/bench_nunique_fn.py | 34 +++++++++++++ benchmarks/pandas/bench_pct_change_fn.py | 32 ++++++++++++ benchmarks/pandas/bench_period_asfreq.py | 30 ++++++++++++ benchmarks/pandas/bench_quantile_fn.py | 33 +++++++++++++ benchmarks/pandas/bench_sample_fn.py | 32 ++++++++++++ benchmarks/tsb/bench_dt_isocalendar.ts | 36 ++++++++++++++ benchmarks/tsb/bench_expanding_min_periods.ts | 38 ++++++++++++++ benchmarks/tsb/bench_merge_suffixes.ts | 49 +++++++++++++++++++ benchmarks/tsb/bench_nunique_fn.ts | 49 +++++++++++++++++++ benchmarks/tsb/bench_pct_change_fn.ts | 44 +++++++++++++++++ benchmarks/tsb/bench_period_asfreq.ts | 39 +++++++++++++++ benchmarks/tsb/bench_quantile_fn.ts | 45 +++++++++++++++++ benchmarks/tsb/bench_sample_fn.ts | 45 +++++++++++++++++ 16 files changed, 599 insertions(+) create mode 100644 benchmarks/pandas/bench_dt_isocalendar.py create mode 100644 benchmarks/pandas/bench_expanding_min_periods.py create mode 100644 benchmarks/pandas/bench_merge_suffixes.py create mode 100644 benchmarks/pandas/bench_nunique_fn.py create mode 100644 benchmarks/pandas/bench_pct_change_fn.py create mode 100644 benchmarks/pandas/bench_period_asfreq.py create mode 100644 benchmarks/pandas/bench_quantile_fn.py create mode 100644 benchmarks/pandas/bench_sample_fn.py create mode 100644 benchmarks/tsb/bench_dt_isocalendar.ts create mode 100644 benchmarks/tsb/bench_expanding_min_periods.ts create mode 100644 benchmarks/tsb/bench_merge_suffixes.ts create mode 100644 benchmarks/tsb/bench_nunique_fn.ts create mode 100644 benchmarks/tsb/bench_pct_change_fn.ts create mode 100644 benchmarks/tsb/bench_period_asfreq.ts create mode 100644 benchmarks/tsb/bench_quantile_fn.ts create mode 100644 benchmarks/tsb/bench_sample_fn.ts diff --git a/benchmarks/pandas/bench_dt_isocalendar.py b/benchmarks/pandas/bench_dt_isocalendar.py new file mode 100644 index 00000000..0680e42a --- /dev/null +++ b/benchmarks/pandas/bench_dt_isocalendar.py @@ -0,0 +1,27 @@ +""" +Benchmark: pandas DatetimeIndex.isocalendar().week on 100k dates. +Outputs JSON: {"function": "dt_isocalendar", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +dates = pd.date_range("2000-01-01", periods=ROWS, freq="D") +s = pd.Series(dates) + +for _ in range(WARMUP): + s.dt.isocalendar()["week"] + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.dt.isocalendar()["week"] + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "dt_isocalendar", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_expanding_min_periods.py b/benchmarks/pandas/bench_expanding_min_periods.py new file mode 100644 index 00000000..f7926992 --- /dev/null +++ b/benchmarks/pandas/bench_expanding_min_periods.py @@ -0,0 +1,32 @@ +""" +Benchmark: pandas Expanding with min_periods option. +Outputs JSON: {"function": "expanding_min_periods", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [float('nan') if i % 10 == 0 else float(np.sin(i * 0.01)) for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.expanding(min_periods=10).mean() + s.expanding(min_periods=50).sum() + s.expanding(min_periods=5).std() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.expanding(min_periods=10).mean() + s.expanding(min_periods=50).sum() + s.expanding(min_periods=5).std() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "expanding_min_periods", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_merge_suffixes.py b/benchmarks/pandas/bench_merge_suffixes.py new file mode 100644 index 00000000..410152b9 --- /dev/null +++ b/benchmarks/pandas/bench_merge_suffixes.py @@ -0,0 +1,34 @@ +""" +Benchmark: pandas merge with custom suffixes option. +Outputs JSON: {"function": "merge_suffixes", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +ROWS = 50_000 +WARMUP = 3 +ITERATIONS = 10 + +ids = [i % 10_000 for i in range(ROWS)] +left = pd.DataFrame({"id": ids, "value": [x * 1.1 for x in ids], "score": [x * 0.5 for x in ids]}) +right = pd.DataFrame({ + "id": list(range(10_000)), + "value": [i * 2.0 for i in range(10_000)], + "rank": list(range(10_000)), +}) + +for _ in range(WARMUP): + pd.merge(left, right, on="id", suffixes=("_left", "_right")) + pd.merge(left, right, on="id", how="outer", suffixes=("_l", "_r")) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.merge(left, right, on="id", suffixes=("_left", "_right")) + pd.merge(left, right, on="id", how="outer", suffixes=("_l", "_r")) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "merge_suffixes", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_nunique_fn.py b/benchmarks/pandas/bench_nunique_fn.py new file mode 100644 index 00000000..558c58e1 --- /dev/null +++ b/benchmarks/pandas/bench_nunique_fn.py @@ -0,0 +1,34 @@ +""" +Benchmark: pandas nunique on Series and DataFrame (functional-form equivalent). +Outputs JSON: {"function": "nunique_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +low = pd.Series([i % 1000 for i in range(ROWS)]) +high = pd.Series([i % 50_000 for i in range(ROWS)]) +with_nulls = pd.Series([float('nan') if i % 100 == 0 else i % 2000 for i in range(ROWS)]) +df = pd.DataFrame({"a": low, "b": high, "c": with_nulls}) + +for _ in range(WARMUP): + low.nunique() + with_nulls.nunique(dropna=False) + df.nunique() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + low.nunique() + with_nulls.nunique(dropna=False) + df.nunique() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "nunique_fn", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_pct_change_fn.py b/benchmarks/pandas/bench_pct_change_fn.py new file mode 100644 index 00000000..b8651710 --- /dev/null +++ b/benchmarks/pandas/bench_pct_change_fn.py @@ -0,0 +1,32 @@ +""" +Benchmark: pandas pct_change on Series and DataFrame. +Outputs JSON: {"function": "pct_change_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [i * 1.1 + 1.0 for i in range(ROWS)] +s = pd.Series(data) +df = pd.DataFrame({"a": data, "b": [x * 2 for x in data]}) + +for _ in range(WARMUP): + s.pct_change() + s.pct_change(periods=2) + df.pct_change() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.pct_change() + s.pct_change(periods=2) + df.pct_change() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "pct_change_fn", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_period_asfreq.py b/benchmarks/pandas/bench_period_asfreq.py new file mode 100644 index 00000000..6e26c6f5 --- /dev/null +++ b/benchmarks/pandas/bench_period_asfreq.py @@ -0,0 +1,30 @@ +""" +Benchmark: pandas Period.asfreq and PeriodIndex.asfreq — frequency conversion. +Outputs JSON: {"function": "period_asfreq", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 10_000 +WARMUP = 3 +ITERATIONS = 20 + +idx = pd.period_range(start="2000-01", periods=SIZE, freq="M") + +for _ in range(WARMUP): + idx.asfreq("D", how="start") + idx.asfreq("D", how="end") + idx.asfreq("Q", how="start") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + idx.asfreq("D", how="start") + idx.asfreq("D", how="end") + idx.asfreq("Q", how="start") + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "period_asfreq", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_quantile_fn.py b/benchmarks/pandas/bench_quantile_fn.py new file mode 100644 index 00000000..329ef171 --- /dev/null +++ b/benchmarks/pandas/bench_quantile_fn.py @@ -0,0 +1,33 @@ +""" +Benchmark: quantileSeries / quantileDataFrame equivalent — pandas Series.quantile / DataFrame.quantile. +Outputs JSON: {"function": "quantile_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [(i * 1.41) % 10000 for i in range(ROWS)] +s = pd.Series(data) +df = pd.DataFrame({"a": data, "b": [x * 2 for x in data], "c": [x * 0.5 for x in data]}) + +for _ in range(WARMUP): + s.quantile(0.25) + s.quantile([0.1, 0.5, 0.9]) + df.quantile(0.5) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.quantile(0.25) + s.quantile([0.1, 0.5, 0.9]) + df.quantile(0.5) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "quantile_fn", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_sample_fn.py b/benchmarks/pandas/bench_sample_fn.py new file mode 100644 index 00000000..c80fe34b --- /dev/null +++ b/benchmarks/pandas/bench_sample_fn.py @@ -0,0 +1,32 @@ +""" +Benchmark: pandas sample on Series and DataFrame. +Outputs JSON: {"function": "sample_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +data = [i * 1.5 for i in range(ROWS)] +s = pd.Series(data) +df = pd.DataFrame({"a": data, "b": [x * 2 for x in data], "c": [x + 100 for x in data]}) + +for _ in range(WARMUP): + s.sample(n=1000) + s.sample(frac=0.01) + df.sample(n=500) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.sample(n=1000) + s.sample(frac=0.01) + df.sample(n=500) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "sample_fn", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/tsb/bench_dt_isocalendar.ts b/benchmarks/tsb/bench_dt_isocalendar.ts new file mode 100644 index 00000000..0dec2b4e --- /dev/null +++ b/benchmarks/tsb/bench_dt_isocalendar.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: DatetimeAccessor.isocalendar_week on 100k datetime Series. + * Outputs JSON: {"function": "dt_isocalendar", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +// Dates spanning ~274 years to cover all ISO week patterns +const base = new Date("2000-01-01").getTime(); +const dates = Array.from({ length: ROWS }, (_, i) => new Date(base + i * 86_400_000)); +const s = new Series({ data: dates }); + +for (let i = 0; i < WARMUP; i++) { + s.dt.isocalendar_week(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.dt.isocalendar_week(); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "dt_isocalendar", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_expanding_min_periods.ts b/benchmarks/tsb/bench_expanding_min_periods.ts new file mode 100644 index 00000000..9f97a5c7 --- /dev/null +++ b/benchmarks/tsb/bench_expanding_min_periods.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: Expanding with minPeriods option. + * Outputs JSON: {"function": "expanding_min_periods", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => (i % 10 === 0 ? null : Math.sin(i * 0.01))); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.expanding(10).mean(); + s.expanding(50).sum(); + s.expanding(5).std(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + s.expanding(10).mean(); + s.expanding(50).sum(); + s.expanding(5).std(); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "expanding_min_periods", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_merge_suffixes.ts b/benchmarks/tsb/bench_merge_suffixes.ts new file mode 100644 index 00000000..9e319971 --- /dev/null +++ b/benchmarks/tsb/bench_merge_suffixes.ts @@ -0,0 +1,49 @@ +/** + * Benchmark: merge with custom suffixes option. + * Outputs JSON: {"function": "merge_suffixes", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, merge } from "../../src/index.ts"; + +const ROWS = 50_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const ids = Array.from({ length: ROWS }, (_, i) => i % 10_000); +const left = new DataFrame( + new Map([ + ["id", new Series({ data: ids })], + ["value", new Series({ data: ids.map((x) => x * 1.1) })], + ["score", new Series({ data: ids.map((x) => x * 0.5) })], + ]), +); +const right = new DataFrame( + new Map([ + ["id", new Series({ data: Array.from({ length: 10_000 }, (_, i) => i) })], + ["value", new Series({ data: Array.from({ length: 10_000 }, (_, i) => i * 2.0) })], + ["rank", new Series({ data: Array.from({ length: 10_000 }, (_, i) => i) })], + ]), +); + +for (let i = 0; i < WARMUP; i++) { + merge(left, right, { on: "id", suffixes: ["_left", "_right"] }); + merge(left, right, { on: "id", how: "outer", suffixes: ["_l", "_r"] }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + merge(left, right, { on: "id", suffixes: ["_left", "_right"] }); + merge(left, right, { on: "id", how: "outer", suffixes: ["_l", "_r"] }); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "merge_suffixes", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_nunique_fn.ts b/benchmarks/tsb/bench_nunique_fn.ts new file mode 100644 index 00000000..574475ee --- /dev/null +++ b/benchmarks/tsb/bench_nunique_fn.ts @@ -0,0 +1,49 @@ +/** + * Benchmark: nuniqueSeries — standalone functional nunique for Series. + * Outputs JSON: {"function": "nunique_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, nuniqueSeries, nuniqueDataFrame } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +// Low-cardinality series (1000 unique values) and high-cardinality (50k unique) +const low = new Series({ data: Array.from({ length: ROWS }, (_, i) => i % 1000) }); +const high = new Series({ data: Array.from({ length: ROWS }, (_, i) => i % 50_000) }); +const withNulls = new Series({ + data: Array.from({ length: ROWS }, (_, i) => (i % 100 === 0 ? null : i % 2000)), +}); +const df = new DataFrame( + new Map([ + ["a", low], + ["b", high], + ["c", withNulls], + ]), +); + +for (let i = 0; i < WARMUP; i++) { + nuniqueSeries(low); + nuniqueSeries(withNulls, { dropna: false }); + nuniqueDataFrame(df); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + nuniqueSeries(low); + nuniqueSeries(withNulls, { dropna: false }); + nuniqueDataFrame(df); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "nunique_fn", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_pct_change_fn.ts b/benchmarks/tsb/bench_pct_change_fn.ts new file mode 100644 index 00000000..ff2995f1 --- /dev/null +++ b/benchmarks/tsb/bench_pct_change_fn.ts @@ -0,0 +1,44 @@ +/** + * Benchmark: pctChangeSeries / pctChangeDataFrame — standalone functional pct_change. + * Outputs JSON: {"function": "pct_change_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, pctChangeSeries, pctChangeDataFrame } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => i * 1.1 + 1.0); +const s = new Series({ data }); +const df = new DataFrame( + new Map([ + ["a", new Series({ data })], + ["b", new Series({ data: data.map((x) => x * 2) })], + ]), +); + +for (let i = 0; i < WARMUP; i++) { + pctChangeSeries(s); + pctChangeSeries(s, { periods: 2 }); + pctChangeDataFrame(df); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + pctChangeSeries(s); + pctChangeSeries(s, { periods: 2 }); + pctChangeDataFrame(df); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "pct_change_fn", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_period_asfreq.ts b/benchmarks/tsb/bench_period_asfreq.ts new file mode 100644 index 00000000..d97b7359 --- /dev/null +++ b/benchmarks/tsb/bench_period_asfreq.ts @@ -0,0 +1,39 @@ +/** + * Benchmark: Period.asfreq and PeriodIndex.asfreq — frequency conversion. + * Outputs JSON: {"function": "period_asfreq", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Period, PeriodIndex } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 3; +const ITERATIONS = 20; + +// Build a PeriodIndex of monthly periods using periodRange +const startMonth = Period.fromString("2000-01", "M"); +const idx = PeriodIndex.periodRange(startMonth, SIZE); + +for (let i = 0; i < WARMUP; i++) { + idx.asfreq("D", "start"); + idx.asfreq("D", "end"); + idx.asfreq("Q", "start"); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + idx.asfreq("D", "start"); + idx.asfreq("D", "end"); + idx.asfreq("Q", "start"); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "period_asfreq", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_quantile_fn.ts b/benchmarks/tsb/bench_quantile_fn.ts new file mode 100644 index 00000000..94153bf8 --- /dev/null +++ b/benchmarks/tsb/bench_quantile_fn.ts @@ -0,0 +1,45 @@ +/** + * Benchmark: quantileSeries / quantileDataFrame — standalone quantile functions. + * Outputs JSON: {"function": "quantile_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, quantileSeries, quantileDataFrame } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => (i * 1.41) % 10000); +const s = new Series({ data }); +const df = new DataFrame( + new Map([ + ["a", new Series({ data })], + ["b", new Series({ data: data.map((x) => x * 2) })], + ["c", new Series({ data: data.map((x) => x * 0.5) })], + ]), +); + +for (let i = 0; i < WARMUP; i++) { + quantileSeries(s, { q: 0.25 }); + quantileSeries(s, { q: [0.1, 0.5, 0.9] }); + quantileDataFrame(df, { q: 0.5 }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + quantileSeries(s, { q: 0.25 }); + quantileSeries(s, { q: [0.1, 0.5, 0.9] }); + quantileDataFrame(df, { q: 0.5 }); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "quantile_fn", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_sample_fn.ts b/benchmarks/tsb/bench_sample_fn.ts new file mode 100644 index 00000000..c7abe893 --- /dev/null +++ b/benchmarks/tsb/bench_sample_fn.ts @@ -0,0 +1,45 @@ +/** + * Benchmark: sampleSeries / sampleDataFrame — standalone functional sample. + * Outputs JSON: {"function": "sample_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, sampleSeries, sampleDataFrame } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const data = Array.from({ length: ROWS }, (_, i) => i * 1.5); +const s = new Series({ data }); +const df = new DataFrame( + new Map([ + ["a", new Series({ data })], + ["b", new Series({ data: data.map((x) => x * 2) })], + ["c", new Series({ data: data.map((x) => x + 100) })], + ]), +); + +for (let i = 0; i < WARMUP; i++) { + sampleSeries(s, { n: 1000 }); + sampleSeries(s, { frac: 0.01 }); + sampleDataFrame(df, { n: 500 }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + sampleSeries(s, { n: 1000 }); + sampleSeries(s, { frac: 0.01 }); + sampleDataFrame(df, { n: 500 }); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "sample_fn", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); From 5899214a9c950ed894b687ca9293bb4289d71d15 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 06:10:10 +0000 Subject: [PATCH 08/19] Iteration 144: Add 8 benchmark pairs (445 total, +8 vs best 437) Added pairs: period_arithmetic (Period.add/diff/compareTo/contains), period_index_methods (PeriodIndex.shift/sort/unique/toDatetimeStart/toDatetimeEnd), dt_total_seconds (DatetimeAccessor.total_seconds), timedelta_index_ops (TimedeltaIndex.sort/unique/shift/filter/min/max), interval_overlaps (Interval.overlaps/IntervalIndex.overlaps), describe_opts (describe with percentiles/include options), merge_index_join (merge with left_index/right_index), to_json_orient (toJson with records/split/columns/values orient options). Run: https://github.com/githubnext/tsessebe/actions/runs/24549838166 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- benchmarks/pandas/bench_describe_opts.py | 31 +++++++++++++ benchmarks/pandas/bench_dt_total_seconds.py | 24 ++++++++++ benchmarks/pandas/bench_interval_overlaps.py | 28 +++++++++++ benchmarks/pandas/bench_merge_index_join.py | 27 +++++++++++ benchmarks/pandas/bench_period_arithmetic.py | 31 +++++++++++++ .../pandas/bench_period_index_methods.py | 32 +++++++++++++ .../pandas/bench_timedelta_index_ops.py | 35 ++++++++++++++ benchmarks/pandas/bench_to_json_orient.py | 32 +++++++++++++ benchmarks/tsb/bench_describe_opts.ts | 39 ++++++++++++++++ benchmarks/tsb/bench_dt_total_seconds.ts | 32 +++++++++++++ benchmarks/tsb/bench_interval_overlaps.ts | 40 ++++++++++++++++ benchmarks/tsb/bench_merge_index_join.ts | 39 ++++++++++++++++ benchmarks/tsb/bench_period_arithmetic.ts | 42 +++++++++++++++++ benchmarks/tsb/bench_period_index_methods.ts | 41 +++++++++++++++++ benchmarks/tsb/bench_timedelta_index_ops.ts | 46 +++++++++++++++++++ benchmarks/tsb/bench_to_json_orient.ts | 40 ++++++++++++++++ 16 files changed, 559 insertions(+) create mode 100644 benchmarks/pandas/bench_describe_opts.py create mode 100644 benchmarks/pandas/bench_dt_total_seconds.py create mode 100644 benchmarks/pandas/bench_interval_overlaps.py create mode 100644 benchmarks/pandas/bench_merge_index_join.py create mode 100644 benchmarks/pandas/bench_period_arithmetic.py create mode 100644 benchmarks/pandas/bench_period_index_methods.py create mode 100644 benchmarks/pandas/bench_timedelta_index_ops.py create mode 100644 benchmarks/pandas/bench_to_json_orient.py create mode 100644 benchmarks/tsb/bench_describe_opts.ts create mode 100644 benchmarks/tsb/bench_dt_total_seconds.ts create mode 100644 benchmarks/tsb/bench_interval_overlaps.ts create mode 100644 benchmarks/tsb/bench_merge_index_join.ts create mode 100644 benchmarks/tsb/bench_period_arithmetic.ts create mode 100644 benchmarks/tsb/bench_period_index_methods.ts create mode 100644 benchmarks/tsb/bench_timedelta_index_ops.ts create mode 100644 benchmarks/tsb/bench_to_json_orient.ts diff --git a/benchmarks/pandas/bench_describe_opts.py b/benchmarks/pandas/bench_describe_opts.py new file mode 100644 index 00000000..58cd4170 --- /dev/null +++ b/benchmarks/pandas/bench_describe_opts.py @@ -0,0 +1,31 @@ +"""Benchmark: DataFrame.describe() with percentiles / include options on 100k-row DataFrame.""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +df = pd.DataFrame({ + "a": np.arange(SIZE) * 1.5, + "b": (np.arange(SIZE) % 1000) * 0.7, + "label": [f"cat_{i % 10}" for i in range(SIZE)], + "flag": np.arange(SIZE) % 2 == 0, +}) + +for _ in range(WARMUP): + df.describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9]) + df.describe(include="all") + df.describe(include=[object]) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9]) + df.describe(include="all") + df.describe(include=[object]) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "describe_opts", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_dt_total_seconds.py b/benchmarks/pandas/bench_dt_total_seconds.py new file mode 100644 index 00000000..b98fc1e1 --- /dev/null +++ b/benchmarks/pandas/bench_dt_total_seconds.py @@ -0,0 +1,24 @@ +"""Benchmark: Series.dt.total_seconds() — epoch-second conversion on 100k datetime Series.""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +base = pd.Timestamp("2020-01-01T00:00:00Z") +dates = pd.date_range(start=base, periods=SIZE, freq="min") +s = pd.Series(dates) + +for _ in range(WARMUP): + (s - pd.Timestamp("1970-01-01", tz="UTC")).dt.total_seconds() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + (s - pd.Timestamp("1970-01-01", tz="UTC")).dt.total_seconds() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "dt_total_seconds", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_interval_overlaps.py b/benchmarks/pandas/bench_interval_overlaps.py new file mode 100644 index 00000000..5cf48514 --- /dev/null +++ b/benchmarks/pandas/bench_interval_overlaps.py @@ -0,0 +1,28 @@ +"""Benchmark: Interval.overlaps / IntervalIndex.overlaps — overlap checks on 1k intervals.""" +import json, time +import pandas as pd + +SIZE = 1_000 +WARMUP = 5 +ITERATIONS = 50 + +intervals = [pd.Interval(i, i + 2) for i in range(SIZE)] +breaks = list(range(SIZE + 1)) +idx = pd.IntervalIndex.from_breaks(breaks) +query = pd.Interval(250, 750) + +for _ in range(WARMUP): + for iv in intervals[:50]: + iv.overlaps(query) + idx.overlaps(query) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + for iv in intervals: + iv.overlaps(query) + idx.overlaps(query) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "interval_overlaps", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_merge_index_join.py b/benchmarks/pandas/bench_merge_index_join.py new file mode 100644 index 00000000..cfd8323e --- /dev/null +++ b/benchmarks/pandas/bench_merge_index_join.py @@ -0,0 +1,27 @@ +"""Benchmark: merge with left_index / right_index options on 10k-row DataFrames.""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 30 + +left = pd.DataFrame({"val_a": np.arange(SIZE) * 1.5}) +right = pd.DataFrame({"val_b": np.arange(SIZE) * 2.0}) + +for _ in range(WARMUP): + pd.merge(left, right, left_index=True, right_index=True, how="inner") + pd.merge(left, right, left_index=True, right_index=True, how="outer") + pd.merge(left, right, left_index=True, right_index=True, how="left") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.merge(left, right, left_index=True, right_index=True, how="inner") + pd.merge(left, right, left_index=True, right_index=True, how="outer") + pd.merge(left, right, left_index=True, right_index=True, how="left") + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "merge_index_join", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_period_arithmetic.py b/benchmarks/pandas/bench_period_arithmetic.py new file mode 100644 index 00000000..328efc85 --- /dev/null +++ b/benchmarks/pandas/bench_period_arithmetic.py @@ -0,0 +1,31 @@ +"""Benchmark: Period.add / diff / compareTo / contains — Period arithmetic on 1k periods.""" +import json, time +import pandas as pd + +SIZE = 1_000 +WARMUP = 5 +ITERATIONS = 50 + +base = pd.Period("2020-01-01", freq="D") +periods = [base + i for i in range(SIZE)] +other = base + 500 + +for _ in range(WARMUP): + for p in periods[:50]: + p + 10 + p - other + p < other + p.start_time + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + for p in periods: + p + 10 + p - other + p < other + p.start_time + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "period_arithmetic", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_period_index_methods.py b/benchmarks/pandas/bench_period_index_methods.py new file mode 100644 index 00000000..9ecf20dd --- /dev/null +++ b/benchmarks/pandas/bench_period_index_methods.py @@ -0,0 +1,32 @@ +"""Benchmark: PeriodIndex.shift / sort_values / unique / to_timestamp — PeriodIndex operations on 1k periods.""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 1_000 +WARMUP = 5 +ITERATIONS = 50 + +base = pd.Period("2020-01-01", freq="D") +shuffled = [base + ((i * 7) % SIZE) for i in range(SIZE)] +idx = pd.PeriodIndex(shuffled) + +for _ in range(WARMUP): + idx.shift(30) + idx.sort_values() + idx.unique() + idx.to_timestamp(how="start") + idx.to_timestamp(how="end") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + idx.shift(30) + idx.sort_values() + idx.unique() + idx.to_timestamp(how="start") + idx.to_timestamp(how="end") + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "period_index_methods", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_timedelta_index_ops.py b/benchmarks/pandas/bench_timedelta_index_ops.py new file mode 100644 index 00000000..b24d05ea --- /dev/null +++ b/benchmarks/pandas/bench_timedelta_index_ops.py @@ -0,0 +1,35 @@ +"""Benchmark: TimedeltaIndex sort / unique / shift / min / max on 1k-element index.""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 1_000 +WARMUP = 5 +ITERATIONS = 100 + +deltas = [pd.Timedelta(days=(i * 13) % 365, hours=i % 24) for i in range(SIZE)] +idx = pd.TimedeltaIndex(deltas) +shift_by = pd.Timedelta(days=1) +threshold = pd.Timedelta(days=100) + +for _ in range(WARMUP): + idx.sort_values() + idx.unique() + idx + shift_by + idx[idx < threshold] + idx.min() + idx.max() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + idx.sort_values() + idx.unique() + idx + shift_by + idx[idx < threshold] + idx.min() + idx.max() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "timedelta_index_ops", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_to_json_orient.py b/benchmarks/pandas/bench_to_json_orient.py new file mode 100644 index 00000000..fade77e6 --- /dev/null +++ b/benchmarks/pandas/bench_to_json_orient.py @@ -0,0 +1,32 @@ +"""Benchmark: DataFrame.to_json() with different orient options on 10k-row DataFrame.""" +import json, time +import pandas as pd +import numpy as np + +SIZE = 10_000 +WARMUP = 3 +ITERATIONS = 20 + +df = pd.DataFrame({ + "id": np.arange(SIZE), + "value": np.arange(SIZE) * 1.1, + "label": [f"cat_{i % 10}" for i in range(SIZE)], +}) + +for _ in range(WARMUP): + df.to_json(orient="records") + df.to_json(orient="split") + df.to_json(orient="columns") + df.to_json(orient="values") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.to_json(orient="records") + df.to_json(orient="split") + df.to_json(orient="columns") + df.to_json(orient="values") + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "to_json_orient", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/tsb/bench_describe_opts.ts b/benchmarks/tsb/bench_describe_opts.ts new file mode 100644 index 00000000..e5c5b487 --- /dev/null +++ b/benchmarks/tsb/bench_describe_opts.ts @@ -0,0 +1,39 @@ +/** + * Benchmark: describe() with percentiles / include options on 100k-row DataFrame. + * Outputs JSON: {"function": "describe_opts", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, describe } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => i * 1.5), + b: Array.from({ length: SIZE }, (_, i) => (i % 1000) * 0.7), + label: Array.from({ length: SIZE }, (_, i) => `cat_${i % 10}`), + flag: Array.from({ length: SIZE }, (_, i) => i % 2 === 0), +}); + +for (let i = 0; i < WARMUP; i++) { + describe(df, { percentiles: [0.1, 0.25, 0.5, 0.75, 0.9] }); + describe(df, { include: "all" }); + describe(df, { include: "object" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + describe(df, { percentiles: [0.1, 0.25, 0.5, 0.75, 0.9] }); + describe(df, { include: "all" }); + describe(df, { include: "object" }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "describe_opts", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dt_total_seconds.ts b/benchmarks/tsb/bench_dt_total_seconds.ts new file mode 100644 index 00000000..ee6e5376 --- /dev/null +++ b/benchmarks/tsb/bench_dt_total_seconds.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: DatetimeAccessor.total_seconds — epoch-second conversion on 100k datetime Series. + * Outputs JSON: {"function": "dt_total_seconds", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const base = new Date("2020-01-01T00:00:00Z").getTime(); +const dates = Array.from({ length: SIZE }, (_, i) => new Date(base + i * 60_000)); +const s = new Series({ data: dates }); + +for (let i = 0; i < WARMUP; i++) { + s.dt.total_seconds(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.dt.total_seconds(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dt_total_seconds", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_interval_overlaps.ts b/benchmarks/tsb/bench_interval_overlaps.ts new file mode 100644 index 00000000..14b0188b --- /dev/null +++ b/benchmarks/tsb/bench_interval_overlaps.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: Interval.overlaps / IntervalIndex.overlaps — interval overlap checks on 1k intervals. + * Outputs JSON: {"function": "interval_overlaps", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Interval, IntervalIndex } from "../../src/index.ts"; + +const SIZE = 1_000; +const WARMUP = 5; +const ITERATIONS = 50; + +// Overlapping intervals: each spans 2 units, starting at every integer +const intervals = Array.from({ length: SIZE }, (_, i) => new Interval(i, i + 2)); +const breaks = Array.from({ length: SIZE + 1 }, (_, i) => i); +const idx = IntervalIndex.fromBreaks(breaks); +const query = new Interval(250, 750); + +for (let i = 0; i < WARMUP; i++) { + for (const iv of intervals.slice(0, 50)) { + iv.overlaps(query); + } + idx.overlaps(query); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + for (const iv of intervals) { + iv.overlaps(query); + } + idx.overlaps(query); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "interval_overlaps", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_merge_index_join.ts b/benchmarks/tsb/bench_merge_index_join.ts new file mode 100644 index 00000000..f29d745a --- /dev/null +++ b/benchmarks/tsb/bench_merge_index_join.ts @@ -0,0 +1,39 @@ +/** + * Benchmark: merge with left_index / right_index options on 10k-row DataFrames. + * Outputs JSON: {"function": "merge_index_join", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, merge } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const left = DataFrame.fromColumns({ + val_a: Array.from({ length: SIZE }, (_, i) => i * 1.5), +}); +const right = DataFrame.fromColumns({ + val_b: Array.from({ length: SIZE }, (_, i) => i * 2.0), +}); + +for (let i = 0; i < WARMUP; i++) { + merge(left, right, { left_index: true, right_index: true, how: "inner" }); + merge(left, right, { left_index: true, right_index: true, how: "outer" }); + merge(left, right, { left_index: true, right_index: true, how: "left" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + merge(left, right, { left_index: true, right_index: true, how: "inner" }); + merge(left, right, { left_index: true, right_index: true, how: "outer" }); + merge(left, right, { left_index: true, right_index: true, how: "left" }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "merge_index_join", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_period_arithmetic.ts b/benchmarks/tsb/bench_period_arithmetic.ts new file mode 100644 index 00000000..d9e68bb3 --- /dev/null +++ b/benchmarks/tsb/bench_period_arithmetic.ts @@ -0,0 +1,42 @@ +/** + * Benchmark: Period.add / diff / compareTo / contains — Period arithmetic on 1k periods. + * Outputs JSON: {"function": "period_arithmetic", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Period } from "../../src/index.ts"; + +const SIZE = 1_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const base = Period.fromDate(new Date(Date.UTC(2020, 0, 1)), "D"); +const periods = Array.from({ length: SIZE }, (_, i) => base.add(i)); +const other = base.add(500); + +for (let i = 0; i < WARMUP; i++) { + for (const p of periods.slice(0, 50)) { + p.add(10); + p.diff(other); + p.compareTo(other); + p.contains(p.startTime); + } +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + for (const p of periods) { + p.add(10); + p.diff(other); + p.compareTo(other); + p.contains(p.startTime); + } +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "period_arithmetic", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_period_index_methods.ts b/benchmarks/tsb/bench_period_index_methods.ts new file mode 100644 index 00000000..6e7189c0 --- /dev/null +++ b/benchmarks/tsb/bench_period_index_methods.ts @@ -0,0 +1,41 @@ +/** + * Benchmark: PeriodIndex.shift / sort / unique / toDatetimeStart / toDatetimeEnd — PeriodIndex operations on 1k periods. + * Outputs JSON: {"function": "period_index_methods", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Period, PeriodIndex } from "../../src/index.ts"; + +const SIZE = 1_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const base = Period.fromDate(new Date(Date.UTC(2020, 0, 1)), "D"); +// Build a shuffled index with some duplicates +const shuffled = Array.from({ length: SIZE }, (_, i) => base.add((i * 7) % SIZE)); +const idx = PeriodIndex.fromPeriods(shuffled); + +for (let i = 0; i < WARMUP; i++) { + idx.shift(30); + idx.sort(); + idx.unique(); + idx.toDatetimeStart(); + idx.toDatetimeEnd(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx.shift(30); + idx.sort(); + idx.unique(); + idx.toDatetimeStart(); + idx.toDatetimeEnd(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "period_index_methods", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_timedelta_index_ops.ts b/benchmarks/tsb/bench_timedelta_index_ops.ts new file mode 100644 index 00000000..1d1ce7a5 --- /dev/null +++ b/benchmarks/tsb/bench_timedelta_index_ops.ts @@ -0,0 +1,46 @@ +/** + * Benchmark: TimedeltaIndex.sort / unique / shift / filter / min / max — operations on 1k-element TimedeltaIndex. + * Outputs JSON: {"function": "timedelta_index_ops", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Timedelta, TimedeltaIndex } from "../../src/index.ts"; + +const SIZE = 1_000; +const WARMUP = 5; +const ITERATIONS = 100; + +// Shuffled, with some duplicates +const deltas = Array.from({ length: SIZE }, (_, i) => + Timedelta.fromComponents({ days: (i * 13) % 365, hours: i % 24 }), +); +const idx = TimedeltaIndex.fromTimedeltas(deltas); +const shiftBy = Timedelta.fromComponents({ days: 1 }); +const threshold = Timedelta.fromComponents({ days: 100 }); + +for (let i = 0; i < WARMUP; i++) { + idx.sort(); + idx.unique(); + idx.shift(shiftBy); + idx.filter((td) => td.totalDays < threshold.totalDays); + idx.min(); + idx.max(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx.sort(); + idx.unique(); + idx.shift(shiftBy); + idx.filter((td) => td.totalDays < threshold.totalDays); + idx.min(); + idx.max(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "timedelta_index_ops", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_to_json_orient.ts b/benchmarks/tsb/bench_to_json_orient.ts new file mode 100644 index 00000000..1016cdff --- /dev/null +++ b/benchmarks/tsb/bench_to_json_orient.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: toJson with different orient options on 10k-row DataFrame. + * Outputs JSON: {"function": "to_json_orient", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, toJson } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const df = DataFrame.fromColumns({ + id: Array.from({ length: SIZE }, (_, i) => i), + value: Array.from({ length: SIZE }, (_, i) => i * 1.1), + label: Array.from({ length: SIZE }, (_, i) => `cat_${i % 10}`), +}); + +for (let i = 0; i < WARMUP; i++) { + toJson(df, { orient: "records" }); + toJson(df, { orient: "split" }); + toJson(df, { orient: "columns" }); + toJson(df, { orient: "values" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + toJson(df, { orient: "records" }); + toJson(df, { orient: "split" }); + toJson(df, { orient: "columns" }); + toJson(df, { orient: "values" }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "to_json_orient", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); From 8568c4add0222e16433b2b78336cf17604ea0f28 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 07:05:13 +0000 Subject: [PATCH 09/19] Iteration 145: Add 9 benchmark pairs (454 total, +9 vs best 445) Added benchmarks for standalone functional forms not yet covered: - mode_dataframe_fn: modeDataFrame standalone vs pandas df.mode() - where_mask_series_fn: whereSeries/maskSeries standalone vs pandas - where_mask_df_fn: whereDataFrame/maskDataFrame standalone vs pandas - idxmin_max_df: idxminDataFrame/idxmaxDataFrame vs pandas df.idxmin/idxmax - interpolate_fn: interpolateSeries/dataFrameInterpolate standalone vs pandas - explode_fn: explodeSeries/explodeDataFrame standalone vs pandas - fillna_fn: fillnaSeries/fillnaDataFrame standalone vs pandas - dropna_fn: dropnaSeries/dropnaDataFrame standalone vs pandas - diff_applymap_fn: diffSeries/applymap standalone vs pandas Run: https://github.com/githubnext/tsessebe/actions/runs/24551622461 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- benchmarks/pandas/bench_diff_applymap_fn.py | 37 ++++++++++++++ benchmarks/pandas/bench_dropna_fn.py | 39 +++++++++++++++ benchmarks/pandas/bench_explode_fn.py | 36 ++++++++++++++ benchmarks/pandas/bench_fillna_fn.py | 40 ++++++++++++++++ benchmarks/pandas/bench_idxmin_max_df.py | 34 +++++++++++++ benchmarks/pandas/bench_interpolate_fn.py | 37 ++++++++++++++ benchmarks/pandas/bench_mode_dataframe_fn.py | 33 +++++++++++++ benchmarks/pandas/bench_where_mask_df_fn.py | 35 ++++++++++++++ .../pandas/bench_where_mask_series_fn.py | 31 ++++++++++++ benchmarks/tsb/bench_diff_applymap_fn.ts | 43 +++++++++++++++++ benchmarks/tsb/bench_dropna_fn.ts | 46 ++++++++++++++++++ benchmarks/tsb/bench_explode_fn.ts | 48 +++++++++++++++++++ benchmarks/tsb/bench_fillna_fn.ts | 47 ++++++++++++++++++ benchmarks/tsb/bench_idxmin_max_df.ts | 40 ++++++++++++++++ benchmarks/tsb/bench_interpolate_fn.ts | 46 ++++++++++++++++++ benchmarks/tsb/bench_mode_dataframe_fn.ts | 40 ++++++++++++++++ benchmarks/tsb/bench_where_mask_df_fn.ts | 41 ++++++++++++++++ benchmarks/tsb/bench_where_mask_series_fn.ts | 37 ++++++++++++++ 18 files changed, 710 insertions(+) create mode 100644 benchmarks/pandas/bench_diff_applymap_fn.py create mode 100644 benchmarks/pandas/bench_dropna_fn.py create mode 100644 benchmarks/pandas/bench_explode_fn.py create mode 100644 benchmarks/pandas/bench_fillna_fn.py create mode 100644 benchmarks/pandas/bench_idxmin_max_df.py create mode 100644 benchmarks/pandas/bench_interpolate_fn.py create mode 100644 benchmarks/pandas/bench_mode_dataframe_fn.py create mode 100644 benchmarks/pandas/bench_where_mask_df_fn.py create mode 100644 benchmarks/pandas/bench_where_mask_series_fn.py create mode 100644 benchmarks/tsb/bench_diff_applymap_fn.ts create mode 100644 benchmarks/tsb/bench_dropna_fn.ts create mode 100644 benchmarks/tsb/bench_explode_fn.ts create mode 100644 benchmarks/tsb/bench_fillna_fn.ts create mode 100644 benchmarks/tsb/bench_idxmin_max_df.ts create mode 100644 benchmarks/tsb/bench_interpolate_fn.ts create mode 100644 benchmarks/tsb/bench_mode_dataframe_fn.ts create mode 100644 benchmarks/tsb/bench_where_mask_df_fn.ts create mode 100644 benchmarks/tsb/bench_where_mask_series_fn.ts diff --git a/benchmarks/pandas/bench_diff_applymap_fn.py b/benchmarks/pandas/bench_diff_applymap_fn.py new file mode 100644 index 00000000..0939f219 --- /dev/null +++ b/benchmarks/pandas/bench_diff_applymap_fn.py @@ -0,0 +1,37 @@ +""" +Benchmark: pandas Series.diff() + DataFrame.applymap() — diff and element-wise map. +Outputs JSON: {"function": "diff_applymap_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +s = pd.Series([i * 1.0 + np.sin(i * 0.01) for i in range(SIZE)]) + +df = pd.DataFrame({ + "a": [i * 0.1 for i in range(SIZE)], + "b": [i * 0.2 + 1 for i in range(SIZE)], + "c": [i * -0.1 for i in range(SIZE)], +}) + +for _ in range(WARMUP): + s.diff() + s.diff(2) + df.map(lambda v: v ** 2) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.diff() + s.diff(2) + df.map(lambda v: v ** 2) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "diff_applymap_fn", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_dropna_fn.py b/benchmarks/pandas/bench_dropna_fn.py new file mode 100644 index 00000000..721e4b2f --- /dev/null +++ b/benchmarks/pandas/bench_dropna_fn.py @@ -0,0 +1,39 @@ +""" +Benchmark: pandas Series.dropna() / DataFrame.dropna() — drop missing values. +Outputs JSON: {"function": "dropna_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +series_data = [float("nan") if i % 5 == 0 else i * 1.0 for i in range(SIZE)] +s = pd.Series(series_data) + +df = pd.DataFrame({ + "a": [float("nan") if i % 5 == 0 else i * 0.1 for i in range(SIZE)], + "b": [float("nan") if i % 7 == 0 else i * 2.0 for i in range(SIZE)], + "c": [float("nan") if i % 3 == 0 else i % 100 for i in range(SIZE)], +}) + +for _ in range(WARMUP): + s.dropna() + df.dropna() + df.dropna(how="any") + df.dropna(how="all") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.dropna() + df.dropna() + df.dropna(how="any") + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "dropna_fn", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_explode_fn.py b/benchmarks/pandas/bench_explode_fn.py new file mode 100644 index 00000000..dd4e0ec2 --- /dev/null +++ b/benchmarks/pandas/bench_explode_fn.py @@ -0,0 +1,36 @@ +""" +Benchmark: pandas Series.explode() / DataFrame.explode() — expand list-like elements. +Outputs JSON: {"function": "explode_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +ROWS = 10_000 +WARMUP = 5 +ITERATIONS = 20 + +series_data = [list(range(i * 10 + j for j in range((i % 4) + 2))) for i in range(ROWS)] +series_data = [[i * 10 + j for j in range((i % 4) + 2)] for i in range(ROWS)] +s = pd.Series(series_data) + +df = pd.DataFrame({ + "a": [[i + j for j in range((i % 3) + 1)] for i in range(ROWS)], + "b": [f"key_{i % 100}" for i in range(ROWS)], +}) + +for _ in range(WARMUP): + s.explode() + df.explode("a") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.explode() + df.explode("a") + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "explode_fn", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_fillna_fn.py b/benchmarks/pandas/bench_fillna_fn.py new file mode 100644 index 00000000..d631aad8 --- /dev/null +++ b/benchmarks/pandas/bench_fillna_fn.py @@ -0,0 +1,40 @@ +""" +Benchmark: pandas Series.fillna() / DataFrame.fillna() — fill missing values. +Outputs JSON: {"function": "fillna_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +series_data = [float("nan") if i % 5 == 0 else i * 1.0 for i in range(SIZE)] +s = pd.Series(series_data) + +df = pd.DataFrame({ + "a": [float("nan") if i % 5 == 0 else i * 0.1 for i in range(SIZE)], + "b": [float("nan") if i % 7 == 0 else i * 2.0 for i in range(SIZE)], + "c": [None if i % 3 == 0 else f"cat{i % 10}" for i in range(SIZE)], +}) + +for _ in range(WARMUP): + s.fillna(0) + s.fillna(method="ffill") + df.fillna(0) + df.fillna(method="bfill") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.fillna(0) + s.fillna(method="ffill") + df.fillna(0) + df.fillna(method="bfill") + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "fillna_fn", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_idxmin_max_df.py b/benchmarks/pandas/bench_idxmin_max_df.py new file mode 100644 index 00000000..d2dd350f --- /dev/null +++ b/benchmarks/pandas/bench_idxmin_max_df.py @@ -0,0 +1,34 @@ +""" +Benchmark: pandas DataFrame.idxmin() / DataFrame.idxmax() — index of min/max per column. +Outputs JSON: {"function": "idxmin_max_df", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +df = pd.DataFrame({ + "a": [np.sin(i * 0.001) * 100 for i in range(ROWS)], + "b": [float("nan") if i % 100 == 0 else i * 0.1 for i in range(ROWS)], + "c": [i * 1.0 if i % 2 == 0 else -i * 1.0 for i in range(ROWS)], +}) + +for _ in range(WARMUP): + df.idxmin() + df.idxmax() + df.idxmin(skipna=False) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.idxmin() + df.idxmax() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "idxmin_max_df", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_interpolate_fn.py b/benchmarks/pandas/bench_interpolate_fn.py new file mode 100644 index 00000000..94f7ccd6 --- /dev/null +++ b/benchmarks/pandas/bench_interpolate_fn.py @@ -0,0 +1,37 @@ +""" +Benchmark: pandas Series.interpolate() / DataFrame.interpolate() — fill NaN by interpolation. +Outputs JSON: {"function": "interpolate_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +SIZE = 50_000 +WARMUP = 5 +ITERATIONS = 20 + +series_data = [float("nan") if i % 10 == 0 else i * 1.0 for i in range(SIZE)] +s = pd.Series(series_data) + +df = pd.DataFrame({ + "a": [float("nan") if i % 7 == 0 else i * 0.5 for i in range(SIZE)], + "b": [float("nan") if i % 11 == 0 else np.sin(i * 0.01) * 100 for i in range(SIZE)], +}) + +for _ in range(WARMUP): + s.interpolate(method="linear") + s.interpolate(method="pad") + df.interpolate() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.interpolate(method="linear") + s.interpolate(method="pad") + df.interpolate() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "interpolate_fn", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_mode_dataframe_fn.py b/benchmarks/pandas/bench_mode_dataframe_fn.py new file mode 100644 index 00000000..a93f0359 --- /dev/null +++ b/benchmarks/pandas/bench_mode_dataframe_fn.py @@ -0,0 +1,33 @@ +""" +Benchmark: pandas DataFrame.mode() — column-wise mode. +Outputs JSON: {"function": "mode_dataframe_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +ROWS = 10_000 +WARMUP = 5 +ITERATIONS = 20 + +df = pd.DataFrame({ + "a": [i % 10 for i in range(ROWS)], + "b": [float("nan") if i % 50 == 0 else i % 5 for i in range(ROWS)], + "c": [i % 3 for i in range(ROWS)], +}) + +for _ in range(WARMUP): + df.mode() + df.mode(dropna=False) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.mode() + df.mode(dropna=False) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "mode_dataframe_fn", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_where_mask_df_fn.py b/benchmarks/pandas/bench_where_mask_df_fn.py new file mode 100644 index 00000000..87c73445 --- /dev/null +++ b/benchmarks/pandas/bench_where_mask_df_fn.py @@ -0,0 +1,35 @@ +""" +Benchmark: pandas DataFrame.where() / DataFrame.mask() — conditional replacement. +Outputs JSON: {"function": "where_mask_df_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +df = pd.DataFrame({ + "a": [i * 1.0 for i in range(ROWS)], + "b": [float("nan") if i % 2 == 0 else i * 0.5 for i in range(ROWS)], + "c": [i * -1.0 for i in range(ROWS)], +}) + +cond = df > 0 + +for _ in range(WARMUP): + df.where(cond, other=0) + df.mask(cond, other=-1) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.where(cond, other=0) + df.mask(cond, other=-1) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "where_mask_df_fn", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_where_mask_series_fn.py b/benchmarks/pandas/bench_where_mask_series_fn.py new file mode 100644 index 00000000..68432031 --- /dev/null +++ b/benchmarks/pandas/bench_where_mask_series_fn.py @@ -0,0 +1,31 @@ +""" +Benchmark: pandas Series.where() / Series.mask() — conditional replacement. +Outputs JSON: {"function": "where_mask_series_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +s = pd.Series([i * 0.1 for i in range(SIZE)]) +cond = s > SIZE * 0.05 +cond_arr = pd.Series([i > SIZE * 0.5 for i in range(SIZE)]) + +for _ in range(WARMUP): + s.where(cond, 0) + s.mask(cond_arr, -1) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.where(cond, 0) + s.mask(cond_arr, -1) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({"function": "where_mask_series_fn", "mean_ms": round(mean_ms, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/tsb/bench_diff_applymap_fn.ts b/benchmarks/tsb/bench_diff_applymap_fn.ts new file mode 100644 index 00000000..fc429816 --- /dev/null +++ b/benchmarks/tsb/bench_diff_applymap_fn.ts @@ -0,0 +1,43 @@ +/** + * Benchmark: diffSeries standalone + applymap — diff and element-wise map. + * Outputs JSON: {"function": "diff_applymap_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, diffSeries, applymap } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 1.0 + Math.sin(i * 0.01)) }); + +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => i * 0.1), + b: Array.from({ length: SIZE }, (_, i) => i * 0.2 + 1), + c: Array.from({ length: SIZE }, (_, i) => i * -0.1), +}); + +for (let i = 0; i < WARMUP; i++) { + diffSeries(s); + diffSeries(s, 2); + applymap(df, (v) => (v as number) ** 2); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + diffSeries(s); + diffSeries(s, 2); + applymap(df, (v) => (v as number) ** 2); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "diff_applymap_fn", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_dropna_fn.ts b/benchmarks/tsb/bench_dropna_fn.ts new file mode 100644 index 00000000..accb7e88 --- /dev/null +++ b/benchmarks/tsb/bench_dropna_fn.ts @@ -0,0 +1,46 @@ +/** + * Benchmark: dropnaSeries / dropnaDataFrame — standalone functional dropna. + * Outputs JSON: {"function": "dropna_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, dropnaSeries, dropnaDataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// ~20% NaN values +const seriesData = Array.from({ length: SIZE }, (_, i) => (i % 5 === 0 ? null : i * 1.0)); +const s = new Series({ data: seriesData }); + +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => (i % 5 === 0 ? null : i * 0.1)), + b: Array.from({ length: SIZE }, (_, i) => (i % 7 === 0 ? null : i * 2.0)), + c: Array.from({ length: SIZE }, (_, i) => (i % 3 === 0 ? null : i % 100)), +}); + +for (let i = 0; i < WARMUP; i++) { + dropnaSeries(s); + dropnaDataFrame(df); + dropnaDataFrame(df, { how: "any" }); + dropnaDataFrame(df, { how: "all" }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + dropnaSeries(s); + dropnaDataFrame(df); + dropnaDataFrame(df, { how: "any" }); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "dropna_fn", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_explode_fn.ts b/benchmarks/tsb/bench_explode_fn.ts new file mode 100644 index 00000000..2c4951a5 --- /dev/null +++ b/benchmarks/tsb/bench_explode_fn.ts @@ -0,0 +1,48 @@ +/** + * Benchmark: explodeSeries / explodeDataFrame — standalone functional explode. + * Outputs JSON: {"function": "explode_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, explodeSeries, explodeDataFrame } from "../../src/index.ts"; + +const ROWS = 10_000; +const WARMUP = 5; +const ITERATIONS = 20; + +// Each row contains a list of 2-5 values +const seriesData = Array.from({ length: ROWS }, (_, i) => { + const len = (i % 4) + 2; + return Array.from({ length: len }, (_, j) => i * 10 + j); +}); +const s = new Series({ data: seriesData }); + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => { + const len = (i % 3) + 1; + return Array.from({ length: len }, (_, j) => i + j); + }), + b: Array.from({ length: ROWS }, (_, i) => `key_${i % 100}`), +}); + +for (let i = 0; i < WARMUP; i++) { + explodeSeries(s); + explodeDataFrame(df, "a"); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + explodeSeries(s); + explodeDataFrame(df, "a"); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "explode_fn", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_fillna_fn.ts b/benchmarks/tsb/bench_fillna_fn.ts new file mode 100644 index 00000000..a0c61738 --- /dev/null +++ b/benchmarks/tsb/bench_fillna_fn.ts @@ -0,0 +1,47 @@ +/** + * Benchmark: fillnaSeries / fillnaDataFrame — standalone functional fillna. + * Outputs JSON: {"function": "fillna_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, fillnaSeries, fillnaDataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// ~20% NaN values +const seriesData = Array.from({ length: SIZE }, (_, i) => (i % 5 === 0 ? null : i * 1.0)); +const s = new Series({ data: seriesData }); + +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => (i % 5 === 0 ? null : i * 0.1)), + b: Array.from({ length: SIZE }, (_, i) => (i % 7 === 0 ? null : i * 2.0)), + c: Array.from({ length: SIZE }, (_, i) => (i % 3 === 0 ? null : "cat" + (i % 10))), +}); + +for (let i = 0; i < WARMUP; i++) { + fillnaSeries(s, { value: 0 }); + fillnaSeries(s, { method: "ffill" }); + fillnaDataFrame(df, { value: 0 }); + fillnaDataFrame(df, { method: "bfill" }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + fillnaSeries(s, { value: 0 }); + fillnaSeries(s, { method: "ffill" }); + fillnaDataFrame(df, { value: 0 }); + fillnaDataFrame(df, { method: "bfill" }); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "fillna_fn", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_idxmin_max_df.ts b/benchmarks/tsb/bench_idxmin_max_df.ts new file mode 100644 index 00000000..6a6e9bde --- /dev/null +++ b/benchmarks/tsb/bench_idxmin_max_df.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: idxminDataFrame / idxmaxDataFrame — index of min/max per column. + * Outputs JSON: {"function": "idxmin_max_df", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, idxminDataFrame, idxmaxDataFrame } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.001) * 100), + b: Array.from({ length: ROWS }, (_, i) => (i % 100 === 0 ? null : i * 0.1)), + c: Array.from({ length: ROWS }, (_, i) => (i % 2 === 0 ? i : -i) * 1.0), +}); + +for (let i = 0; i < WARMUP; i++) { + idxminDataFrame(df); + idxmaxDataFrame(df); + idxminDataFrame(df, { skipna: false }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + idxminDataFrame(df); + idxmaxDataFrame(df); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "idxmin_max_df", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_interpolate_fn.ts b/benchmarks/tsb/bench_interpolate_fn.ts new file mode 100644 index 00000000..7cc9163c --- /dev/null +++ b/benchmarks/tsb/bench_interpolate_fn.ts @@ -0,0 +1,46 @@ +/** + * Benchmark: interpolateSeries / dataFrameInterpolate — standalone functional interpolation. + * Outputs JSON: {"function": "interpolate_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, interpolateSeries, dataFrameInterpolate } from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 20; + +// ~10% NaN values scattered through the data +const seriesData = Array.from({ length: SIZE }, (_, i) => + i % 10 === 0 ? null : i * 1.0, +); +const s = new Series({ data: seriesData }); + +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => (i % 7 === 0 ? null : i * 0.5)), + b: Array.from({ length: SIZE }, (_, i) => (i % 11 === 0 ? null : Math.sin(i * 0.01) * 100)), +}); + +for (let i = 0; i < WARMUP; i++) { + interpolateSeries(s, { method: "linear" }); + interpolateSeries(s, { method: "pad" }); + dataFrameInterpolate(df); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + interpolateSeries(s, { method: "linear" }); + interpolateSeries(s, { method: "pad" }); + dataFrameInterpolate(df); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "interpolate_fn", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_mode_dataframe_fn.ts b/benchmarks/tsb/bench_mode_dataframe_fn.ts new file mode 100644 index 00000000..86140172 --- /dev/null +++ b/benchmarks/tsb/bench_mode_dataframe_fn.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: modeDataFrame — standalone functional mode for DataFrame columns. + * Outputs JSON: {"function": "mode_dataframe_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, modeDataFrame } from "../../src/index.ts"; + +const ROWS = 10_000; +const WARMUP = 5; +const ITERATIONS = 20; + +// Low-cardinality numeric data (many ties → large mode arrays) +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i % 10), + b: Array.from({ length: ROWS }, (_, i) => (i % 50 === 0 ? null : i % 5)), + c: Array.from({ length: ROWS }, (_, i) => (i % 3)), +}); + +for (let i = 0; i < WARMUP; i++) { + modeDataFrame(df); + modeDataFrame(df, { dropna: false }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + modeDataFrame(df); + modeDataFrame(df, { dropna: false }); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "mode_dataframe_fn", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_where_mask_df_fn.ts b/benchmarks/tsb/bench_where_mask_df_fn.ts new file mode 100644 index 00000000..c8c60b53 --- /dev/null +++ b/benchmarks/tsb/bench_where_mask_df_fn.ts @@ -0,0 +1,41 @@ +/** + * Benchmark: whereDataFrame / maskDataFrame — standalone functional where/mask for DataFrame. + * Outputs JSON: {"function": "where_mask_df_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, whereDataFrame, maskDataFrame } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i * 1.0), + b: Array.from({ length: ROWS }, (_, i) => (i % 2 === 0 ? null : i * 0.5)), + c: Array.from({ length: ROWS }, (_, i) => i * -1.0), +}); + +const condFn = (v: unknown) => (v as number) > 0; + +for (let i = 0; i < WARMUP; i++) { + whereDataFrame(df, condFn, { other: 0 }); + maskDataFrame(df, condFn, { other: -1 }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + whereDataFrame(df, condFn, { other: 0 }); + maskDataFrame(df, condFn, { other: -1 }); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "where_mask_df_fn", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_where_mask_series_fn.ts b/benchmarks/tsb/bench_where_mask_series_fn.ts new file mode 100644 index 00000000..d6aaad33 --- /dev/null +++ b/benchmarks/tsb/bench_where_mask_series_fn.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: whereSeries / maskSeries — standalone functional where/mask for Series. + * Outputs JSON: {"function": "where_mask_series_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, whereSeries, maskSeries } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 0.1) }); +const cond = (v: unknown) => (v as number) > SIZE * 0.05; +const condArr = Array.from({ length: SIZE }, (_, i) => i > SIZE * 0.5); + +for (let i = 0; i < WARMUP; i++) { + whereSeries(s, cond, 0); + maskSeries(s, condArr, -1); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + whereSeries(s, cond, 0); + maskSeries(s, condArr, -1); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "where_mask_series_fn", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); From b854980226fa8f64fa249274f8bcda8cd497c650 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 09:08:49 +0000 Subject: [PATCH 10/19] Iteration 147: Add 8 benchmark pairs (462 total, +2 vs best 460) Added 8 new benchmark pairs covering: - timestamp_arith: Timestamp.add/sub/eq/lt/gt/le/ge/ne operations - timestamp_str_format: strftime/isoformat/day_name/month_name - timestamp_round_normalize: ceil/floor/round/normalize - value_counts_opts: valueCounts with normalize/ascending/dropna options - series_sortvalues_opts: Series.sortValues with ascending=false/naPosition='first' - dataframe_sortvalues_mixed: DataFrame.sortValues with mixed ascending array - series_groupby_size: SeriesGroupBy.size() and getGroup() - series_log_natural: seriesLog (natural logarithm) Run: https://github.com/githubnext/tsessebe/actions/runs/24555921452 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../bench_dataframe_sortvalues_mixed.py | 32 +++++++++++ .../pandas/bench_series_groupby_size.py | 31 +++++++++++ benchmarks/pandas/bench_series_log_natural.py | 26 +++++++++ .../pandas/bench_series_sortvalues_opts.py | 31 +++++++++++ benchmarks/pandas/bench_timestamp_arith.py | 45 ++++++++++++++++ .../pandas/bench_timestamp_round_normalize.py | 37 +++++++++++++ .../pandas/bench_timestamp_str_format.py | 37 +++++++++++++ benchmarks/pandas/bench_value_counts_opts.py | 33 ++++++++++++ .../tsb/bench_dataframe_sortvalues_mixed.ts | 36 +++++++++++++ benchmarks/tsb/bench_series_groupby_size.ts | 39 ++++++++++++++ benchmarks/tsb/bench_series_log_natural.ts | 31 +++++++++++ .../tsb/bench_series_sortvalues_opts.ts | 38 +++++++++++++ benchmarks/tsb/bench_timestamp_arith.ts | 54 +++++++++++++++++++ .../tsb/bench_timestamp_round_normalize.ts | 44 +++++++++++++++ benchmarks/tsb/bench_timestamp_str_format.ts | 43 +++++++++++++++ benchmarks/tsb/bench_value_counts_opts.ts | 40 ++++++++++++++ 16 files changed, 597 insertions(+) create mode 100644 benchmarks/pandas/bench_dataframe_sortvalues_mixed.py create mode 100644 benchmarks/pandas/bench_series_groupby_size.py create mode 100644 benchmarks/pandas/bench_series_log_natural.py create mode 100644 benchmarks/pandas/bench_series_sortvalues_opts.py create mode 100644 benchmarks/pandas/bench_timestamp_arith.py create mode 100644 benchmarks/pandas/bench_timestamp_round_normalize.py create mode 100644 benchmarks/pandas/bench_timestamp_str_format.py create mode 100644 benchmarks/pandas/bench_value_counts_opts.py create mode 100644 benchmarks/tsb/bench_dataframe_sortvalues_mixed.ts create mode 100644 benchmarks/tsb/bench_series_groupby_size.ts create mode 100644 benchmarks/tsb/bench_series_log_natural.ts create mode 100644 benchmarks/tsb/bench_series_sortvalues_opts.ts create mode 100644 benchmarks/tsb/bench_timestamp_arith.ts create mode 100644 benchmarks/tsb/bench_timestamp_round_normalize.ts create mode 100644 benchmarks/tsb/bench_timestamp_str_format.ts create mode 100644 benchmarks/tsb/bench_value_counts_opts.ts diff --git a/benchmarks/pandas/bench_dataframe_sortvalues_mixed.py b/benchmarks/pandas/bench_dataframe_sortvalues_mixed.py new file mode 100644 index 00000000..73e33369 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_sortvalues_mixed.py @@ -0,0 +1,32 @@ +"""Benchmark: DataFrame.sort_values with mixed ascending list [True, False, True].""" +import json +import time +import pandas as pd +import numpy as np + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +df = pd.DataFrame({ + "category": [f"group_{i % 10}" for i in range(ROWS)], + "priority": [i % 5 for i in range(ROWS)], + "value": np.random.random(ROWS) * 1000, +}) + +for _ in range(WARMUP): + df.sort_values(["category", "priority", "value"], ascending=[True, False, True]) + df.sort_values(["category", "value"], ascending=[False, True]) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.sort_values(["category", "priority", "value"], ascending=[True, False, True]) + df.sort_values(["category", "value"], ascending=[False, True]) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_sortvalues_mixed", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_groupby_size.py b/benchmarks/pandas/bench_series_groupby_size.py new file mode 100644 index 00000000..4990657d --- /dev/null +++ b/benchmarks/pandas/bench_series_groupby_size.py @@ -0,0 +1,31 @@ +"""Benchmark: SeriesGroupBy.size() and get_group() operations.""" +import json +import time +import pandas as pd +import numpy as np + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +values = pd.Series(np.random.random(ROWS) * 1000) +groups = pd.Series([f"g{i % 20}" for i in range(ROWS)]) + +for _ in range(WARMUP): + values.groupby(groups).size() + values.groupby(groups).get_group("g0") + values.groupby(groups).get_group("g10") + +start = time.perf_counter() +for _ in range(ITERATIONS): + values.groupby(groups).size() + values.groupby(groups).get_group("g0") + values.groupby(groups).get_group("g10") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_groupby_size", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_log_natural.py b/benchmarks/pandas/bench_series_log_natural.py new file mode 100644 index 00000000..95bb54f4 --- /dev/null +++ b/benchmarks/pandas/bench_series_log_natural.py @@ -0,0 +1,26 @@ +"""Benchmark: Series natural logarithm — np.log / Series.apply(np.log) on 100k-element Series.""" +import json +import time +import pandas as pd +import numpy as np + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series(np.arange(1, ROWS + 1, dtype=float)) + +for _ in range(WARMUP): + np.log(s) + +start = time.perf_counter() +for _ in range(ITERATIONS): + np.log(s) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_log_natural", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_sortvalues_opts.py b/benchmarks/pandas/bench_series_sortvalues_opts.py new file mode 100644 index 00000000..83f843fa --- /dev/null +++ b/benchmarks/pandas/bench_series_sortvalues_opts.py @@ -0,0 +1,31 @@ +"""Benchmark: Series.sort_values with options — ascending=False, na_position='first'.""" +import json +import time +import pandas as pd +import numpy as np + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +data = [None if i % 1000 == 0 else (np.random.random() * 10000 - 5000) for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.sort_values(ascending=False) + s.sort_values(ascending=True, na_position="first") + s.sort_values(ascending=False, na_position="first") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.sort_values(ascending=False) + s.sort_values(ascending=True, na_position="first") + s.sort_values(ascending=False, na_position="first") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_sortvalues_opts", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_timestamp_arith.py b/benchmarks/pandas/bench_timestamp_arith.py new file mode 100644 index 00000000..3fee9b20 --- /dev/null +++ b/benchmarks/pandas/bench_timestamp_arith.py @@ -0,0 +1,45 @@ +"""Benchmark: Timestamp arithmetic — add timedelta, subtract, comparison operators.""" +import json +import time +import pandas as pd +from datetime import timedelta + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +base = pd.Timestamp("2024-01-01") +timestamps = [pd.Timestamp("2020-01-01") + timedelta(days=i) for i in range(SIZE)] +delta = pd.Timedelta(days=30) +delta2 = pd.Timedelta(hours=12) + +for _ in range(WARMUP): + for ts in timestamps: + ts + delta + ts - delta2 + ts == base + ts < base + ts > base + ts <= base + ts >= base + ts != base + +start = time.perf_counter() +for _ in range(ITERATIONS): + for ts in timestamps: + ts + delta + ts - delta2 + ts == base + ts < base + ts > base + ts <= base + ts >= base + ts != base +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "timestamp_arith", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_timestamp_round_normalize.py b/benchmarks/pandas/bench_timestamp_round_normalize.py new file mode 100644 index 00000000..b2425f97 --- /dev/null +++ b/benchmarks/pandas/bench_timestamp_round_normalize.py @@ -0,0 +1,37 @@ +"""Benchmark: Timestamp rounding — floor, ceil, round, normalize.""" +import json +import time +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +timestamps = [ + pd.Timestamp(year=2020, month=(i % 12) + 1, day=(i % 28) + 1, + hour=i % 24, minute=(i * 7) % 60, second=(i * 13) % 60) + for i in range(SIZE) +] + +for _ in range(WARMUP): + for ts in timestamps: + ts.floor("h") + ts.ceil("h") + ts.round("min") + ts.normalize() + +start = time.perf_counter() +for _ in range(ITERATIONS): + for ts in timestamps: + ts.floor("h") + ts.ceil("h") + ts.round("min") + ts.normalize() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "timestamp_round_normalize", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_timestamp_str_format.py b/benchmarks/pandas/bench_timestamp_str_format.py new file mode 100644 index 00000000..a557bbcb --- /dev/null +++ b/benchmarks/pandas/bench_timestamp_str_format.py @@ -0,0 +1,37 @@ +"""Benchmark: Timestamp string formatting — strftime, isoformat, day_name, month_name.""" +import json +import time +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +timestamps = [ + pd.Timestamp(year=2020, month=(i % 12) + 1, day=(i % 28) + 1, + hour=i % 24, minute=i % 60, second=i % 60) + for i in range(SIZE) +] + +for _ in range(WARMUP): + for ts in timestamps: + ts.strftime("%Y-%m-%d %H:%M:%S") + ts.isoformat() + ts.day_name() + ts.month_name() + +start = time.perf_counter() +for _ in range(ITERATIONS): + for ts in timestamps: + ts.strftime("%Y-%m-%d %H:%M:%S") + ts.isoformat() + ts.day_name() + ts.month_name() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "timestamp_str_format", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_value_counts_opts.py b/benchmarks/pandas/bench_value_counts_opts.py new file mode 100644 index 00000000..b998372a --- /dev/null +++ b/benchmarks/pandas/bench_value_counts_opts.py @@ -0,0 +1,33 @@ +"""Benchmark: value_counts with options — normalize=True, ascending=True, dropna=False.""" +import json +import time +import pandas as pd +import numpy as np + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +data = [None if i % 500 == 0 else f"cat_{i % 50}" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.value_counts(normalize=True) + s.value_counts(ascending=True) + s.value_counts(dropna=False) + s.value_counts(normalize=True, ascending=True, dropna=False) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.value_counts(normalize=True) + s.value_counts(ascending=True) + s.value_counts(dropna=False) + s.value_counts(normalize=True, ascending=True, dropna=False) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "value_counts_opts", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/tsb/bench_dataframe_sortvalues_mixed.ts b/benchmarks/tsb/bench_dataframe_sortvalues_mixed.ts new file mode 100644 index 00000000..4b4dbc68 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_sortvalues_mixed.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: DataFrame.sortValues with mixed ascending array [true, false, true]. + * Outputs JSON: {"function": "dataframe_sortvalues_mixed", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const df = DataFrame.fromColumns({ + category: Array.from({ length: ROWS }, (_, i) => `group_${i % 10}`), + priority: Array.from({ length: ROWS }, (_, i) => i % 5), + value: Array.from({ length: ROWS }, () => Math.random() * 1000), +}); + +for (let i = 0; i < WARMUP; i++) { + df.sortValues(["category", "priority", "value"], [true, false, true]); + df.sortValues(["category", "value"], [false, true]); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.sortValues(["category", "priority", "value"], [true, false, true]); + df.sortValues(["category", "value"], [false, true]); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_sortvalues_mixed", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_groupby_size.ts b/benchmarks/tsb/bench_series_groupby_size.ts new file mode 100644 index 00000000..9049f0e7 --- /dev/null +++ b/benchmarks/tsb/bench_series_groupby_size.ts @@ -0,0 +1,39 @@ +/** + * Benchmark: SeriesGroupBy.size() and SeriesGroupBy.getGroup() operations. + * Outputs JSON: {"function": "series_groupby_size", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const values = new Series({ + data: Array.from({ length: ROWS }, (_, i) => Math.random() * 1000), +}); +const groups = new Series({ + data: Array.from({ length: ROWS }, (_, i) => `g${i % 20}`), +}); + +for (let i = 0; i < WARMUP; i++) { + values.groupby(groups).size(); + values.groupby(groups).getGroup("g0"); + values.groupby(groups).getGroup("g10"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + values.groupby(groups).size(); + values.groupby(groups).getGroup("g0"); + values.groupby(groups).getGroup("g10"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_groupby_size", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_log_natural.ts b/benchmarks/tsb/bench_series_log_natural.ts new file mode 100644 index 00000000..c0d0164c --- /dev/null +++ b/benchmarks/tsb/bench_series_log_natural.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: seriesLog — natural logarithm on a 100k-element Series. + * Outputs JSON: {"function": "series_log_natural", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, seriesLog } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +// Positive values to avoid NaN in log +const s = new Series({ data: Array.from({ length: ROWS }, (_, i) => (i % 10000) + 1) }); + +for (let i = 0; i < WARMUP; i++) { + seriesLog(s); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesLog(s); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_log_natural", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_sortvalues_opts.ts b/benchmarks/tsb/bench_series_sortvalues_opts.ts new file mode 100644 index 00000000..4c86d786 --- /dev/null +++ b/benchmarks/tsb/bench_series_sortvalues_opts.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: Series.sortValues with options — ascending=false, naPosition='first'. + * Outputs JSON: {"function": "series_sortvalues_opts", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const data = Array.from({ length: ROWS }, (_, i) => { + if (i % 1000 === 0) return null; + return Math.random() * 10000 - 5000; +}); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.sortValues(false); + s.sortValues(true, "first"); + s.sortValues(false, "first"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.sortValues(false); + s.sortValues(true, "first"); + s.sortValues(false, "first"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_sortvalues_opts", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_timestamp_arith.ts b/benchmarks/tsb/bench_timestamp_arith.ts new file mode 100644 index 00000000..58fa657e --- /dev/null +++ b/benchmarks/tsb/bench_timestamp_arith.ts @@ -0,0 +1,54 @@ +/** + * Benchmark: Timestamp arithmetic — add, sub, comparison operators (eq/lt/gt/le/ge/ne). + * Outputs JSON: {"function": "timestamp_arith", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Timestamp, Timedelta } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const base = new Timestamp(Date.UTC(2024, 0, 1)); +const timestamps = Array.from( + { length: SIZE }, + (_, i) => new Timestamp(Date.UTC(2020, 0, 1) + i * 86_400_000), +); +const delta = Timedelta.fromComponents({ days: 30 }); +const delta2 = Timedelta.fromComponents({ hours: 12 }); + +for (let i = 0; i < WARMUP; i++) { + for (const ts of timestamps) { + ts.add(delta); + ts.sub(delta2); + ts.eq(base); + ts.lt(base); + ts.gt(base); + ts.le(base); + ts.ge(base); + ts.ne(base); + } +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + for (const ts of timestamps) { + ts.add(delta); + ts.sub(delta2); + ts.eq(base); + ts.lt(base); + ts.gt(base); + ts.le(base); + ts.ge(base); + ts.ne(base); + } +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "timestamp_arith", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_timestamp_round_normalize.ts b/benchmarks/tsb/bench_timestamp_round_normalize.ts new file mode 100644 index 00000000..0a43b3bc --- /dev/null +++ b/benchmarks/tsb/bench_timestamp_round_normalize.ts @@ -0,0 +1,44 @@ +/** + * Benchmark: Timestamp rounding — ceil, floor, round, normalize. + * Outputs JSON: {"function": "timestamp_round_normalize", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Timestamp } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const timestamps = Array.from( + { length: SIZE }, + (_, i) => + new Timestamp(Date.UTC(2020, i % 12, (i % 28) + 1, i % 24, (i * 7) % 60, (i * 13) % 60)), +); + +for (let i = 0; i < WARMUP; i++) { + for (const ts of timestamps) { + ts.floor("H"); + ts.ceil("H"); + ts.round("T"); + ts.normalize(); + } +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + for (const ts of timestamps) { + ts.floor("H"); + ts.ceil("H"); + ts.round("T"); + ts.normalize(); + } +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "timestamp_round_normalize", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_timestamp_str_format.ts b/benchmarks/tsb/bench_timestamp_str_format.ts new file mode 100644 index 00000000..ba84162c --- /dev/null +++ b/benchmarks/tsb/bench_timestamp_str_format.ts @@ -0,0 +1,43 @@ +/** + * Benchmark: Timestamp string formatting — strftime, isoformat, day_name, month_name. + * Outputs JSON: {"function": "timestamp_str_format", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Timestamp } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const timestamps = Array.from( + { length: SIZE }, + (_, i) => new Timestamp(Date.UTC(2020, i % 12, (i % 28) + 1, i % 24, i % 60, i % 60)), +); + +for (let i = 0; i < WARMUP; i++) { + for (const ts of timestamps) { + ts.strftime("%Y-%m-%d %H:%M:%S"); + ts.isoformat(); + ts.day_name(); + ts.month_name(); + } +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + for (const ts of timestamps) { + ts.strftime("%Y-%m-%d %H:%M:%S"); + ts.isoformat(); + ts.day_name(); + ts.month_name(); + } +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "timestamp_str_format", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_value_counts_opts.ts b/benchmarks/tsb/bench_value_counts_opts.ts new file mode 100644 index 00000000..55d70193 --- /dev/null +++ b/benchmarks/tsb/bench_value_counts_opts.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: valueCounts with options — normalize=true, ascending=true, dropna=false. + * Outputs JSON: {"function": "value_counts_opts", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, valueCounts } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const data = Array.from({ length: ROWS }, (_, i) => { + if (i % 500 === 0) return null; + return `cat_${i % 50}`; +}); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + valueCounts(s, { normalize: true }); + valueCounts(s, { ascending: true }); + valueCounts(s, { dropna: false }); + valueCounts(s, { normalize: true, ascending: true, dropna: false }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + valueCounts(s, { normalize: true }); + valueCounts(s, { ascending: true }); + valueCounts(s, { dropna: false }); + valueCounts(s, { normalize: true, ascending: true, dropna: false }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "value_counts_opts", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); From e3c731cf614559641bf095bab432895dafbda734 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 09:59:58 +0000 Subject: [PATCH 11/19] Iteration 148: Add 6 benchmark pairs (468 total, +6 vs best 462) Add benchmarks for standalone comparison, floordiv/mod/pow, drop-duplicates, nsmallest, and duplicated functions not yet covered as standalone imports: - series_standalone_compare: seriesEq/Ne/Lt/Gt/Le/Ge - dataframe_compare_lege: dataFrameLe/dataFrameGe - series_floordiv_standalone: seriesFloorDiv/seriesMod/seriesPow - drop_duplicates_fn: dropDuplicatesSeries/dropDuplicatesDataFrame - nsmallest_series_fn: nsmallestSeries - duplicated_fn: duplicatedSeries/duplicatedDataFrame Run: https://github.com/githubnext/tsessebe/actions/runs/24558253472 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../pandas/bench_dataframe_compare_lege.py | 36 ++++++++++++++++ benchmarks/pandas/bench_drop_duplicates_fn.py | 36 ++++++++++++++++ benchmarks/pandas/bench_duplicated_fn.py | 36 ++++++++++++++++ .../pandas/bench_nsmallest_series_fn.py | 31 ++++++++++++++ .../bench_series_floordiv_standalone.py | 35 ++++++++++++++++ .../pandas/bench_series_standalone_compare.py | 42 +++++++++++++++++++ .../tsb/bench_dataframe_compare_lege.ts | 36 ++++++++++++++++ benchmarks/tsb/bench_drop_duplicates_fn.ts | 36 ++++++++++++++++ benchmarks/tsb/bench_duplicated_fn.ts | 36 ++++++++++++++++ benchmarks/tsb/bench_nsmallest_series_fn.ts | 30 +++++++++++++ .../tsb/bench_series_floordiv_standalone.ts | 34 +++++++++++++++ .../tsb/bench_series_standalone_compare.ts | 41 ++++++++++++++++++ 12 files changed, 429 insertions(+) create mode 100644 benchmarks/pandas/bench_dataframe_compare_lege.py create mode 100644 benchmarks/pandas/bench_drop_duplicates_fn.py create mode 100644 benchmarks/pandas/bench_duplicated_fn.py create mode 100644 benchmarks/pandas/bench_nsmallest_series_fn.py create mode 100644 benchmarks/pandas/bench_series_floordiv_standalone.py create mode 100644 benchmarks/pandas/bench_series_standalone_compare.py create mode 100644 benchmarks/tsb/bench_dataframe_compare_lege.ts create mode 100644 benchmarks/tsb/bench_drop_duplicates_fn.ts create mode 100644 benchmarks/tsb/bench_duplicated_fn.ts create mode 100644 benchmarks/tsb/bench_nsmallest_series_fn.ts create mode 100644 benchmarks/tsb/bench_series_floordiv_standalone.ts create mode 100644 benchmarks/tsb/bench_series_standalone_compare.ts diff --git a/benchmarks/pandas/bench_dataframe_compare_lege.py b/benchmarks/pandas/bench_dataframe_compare_lege.py new file mode 100644 index 00000000..ddaa1b62 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_compare_lege.py @@ -0,0 +1,36 @@ +""" +Benchmark: DataFrame <= and >= element-wise comparisons on 100k-row DataFrame. +Mirrors dataFrameLe / dataFrameGe standalone functions. +Outputs JSON: {"function": "dataframe_compare_lege", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({ + "a": np.arange(SIZE), + "b": np.arange(SIZE) * 2, + "c": np.arange(SIZE) % 100, +}) + +for _ in range(WARMUP): + df.le(50) + df.ge(50) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.le(50) + df.ge(50) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_compare_lege", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_drop_duplicates_fn.py b/benchmarks/pandas/bench_drop_duplicates_fn.py new file mode 100644 index 00000000..21b481ef --- /dev/null +++ b/benchmarks/pandas/bench_drop_duplicates_fn.py @@ -0,0 +1,36 @@ +""" +Benchmark: Series.drop_duplicates / DataFrame.drop_duplicates on 100k elements. +Mirrors dropDuplicatesSeries / dropDuplicatesDataFrame standalone functions. +Outputs JSON: {"function": "drop_duplicates_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series(np.arange(SIZE) % 1000) +df = pd.DataFrame({ + "a": np.arange(SIZE) % 1000, + "b": np.arange(SIZE) % 500, +}) + +for _ in range(WARMUP): + s.drop_duplicates() + df.drop_duplicates() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.drop_duplicates() + df.drop_duplicates() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "drop_duplicates_fn", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_duplicated_fn.py b/benchmarks/pandas/bench_duplicated_fn.py new file mode 100644 index 00000000..7b37976b --- /dev/null +++ b/benchmarks/pandas/bench_duplicated_fn.py @@ -0,0 +1,36 @@ +""" +Benchmark: Series.duplicated / DataFrame.duplicated on 100k elements. +Mirrors duplicatedSeries / duplicatedDataFrame standalone functions. +Outputs JSON: {"function": "duplicated_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series(np.arange(SIZE) % 1000) +df = pd.DataFrame({ + "a": np.arange(SIZE) % 1000, + "b": np.arange(SIZE) % 500, +}) + +for _ in range(WARMUP): + s.duplicated() + df.duplicated() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.duplicated() + df.duplicated() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "duplicated_fn", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_nsmallest_series_fn.py b/benchmarks/pandas/bench_nsmallest_series_fn.py new file mode 100644 index 00000000..67b72b0e --- /dev/null +++ b/benchmarks/pandas/bench_nsmallest_series_fn.py @@ -0,0 +1,31 @@ +""" +Benchmark: Series.nsmallest on 100k-element Series. +Mirrors nsmallestSeries standalone function. +Outputs JSON: {"function": "nsmallest_series_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +data = np.sin(np.arange(SIZE) * 0.01) * 1000 +s = pd.Series(data) + +for _ in range(WARMUP): + s.nsmallest(100) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.nsmallest(100) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "nsmallest_series_fn", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_floordiv_standalone.py b/benchmarks/pandas/bench_series_floordiv_standalone.py new file mode 100644 index 00000000..733eab50 --- /dev/null +++ b/benchmarks/pandas/bench_series_floordiv_standalone.py @@ -0,0 +1,35 @@ +""" +Benchmark: Series floordiv / mod / pow standalone functions on 100k Series. +Mirrors seriesFloorDiv / seriesMod / seriesPow. +Outputs JSON: {"function": "series_floordiv_standalone", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +data = (np.arange(SIZE) % 1000) + 1 +s = pd.Series(data.astype(float)) + +for _ in range(WARMUP): + s.floordiv(3) + s.mod(7) + s.pow(2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.floordiv(3) + s.mod(7) + s.pow(2) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_floordiv_standalone", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_standalone_compare.py b/benchmarks/pandas/bench_series_standalone_compare.py new file mode 100644 index 00000000..40b42900 --- /dev/null +++ b/benchmarks/pandas/bench_series_standalone_compare.py @@ -0,0 +1,42 @@ +""" +Benchmark: standalone Series comparison operators (eq, ne, lt, gt, le, ge) on 100k Series. +Mirrors seriesEq/Ne/Lt/Gt/Le/Ge standalone functions. +Outputs JSON: {"function": "series_standalone_compare", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +data = np.arange(SIZE) * 0.1 +s = pd.Series(data) +threshold = SIZE * 0.05 + +for _ in range(WARMUP): + s.eq(threshold) + s.ne(threshold) + s.lt(threshold) + s.gt(threshold) + s.le(threshold) + s.ge(threshold) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.eq(threshold) + s.ne(threshold) + s.lt(threshold) + s.gt(threshold) + s.le(threshold) + s.ge(threshold) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_standalone_compare", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/tsb/bench_dataframe_compare_lege.ts b/benchmarks/tsb/bench_dataframe_compare_lege.ts new file mode 100644 index 00000000..53490a33 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_compare_lege.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: dataFrameLe / dataFrameGe — less-than-or-equal and greater-than-or-equal standalone functions on 100k-row DataFrame. + * Outputs JSON: {"function": "dataframe_compare_lege", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dataFrameLe, dataFrameGe } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i), + b: Array.from({ length: SIZE }, (_, i) => i * 2), + c: Array.from({ length: SIZE }, (_, i) => i % 100), +}); + +for (let i = 0; i < WARMUP; i++) { + dataFrameLe(df, 50); + dataFrameGe(df, 50); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dataFrameLe(df, 50); + dataFrameGe(df, 50); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_compare_lege", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_drop_duplicates_fn.ts b/benchmarks/tsb/bench_drop_duplicates_fn.ts new file mode 100644 index 00000000..8b5a3b18 --- /dev/null +++ b/benchmarks/tsb/bench_drop_duplicates_fn.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: dropDuplicatesSeries / dropDuplicatesDataFrame — standalone drop-duplicates on 100k elements. + * Outputs JSON: {"function": "drop_duplicates_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, dropDuplicatesSeries, dropDuplicatesDataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 1000) }); +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i % 1000), + b: Array.from({ length: SIZE }, (_, i) => i % 500), +}); + +for (let i = 0; i < WARMUP; i++) { + dropDuplicatesSeries(s); + dropDuplicatesDataFrame(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dropDuplicatesSeries(s); + dropDuplicatesDataFrame(df); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "drop_duplicates_fn", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_duplicated_fn.ts b/benchmarks/tsb/bench_duplicated_fn.ts new file mode 100644 index 00000000..0aab2812 --- /dev/null +++ b/benchmarks/tsb/bench_duplicated_fn.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: duplicatedSeries / duplicatedDataFrame — standalone duplicated detection on 100k elements. + * Outputs JSON: {"function": "duplicated_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, duplicatedSeries, duplicatedDataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 1000) }); +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i % 1000), + b: Array.from({ length: SIZE }, (_, i) => i % 500), +}); + +for (let i = 0; i < WARMUP; i++) { + duplicatedSeries(s); + duplicatedDataFrame(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + duplicatedSeries(s); + duplicatedDataFrame(df); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "duplicated_fn", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_nsmallest_series_fn.ts b/benchmarks/tsb/bench_nsmallest_series_fn.ts new file mode 100644 index 00000000..443129a0 --- /dev/null +++ b/benchmarks/tsb/bench_nsmallest_series_fn.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: nsmallestSeries — standalone nsmallest on 100k-element Series. + * Outputs JSON: {"function": "nsmallest_series_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, nsmallestSeries } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.01) * 1000) }); + +for (let i = 0; i < WARMUP; i++) { + nsmallestSeries(s, 100); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + nsmallestSeries(s, 100); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "nsmallest_series_fn", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_floordiv_standalone.ts b/benchmarks/tsb/bench_series_floordiv_standalone.ts new file mode 100644 index 00000000..d55fb1d8 --- /dev/null +++ b/benchmarks/tsb/bench_series_floordiv_standalone.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: seriesFloorDiv / seriesMod / seriesPow — standalone floor-division, modulo, and power on 100k Series. + * Outputs JSON: {"function": "series_floordiv_standalone", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, seriesFloorDiv, seriesMod, seriesPow } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i % 1000) + 1) }); + +for (let i = 0; i < WARMUP; i++) { + seriesFloorDiv(s, 3); + seriesMod(s, 7); + seriesPow(s, 2); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesFloorDiv(s, 3); + seriesMod(s, 7); + seriesPow(s, 2); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_floordiv_standalone", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_standalone_compare.ts b/benchmarks/tsb/bench_series_standalone_compare.ts new file mode 100644 index 00000000..7fd71cb6 --- /dev/null +++ b/benchmarks/tsb/bench_series_standalone_compare.ts @@ -0,0 +1,41 @@ +/** + * Benchmark: seriesEq / seriesNe / seriesLt / seriesGt / seriesLe / seriesGe — standalone comparison functions on 100k Series. + * Outputs JSON: {"function": "series_standalone_compare", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, seriesEq, seriesNe, seriesLt, seriesGt, seriesLe, seriesGe } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 0.1) }); +const threshold = SIZE * 0.05; + +for (let i = 0; i < WARMUP; i++) { + seriesEq(s, threshold); + seriesNe(s, threshold); + seriesLt(s, threshold); + seriesGt(s, threshold); + seriesLe(s, threshold); + seriesGe(s, threshold); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesEq(s, threshold); + seriesNe(s, threshold); + seriesLt(s, threshold); + seriesGt(s, threshold); + seriesLe(s, threshold); + seriesGe(s, threshold); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_standalone_compare", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); From e4521f9c2a0d13d571e02019596dd290b55072c3 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 11:05:01 +0000 Subject: [PATCH 12/19] Iteration 150: Add 5 benchmark pairs (473 total, +5 vs best 468) Added benchmarks for: replaceSeries, isnull/notnull aliases, toNumericScalar, dataFrameAssign (functional API), dataFrameIsin (functional API). Run: https://github.com/githubnext/tsessebe/actions/runs/24561344321 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../pandas/bench_dataframe_assign_fn.py | 30 +++++++++++ benchmarks/pandas/bench_dataframe_isin_fn.py | 30 +++++++++++ benchmarks/pandas/bench_isnull_notnull.py | 32 ++++++++++++ benchmarks/pandas/bench_replace_series.py | 22 ++++++++ benchmarks/pandas/bench_to_numeric_scalar.py | 37 ++++++++++++++ benchmarks/tsb/bench_dataframe_assign_fn.ts | 51 +++++++++++++++++++ benchmarks/tsb/bench_dataframe_isin_fn.ts | 44 ++++++++++++++++ benchmarks/tsb/bench_isnull_notnull.ts | 45 ++++++++++++++++ benchmarks/tsb/bench_replace_series.ts | 40 +++++++++++++++ benchmarks/tsb/bench_to_numeric_scalar.ts | 41 +++++++++++++++ 10 files changed, 372 insertions(+) create mode 100644 benchmarks/pandas/bench_dataframe_assign_fn.py create mode 100644 benchmarks/pandas/bench_dataframe_isin_fn.py create mode 100644 benchmarks/pandas/bench_isnull_notnull.py create mode 100644 benchmarks/pandas/bench_replace_series.py create mode 100644 benchmarks/pandas/bench_to_numeric_scalar.py create mode 100644 benchmarks/tsb/bench_dataframe_assign_fn.ts create mode 100644 benchmarks/tsb/bench_dataframe_isin_fn.ts create mode 100644 benchmarks/tsb/bench_isnull_notnull.ts create mode 100644 benchmarks/tsb/bench_replace_series.ts create mode 100644 benchmarks/tsb/bench_to_numeric_scalar.ts diff --git a/benchmarks/pandas/bench_dataframe_assign_fn.py b/benchmarks/pandas/bench_dataframe_assign_fn.py new file mode 100644 index 00000000..cd4d1c08 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_assign_fn.py @@ -0,0 +1,30 @@ +"""Benchmark: DataFrame.assign — add new columns using the pandas assign API.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({ + "a": [i * 1.0 for i in range(SIZE)], + "b": [i * 2.0 for i in range(SIZE)], +}) + +for _ in range(WARMUP): + df.assign( + c=[i * 3.0 for i in range(SIZE)], + d=lambda working: working["a"] + working["c"], + ) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.assign( + c=[i * 3.0 for i in range(SIZE)], + d=lambda working: working["a"] + working["c"], + ) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "dataframe_assign_fn", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_dataframe_isin_fn.py b/benchmarks/pandas/bench_dataframe_isin_fn.py new file mode 100644 index 00000000..0b63a5d7 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_isin_fn.py @@ -0,0 +1,30 @@ +"""Benchmark: DataFrame.isin — test membership of each element against value sets.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +df = pd.DataFrame({ + "a": [i % 20 for i in range(SIZE)], + "b": [["x", "y", "z", "w"][i % 4] for i in range(SIZE)], + "c": [i % 10 for i in range(SIZE)], +}) + +global_values = [0, 1, 2, "x", "y"] +col_values = {"a": [0, 1, 2, 3, 4], "b": ["x", "y"], "c": [0, 5]} + +for _ in range(WARMUP): + df.isin(global_values) + df.isin(col_values) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.isin(global_values) + df.isin(col_values) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "dataframe_isin_fn", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_isnull_notnull.py b/benchmarks/pandas/bench_isnull_notnull.py new file mode 100644 index 00000000..a435e793 --- /dev/null +++ b/benchmarks/pandas/bench_isnull_notnull.py @@ -0,0 +1,32 @@ +"""Benchmark: isnull / notnull — aliases for isna / notna on Series and DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series([np.nan if i % 7 == 0 else i * 0.1 for i in range(SIZE)]) +df = pd.DataFrame({ + "a": [np.nan if i % 5 == 0 else float(i) for i in range(SIZE)], + "b": [np.nan if i % 3 == 0 else i * 2.5 for i in range(SIZE)], +}) + +for _ in range(WARMUP): + pd.isnull(s) + pd.notnull(s) + pd.isnull(df) + pd.notnull(df) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.isnull(s) + pd.notnull(s) + pd.isnull(df) + pd.notnull(df) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "isnull_notnull", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_replace_series.py b/benchmarks/pandas/bench_replace_series.py new file mode 100644 index 00000000..903f3917 --- /dev/null +++ b/benchmarks/pandas/bench_replace_series.py @@ -0,0 +1,22 @@ +"""Benchmark: Series.replace — replace values in a Series.""" +import json, time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series([i % 10 for i in range(SIZE)]) +mapping = {0: 100, 1: 200, 2: 300, 3: 400, 4: 500} + +for _ in range(WARMUP): + s.replace(mapping) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.replace(mapping) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "replace_series", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/pandas/bench_to_numeric_scalar.py b/benchmarks/pandas/bench_to_numeric_scalar.py new file mode 100644 index 00000000..f8e526f0 --- /dev/null +++ b/benchmarks/pandas/bench_to_numeric_scalar.py @@ -0,0 +1,37 @@ +"""Benchmark: pd.to_numeric scalar coercion — convert individual scalar values to numeric.""" +import json, time +import pandas as pd + +WARMUP = 5 +ITERATIONS = 100 +BATCH = 10_000 + +inputs = [] +for i in range(BATCH): + r = i % 6 + if r == 0: + inputs.append(str(i * 1.5)) + elif r == 1: + inputs.append(i) + elif r == 2: + inputs.append(f" {i} ") + elif r == 3: + inputs.append(True) + elif r == 4: + inputs.append(None) + else: + inputs.append(str(i)) + +for _ in range(WARMUP): + for v in inputs: + pd.to_numeric(v, errors="coerce") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + for v in inputs: + pd.to_numeric(v, errors="coerce") + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({"function": "to_numeric_scalar", "mean_ms": round(total_ms / ITERATIONS, 3), "iterations": ITERATIONS, "total_ms": round(total_ms, 3)})) diff --git a/benchmarks/tsb/bench_dataframe_assign_fn.ts b/benchmarks/tsb/bench_dataframe_assign_fn.ts new file mode 100644 index 00000000..4cb258c9 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_assign_fn.ts @@ -0,0 +1,51 @@ +/** + * Benchmark: dataFrameAssign — add new columns to a DataFrame using the functional API. + * Outputs JSON: {"function": "dataframe_assign_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { dataFrameAssign, DataFrame, Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i * 1.0), + b: Array.from({ length: SIZE }, (_, i) => i * 2.0), +}); +const colC = new Series({ data: Array.from({ length: SIZE }, (_, i) => i * 3.0), name: "c" }); + +for (let i = 0; i < WARMUP; i++) { + dataFrameAssign(df, { + c: colC, + d: (working) => { + const aVals = working.col("a").values; + const cVals = working.col("c").values; + return new Series({ data: aVals.map((v, idx) => (v as number) + (cVals[idx] as number)) }); + }, + }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + dataFrameAssign(df, { + c: colC, + d: (working) => { + const aVals = working.col("a").values; + const cVals = working.col("c").values; + return new Series({ data: aVals.map((v, idx) => (v as number) + (cVals[idx] as number)) }); + }, + }); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "dataframe_assign_fn", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_isin_fn.ts b/benchmarks/tsb/bench_dataframe_isin_fn.ts new file mode 100644 index 00000000..48b7684f --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_isin_fn.ts @@ -0,0 +1,44 @@ +/** + * Benchmark: dataFrameIsin — test membership of each element in a DataFrame against value sets. + * Outputs JSON: {"function": "dataframe_isin_fn", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { dataFrameIsin, DataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => i % 20), + b: Array.from({ length: SIZE }, (_, i) => ["x", "y", "z", "w"][i % 4]), + c: Array.from({ length: SIZE }, (_, i) => i % 10), +}); + +// Global isin — check all columns +const globalValues = [0, 1, 2, "x", "y"]; +// Per-column isin dict +const colValues = { a: [0, 1, 2, 3, 4], b: ["x", "y"], c: [0, 5] }; + +for (let i = 0; i < WARMUP; i++) { + dataFrameIsin(df, globalValues); + dataFrameIsin(df, colValues); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + dataFrameIsin(df, globalValues); + dataFrameIsin(df, colValues); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "dataframe_isin_fn", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_isnull_notnull.ts b/benchmarks/tsb/bench_isnull_notnull.ts new file mode 100644 index 00000000..f27abcf0 --- /dev/null +++ b/benchmarks/tsb/bench_isnull_notnull.ts @@ -0,0 +1,45 @@ +/** + * Benchmark: isnull / notnull — aliases for isna / notna on Series and DataFrame. + * Outputs JSON: {"function": "isnull_notnull", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { isnull, notnull, Series, DataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ + data: Array.from({ length: SIZE }, (_, i) => (i % 7 === 0 ? null : i * 0.1)), +}); +const df = new DataFrame({ + a: Array.from({ length: SIZE }, (_, i) => (i % 5 === 0 ? null : i)), + b: Array.from({ length: SIZE }, (_, i) => (i % 3 === 0 ? null : i * 2.5)), +}); + +for (let i = 0; i < WARMUP; i++) { + isnull(s); + notnull(s); + isnull(df); + notnull(df); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + isnull(s); + notnull(s); + isnull(df); + notnull(df); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "isnull_notnull", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_replace_series.ts b/benchmarks/tsb/bench_replace_series.ts new file mode 100644 index 00000000..23fcf515 --- /dev/null +++ b/benchmarks/tsb/bench_replace_series.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: replaceSeries — replace values in a Series. + * Outputs JSON: {"function": "replace_series", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { replaceSeries, Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 10) }); +const mapping = new Map([ + [0, 100], + [1, 200], + [2, 300], + [3, 400], + [4, 500], +]); + +for (let i = 0; i < WARMUP; i++) { + replaceSeries(s, mapping); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + replaceSeries(s, mapping); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "replace_series", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_to_numeric_scalar.ts b/benchmarks/tsb/bench_to_numeric_scalar.ts new file mode 100644 index 00000000..7dd7e2aa --- /dev/null +++ b/benchmarks/tsb/bench_to_numeric_scalar.ts @@ -0,0 +1,41 @@ +/** + * Benchmark: toNumericScalar — coerce individual scalars to numeric values. + * Outputs JSON: {"function": "to_numeric_scalar", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { toNumericScalar } from "../../src/index.ts"; + +const WARMUP = 5; +const ITERATIONS = 100; +const BATCH = 10_000; + +const inputs: unknown[] = Array.from({ length: BATCH }, (_, i) => { + const r = i % 6; + if (r === 0) return String(i * 1.5); + if (r === 1) return i; + if (r === 2) return ` ${i} `; + if (r === 3) return true; + if (r === 4) return null; + return String(i); +}); + +for (let i = 0; i < WARMUP; i++) { + for (const v of inputs) toNumericScalar(v, { errors: "coerce" }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + for (const v of inputs) toNumericScalar(v, { errors: "coerce" }); + times.push(performance.now() - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "to_numeric_scalar", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); From 952d47943a233caff0e20a5b67fdebe6bff6ae9e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 11:46:25 +0000 Subject: [PATCH 13/19] Iteration 151: Add 5 benchmark pairs (478 total, +5 vs best 473) Added benchmarks: - df_any_all_axis1: anyDataFrame/allDataFrame row-wise (axis=1) - df_nunique_axis1: nuniqueDataFrame row-wise (axis=1) - cat_codes_accessor: CategoricalAccessor.codes/nCategories/ordered properties - ewm_adjust: EWM with adjust=false (IIR) vs adjust=true - interpolate_bfill_limit: interpolateSeries bfill method with limit option Run: https://github.com/githubnext/tsessebe/actions/runs/24562479978 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- benchmarks/pandas/bench_cat_codes_accessor.py | 42 +++++++++++++++++++ benchmarks/pandas/bench_df_any_all_axis1.py | 38 +++++++++++++++++ benchmarks/pandas/bench_df_nunique_axis1.py | 37 ++++++++++++++++ benchmarks/pandas/bench_ewm_adjust.py | 36 ++++++++++++++++ .../pandas/bench_interpolate_bfill_limit.py | 36 ++++++++++++++++ benchmarks/tsb/bench_cat_codes_accessor.ts | 40 ++++++++++++++++++ benchmarks/tsb/bench_df_any_all_axis1.ts | 39 +++++++++++++++++ benchmarks/tsb/bench_df_nunique_axis1.ts | 38 +++++++++++++++++ benchmarks/tsb/bench_ewm_adjust.ts | 35 ++++++++++++++++ .../tsb/bench_interpolate_bfill_limit.ts | 40 ++++++++++++++++++ 10 files changed, 381 insertions(+) create mode 100644 benchmarks/pandas/bench_cat_codes_accessor.py create mode 100644 benchmarks/pandas/bench_df_any_all_axis1.py create mode 100644 benchmarks/pandas/bench_df_nunique_axis1.py create mode 100644 benchmarks/pandas/bench_ewm_adjust.py create mode 100644 benchmarks/pandas/bench_interpolate_bfill_limit.py create mode 100644 benchmarks/tsb/bench_cat_codes_accessor.ts create mode 100644 benchmarks/tsb/bench_df_any_all_axis1.ts create mode 100644 benchmarks/tsb/bench_df_nunique_axis1.ts create mode 100644 benchmarks/tsb/bench_ewm_adjust.ts create mode 100644 benchmarks/tsb/bench_interpolate_bfill_limit.ts diff --git a/benchmarks/pandas/bench_cat_codes_accessor.py b/benchmarks/pandas/bench_cat_codes_accessor.py new file mode 100644 index 00000000..a2d1462e --- /dev/null +++ b/benchmarks/pandas/bench_cat_codes_accessor.py @@ -0,0 +1,42 @@ +""" +Benchmark: pd.Categorical.codes / categories / ordered — category accessor properties +on a 100k-element categorical Series. +Outputs JSON: {"function": "cat_codes_accessor", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +CATS = 50 +WARMUP = 5 +ITERATIONS = 30 + +categories = [f"cat_{i}" for i in range(CATS)] +data = [categories[i % CATS] for i in range(SIZE)] +s = pd.Categorical(data, categories=categories) +ps = pd.Series(s) + +for _ in range(WARMUP): + _ = ps.cat.codes + _ = ps.cat.categories + _ = ps.cat.ordered + _ = len(ps.cat.categories) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + _ = ps.cat.codes + _ = ps.cat.categories + _ = ps.cat.ordered + _ = len(ps.cat.categories) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "cat_codes_accessor", + "mean_ms": round(total_ms / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_df_any_all_axis1.py b/benchmarks/pandas/bench_df_any_all_axis1.py new file mode 100644 index 00000000..f6b193a3 --- /dev/null +++ b/benchmarks/pandas/bench_df_any_all_axis1.py @@ -0,0 +1,38 @@ +""" +Benchmark: DataFrame.any(axis=1) / all(axis=1) — row-wise boolean reductions on 100k-row DataFrame. +Outputs JSON: {"function": "df_any_all_axis1", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +df = pd.DataFrame({ + "a": np.arange(SIZE) % 2 == 0, + "b": np.arange(SIZE) % 3 != 0, + "c": np.arange(SIZE) > 0, + "d": np.arange(SIZE) % 5 == 0, +}) + +for _ in range(WARMUP): + df.any(axis=1) + df.all(axis=1) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.any(axis=1) + df.all(axis=1) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "df_any_all_axis1", + "mean_ms": round(total_ms / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_df_nunique_axis1.py b/benchmarks/pandas/bench_df_nunique_axis1.py new file mode 100644 index 00000000..2c4e7bad --- /dev/null +++ b/benchmarks/pandas/bench_df_nunique_axis1.py @@ -0,0 +1,37 @@ +""" +Benchmark: DataFrame.nunique(axis=1) — count unique values per row on a 10k-row DataFrame. +Outputs JSON: {"function": "df_nunique_axis1", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 30 + +df = pd.DataFrame({ + "a": np.arange(SIZE) % 5, + "b": np.arange(SIZE) % 10, + "c": np.arange(SIZE) % 3, + "d": np.arange(SIZE) % 7, + "e": np.arange(SIZE) % 4, +}) + +for _ in range(WARMUP): + df.nunique(axis=1) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.nunique(axis=1) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "df_nunique_axis1", + "mean_ms": round(total_ms / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_ewm_adjust.py b/benchmarks/pandas/bench_ewm_adjust.py new file mode 100644 index 00000000..4336db98 --- /dev/null +++ b/benchmarks/pandas/bench_ewm_adjust.py @@ -0,0 +1,36 @@ +""" +Benchmark: EWM with adjust=False — IIR-based exponential weighted mean vs default adjust=True on 100k Series. +Outputs JSON: {"function": "ewm_adjust", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +data = np.sin(np.arange(SIZE) * 0.01) * 100 +s = pd.Series(data) + +for _ in range(WARMUP): + s.ewm(alpha=0.3, adjust=False).mean() + s.ewm(alpha=0.3, adjust=True).mean() + s.ewm(span=20, adjust=False).mean() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.ewm(alpha=0.3, adjust=False).mean() + s.ewm(alpha=0.3, adjust=True).mean() + s.ewm(span=20, adjust=False).mean() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "ewm_adjust", + "mean_ms": round(total_ms / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_interpolate_bfill_limit.py b/benchmarks/pandas/bench_interpolate_bfill_limit.py new file mode 100644 index 00000000..4c94d6d9 --- /dev/null +++ b/benchmarks/pandas/bench_interpolate_bfill_limit.py @@ -0,0 +1,36 @@ +""" +Benchmark: Series.interpolate with bfill method and limit option — backward fill with gap limit on 50k Series. +Outputs JSON: {"function": "interpolate_bfill_limit", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 50_000 +WARMUP = 5 +ITERATIONS = 30 + +data = np.where(np.arange(SIZE) % 7 < 2, np.nan, np.sin(np.arange(SIZE) * 0.01) * 100) +s = pd.Series(data) + +for _ in range(WARMUP): + s.interpolate(method="bfill") + s.ffill(limit=2) + s.bfill(limit=1) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + s.interpolate(method="bfill") + s.ffill(limit=2) + s.bfill(limit=1) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +print(json.dumps({ + "function": "interpolate_bfill_limit", + "mean_ms": round(total_ms / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/tsb/bench_cat_codes_accessor.ts b/benchmarks/tsb/bench_cat_codes_accessor.ts new file mode 100644 index 00000000..c646e7d2 --- /dev/null +++ b/benchmarks/tsb/bench_cat_codes_accessor.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: CategoricalAccessor.codes / nCategories / ordered — category accessor + * properties on a 100k-element categorical Series. + * Outputs JSON: {"function": "cat_codes_accessor", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const CATS = 50; +const WARMUP = 5; +const ITERATIONS = 30; + +const categories = Array.from({ length: CATS }, (_, i) => `cat_${i}`); +const data = Array.from({ length: SIZE }, (_, i) => categories[i % CATS]); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + void s.cat.codes; + void s.cat.nCategories; + void s.cat.ordered; + void s.cat.categories; +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + void s.cat.codes; + void s.cat.nCategories; + void s.cat.ordered; + void s.cat.categories; +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "cat_codes_accessor", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_df_any_all_axis1.ts b/benchmarks/tsb/bench_df_any_all_axis1.ts new file mode 100644 index 00000000..53285ad1 --- /dev/null +++ b/benchmarks/tsb/bench_df_any_all_axis1.ts @@ -0,0 +1,39 @@ +/** + * Benchmark: anyDataFrame / allDataFrame with axis=1 — row-wise boolean reductions on 100k-row DataFrame. + * Outputs JSON: {"function": "df_any_all_axis1", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, Series, anyDataFrame, allDataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const df = new DataFrame({ + columns: new Map([ + ["a", new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 2 === 0) })], + ["b", new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 3 !== 0) })], + ["c", new Series({ data: Array.from({ length: SIZE }, (_, i) => i > 0) })], + ["d", new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 5 === 0) })], + ]), +}); + +for (let i = 0; i < WARMUP; i++) { + anyDataFrame(df, { axis: 1 }); + allDataFrame(df, { axis: 1 }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + anyDataFrame(df, { axis: 1 }); + allDataFrame(df, { axis: 1 }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "df_any_all_axis1", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_df_nunique_axis1.ts b/benchmarks/tsb/bench_df_nunique_axis1.ts new file mode 100644 index 00000000..c1379da8 --- /dev/null +++ b/benchmarks/tsb/bench_df_nunique_axis1.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: nuniqueDataFrame with axis=1 — count unique values per row on a 10k-row DataFrame. + * Outputs JSON: {"function": "df_nunique_axis1", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, Series, nuniqueDataFrame } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const df = new DataFrame({ + columns: new Map([ + ["a", new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 5) })], + ["b", new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 10) })], + ["c", new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 3) })], + ["d", new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 7) })], + ["e", new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 4) })], + ]), +}); + +for (let i = 0; i < WARMUP; i++) { + nuniqueDataFrame(df, { axis: 1 }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + nuniqueDataFrame(df, { axis: 1 }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "df_nunique_axis1", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_ewm_adjust.ts b/benchmarks/tsb/bench_ewm_adjust.ts new file mode 100644 index 00000000..6ccc52f8 --- /dev/null +++ b/benchmarks/tsb/bench_ewm_adjust.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: EWM with adjust=false — IIR-based exponential weighted mean vs default adjust=true on 100k Series. + * Outputs JSON: {"function": "ewm_adjust", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const data = Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.01) * 100); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + s.ewm({ alpha: 0.3, adjust: false }).mean(); + s.ewm({ alpha: 0.3, adjust: true }).mean(); + s.ewm({ span: 20, adjust: false }).mean(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.ewm({ alpha: 0.3, adjust: false }).mean(); + s.ewm({ alpha: 0.3, adjust: true }).mean(); + s.ewm({ span: 20, adjust: false }).mean(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "ewm_adjust", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_interpolate_bfill_limit.ts b/benchmarks/tsb/bench_interpolate_bfill_limit.ts new file mode 100644 index 00000000..83eb5d29 --- /dev/null +++ b/benchmarks/tsb/bench_interpolate_bfill_limit.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: interpolateSeries with bfill method and limit option — backward fill with gap limit on 50k Series. + * Outputs JSON: {"function": "interpolate_bfill_limit", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, interpolateSeries } from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// ~15% NaN values with consecutive gaps of up to 5 +const data = Array.from({ length: SIZE }, (_, i) => { + const gap = i % 7; + if (gap === 0 || gap === 1) return null; + return Math.sin(i * 0.01) * 100; +}); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + interpolateSeries(s, { method: "bfill" }); + interpolateSeries(s, { method: "ffill", limit: 2 }); + interpolateSeries(s, { method: "bfill", limit: 1 }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + interpolateSeries(s, { method: "bfill" }); + interpolateSeries(s, { method: "ffill", limit: 2 }); + interpolateSeries(s, { method: "bfill", limit: 1 }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "interpolate_bfill_limit", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); From 3e8972583e95ed030d2f398fbecbf5c3ba7930cc Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 12:35:56 +0000 Subject: [PATCH 14/19] Iteration 153: Add 5 benchmark pairs (483 total, +5 vs best 478) Added 5 new benchmark pairs: - datetime_index_ops: DatetimeIndex sort/unique/toStrings/slice/contains/concat - datetime_index_snap: DatetimeIndex.snap(freq) to month-start and week boundaries - period_index_query: PeriodIndex.getLoc/contains querying operations - series_groupby_agg_all: SeriesGroupBy all aggregations (sum/mean/std/min/max/count/first/last) - dataframe_rolling_median: DataFrameRolling.median and DataFrameExpanding.median Run: https://github.com/githubnext/tsessebe/actions/runs/24564770860 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../pandas/bench_dataframe_rolling_median.py | 36 ++++++++++++++ benchmarks/pandas/bench_datetime_index_ops.py | 43 ++++++++++++++++ .../pandas/bench_datetime_index_snap.py | 34 +++++++++++++ benchmarks/pandas/bench_period_index_query.py | 42 ++++++++++++++++ .../pandas/bench_series_groupby_agg_all.py | 48 ++++++++++++++++++ .../tsb/bench_dataframe_rolling_median.ts | 38 ++++++++++++++ benchmarks/tsb/bench_datetime_index_ops.ts | 45 +++++++++++++++++ benchmarks/tsb/bench_datetime_index_snap.ts | 36 ++++++++++++++ benchmarks/tsb/bench_period_index_query.ts | 44 +++++++++++++++++ .../tsb/bench_series_groupby_agg_all.ts | 49 +++++++++++++++++++ 10 files changed, 415 insertions(+) create mode 100644 benchmarks/pandas/bench_dataframe_rolling_median.py create mode 100644 benchmarks/pandas/bench_datetime_index_ops.py create mode 100644 benchmarks/pandas/bench_datetime_index_snap.py create mode 100644 benchmarks/pandas/bench_period_index_query.py create mode 100644 benchmarks/pandas/bench_series_groupby_agg_all.py create mode 100644 benchmarks/tsb/bench_dataframe_rolling_median.ts create mode 100644 benchmarks/tsb/bench_datetime_index_ops.ts create mode 100644 benchmarks/tsb/bench_datetime_index_snap.ts create mode 100644 benchmarks/tsb/bench_period_index_query.ts create mode 100644 benchmarks/tsb/bench_series_groupby_agg_all.ts diff --git a/benchmarks/pandas/bench_dataframe_rolling_median.py b/benchmarks/pandas/bench_dataframe_rolling_median.py new file mode 100644 index 00000000..4f2a7832 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_rolling_median.py @@ -0,0 +1,36 @@ +""" +Benchmark: pandas DataFrame.rolling(10).median() / DataFrame.expanding(1).median() — rolling and expanding median on DataFrame. +Outputs JSON: {"function": "dataframe_rolling_median", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 20 + +df = pd.DataFrame({ + "a": [i * 0.1 for i in range(ROWS)], + "b": [(i * 0.3) % 500 for i in range(ROWS)], +}) + +for _ in range(WARMUP): + df.rolling(10).median() + df.expanding(1).median() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.rolling(10).median() + df.expanding(1).median() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "dataframe_rolling_median", + "mean_ms": round(mean_ms, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_datetime_index_ops.py b/benchmarks/pandas/bench_datetime_index_ops.py new file mode 100644 index 00000000..ebbb1f54 --- /dev/null +++ b/benchmarks/pandas/bench_datetime_index_ops.py @@ -0,0 +1,43 @@ +""" +Benchmark: pandas DatetimeIndex sort_values / unique / strftime / slice / isin / append — DatetimeIndex operations. +Outputs JSON: {"function": "datetime_index_ops", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +idx = pd.date_range(start="2020-01-01", periods=SIZE, freq="h") +idx2 = pd.date_range(start="2021-01-01", periods=SIZE, freq="h") +ref_date = pd.Timestamp("2020-06-15T00:00:00Z") + +for _ in range(WARMUP): + idx.sort_values() + idx.unique() + idx.strftime("%Y-%m-%dT%H:%M:%SZ") + idx[:100] + ref_date in idx + idx.append(idx2) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + idx.sort_values() + idx.unique() + idx.strftime("%Y-%m-%dT%H:%M:%SZ") + idx[:100] + ref_date in idx + idx.append(idx2) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "datetime_index_ops", + "mean_ms": round(mean_ms, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_datetime_index_snap.py b/benchmarks/pandas/bench_datetime_index_snap.py new file mode 100644 index 00000000..b62a4490 --- /dev/null +++ b/benchmarks/pandas/bench_datetime_index_snap.py @@ -0,0 +1,34 @@ +""" +Benchmark: pandas DatetimeIndex.snap(freq) — snap index to frequency boundaries (round to nearest). +Outputs JSON: {"function": "datetime_index_snap", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 5_000 +WARMUP = 5 +ITERATIONS = 50 + +# Dates that are not on month/week boundaries +idx = pd.date_range(start="2020-01-15", periods=SIZE, freq="D") + +for _ in range(WARMUP): + idx.snap("MS") # snap to month start + idx.snap("W") # snap to week + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + idx.snap("MS") + idx.snap("W") + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "datetime_index_snap", + "mean_ms": round(mean_ms, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_period_index_query.py b/benchmarks/pandas/bench_period_index_query.py new file mode 100644 index 00000000..0e6d3e01 --- /dev/null +++ b/benchmarks/pandas/bench_period_index_query.py @@ -0,0 +1,42 @@ +""" +Benchmark: pandas PeriodIndex.get_loc / isin — querying a PeriodIndex. +Outputs JSON: {"function": "period_index_query", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 1_000 +WARMUP = 5 +ITERATIONS = 100 + +base = pd.Period("2020-01", freq="M") +periods = [base + i for i in range(SIZE)] +idx = pd.PeriodIndex(periods) + +query_period = base + 500 +mid_period = base + 250 + +for _ in range(WARMUP): + idx.get_loc(query_period) + query_period in idx + idx.get_loc(mid_period) + mid_period in idx + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + idx.get_loc(query_period) + query_period in idx + idx.get_loc(mid_period) + mid_period in idx + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "period_index_query", + "mean_ms": round(mean_ms, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_series_groupby_agg_all.py b/benchmarks/pandas/bench_series_groupby_agg_all.py new file mode 100644 index 00000000..a99588ff --- /dev/null +++ b/benchmarks/pandas/bench_series_groupby_agg_all.py @@ -0,0 +1,48 @@ +""" +Benchmark: pandas SeriesGroupBy — all aggregation operations (sum/mean/std/min/max/count/first/last) on 100k Series. +Outputs JSON: {"function": "series_groupby_agg_all", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +s = pd.Series((np.arange(SIZE) * 1.5) % 9999) +by = pd.Series(np.arange(SIZE) % 100) +gb = s.groupby(by) + +for _ in range(WARMUP): + gb.sum() + gb.mean() + gb.std() + gb.min() + gb.max() + gb.count() + gb.first() + gb.last() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + gb.sum() + gb.mean() + gb.std() + gb.min() + gb.max() + gb.count() + gb.first() + gb.last() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "series_groupby_agg_all", + "mean_ms": round(mean_ms, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/tsb/bench_dataframe_rolling_median.ts b/benchmarks/tsb/bench_dataframe_rolling_median.ts new file mode 100644 index 00000000..7f11ec4c --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_rolling_median.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: DataFrameRolling.median / DataFrameExpanding.median — rolling and expanding median on DataFrame. + * Outputs JSON: {"function": "dataframe_rolling_median", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.ts"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: ROWS }, (_, i) => i * 0.1), + b: Array.from({ length: ROWS }, (_, i) => (i * 0.3) % 500), +}); + +for (let i = 0; i < WARMUP; i++) { + df.rolling(10).median(); + df.expanding(1).median(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.rolling(10).median(); + df.expanding(1).median(); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "dataframe_rolling_median", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_datetime_index_ops.ts b/benchmarks/tsb/bench_datetime_index_ops.ts new file mode 100644 index 00000000..f85902da --- /dev/null +++ b/benchmarks/tsb/bench_datetime_index_ops.ts @@ -0,0 +1,45 @@ +/** + * Benchmark: DatetimeIndex sort / unique / toStrings / slice / contains / concat — DatetimeIndex operations. + * Outputs JSON: {"function": "datetime_index_ops", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { date_range } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const idx = date_range({ start: "2020-01-01", periods: SIZE, freq: "h" }); +const idx2 = date_range({ start: "2021-01-01", periods: SIZE, freq: "h" }); +const refDate = new Date("2020-06-15T00:00:00Z"); + +for (let i = 0; i < WARMUP; i++) { + idx.sort(); + idx.unique(); + idx.toStrings(); + idx.slice(0, 100); + idx.contains(refDate); + idx.concat(idx2); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + idx.sort(); + idx.unique(); + idx.toStrings(); + idx.slice(0, 100); + idx.contains(refDate); + idx.concat(idx2); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "datetime_index_ops", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_datetime_index_snap.ts b/benchmarks/tsb/bench_datetime_index_snap.ts new file mode 100644 index 00000000..2f98aa4c --- /dev/null +++ b/benchmarks/tsb/bench_datetime_index_snap.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: DatetimeIndex.snap(freq) — snap index dates to frequency boundaries. + * Outputs JSON: {"function": "datetime_index_snap", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { date_range } from "../../src/index.ts"; + +const SIZE = 5_000; +const WARMUP = 5; +const ITERATIONS = 50; + +// Dates that are not on month/week boundaries +const idx = date_range({ start: "2020-01-15", periods: SIZE, freq: "D" }); + +for (let i = 0; i < WARMUP; i++) { + idx.snap("MS"); // snap to month start + idx.snap("W"); // snap to week +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + idx.snap("MS"); + idx.snap("W"); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "datetime_index_snap", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_period_index_query.ts b/benchmarks/tsb/bench_period_index_query.ts new file mode 100644 index 00000000..792412d1 --- /dev/null +++ b/benchmarks/tsb/bench_period_index_query.ts @@ -0,0 +1,44 @@ +/** + * Benchmark: PeriodIndex.getLoc / contains — querying a PeriodIndex. + * Outputs JSON: {"function": "period_index_query", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Period, PeriodIndex } from "../../src/index.ts"; + +const SIZE = 1_000; +const WARMUP = 5; +const ITERATIONS = 100; + +const base = Period.fromDate(new Date(Date.UTC(2020, 0, 1)), "M"); +const periods = Array.from({ length: SIZE }, (_, i) => base.add(i)); +const idx = PeriodIndex.fromPeriods(periods); + +const queryPeriod = base.add(500); +const midPeriod = base.add(250); + +for (let i = 0; i < WARMUP; i++) { + idx.getLoc(queryPeriod); + idx.contains(queryPeriod); + idx.getLoc(midPeriod); + idx.contains(midPeriod); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + idx.getLoc(queryPeriod); + idx.contains(queryPeriod); + idx.getLoc(midPeriod); + idx.contains(midPeriod); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "period_index_query", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_series_groupby_agg_all.ts b/benchmarks/tsb/bench_series_groupby_agg_all.ts new file mode 100644 index 00000000..f3e7eb38 --- /dev/null +++ b/benchmarks/tsb/bench_series_groupby_agg_all.ts @@ -0,0 +1,49 @@ +/** + * Benchmark: SeriesGroupBy — all aggregation operations (sum/mean/std/min/max/count/first/last) on 100k Series. + * Outputs JSON: {"function": "series_groupby_agg_all", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i * 1.5) % 9999) }); +const by = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 100) }); +const gb = s.groupby(by); + +for (let i = 0; i < WARMUP; i++) { + gb.sum(); + gb.mean(); + gb.std(); + gb.min(); + gb.max(); + gb.count(); + gb.first(); + gb.last(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + gb.sum(); + gb.mean(); + gb.std(); + gb.min(); + gb.max(); + gb.count(); + gb.first(); + gb.last(); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "series_groupby_agg_all", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); From 86d054ba0616c9067e7ccff4e39e3e3fe382f05f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 13:13:45 +0000 Subject: [PATCH 15/19] Iteration 154: Add 5 benchmark pairs (488 total, +5 vs best 483) Added 5 new pairs: - datetime_index_normalize_filter_shift (DatetimeIndex.normalize/filter/shift) - index_map (Index.map transform function) - multi_index_fromtuples (MultiIndex.fromTuples construction) - timedelta_advanced_ops (Timedelta.parse/toISOString/divBy/negate/mul/compareTo) - dataframe_rolling_var_std_sum_count (DataFrameRolling.var/std/sum/count) Run: https://github.com/githubnext/tsessebe/actions/runs/24565880287 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...nch_dataframe_rolling_var_std_sum_count.py | 43 +++++++++++++ ...h_datetime_index_normalize_filter_shift.py | 36 +++++++++++ benchmarks/pandas/bench_index_map.py | 34 +++++++++++ .../pandas/bench_multi_index_fromtuples.py | 34 +++++++++++ .../pandas/bench_timedelta_advanced_ops.py | 56 +++++++++++++++++ ...nch_dataframe_rolling_var_std_sum_count.ts | 44 ++++++++++++++ ...h_datetime_index_normalize_filter_shift.ts | 39 ++++++++++++ benchmarks/tsb/bench_index_map.ts | 36 +++++++++++ .../tsb/bench_multi_index_fromtuples.ts | 46 ++++++++++++++ .../tsb/bench_timedelta_advanced_ops.ts | 60 +++++++++++++++++++ 10 files changed, 428 insertions(+) create mode 100644 benchmarks/pandas/bench_dataframe_rolling_var_std_sum_count.py create mode 100644 benchmarks/pandas/bench_datetime_index_normalize_filter_shift.py create mode 100644 benchmarks/pandas/bench_index_map.py create mode 100644 benchmarks/pandas/bench_multi_index_fromtuples.py create mode 100644 benchmarks/pandas/bench_timedelta_advanced_ops.py create mode 100644 benchmarks/tsb/bench_dataframe_rolling_var_std_sum_count.ts create mode 100644 benchmarks/tsb/bench_datetime_index_normalize_filter_shift.ts create mode 100644 benchmarks/tsb/bench_index_map.ts create mode 100644 benchmarks/tsb/bench_multi_index_fromtuples.ts create mode 100644 benchmarks/tsb/bench_timedelta_advanced_ops.ts diff --git a/benchmarks/pandas/bench_dataframe_rolling_var_std_sum_count.py b/benchmarks/pandas/bench_dataframe_rolling_var_std_sum_count.py new file mode 100644 index 00000000..dc1c020d --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_rolling_var_std_sum_count.py @@ -0,0 +1,43 @@ +""" +Benchmark: pandas DataFrame.rolling().var() / std() / sum() / count() — rolling aggregations. +Outputs JSON: {"function": "dataframe_rolling_var_std_sum_count", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +SIZE = 50_000 +WINDOW = 20 +WARMUP = 5 +ITERATIONS = 30 + +df = pd.DataFrame({ + "a": np.sin(np.arange(SIZE) * 0.01) * 100, + "b": np.cos(np.arange(SIZE) * 0.01) * 50, + "c": (np.arange(SIZE) % 100) * 1.5, +}) + +for _ in range(WARMUP): + df.rolling(WINDOW).var() + df.rolling(WINDOW).std() + df.rolling(WINDOW).sum() + df.rolling(WINDOW).count() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.rolling(WINDOW).var() + df.rolling(WINDOW).std() + df.rolling(WINDOW).sum() + df.rolling(WINDOW).count() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "dataframe_rolling_var_std_sum_count", + "mean_ms": round(mean_ms, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_datetime_index_normalize_filter_shift.py b/benchmarks/pandas/bench_datetime_index_normalize_filter_shift.py new file mode 100644 index 00000000..f091f47a --- /dev/null +++ b/benchmarks/pandas/bench_datetime_index_normalize_filter_shift.py @@ -0,0 +1,36 @@ +""" +Benchmark: pandas DatetimeIndex.normalize() / date filtering / shift — DatetimeIndex transforms. +Outputs JSON: {"function": "datetime_index_normalize_filter_shift", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 5_000 +WARMUP = 5 +ITERATIONS = 50 + +idx = pd.date_range(start="2020-01-01 12:30:00", periods=SIZE, freq="h") +cutoff = pd.Timestamp("2021-01-01") + +for _ in range(WARMUP): + idx.normalize() + idx[idx < cutoff] + idx.shift(7, freq="D") + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + idx.normalize() + idx[idx < cutoff] + idx.shift(7, freq="D") + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "datetime_index_normalize_filter_shift", + "mean_ms": round(mean_ms, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_index_map.py b/benchmarks/pandas/bench_index_map.py new file mode 100644 index 00000000..ddc89d5d --- /dev/null +++ b/benchmarks/pandas/bench_index_map.py @@ -0,0 +1,34 @@ +""" +Benchmark: pandas Index.map(fn) — transform Index values with a mapping function. +Outputs JSON: {"function": "index_map", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 50_000 +WARMUP = 5 +ITERATIONS = 50 + +num_idx = pd.Index(range(SIZE)) +str_idx = pd.Index([f"key_{i % 1000}" for i in range(SIZE)]) + +for _ in range(WARMUP): + num_idx.map(lambda v: v * 2) + str_idx.map(lambda v: v.upper()) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + num_idx.map(lambda v: v * 2) + str_idx.map(lambda v: v.upper()) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "index_map", + "mean_ms": round(mean_ms, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_multi_index_fromtuples.py b/benchmarks/pandas/bench_multi_index_fromtuples.py new file mode 100644 index 00000000..8437d4a9 --- /dev/null +++ b/benchmarks/pandas/bench_multi_index_fromtuples.py @@ -0,0 +1,34 @@ +""" +Benchmark: pandas MultiIndex.from_tuples — construct MultiIndex from array of tuples. +Outputs JSON: {"function": "multi_index_fromtuples", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 5_000 +WARMUP = 3 +ITERATIONS = 20 + +tuples2 = [(f"dept_{i % 20}", i % 100) for i in range(SIZE)] +tuples3 = [(f"region_{i % 5}", f"dept_{i % 20}", i % 50) for i in range(SIZE)] + +for _ in range(WARMUP): + pd.MultiIndex.from_tuples(tuples2) + pd.MultiIndex.from_tuples(tuples3) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.MultiIndex.from_tuples(tuples2) + pd.MultiIndex.from_tuples(tuples3) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "multi_index_fromtuples", + "mean_ms": round(mean_ms, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/pandas/bench_timedelta_advanced_ops.py b/benchmarks/pandas/bench_timedelta_advanced_ops.py new file mode 100644 index 00000000..3f1bf24e --- /dev/null +++ b/benchmarks/pandas/bench_timedelta_advanced_ops.py @@ -0,0 +1,56 @@ +""" +Benchmark: pandas Timedelta advanced operations — parse, isoformat, division, negation, multiplication, comparison. +Outputs JSON: {"function": "timedelta_advanced_ops", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 1_000 +WARMUP = 5 +ITERATIONS = 100 + +iso_strings = [ + "1 days 02:30:00", + "0 days 00:45:00", + "7 days 00:00:00", + "-1 days +22:30:00", + "10 days 05:20:15", +] + +td1 = pd.Timedelta(days=2, hours=3) +td2 = pd.Timedelta(hours=5, minutes=30) +deltas = [pd.Timedelta(days=i % 365, hours=i % 24) for i in range(SIZE)] + +for _ in range(WARMUP): + for s in iso_strings: + pd.Timedelta(s) + for td in deltas[:50]: + td.isoformat() + td / td1 if td1.total_seconds() != 0 else None + -td + td * 2 + td < td2 + td == td1 + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + for s in iso_strings: + pd.Timedelta(s) + for td in deltas: + td.isoformat() + -td + td * 3 + td < td2 + td == td1 + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "timedelta_advanced_ops", + "mean_ms": round(mean_ms, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +})) diff --git a/benchmarks/tsb/bench_dataframe_rolling_var_std_sum_count.ts b/benchmarks/tsb/bench_dataframe_rolling_var_std_sum_count.ts new file mode 100644 index 00000000..391763da --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_rolling_var_std_sum_count.ts @@ -0,0 +1,44 @@ +/** + * Benchmark: DataFrameRolling.var / std / sum / count — rolling aggregations on a 50k-row DataFrame. + * Outputs JSON: {"function": "dataframe_rolling_var_std_sum_count", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.ts"; + +const SIZE = 50_000; +const WINDOW = 20; +const WARMUP = 5; +const ITERATIONS = 30; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.01) * 100), + b: Array.from({ length: SIZE }, (_, i) => Math.cos(i * 0.01) * 50), + c: Array.from({ length: SIZE }, (_, i) => (i % 100) * 1.5), +}); + +for (let i = 0; i < WARMUP; i++) { + df.rolling(WINDOW).var(); + df.rolling(WINDOW).std(); + df.rolling(WINDOW).sum(); + df.rolling(WINDOW).count(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.rolling(WINDOW).var(); + df.rolling(WINDOW).std(); + df.rolling(WINDOW).sum(); + df.rolling(WINDOW).count(); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "dataframe_rolling_var_std_sum_count", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_datetime_index_normalize_filter_shift.ts b/benchmarks/tsb/bench_datetime_index_normalize_filter_shift.ts new file mode 100644 index 00000000..e6f07fcd --- /dev/null +++ b/benchmarks/tsb/bench_datetime_index_normalize_filter_shift.ts @@ -0,0 +1,39 @@ +/** + * Benchmark: DatetimeIndex.normalize() / filter() / shift(n, freq) — DatetimeIndex transforms. + * Outputs JSON: {"function": "datetime_index_normalize_filter_shift", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { date_range } from "../../src/index.ts"; + +const SIZE = 5_000; +const WARMUP = 5; +const ITERATIONS = 50; + +// Index with non-midnight times (so normalize actually changes something) +const idx = date_range({ start: "2020-01-01T12:30:00", periods: SIZE, freq: "h" }); +const cutoff = new Date("2021-01-01T00:00:00Z"); + +for (let i = 0; i < WARMUP; i++) { + idx.normalize(); + idx.filter((d) => d < cutoff); + idx.shift(7, "D"); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + idx.normalize(); + idx.filter((d) => d < cutoff); + idx.shift(7, "D"); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "datetime_index_normalize_filter_shift", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_index_map.ts b/benchmarks/tsb/bench_index_map.ts new file mode 100644 index 00000000..2700c839 --- /dev/null +++ b/benchmarks/tsb/bench_index_map.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: Index.map(fn) — transform Index values to a new Index using a mapping function. + * Outputs JSON: {"function": "index_map", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Index } from "../../src/index.ts"; + +const SIZE = 50_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const numIdx = new Index(Array.from({ length: SIZE }, (_, i) => i)); +const strIdx = new Index(Array.from({ length: SIZE }, (_, i) => `key_${i % 1000}`)); + +for (let i = 0; i < WARMUP; i++) { + numIdx.map((v) => (v as number) * 2); + strIdx.map((v) => (v as string).toUpperCase()); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + numIdx.map((v) => (v as number) * 2); + strIdx.map((v) => (v as string).toUpperCase()); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "index_map", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_multi_index_fromtuples.ts b/benchmarks/tsb/bench_multi_index_fromtuples.ts new file mode 100644 index 00000000..3aca0304 --- /dev/null +++ b/benchmarks/tsb/bench_multi_index_fromtuples.ts @@ -0,0 +1,46 @@ +/** + * Benchmark: MultiIndex.fromTuples — construct a MultiIndex from an array of tuples. + * Outputs JSON: {"function": "multi_index_fromtuples", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { MultiIndex } from "../../src/index.ts"; + +const SIZE = 5_000; +const WARMUP = 3; +const ITERATIONS = 20; + +// Build an array of 2-level tuples [string, number] +const tuples: (readonly (string | number)[])[] = Array.from({ length: SIZE }, (_, i) => [ + `dept_${i % 20}`, + i % 100, +]); + +// Also build 3-level tuples to test deeper nesting +const tuples3: (readonly (string | number)[])[] = Array.from({ length: SIZE }, (_, i) => [ + `region_${i % 5}`, + `dept_${i % 20}`, + i % 50, +]); + +for (let i = 0; i < WARMUP; i++) { + MultiIndex.fromTuples(tuples); + MultiIndex.fromTuples(tuples3); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + MultiIndex.fromTuples(tuples); + MultiIndex.fromTuples(tuples3); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "multi_index_fromtuples", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_timedelta_advanced_ops.ts b/benchmarks/tsb/bench_timedelta_advanced_ops.ts new file mode 100644 index 00000000..53c6dbde --- /dev/null +++ b/benchmarks/tsb/bench_timedelta_advanced_ops.ts @@ -0,0 +1,60 @@ +/** + * Benchmark: Timedelta advanced operations — parse, toISOString, divBy, negate, mul, compareTo, equals. + * Outputs JSON: {"function": "timedelta_advanced_ops", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Timedelta } from "../../src/index.ts"; + +const SIZE = 1_000; +const WARMUP = 5; +const ITERATIONS = 100; + +const isoStrings = [ + "P1DT2H30M", + "PT45M", + "P7D", + "-PT1H30M", + "P10DT5H20M15S", +]; + +const td1 = Timedelta.fromComponents({ days: 2, hours: 3 }); +const td2 = Timedelta.fromComponents({ hours: 5, minutes: 30 }); +const deltas = Array.from({ length: SIZE }, (_, i) => + Timedelta.fromComponents({ days: i % 365, hours: i % 24 }), +); + +for (let w = 0; w < WARMUP; w++) { + for (const s of isoStrings) Timedelta.parse(s); + for (const td of deltas.slice(0, 50)) { + td.toISOString(); + td.divBy(td1); + td.negate(); + td.mul(2); + td.compareTo(td2); + td.equals(td1); + } +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + for (const s of isoStrings) Timedelta.parse(s); + for (const td of deltas) { + td.toISOString(); + td.negate(); + td.mul(3); + td.compareTo(td2); + td.equals(td1); + } + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "timedelta_advanced_ops", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, + }), +); From 313b4f3f9fdd14173847204a16ba66e54cd23aad Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 13:50:55 +0000 Subject: [PATCH 16/19] Iteration 155: DataFrameExpanding std/var/sum/count/median/apply + TZDatetimeIndex extra + TimedeltaIndex toStrings Run: https://github.com/githubnext/tsessebe/actions/runs/24567781388 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../bench_dataframe_expanding_median_apply.py | 32 +++++++++++ .../bench_dataframe_expanding_std_var.py | 30 +++++++++++ .../bench_dataframe_expanding_sum_count.py | 30 +++++++++++ .../pandas/bench_timedelta_index_tostrings.py | 39 ++++++++++++++ .../pandas/bench_tz_datetime_index_extra.py | 45 ++++++++++++++++ .../bench_dataframe_expanding_median_apply.ts | 36 +++++++++++++ .../tsb/bench_dataframe_expanding_std_var.ts | 34 ++++++++++++ .../bench_dataframe_expanding_sum_count.ts | 34 ++++++++++++ .../tsb/bench_timedelta_index_tostrings.ts | 41 ++++++++++++++ .../tsb/bench_tz_datetime_index_extra.ts | 53 +++++++++++++++++++ 10 files changed, 374 insertions(+) create mode 100644 benchmarks/pandas/bench_dataframe_expanding_median_apply.py create mode 100644 benchmarks/pandas/bench_dataframe_expanding_std_var.py create mode 100644 benchmarks/pandas/bench_dataframe_expanding_sum_count.py create mode 100644 benchmarks/pandas/bench_timedelta_index_tostrings.py create mode 100644 benchmarks/pandas/bench_tz_datetime_index_extra.py create mode 100644 benchmarks/tsb/bench_dataframe_expanding_median_apply.ts create mode 100644 benchmarks/tsb/bench_dataframe_expanding_std_var.ts create mode 100644 benchmarks/tsb/bench_dataframe_expanding_sum_count.ts create mode 100644 benchmarks/tsb/bench_timedelta_index_tostrings.ts create mode 100644 benchmarks/tsb/bench_tz_datetime_index_extra.ts diff --git a/benchmarks/pandas/bench_dataframe_expanding_median_apply.py b/benchmarks/pandas/bench_dataframe_expanding_median_apply.py new file mode 100644 index 00000000..7d28d657 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_expanding_median_apply.py @@ -0,0 +1,32 @@ +"""Benchmark: DataFrame.expanding().median() and .apply(fn) on 10k-row DataFrame.""" +import pandas as pd +import numpy as np +import json +import time + +ROWS = 10_000 +WARMUP = 2 +ITERATIONS = 5 + +a = np.sin(np.arange(ROWS) * 0.05) * 100 +b = np.cos(np.arange(ROWS) * 0.05) * 80 +df = pd.DataFrame({"a": a, "b": b}) + +sum_fn = lambda x: x.sum() + +for _ in range(WARMUP): + df.expanding().median() + df.expanding().apply(sum_fn, raw=True) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.expanding().median() + df.expanding().apply(sum_fn, raw=True) +total = time.perf_counter() - start + +print(json.dumps({ + "function": "dataframe_expanding_median_apply", + "mean_ms": total / ITERATIONS * 1000, + "iterations": ITERATIONS, + "total_ms": total * 1000, +})) diff --git a/benchmarks/pandas/bench_dataframe_expanding_std_var.py b/benchmarks/pandas/bench_dataframe_expanding_std_var.py new file mode 100644 index 00000000..ceaec2a8 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_expanding_std_var.py @@ -0,0 +1,30 @@ +"""Benchmark: DataFrame.expanding().std() and .var() on 10k-row DataFrame.""" +import pandas as pd +import numpy as np +import json +import time + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 + +a = np.sin(np.arange(ROWS) * 0.01) * 100 +b = np.cos(np.arange(ROWS) * 0.01) * 50 +df = pd.DataFrame({"a": a, "b": b}) + +for _ in range(WARMUP): + df.expanding().std() + df.expanding().var() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.expanding().std() + df.expanding().var() +total = time.perf_counter() - start + +print(json.dumps({ + "function": "dataframe_expanding_std_var", + "mean_ms": total / ITERATIONS * 1000, + "iterations": ITERATIONS, + "total_ms": total * 1000, +})) diff --git a/benchmarks/pandas/bench_dataframe_expanding_sum_count.py b/benchmarks/pandas/bench_dataframe_expanding_sum_count.py new file mode 100644 index 00000000..4a10ec7c --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_expanding_sum_count.py @@ -0,0 +1,30 @@ +"""Benchmark: DataFrame.expanding().sum() and .count() on 10k-row DataFrame.""" +import pandas as pd +import numpy as np +import json +import time + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 + +a = (np.arange(ROWS) % 100) * 1.5 +b = (np.arange(ROWS) % 50) * 2.0 +df = pd.DataFrame({"a": a, "b": b}) + +for _ in range(WARMUP): + df.expanding().sum() + df.expanding().count() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.expanding().sum() + df.expanding().count() +total = time.perf_counter() - start + +print(json.dumps({ + "function": "dataframe_expanding_sum_count", + "mean_ms": total / ITERATIONS * 1000, + "iterations": ITERATIONS, + "total_ms": total * 1000, +})) diff --git a/benchmarks/pandas/bench_timedelta_index_tostrings.py b/benchmarks/pandas/bench_timedelta_index_tostrings.py new file mode 100644 index 00000000..2fc9fd45 --- /dev/null +++ b/benchmarks/pandas/bench_timedelta_index_tostrings.py @@ -0,0 +1,39 @@ +"""Benchmark: TimedeltaIndex.astype(str), .to_numpy(), element access, rename +on 10k-element TimedeltaIndex.""" +import pandas as pd +import numpy as np +import json +import time + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +deltas = pd.to_timedelta( + [(i % 365) * 24 * 3600 + (i % 24) * 3600 + (i % 60) * 60 for i in range(SIZE)], + unit="s", +) +idx = pd.TimedeltaIndex(deltas, name="duration") + +for _ in range(WARMUP): + idx.astype(str) + idx.to_numpy() + idx[0] + idx[-1] + idx.rename("elapsed") + +start = time.perf_counter() +for _ in range(ITERATIONS): + idx.astype(str) + idx.to_numpy() + idx[0] + idx[-1] + idx.rename("elapsed") +total = time.perf_counter() - start + +print(json.dumps({ + "function": "timedelta_index_tostrings", + "mean_ms": total / ITERATIONS * 1000, + "iterations": ITERATIONS, + "total_ms": total * 1000, +})) diff --git a/benchmarks/pandas/bench_tz_datetime_index_extra.py b/benchmarks/pandas/bench_tz_datetime_index_extra.py new file mode 100644 index 00000000..b9fb781a --- /dev/null +++ b/benchmarks/pandas/bench_tz_datetime_index_extra.py @@ -0,0 +1,45 @@ +"""Benchmark: tz-aware DatetimeIndex — slice, concat, min/max, tz_convert, +tz_localize(None), and array conversions on 10k-element index.""" +import pandas as pd +import numpy as np +import json +import time + +SIZE = 10_000 +WARMUP = 3 +ITERATIONS = 20 + +naive = pd.date_range("2024-01-01", periods=SIZE, freq="h") +tz_idx = naive.tz_localize("America/New_York") +half = SIZE // 2 + +for _ in range(WARMUP): + tz_idx[:half] + tz_idx[:half].append(tz_idx[half:]) + tz_idx[0] + tz_idx.to_list() + tz_idx.asi8 + tz_idx.min() + tz_idx.max() + tz_idx.tz_convert("UTC") + tz_idx.tz_localize(None) + +start = time.perf_counter() +for _ in range(ITERATIONS): + tz_idx[:half] + tz_idx[:half].append(tz_idx[half:]) + tz_idx[0] + tz_idx.to_list() + tz_idx.asi8 + tz_idx.min() + tz_idx.max() + tz_idx.tz_convert("UTC") + tz_idx.tz_localize(None) +total = time.perf_counter() - start + +print(json.dumps({ + "function": "tz_datetime_index_extra", + "mean_ms": total / ITERATIONS * 1000, + "iterations": ITERATIONS, + "total_ms": total * 1000, +})) diff --git a/benchmarks/tsb/bench_dataframe_expanding_median_apply.ts b/benchmarks/tsb/bench_dataframe_expanding_median_apply.ts new file mode 100644 index 00000000..c1155dc0 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_expanding_median_apply.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: DataFrameExpanding.median() and .apply(fn) on 10k-row DataFrame. + * Outputs JSON: {"function": "dataframe_expanding_median_apply", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 2; +const ITERATIONS = 5; + +const a = Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.05) * 100); +const b = Array.from({ length: ROWS }, (_, i) => Math.cos(i * 0.05) * 80); +const df = DataFrame.fromColumns({ a, b }); + +const sumFn = (vals: readonly number[]) => vals.reduce((acc, v) => acc + v, 0); + +for (let i = 0; i < WARMUP; i++) { + df.expanding().median(); + df.expanding().apply(sumFn); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.expanding().median(); + df.expanding().apply(sumFn); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_expanding_median_apply", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_expanding_std_var.ts b/benchmarks/tsb/bench_dataframe_expanding_std_var.ts new file mode 100644 index 00000000..ca1cbd19 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_expanding_std_var.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: DataFrameExpanding.std() and .var() on 10k-row DataFrame. + * Outputs JSON: {"function": "dataframe_expanding_std_var", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const a = Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01) * 100); +const b = Array.from({ length: ROWS }, (_, i) => Math.cos(i * 0.01) * 50); +const df = DataFrame.fromColumns({ a, b }); + +for (let i = 0; i < WARMUP; i++) { + df.expanding().std(); + df.expanding().var(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.expanding().std(); + df.expanding().var(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_expanding_std_var", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_expanding_sum_count.ts b/benchmarks/tsb/bench_dataframe_expanding_sum_count.ts new file mode 100644 index 00000000..237e7adc --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_expanding_sum_count.ts @@ -0,0 +1,34 @@ +/** + * Benchmark: DataFrameExpanding.sum() and .count() on 10k-row DataFrame. + * Outputs JSON: {"function": "dataframe_expanding_sum_count", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const a = Array.from({ length: ROWS }, (_, i) => (i % 100) * 1.5); +const b = Array.from({ length: ROWS }, (_, i) => (i % 50) * 2.0); +const df = DataFrame.fromColumns({ a, b }); + +for (let i = 0; i < WARMUP; i++) { + df.expanding().sum(); + df.expanding().count(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.expanding().sum(); + df.expanding().count(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_expanding_sum_count", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_timedelta_index_tostrings.ts b/benchmarks/tsb/bench_timedelta_index_tostrings.ts new file mode 100644 index 00000000..3dbe9535 --- /dev/null +++ b/benchmarks/tsb/bench_timedelta_index_tostrings.ts @@ -0,0 +1,41 @@ +/** + * Benchmark: TimedeltaIndex.toStrings(), .toArray(), .at(), .rename() on 10k-element index. + * Outputs JSON: {"function": "timedelta_index_tostrings", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Timedelta, TimedeltaIndex } from "../../src/index.js"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const deltas = Array.from({ length: SIZE }, (_, i) => + Timedelta.fromComponents({ days: i % 365, hours: i % 24, minutes: i % 60 }), +); +const idx = TimedeltaIndex.fromTimedeltas(deltas, "duration"); + +for (let i = 0; i < WARMUP; i++) { + idx.toStrings(); + idx.toArray(); + idx.at(0); + idx.at(SIZE - 1); + idx.rename("elapsed"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + idx.toStrings(); + idx.toArray(); + idx.at(0); + idx.at(SIZE - 1); + idx.rename("elapsed"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "timedelta_index_tostrings", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_tz_datetime_index_extra.ts b/benchmarks/tsb/bench_tz_datetime_index_extra.ts new file mode 100644 index 00000000..050f0851 --- /dev/null +++ b/benchmarks/tsb/bench_tz_datetime_index_extra.ts @@ -0,0 +1,53 @@ +/** + * Benchmark: TZDatetimeIndex — slice, concat, at, toArray, toTimestamps, min, max, + * tz_convert (instance method), tz_localize_none on 10k-element TZDatetimeIndex. + * Outputs JSON: {"function": "tz_datetime_index_extra", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { date_range, tz_localize } from "../../src/index.js"; + +const SIZE = 10_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const naive = date_range({ start: "2024-01-01", periods: SIZE, freq: "h" }); +const tzIdx = tz_localize(naive, "America/New_York"); +const halfSize = Math.floor(SIZE / 2); + +for (let i = 0; i < WARMUP; i++) { + tzIdx.slice(0, halfSize); + const half1 = tzIdx.slice(0, halfSize); + const half2 = tzIdx.slice(halfSize); + half1.concat(half2); + tzIdx.at(0); + tzIdx.toArray(); + tzIdx.toTimestamps(); + tzIdx.min(); + tzIdx.max(); + tzIdx.tz_convert("UTC"); + tzIdx.tz_localize_none(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + tzIdx.slice(0, halfSize); + const half1 = tzIdx.slice(0, halfSize); + const half2 = tzIdx.slice(halfSize); + half1.concat(half2); + tzIdx.at(0); + tzIdx.toArray(); + tzIdx.toTimestamps(); + tzIdx.min(); + tzIdx.max(); + tzIdx.tz_convert("UTC"); + tzIdx.tz_localize_none(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "tz_datetime_index_extra", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); From df6dab9998fca02b014636ad989324156bd5e80d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 15:02:39 +0000 Subject: [PATCH 17/19] Iteration 156: 5 new benchmark pairs Added benchmarks for DateOffset rollforward/rollback/onOffset, more DateOffset types (MonthBegin/YearEnd/Week/Minute/Milli), date_range with various frequency options, combineFirstDataFrame standalone function, and SeriesGroupBy.agg with custom aggregate functions. Run: https://github.com/githubnext/tsessebe/actions/runs/24570329650 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../pandas/bench_combine_first_dataframe.py | 36 +++++++++++ .../pandas/bench_date_offset_more_types.py | 45 +++++++++++++ .../pandas/bench_date_offset_rollforward.py | 58 +++++++++++++++++ benchmarks/pandas/bench_date_range_options.py | 37 +++++++++++ .../pandas/bench_series_groupby_custom_agg.py | 38 +++++++++++ .../tsb/bench_combine_first_dataframe.ts | 44 +++++++++++++ .../tsb/bench_date_offset_more_types.ts | 50 +++++++++++++++ .../tsb/bench_date_offset_rollforward.ts | 63 +++++++++++++++++++ benchmarks/tsb/bench_date_range_options.ts | 52 +++++++++++++++ .../tsb/bench_series_groupby_custom_agg.ts | 52 +++++++++++++++ 10 files changed, 475 insertions(+) create mode 100644 benchmarks/pandas/bench_combine_first_dataframe.py create mode 100644 benchmarks/pandas/bench_date_offset_more_types.py create mode 100644 benchmarks/pandas/bench_date_offset_rollforward.py create mode 100644 benchmarks/pandas/bench_date_range_options.py create mode 100644 benchmarks/pandas/bench_series_groupby_custom_agg.py create mode 100644 benchmarks/tsb/bench_combine_first_dataframe.ts create mode 100644 benchmarks/tsb/bench_date_offset_more_types.ts create mode 100644 benchmarks/tsb/bench_date_offset_rollforward.ts create mode 100644 benchmarks/tsb/bench_date_range_options.ts create mode 100644 benchmarks/tsb/bench_series_groupby_custom_agg.ts diff --git a/benchmarks/pandas/bench_combine_first_dataframe.py b/benchmarks/pandas/bench_combine_first_dataframe.py new file mode 100644 index 00000000..2609a647 --- /dev/null +++ b/benchmarks/pandas/bench_combine_first_dataframe.py @@ -0,0 +1,36 @@ +"""Benchmark: DataFrame.combine_first — fill NaN values from another DataFrame (union of indexes). +Mirrors tsb bench_combine_first_dataframe.ts. +""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 5_000 +WARMUP = 5 +ITERATIONS = 30 + +rows1 = list(range(SIZE)) +data1a = [None if i % 3 == 0 else i * 1.5 for i in range(SIZE)] +data1b = [None if i % 5 == 0 else i * 0.5 for i in range(SIZE)] +df1 = pd.DataFrame({"a": data1a, "b": data1b}, index=rows1) + +rows2 = list(range(SIZE + 500)) +data2a = [i * 2.0 for i in range(SIZE + 500)] +data2b = [i * 1.0 for i in range(SIZE + 500)] +data2c = [i * 0.1 for i in range(SIZE + 500)] +df2 = pd.DataFrame({"a": data2a, "b": data2b, "c": data2c}, index=rows2) + +for _ in range(WARMUP): + df1.combine_first(df2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df1.combine_first(df2) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "combine_first_dataframe", + "mean_ms": round(total / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total, 3), +})) diff --git a/benchmarks/pandas/bench_date_offset_more_types.py b/benchmarks/pandas/bench_date_offset_more_types.py new file mode 100644 index 00000000..3910f041 --- /dev/null +++ b/benchmarks/pandas/bench_date_offset_more_types.py @@ -0,0 +1,45 @@ +"""Benchmark: DateOffset more types — MonthBegin, YearEnd, Week, Minute, Milli apply. +Mirrors tsb bench_date_offset_more_types.ts for pandas.tseries.offsets. +""" +import json, time +from datetime import timedelta +import pandas as pd +from pandas.tseries.offsets import MonthBegin, YearEnd, Week, Minute, Milli + +SIZE = 5_000 +WARMUP = 5 +ITERATIONS = 50 + +month_begin = MonthBegin(1) +year_end = YearEnd(1) +week = Week(2) +minute = Minute(60) +milli = Milli(1000) + +base = pd.Timestamp("2020-01-15 10:30:00", tz="UTC") +dates = [base + timedelta(minutes=i) for i in range(SIZE)] + +for _ in range(WARMUP): + for d in dates[:100]: + d + month_begin + d + year_end + d + week + d + minute + d + milli + +start = time.perf_counter() +for _ in range(ITERATIONS): + for d in dates: + d + month_begin + d + year_end + d + week + d + minute + d + milli +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "date_offset_more_types", + "mean_ms": round(total / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total, 3), +})) diff --git a/benchmarks/pandas/bench_date_offset_rollforward.py b/benchmarks/pandas/bench_date_offset_rollforward.py new file mode 100644 index 00000000..3c5de6d8 --- /dev/null +++ b/benchmarks/pandas/bench_date_offset_rollforward.py @@ -0,0 +1,58 @@ +"""Benchmark: DateOffset.rollforward / rollback / is_on_offset — snap dates to anchors. +Mirrors tsb bench_date_offset_rollforward.ts for pandas.tseries.offsets. +""" +import json, time +from datetime import datetime, timezone, timedelta +from pandas.tseries.offsets import MonthEnd, BusinessDay, YearBegin, MonthBegin, YearEnd + +SIZE = 5_000 +WARMUP = 5 +ITERATIONS = 50 + +month_end = MonthEnd(1) +biz_day = BusinessDay(1) +year_begin = YearBegin(1) +month_begin = MonthBegin(1) +year_end = YearEnd(1) + +import pandas as pd +base = pd.Timestamp("2020-01-15", tz="UTC") +dates = [base + timedelta(days=i) for i in range(SIZE)] + +for _ in range(WARMUP): + for d in dates[:100]: + month_end.rollforward(d) + month_end.rollback(d) + month_end.is_on_offset(d) + biz_day.rollforward(d) + biz_day.rollback(d) + biz_day.is_on_offset(d) + year_begin.rollforward(d) + year_begin.rollback(d) + month_begin.rollforward(d) + month_begin.rollback(d) + year_end.rollforward(d) + year_end.rollback(d) + +start = time.perf_counter() +for _ in range(ITERATIONS): + for d in dates: + month_end.rollforward(d) + month_end.rollback(d) + month_end.is_on_offset(d) + biz_day.rollforward(d) + biz_day.rollback(d) + year_begin.rollforward(d) + year_begin.rollback(d) + month_begin.rollforward(d) + month_begin.rollback(d) + year_end.rollforward(d) + year_end.rollback(d) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "date_offset_rollforward", + "mean_ms": round(total / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total, 3), +})) diff --git a/benchmarks/pandas/bench_date_range_options.py b/benchmarks/pandas/bench_date_range_options.py new file mode 100644 index 00000000..a5fc1516 --- /dev/null +++ b/benchmarks/pandas/bench_date_range_options.py @@ -0,0 +1,37 @@ +"""Benchmark: date_range — generate DatetimeIndex with various frequency options. +Mirrors tsb bench_date_range_options.ts using pandas.date_range. +""" +import json, time +import pandas as pd + +WARMUP = 5 +ITERATIONS = 100 + +for _ in range(WARMUP): + pd.date_range(start="2020-01-01", periods=1_000, freq="D") + pd.date_range(start="2020-01-01", periods=1_000, freq="h") + pd.date_range(start="2020-01-01", periods=500, freq="ME") + pd.date_range(start="2020-01-01", periods=200, freq="QE") + pd.date_range(start="2020-01-01", periods=100, freq="YE") + pd.date_range(start="2020-01-01", periods=500, freq="MS") + pd.date_range(start="2020-01-01", end="2025-01-01", freq="W") + pd.date_range(start="2020-01-01", periods=2_000, freq="min") + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.date_range(start="2020-01-01", periods=1_000, freq="D") + pd.date_range(start="2020-01-01", periods=1_000, freq="h") + pd.date_range(start="2020-01-01", periods=500, freq="ME") + pd.date_range(start="2020-01-01", periods=200, freq="QE") + pd.date_range(start="2020-01-01", periods=100, freq="YE") + pd.date_range(start="2020-01-01", periods=500, freq="MS") + pd.date_range(start="2020-01-01", end="2025-01-01", freq="W") + pd.date_range(start="2020-01-01", periods=2_000, freq="min") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "date_range_options", + "mean_ms": round(total / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total, 3), +})) diff --git a/benchmarks/pandas/bench_series_groupby_custom_agg.py b/benchmarks/pandas/bench_series_groupby_custom_agg.py new file mode 100644 index 00000000..a7c73509 --- /dev/null +++ b/benchmarks/pandas/bench_series_groupby_custom_agg.py @@ -0,0 +1,38 @@ +"""Benchmark: SeriesGroupBy.agg with custom aggregate functions — median, range. +Mirrors tsb bench_series_groupby_custom_agg.ts. +""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 + +data = [(i * 1.5) % 9999 for i in range(SIZE)] +by = [i % 100 for i in range(SIZE)] +s = pd.Series(data) +gb = s.groupby(by) + +def median_fn(x): + return float(np.median(x)) + +def range_fn(x): + return float(np.max(x) - np.min(x)) + +for _ in range(WARMUP): + gb.agg(median_fn) + gb.agg(range_fn) + +start = time.perf_counter() +for _ in range(ITERATIONS): + gb.agg(median_fn) + gb.agg(range_fn) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_groupby_custom_agg", + "mean_ms": round(total / ITERATIONS, 3), + "iterations": ITERATIONS, + "total_ms": round(total, 3), +})) diff --git a/benchmarks/tsb/bench_combine_first_dataframe.ts b/benchmarks/tsb/bench_combine_first_dataframe.ts new file mode 100644 index 00000000..1e0528eb --- /dev/null +++ b/benchmarks/tsb/bench_combine_first_dataframe.ts @@ -0,0 +1,44 @@ +/** + * Benchmark: combineFirstDataFrame — fill NaN values from another DataFrame (union of indexes). + * Mirrors pandas DataFrame.combine_first. + * Outputs JSON: {"function": "combine_first_dataframe", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, Index, combineFirstDataFrame } from "../../src/index.ts"; + +const SIZE = 5_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// df1: rows 0..SIZE-1, ~30% nulls +const rows1 = Array.from({ length: SIZE }, (_, i) => i); +const data1a = Array.from({ length: SIZE }, (_, i) => (i % 3 === 0 ? null : i * 1.5)); +const data1b = Array.from({ length: SIZE }, (_, i) => (i % 5 === 0 ? null : i * 0.5)); +const idx1 = new Index(rows1); +const df1 = new DataFrame({ a: data1a, b: data1b }, idx1); + +// df2: rows 0..SIZE+500-1 (overlapping + extra), fills missing in df1 +const rows2 = Array.from({ length: SIZE + 500 }, (_, i) => i); +const data2a = Array.from({ length: SIZE + 500 }, (_, i) => i * 2.0); +const data2b = Array.from({ length: SIZE + 500 }, (_, i) => i * 1.0); +const data2c = Array.from({ length: SIZE + 500 }, (_, i) => i * 0.1); +const idx2 = new Index(rows2); +const df2 = new DataFrame({ a: data2a, b: data2b, c: data2c }, idx2); + +for (let i = 0; i < WARMUP; i++) { + combineFirstDataFrame(df1, df2); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + combineFirstDataFrame(df1, df2); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "combine_first_dataframe", + mean_ms: Math.round((total / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_date_offset_more_types.ts b/benchmarks/tsb/bench_date_offset_more_types.ts new file mode 100644 index 00000000..51159887 --- /dev/null +++ b/benchmarks/tsb/bench_date_offset_more_types.ts @@ -0,0 +1,50 @@ +/** + * Benchmark: DateOffset more types — apply operations for MonthBegin, YearEnd, Week, Minute, Milli. + * These DateOffset classes haven't been covered in existing benchmarks. + * Outputs JSON: {"function": "date_offset_more_types", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { MonthBegin, YearEnd, Week, Minute, Milli } from "../../src/index.ts"; + +const SIZE = 5_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const monthBegin = new MonthBegin(1); +const yearEnd = new YearEnd(1); +const week = new Week(2); +const minute = new Minute(60); +const milli = new Milli(1000); + +const base = new Date(Date.UTC(2020, 0, 15, 10, 30, 0)); +const dates = Array.from({ length: SIZE }, (_, i) => new Date(base.getTime() + i * 60_000)); + +for (let i = 0; i < WARMUP; i++) { + for (const d of dates.slice(0, 100)) { + monthBegin.apply(d); + yearEnd.apply(d); + week.apply(d); + minute.apply(d); + milli.apply(d); + } +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + for (const d of dates) { + monthBegin.apply(d); + yearEnd.apply(d); + week.apply(d); + minute.apply(d); + milli.apply(d); + } +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "date_offset_more_types", + mean_ms: Math.round((total / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_date_offset_rollforward.ts b/benchmarks/tsb/bench_date_offset_rollforward.ts new file mode 100644 index 00000000..df2ee0ca --- /dev/null +++ b/benchmarks/tsb/bench_date_offset_rollforward.ts @@ -0,0 +1,63 @@ +/** + * Benchmark: DateOffset.rollforward / rollback / onOffset — snap dates to offset anchors. + * Tests MonthEnd, BusinessDay, YearBegin, MonthBegin, YearEnd. + * Outputs JSON: {"function": "date_offset_rollforward", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { MonthEnd, BusinessDay, YearBegin, MonthBegin, YearEnd } from "../../src/index.ts"; + +const SIZE = 5_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const monthEnd = new MonthEnd(1); +const bizDay = new BusinessDay(1); +const yearBegin = new YearBegin(1); +const monthBegin = new MonthBegin(1); +const yearEnd = new YearEnd(1); + +const base = new Date(Date.UTC(2020, 0, 15)); +const dates = Array.from({ length: SIZE }, (_, i) => new Date(base.getTime() + i * 86_400_000)); + +for (let i = 0; i < WARMUP; i++) { + for (const d of dates.slice(0, 100)) { + monthEnd.rollforward(d); + monthEnd.rollback(d); + monthEnd.onOffset(d); + bizDay.rollforward(d); + bizDay.rollback(d); + bizDay.onOffset(d); + yearBegin.rollforward(d); + yearBegin.rollback(d); + monthBegin.rollforward(d); + monthBegin.rollback(d); + yearEnd.rollforward(d); + yearEnd.rollback(d); + } +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + for (const d of dates) { + monthEnd.rollforward(d); + monthEnd.rollback(d); + monthEnd.onOffset(d); + bizDay.rollforward(d); + bizDay.rollback(d); + yearBegin.rollforward(d); + yearBegin.rollback(d); + monthBegin.rollforward(d); + monthBegin.rollback(d); + yearEnd.rollforward(d); + yearEnd.rollback(d); + } +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "date_offset_rollforward", + mean_ms: Math.round((total / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_date_range_options.ts b/benchmarks/tsb/bench_date_range_options.ts new file mode 100644 index 00000000..a6d30cc0 --- /dev/null +++ b/benchmarks/tsb/bench_date_range_options.ts @@ -0,0 +1,52 @@ +/** + * Benchmark: date_range — generate DatetimeIndex with various frequency options. + * Tests date_range with calendar, business, month-start/end, quarter, year freqs. + * Outputs JSON: {"function": "date_range_options", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { date_range } from "../../src/index.ts"; + +const WARMUP = 5; +const ITERATIONS = 100; + +for (let i = 0; i < WARMUP; i++) { + date_range({ start: "2020-01-01", periods: 1_000, freq: "D" }); + date_range({ start: "2020-01-01", periods: 1_000, freq: "H" }); + date_range({ start: "2020-01-01", periods: 500, freq: "ME" }); + date_range({ start: "2020-01-01", periods: 200, freq: "QE" }); + date_range({ start: "2020-01-01", periods: 100, freq: "YE" }); + date_range({ start: "2020-01-01", periods: 500, freq: "MS" }); + date_range({ start: "2020-01-01", end: "2025-01-01", freq: "W" }); + date_range({ start: "2020-01-01", periods: 2_000, freq: "min" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + date_range({ start: "2020-01-01", periods: 1_000, freq: "D" }); + date_range({ start: "2020-01-01", periods: 1_000, freq: "H" }); + date_range({ start: "2020-01-01", periods: 500, freq: "ME" }); + date_range({ start: "2020-01-01", periods: 200, freq: "QE" }); + date_range({ start: "2020-01-01", periods: 100, freq: "YE" }); + date_range({ start: "2020-01-01", periods: 500, freq: "MS" }); + date_range({ start: "2020-01-01", end: "2025-01-01", freq: "W" }); + date_range({ start: "2020-01-01", periods: 2_000, freq: "min" }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "date_range_options", + mean_ms: Math.round((total / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total * 1000) / 1000, + }), +); + + +console.log( + JSON.stringify({ + function: "date_range_options", + mean_ms: Math.round((total / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total * 1000) / 1000, + }), +); diff --git a/benchmarks/tsb/bench_series_groupby_custom_agg.ts b/benchmarks/tsb/bench_series_groupby_custom_agg.ts new file mode 100644 index 00000000..7aa7cf34 --- /dev/null +++ b/benchmarks/tsb/bench_series_groupby_custom_agg.ts @@ -0,0 +1,52 @@ +/** + * Benchmark: SeriesGroupBy.agg with custom aggregate function — median, geometric mean, range. + * Mirrors pandas SeriesGroupBy.agg(custom_fn) for custom reductions. + * Outputs JSON: {"function": "series_groupby_custom_agg", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const data = Array.from({ length: SIZE }, (_, i) => (i * 1.5) % 9999); +const by = Array.from({ length: SIZE }, (_, i) => i % 100); +const s = new Series({ data }); +const byS = new Series({ data: by }); +const gb = s.groupby(byS); + +// Custom aggregation functions +function medianFn(vals: readonly (string | number | boolean | null | undefined)[]): number { + const nums = vals.filter((v): v is number => typeof v === "number" && !Number.isNaN(v)); + if (nums.length === 0) return Number.NaN; + const sorted = [...nums].sort((a, b) => a - b); + const mid = Math.floor(sorted.length / 2); + return sorted.length % 2 === 0 ? ((sorted[mid - 1] ?? 0) + (sorted[mid] ?? 0)) / 2 : (sorted[mid] ?? Number.NaN); +} + +function rangeFn(vals: readonly (string | number | boolean | null | undefined)[]): number { + const nums = vals.filter((v): v is number => typeof v === "number" && !Number.isNaN(v)); + if (nums.length === 0) return Number.NaN; + return Math.max(...nums) - Math.min(...nums); +} + +for (let i = 0; i < WARMUP; i++) { + gb.agg(medianFn); + gb.agg(rangeFn); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + gb.agg(medianFn); + gb.agg(rangeFn); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_groupby_custom_agg", + mean_ms: Math.round((total / ITERATIONS) * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(total * 1000) / 1000, + }), +); From ba235be914326e9a491eff4a235959b735927e3e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 15:45:50 +0000 Subject: [PATCH 18/19] Iteration 157: 5 new benchmark pairs (503 total, +5 vs best 498) Added benchmarks for: - nan_agg_extended: nancount/nanprod/nanmedian (extended nan aggregates) - rank_methods: rankSeries with min/max/first/dense tie-breaking methods - dropna_advanced: dropnaDataFrame with thresh/subset/axis=1 options - get_dummies_opts: getDummies/dataFrameGetDummies with prefix/dropFirst/dummyNa - factorize_sort: factorize/seriesFactorize with sort=true/useNaSentinel options Run: https://github.com/githubnext/tsessebe/actions/runs/24572885192 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- benchmarks/pandas/bench_dropna_advanced.py | 35 +++++++++++++++++++ benchmarks/pandas/bench_factorize_sort.py | 30 ++++++++++++++++ benchmarks/pandas/bench_get_dummies_opts.py | 34 ++++++++++++++++++ benchmarks/pandas/bench_nan_agg_extended.py | 29 ++++++++++++++++ benchmarks/pandas/bench_rank_methods.py | 31 +++++++++++++++++ benchmarks/tsb/bench_dropna_advanced.ts | 33 ++++++++++++++++++ benchmarks/tsb/bench_factorize_sort.ts | 31 +++++++++++++++++ benchmarks/tsb/bench_get_dummies_opts.ts | 38 +++++++++++++++++++++ benchmarks/tsb/bench_nan_agg_extended.ts | 30 ++++++++++++++++ benchmarks/tsb/bench_rank_methods.ts | 31 +++++++++++++++++ 10 files changed, 322 insertions(+) create mode 100644 benchmarks/pandas/bench_dropna_advanced.py create mode 100644 benchmarks/pandas/bench_factorize_sort.py create mode 100644 benchmarks/pandas/bench_get_dummies_opts.py create mode 100644 benchmarks/pandas/bench_nan_agg_extended.py create mode 100644 benchmarks/pandas/bench_rank_methods.py create mode 100644 benchmarks/tsb/bench_dropna_advanced.ts create mode 100644 benchmarks/tsb/bench_factorize_sort.ts create mode 100644 benchmarks/tsb/bench_get_dummies_opts.ts create mode 100644 benchmarks/tsb/bench_nan_agg_extended.ts create mode 100644 benchmarks/tsb/bench_rank_methods.ts diff --git a/benchmarks/pandas/bench_dropna_advanced.py b/benchmarks/pandas/bench_dropna_advanced.py new file mode 100644 index 00000000..e97d509a --- /dev/null +++ b/benchmarks/pandas/bench_dropna_advanced.py @@ -0,0 +1,35 @@ +""" +Benchmark: DataFrame.dropna with advanced options (thresh, subset, axis=1). +Outputs JSON: {"function": "dropna_advanced", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 30 + +# DataFrame with scattered null values +rng = np.random.default_rng(42) +df = pd.DataFrame({ + "a": [None if i % 4 == 0 else i * 0.1 for i in range(SIZE)], + "b": [None if i % 6 == 0 else i * 2.0 for i in range(SIZE)], + "c": [None if i % 8 == 0 else i % 100 for i in range(SIZE)], + "d": [None if i % 3 == 0 else f"val_{i % 20}" for i in range(SIZE)], +}) + +for _ in range(WARMUP): + df.dropna(thresh=3) + df.dropna(subset=["a", "b"]) + df.dropna(axis=1) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.dropna(thresh=3) + df.dropna(subset=["a", "b"]) + df.dropna(axis=1) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "dropna_advanced", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_factorize_sort.py b/benchmarks/pandas/bench_factorize_sort.py new file mode 100644 index 00000000..2feffe95 --- /dev/null +++ b/benchmarks/pandas/bench_factorize_sort.py @@ -0,0 +1,30 @@ +""" +Benchmark: pandas.factorize / Series.factorize with sort=True and use_na_sentinel options. +Outputs JSON: {"function": "factorize_sort", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +categories = ["zebra", "apple", "mango", "banana", "coconut", "date"] +data = [None if i % 15 == 0 else categories[i % len(categories)] for i in range(SIZE)] +s = pd.Series(data, dtype="object") + +for _ in range(WARMUP): + pd.factorize(data, sort=True) + pd.factorize(data, sort=True, use_na_sentinel=True) + s.factorize(sort=True) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.factorize(data, sort=True) + pd.factorize(data, sort=True, use_na_sentinel=True) + s.factorize(sort=True) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "factorize_sort", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_get_dummies_opts.py b/benchmarks/pandas/bench_get_dummies_opts.py new file mode 100644 index 00000000..d18664b4 --- /dev/null +++ b/benchmarks/pandas/bench_get_dummies_opts.py @@ -0,0 +1,34 @@ +""" +Benchmark: pandas.get_dummies with prefix, drop_first, dummy_na options. +Outputs JSON: {"function": "get_dummies_opts", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 30 + +categories = ["apple", "banana", "cherry", "date", "elderberry"] +data = [None if i % 20 == 0 else categories[i % len(categories)] for i in range(SIZE)] +s = pd.Series(data, dtype="object") + +df = pd.DataFrame({ + "fruit": [None if i % 20 == 0 else categories[i % len(categories)] for i in range(SIZE)], + "color": [["red", "green", "blue"][i % 3] for i in range(SIZE)], +}) + +for _ in range(WARMUP): + pd.get_dummies(s, prefix="cat", dummy_na=True) + pd.get_dummies(s, drop_first=True) + pd.get_dummies(df, columns=["fruit", "color"], prefix="col", drop_first=True) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.get_dummies(s, prefix="cat", dummy_na=True) + pd.get_dummies(s, drop_first=True) + pd.get_dummies(df, columns=["fruit", "color"], prefix="col", drop_first=True) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "get_dummies_opts", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_nan_agg_extended.py b/benchmarks/pandas/bench_nan_agg_extended.py new file mode 100644 index 00000000..5e1fb1bc --- /dev/null +++ b/benchmarks/pandas/bench_nan_agg_extended.py @@ -0,0 +1,29 @@ +""" +Benchmark: np.count_nonzero / np.nanprod / np.nanmedian — extended nan-ignoring aggregates. +Outputs JSON: {"function": "nan_agg_extended", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import math +import time +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +# Array with ~15% NaN values +data = np.array([float("nan") if i % 7 == 0 else math.cos(i * 0.02) * 50 + 1 for i in range(SIZE)]) + +for _ in range(WARMUP): + np.sum(~np.isnan(data)) + np.nanprod(data[:1000]) + np.nanmedian(data) + +start = time.perf_counter() +for _ in range(ITERATIONS): + np.sum(~np.isnan(data)) + np.nanprod(data[:1000]) + np.nanmedian(data) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "nan_agg_extended", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/pandas/bench_rank_methods.py b/benchmarks/pandas/bench_rank_methods.py new file mode 100644 index 00000000..614f1480 --- /dev/null +++ b/benchmarks/pandas/bench_rank_methods.py @@ -0,0 +1,31 @@ +""" +Benchmark: Series.rank with different tie-breaking methods (min/max/first/dense). +Outputs JSON: {"function": "rank_methods", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +# Data with many ties to stress different tie-breaking methods +data = [float((i // 5) * 1.0) for i in range(SIZE)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.rank(method="min") + s.rank(method="max") + s.rank(method="first") + s.rank(method="dense") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.rank(method="min") + s.rank(method="max") + s.rank(method="first") + s.rank(method="dense") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({"function": "rank_methods", "mean_ms": total / ITERATIONS, "iterations": ITERATIONS, "total_ms": total})) diff --git a/benchmarks/tsb/bench_dropna_advanced.ts b/benchmarks/tsb/bench_dropna_advanced.ts new file mode 100644 index 00000000..7f1dc92d --- /dev/null +++ b/benchmarks/tsb/bench_dropna_advanced.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: dropnaDataFrame with advanced options (thresh, subset, axis=1). + * Outputs JSON: {"function": "dropna_advanced", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, dropnaDataFrame } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// DataFrame with scattered null values +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => (i % 4 === 0 ? null : i * 0.1)), + b: Array.from({ length: SIZE }, (_, i) => (i % 6 === 0 ? null : i * 2.0)), + c: Array.from({ length: SIZE }, (_, i) => (i % 8 === 0 ? null : i % 100)), + d: Array.from({ length: SIZE }, (_, i) => (i % 3 === 0 ? null : `val_${i % 20}`)), +}); + +for (let i = 0; i < WARMUP; i++) { + dropnaDataFrame(df, { thresh: 3 }); + dropnaDataFrame(df, { subset: ["a", "b"] }); + dropnaDataFrame(df, { axis: 1 }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + dropnaDataFrame(df, { thresh: 3 }); + dropnaDataFrame(df, { subset: ["a", "b"] }); + dropnaDataFrame(df, { axis: 1 }); +} +const total = performance.now() - start; + +console.log(JSON.stringify({ function: "dropna_advanced", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_factorize_sort.ts b/benchmarks/tsb/bench_factorize_sort.ts new file mode 100644 index 00000000..3ebab2d7 --- /dev/null +++ b/benchmarks/tsb/bench_factorize_sort.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: factorize / seriesFactorize with sort=true and useNaSentinel options. + * Outputs JSON: {"function": "factorize_sort", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { factorize, seriesFactorize, Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const categories = ["zebra", "apple", "mango", "banana", "coconut", "date"]; +const data = Array.from({ length: SIZE }, (_, i) => + i % 15 === 0 ? null : categories[i % categories.length], +); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + factorize(data, { sort: true }); + factorize(data, { sort: true, useNaSentinel: true }); + seriesFactorize(s, { sort: true }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + factorize(data, { sort: true }); + factorize(data, { sort: true, useNaSentinel: true }); + seriesFactorize(s, { sort: true }); +} +const total = performance.now() - start; + +console.log(JSON.stringify({ function: "factorize_sort", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_get_dummies_opts.ts b/benchmarks/tsb/bench_get_dummies_opts.ts new file mode 100644 index 00000000..2966e029 --- /dev/null +++ b/benchmarks/tsb/bench_get_dummies_opts.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: getDummies / dataFrameGetDummies with prefix, dropFirst, dummyNa options. + * Outputs JSON: {"function": "get_dummies_opts", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { getDummies, dataFrameGetDummies, Series, DataFrame } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const categories = ["apple", "banana", "cherry", "date", "elderberry"]; +const data = Array.from({ length: SIZE }, (_, i) => + i % 20 === 0 ? null : categories[i % categories.length], +); +const s = new Series({ data }); + +const df = DataFrame.fromColumns({ + fruit: Array.from({ length: SIZE }, (_, i) => + i % 20 === 0 ? null : categories[i % categories.length], + ), + color: Array.from({ length: SIZE }, (_, i) => ["red", "green", "blue"][i % 3]), +}); + +for (let i = 0; i < WARMUP; i++) { + getDummies(s, { prefix: "cat", dummyNa: true }); + getDummies(s, { dropFirst: true }); + dataFrameGetDummies(df, { columns: ["fruit", "color"], prefix: "col", dropFirst: true }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + getDummies(s, { prefix: "cat", dummyNa: true }); + getDummies(s, { dropFirst: true }); + dataFrameGetDummies(df, { columns: ["fruit", "color"], prefix: "col", dropFirst: true }); +} +const total = performance.now() - start; + +console.log(JSON.stringify({ function: "get_dummies_opts", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_nan_agg_extended.ts b/benchmarks/tsb/bench_nan_agg_extended.ts new file mode 100644 index 00000000..27cd89eb --- /dev/null +++ b/benchmarks/tsb/bench_nan_agg_extended.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: nancount / nanprod / nanmedian — extended nan-ignoring aggregates. + * Outputs JSON: {"function": "nan_agg_extended", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { nancount, nanprod, nanmedian } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +// Array with ~15% NaN values +const data: (number | null)[] = Array.from({ length: SIZE }, (_, i) => + i % 7 === 0 ? null : Math.cos(i * 0.02) * 50 + 1, +); + +for (let i = 0; i < WARMUP; i++) { + nancount(data); + nanprod(data.slice(0, 1000)); // nanprod on small slice to avoid overflow + nanmedian(data); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + nancount(data); + nanprod(data.slice(0, 1000)); + nanmedian(data); +} +const total = performance.now() - start; + +console.log(JSON.stringify({ function: "nan_agg_extended", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); diff --git a/benchmarks/tsb/bench_rank_methods.ts b/benchmarks/tsb/bench_rank_methods.ts new file mode 100644 index 00000000..004acb47 --- /dev/null +++ b/benchmarks/tsb/bench_rank_methods.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: rankSeries with different tie-breaking methods (min/max/first/dense). + * Outputs JSON: {"function": "rank_methods", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, rankSeries } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// Data with many ties to stress different tie-breaking methods +const data = Array.from({ length: SIZE }, (_, i) => Math.floor(i / 5) * 1.0); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + rankSeries(s, { method: "min" }); + rankSeries(s, { method: "max" }); + rankSeries(s, { method: "first" }); + rankSeries(s, { method: "dense" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + rankSeries(s, { method: "min" }); + rankSeries(s, { method: "max" }); + rankSeries(s, { method: "first" }); + rankSeries(s, { method: "dense" }); +} +const total = performance.now() - start; + +console.log(JSON.stringify({ function: "rank_methods", mean_ms: total / ITERATIONS, iterations: ITERATIONS, total_ms: total })); From 5b7ea6de8b5b6fd633b1bea4db4ac04b986a59f2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 16:14:41 +0000 Subject: [PATCH 19/19] Iteration 158: 5 new benchmark pairs (508 total, +5 vs best 503) Run: https://github.com/githubnext/tsessebe/actions/runs/24573945763 --- benchmarks/pandas/bench_dataframe_median.py | 36 ++++++++++++ .../pandas/bench_dataframe_rolling_min_max.py | 39 +++++++++++++ .../bench_interval_index_construction.py | 41 ++++++++++++++ benchmarks/pandas/bench_read_csv_options.py | 48 ++++++++++++++++ benchmarks/pandas/bench_to_csv_options.py | 42 ++++++++++++++ benchmarks/tsb/bench_dataframe_median.ts | 37 ++++++++++++ .../tsb/bench_dataframe_rolling_min_max.ts | 40 +++++++++++++ .../tsb/bench_interval_index_construction.ts | 42 ++++++++++++++ benchmarks/tsb/bench_read_csv_options.ts | 56 +++++++++++++++++++ benchmarks/tsb/bench_to_csv_options.ts | 43 ++++++++++++++ 10 files changed, 424 insertions(+) create mode 100644 benchmarks/pandas/bench_dataframe_median.py create mode 100644 benchmarks/pandas/bench_dataframe_rolling_min_max.py create mode 100644 benchmarks/pandas/bench_interval_index_construction.py create mode 100644 benchmarks/pandas/bench_read_csv_options.py create mode 100644 benchmarks/pandas/bench_to_csv_options.py create mode 100644 benchmarks/tsb/bench_dataframe_median.ts create mode 100644 benchmarks/tsb/bench_dataframe_rolling_min_max.ts create mode 100644 benchmarks/tsb/bench_interval_index_construction.ts create mode 100644 benchmarks/tsb/bench_read_csv_options.ts create mode 100644 benchmarks/tsb/bench_to_csv_options.ts diff --git a/benchmarks/pandas/bench_dataframe_median.py b/benchmarks/pandas/bench_dataframe_median.py new file mode 100644 index 00000000..b68615f4 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_median.py @@ -0,0 +1,36 @@ +""" +Benchmark: pandas DataFrame.median() — column-wise median on a 100k-row DataFrame. +Outputs JSON: {"function": "dataframe_median", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 30 + +df = pd.DataFrame({ + "a": (np.arange(SIZE) * 1.23) % 9000, + "b": (np.arange(SIZE) * 4.56) % 7000, + "c": (np.arange(SIZE) * 7.89) % 5000, +}) + +for _ in range(WARMUP): + df.median() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.median() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "dataframe_median", + "mean_ms": mean_ms, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_dataframe_rolling_min_max.py b/benchmarks/pandas/bench_dataframe_rolling_min_max.py new file mode 100644 index 00000000..42435d31 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_rolling_min_max.py @@ -0,0 +1,39 @@ +""" +Benchmark: pandas DataFrame.rolling().min() / .max() — rolling min/max aggregations. +Outputs JSON: {"function": "dataframe_rolling_min_max", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +SIZE = 50_000 +WINDOW = 20 +WARMUP = 5 +ITERATIONS = 30 + +df = pd.DataFrame({ + "a": np.sin(np.arange(SIZE) * 0.01) * 100, + "b": np.cos(np.arange(SIZE) * 0.01) * 50, + "c": (np.arange(SIZE) % 100) * 1.5, +}) + +for _ in range(WARMUP): + df.rolling(WINDOW).min() + df.rolling(WINDOW).max() + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.rolling(WINDOW).min() + df.rolling(WINDOW).max() + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "dataframe_rolling_min_max", + "mean_ms": mean_ms, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_interval_index_construction.py b/benchmarks/pandas/bench_interval_index_construction.py new file mode 100644 index 00000000..4be29661 --- /dev/null +++ b/benchmarks/pandas/bench_interval_index_construction.py @@ -0,0 +1,41 @@ +""" +Benchmark: pandas IntervalIndex.from_arrays() and IntervalIndex.from_tuples() — alternative constructors. +Outputs JSON: {"function": "interval_index_construction", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +SIZE = 10_000 +WARMUP = 5 +ITERATIONS = 50 + +# Prepare data +left_arr = np.arange(SIZE) * 0.1 +right_arr = left_arr + 0.1 + +# Prepare tuples for from_tuples +tuples = [(left_arr[i], right_arr[i]) for i in range(SIZE)] + +for _ in range(WARMUP): + pd.IntervalIndex.from_arrays(left_arr, right_arr) + pd.IntervalIndex.from_arrays(left_arr, right_arr, closed="left") + pd.IntervalIndex.from_tuples(tuples) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.IntervalIndex.from_arrays(left_arr, right_arr) + pd.IntervalIndex.from_arrays(left_arr, right_arr, closed="left") + pd.IntervalIndex.from_tuples(tuples) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "interval_index_construction", + "mean_ms": mean_ms, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_read_csv_options.py b/benchmarks/pandas/bench_read_csv_options.py new file mode 100644 index 00000000..2ff634b6 --- /dev/null +++ b/benchmarks/pandas/bench_read_csv_options.py @@ -0,0 +1,48 @@ +""" +Benchmark: pandas read_csv with options — sep, header, skiprows, dtype casting. +Outputs JSON: {"function": "read_csv_options", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import io +import time +import pandas as pd +import numpy as np + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 20 + +# Build pipe-separated CSV (no header) +pipe_lines = [f"{i}|{i * 1.1:.4f}|cat_{i % 50}" for i in range(ROWS)] +pipe_csv = "\n".join(pipe_lines) + +# Build comma-separated CSV (skip first 2 rows) +skip_lines = ["# comment row 1", "# comment row 2", "id,value,label"] + \ + [f"{i},{i * 2.2:.4f},grp_{i % 20}" for i in range(ROWS)] +skip_csv = "\n".join(skip_lines) + +# Build CSV for dtype override +dtype_lines = ["id,value,flag"] + [f"{i},{i * 1.5},{i % 2}" for i in range(ROWS)] +dtype_csv = "\n".join(dtype_lines) + +for _ in range(WARMUP): + pd.read_csv(io.StringIO(pipe_csv), sep="|", header=None) + pd.read_csv(io.StringIO(skip_csv), skiprows=2) + pd.read_csv(io.StringIO(dtype_csv), dtype={"id": "int32", "value": "float32"}) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + pd.read_csv(io.StringIO(pipe_csv), sep="|", header=None) + pd.read_csv(io.StringIO(skip_csv), skiprows=2) + pd.read_csv(io.StringIO(dtype_csv), dtype={"id": "int32", "value": "float32"}) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "read_csv_options", + "mean_ms": mean_ms, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_to_csv_options.py b/benchmarks/pandas/bench_to_csv_options.py new file mode 100644 index 00000000..015a9790 --- /dev/null +++ b/benchmarks/pandas/bench_to_csv_options.py @@ -0,0 +1,42 @@ +""" +Benchmark: pandas DataFrame.to_csv() with options — sep, header, index settings. +Outputs JSON: {"function": "to_csv_options", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import pandas as pd +import numpy as np + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 20 + +df = pd.DataFrame({ + "id": np.arange(ROWS), + "value": np.arange(ROWS) * 1.1, + "label": [f"cat_{i % 50}" for i in range(ROWS)], +}) + +for _ in range(WARMUP): + df.to_csv(sep="\t") + df.to_csv(header=False) + df.to_csv(index=False) + df.to_csv(sep="|", header=False, index=False) + +times = [] +for _ in range(ITERATIONS): + t0 = time.perf_counter() + df.to_csv(sep="\t") + df.to_csv(header=False) + df.to_csv(index=False) + df.to_csv(sep="|", header=False, index=False) + times.append((time.perf_counter() - t0) * 1000) + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS +print(json.dumps({ + "function": "to_csv_options", + "mean_ms": mean_ms, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/tsb/bench_dataframe_median.ts b/benchmarks/tsb/bench_dataframe_median.ts new file mode 100644 index 00000000..911bc3bc --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_median.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: DataFrame.median() — column-wise median on a 100k-row DataFrame. + * Outputs JSON: {"function": "dataframe_median", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 30; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => (i * 1.23) % 9000), + b: Array.from({ length: SIZE }, (_, i) => (i * 4.56) % 7000), + c: Array.from({ length: SIZE }, (_, i) => (i * 7.89) % 5000), +}); + +for (let i = 0; i < WARMUP; i++) { + df.median(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.median(); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "dataframe_median", + mean_ms: meanMs, + iterations: ITERATIONS, + total_ms: totalMs, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_rolling_min_max.ts b/benchmarks/tsb/bench_dataframe_rolling_min_max.ts new file mode 100644 index 00000000..fcb714c1 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_rolling_min_max.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: DataFrameRolling.min() and DataFrameRolling.max() on a 50k-row DataFrame. + * Outputs JSON: {"function": "dataframe_rolling_min_max", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame } from "../../src/index.ts"; + +const SIZE = 50_000; +const WINDOW = 20; +const WARMUP = 5; +const ITERATIONS = 30; + +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => Math.sin(i * 0.01) * 100), + b: Array.from({ length: SIZE }, (_, i) => Math.cos(i * 0.01) * 50), + c: Array.from({ length: SIZE }, (_, i) => (i % 100) * 1.5), +}); + +for (let i = 0; i < WARMUP; i++) { + df.rolling(WINDOW).min(); + df.rolling(WINDOW).max(); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + df.rolling(WINDOW).min(); + df.rolling(WINDOW).max(); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "dataframe_rolling_min_max", + mean_ms: meanMs, + iterations: ITERATIONS, + total_ms: totalMs, + }), +); diff --git a/benchmarks/tsb/bench_interval_index_construction.ts b/benchmarks/tsb/bench_interval_index_construction.ts new file mode 100644 index 00000000..1bfc2d1a --- /dev/null +++ b/benchmarks/tsb/bench_interval_index_construction.ts @@ -0,0 +1,42 @@ +/** + * Benchmark: IntervalIndex.fromArrays() and IntervalIndex.fromIntervals() — alternative constructors. + * Outputs JSON: {"function": "interval_index_construction", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Interval, IntervalIndex } from "../../src/index.ts"; + +const SIZE = 10_000; +const WARMUP = 5; +const ITERATIONS = 50; + +// Prepare data for fromArrays +const leftArr = Array.from({ length: SIZE }, (_, i) => i * 0.1); +const rightArr = Array.from({ length: SIZE }, (_, i) => i * 0.1 + 0.1); + +// Prepare interval objects for fromIntervals +const intervals = Array.from({ length: SIZE }, (_, i) => new Interval(i * 0.1, i * 0.1 + 0.1)); + +for (let i = 0; i < WARMUP; i++) { + IntervalIndex.fromArrays(leftArr, rightArr); + IntervalIndex.fromArrays(leftArr, rightArr, "left"); + IntervalIndex.fromIntervals(intervals); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + IntervalIndex.fromArrays(leftArr, rightArr); + IntervalIndex.fromArrays(leftArr, rightArr, "left"); + IntervalIndex.fromIntervals(intervals); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "interval_index_construction", + mean_ms: meanMs, + iterations: ITERATIONS, + total_ms: totalMs, + }), +); diff --git a/benchmarks/tsb/bench_read_csv_options.ts b/benchmarks/tsb/bench_read_csv_options.ts new file mode 100644 index 00000000..a629da8b --- /dev/null +++ b/benchmarks/tsb/bench_read_csv_options.ts @@ -0,0 +1,56 @@ +/** + * Benchmark: readCsv with options — sep, header, skipRows, dtype casting. + * Outputs JSON: {"function": "read_csv_options", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { readCsv } from "../../src/index.ts"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 20; + +// Build pipe-separated CSV (no header) +const pipeLines: string[] = []; +for (let i = 0; i < ROWS; i++) { + pipeLines.push(`${i}|${(i * 1.1).toFixed(4)}|cat_${i % 50}`); +} +const pipeCsv = pipeLines.join("\n"); + +// Build comma-separated CSV (skip first 2 rows) +const skipLines: string[] = ["# comment row 1", "# comment row 2", "id,value,label"]; +for (let i = 0; i < ROWS; i++) { + skipLines.push(`${i},${(i * 2.2).toFixed(4)},grp_${i % 20}`); +} +const skipCsv = skipLines.join("\n"); + +// Build CSV for dtype override +const dtypeLines: string[] = ["id,value,flag"]; +for (let i = 0; i < ROWS; i++) { + dtypeLines.push(`${i},${i * 1.5},${i % 2}`); +} +const dtypeCsv = dtypeLines.join("\n"); + +for (let i = 0; i < WARMUP; i++) { + readCsv(pipeCsv, { sep: "|", header: null }); + readCsv(skipCsv, { skipRows: 2 }); + readCsv(dtypeCsv, { dtype: { id: "int32", value: "float32" } }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + readCsv(pipeCsv, { sep: "|", header: null }); + readCsv(skipCsv, { skipRows: 2 }); + readCsv(dtypeCsv, { dtype: { id: "int32", value: "float32" } }); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "read_csv_options", + mean_ms: meanMs, + iterations: ITERATIONS, + total_ms: totalMs, + }), +); diff --git a/benchmarks/tsb/bench_to_csv_options.ts b/benchmarks/tsb/bench_to_csv_options.ts new file mode 100644 index 00000000..43106b74 --- /dev/null +++ b/benchmarks/tsb/bench_to_csv_options.ts @@ -0,0 +1,43 @@ +/** + * Benchmark: toCsv with options — sep, header, index settings. + * Outputs JSON: {"function": "to_csv_options", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, toCsv } from "../../src/index.ts"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 20; + +const df = DataFrame.fromColumns({ + id: Array.from({ length: ROWS }, (_, i) => i), + value: Array.from({ length: ROWS }, (_, i) => i * 1.1), + label: Array.from({ length: ROWS }, (_, i) => `cat_${i % 50}`), +}); + +for (let i = 0; i < WARMUP; i++) { + toCsv(df, { sep: "\t" }); + toCsv(df, { header: false }); + toCsv(df, { index: false }); + toCsv(df, { sep: "|", header: false, index: false }); +} + +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const t0 = performance.now(); + toCsv(df, { sep: "\t" }); + toCsv(df, { header: false }); + toCsv(df, { index: false }); + toCsv(df, { sep: "|", header: false, index: false }); + times.push(performance.now() - t0); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; +console.log( + JSON.stringify({ + function: "to_csv_options", + mean_ms: meanMs, + iterations: ITERATIONS, + total_ms: totalMs, + }), +);