From 2c1b44f4294a3adf5480da810e41d0fffe4c7e8b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sun, 5 Apr 2026 17:10:32 +0000 Subject: [PATCH 001/104] Iteration 6: Implement GroupBy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port pandas.core.groupby — DataFrameGroupBy and SeriesGroupBy with full split-apply-combine engine: - DataFrameGroupBy: sum, mean, min, max, count, std, first, last, size, agg (named/fn/per-column spec), transform, apply, filter, getGroup, ngroups, groupKeys, groups — single-key and multi-key support - SeriesGroupBy: same aggregation API, transform, apply, filter, getGroup - DataFrame.groupby(by) and Series.groupby(by) convenience methods - 40+ unit tests + property-based tests (fast-check) - Interactive playground page (playground/groupby.html) Run: https://github.com/githubnext/tsessebe/actions/runs/23972003902 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- playground/groupby.html | 401 +++++++++++++++++++ playground/index.html | 59 ++- src/core/frame.ts | 372 +++++++++++++++++ src/core/series.ts | 244 ++++++++++++ src/groupby/groupby.ts | 730 ++++++++++++++++++++++++++++++++++ src/groupby/index.ts | 8 + src/index.ts | 26 ++ tests/groupby/groupby.test.ts | 427 ++++++++++++++++++++ 8 files changed, 2265 insertions(+), 2 deletions(-) create mode 100644 playground/groupby.html create mode 100644 src/groupby/groupby.ts create mode 100644 src/groupby/index.ts create mode 100644 tests/groupby/groupby.test.ts diff --git a/playground/groupby.html b/playground/groupby.html new file mode 100644 index 00000000..9edf578e --- /dev/null +++ b/playground/groupby.html @@ -0,0 +1,401 @@ + + + + + + tsb — GroupBy Tutorial + + + + +
+

tsb

+ ← All features + + GroupBy +
+ +
+ +

GroupBy: Split–Apply–Combine

+

+ The GroupBy engine lets you split a DataFrame (or Series) into groups, apply an aggregation + or transformation to each group, and combine the results — mirroring + pandas.DataFrame.groupby(). +

+ + +

1 · Basic groupby + sum

+

Group by a single column and aggregate with a built-in function:

+
+
TypeScript
+
import { DataFrame } from "tsb";
+
+const df = DataFrame.fromColumns({
+  dept:  ["A", "A", "B", "B", "C"],
+  sales: [10, 20, 30, 40, 50],
+  bonus: [1,  2,  3,  4,  5 ],
+});
+
+const result = df.groupby("dept").sum();
+console.log(result.toString());
+
+
Output
+ + + + + + + +
(index)salesbonus
A303
B707
C505
+
+
+ + +

2 · Per-column aggregation specs

+

Apply different aggregation functions to different columns using an object spec:

+
+
TypeScript
+
df.groupby("dept").agg({
+  sales: "sum",
+  bonus: "mean",
+});
+
+
Output
+ + + + + + + +
(index)salesbonus
A301.5
B703.5
C505
+
+
+ + +

3 · Custom aggregation function

+

Pass any function (vals: readonly Scalar[]) => Scalar:

+
+
TypeScript
+
// Range = max − min per group
+df.groupby("dept").agg((vals) => {
+  const nums = vals.filter((v): v is number => typeof v === "number");
+  if (nums.length === 0) return 0;
+  return Math.max(...nums) - Math.min(...nums);
+});
+
+
Output (sales range)
+ + + + + + + +
(index)salesbonus
A101
B101
C00
+
+
+ + +

4 · transform()

+

+ Unlike agg(), transform() returns a same-shape DataFrame — + useful for broadcasting group statistics back to the original rows. +

+
+
TypeScript
+
// Subtract group mean (demeaning)
+const demeaned = df.groupby("dept").transform((vals, col) => {
+  if (col === "dept") return vals;
+  const nums = vals.filter((v): v is number => typeof v === "number");
+  const mean = nums.reduce((a, b) => a + b, 0) / nums.length;
+  return vals.map((v) => (typeof v === "number" ? v - mean : v));
+});
+
+
Output (same shape, values demeaned)
+ + + + + + + + + +
(index)deptsalesbonus
0A-5-0.5
1A50.5
2B-5-0.5
3B50.5
4C00
+
+
+ + +

5 · apply()

+

Run arbitrary logic on each sub-DataFrame and concatenate the results:

+
+
TypeScript
+
// Keep only the top-sales row from each dept
+const topRows = df.groupby("dept").apply((sub) =>
+  sub.sortValues("sales", false).head(1),
+);
+
+
Output
+ + + + + + + +
(index)deptsalesbonus
1A202
3B404
4C505
+
+
+ + +

6 · filter()

+

Keep only the rows belonging to groups that pass a predicate:

+
+
TypeScript
+
// Keep only groups with more than 1 row
+const big = df.groupby("dept").filter((sub) => sub.shape[0] > 1);
+
+
Output (C dropped — only 1 row)
+ + + + + + + + +
(index)deptsalesbonus
0A101
1A202
2B303
3B404
+
+
+ + +

7 · size() and ngroups

+

Inspect the structure of the groups:

+
+
TypeScript
+
const gb = df.groupby("dept");
+
+gb.ngroups;        // 3
+gb.groupKeys();   // ["A", "B", "C"]
+gb.size();        // Series { A: 2, B: 2, C: 1 }
+
+
gb.size()
+ + + + + + + +
(index)size
A2
B2
C1
+
+
+ + +

8 · Multi-key groupby

+

Group by an array of column names:

+
+
TypeScript
+
const df2 = DataFrame.fromColumns({
+  dept:   ["A", "A", "A", "B"],
+  region: ["E", "E", "W", "E"],
+  sales:  [10, 20, 30, 40],
+});
+
+df2.groupby(["dept", "region"]).sum();
+
+
3 composite groups
+ + + + + + + +
(index)sales
A__SEP__E30
A__SEP__W30
B__SEP__E40
+
+
+ + +

9 · SeriesGroupBy

+

Series also supports groupby(), accepting an array of key values:

+
+
TypeScript
+
import { Series } from "tsb";
+
+const s = new Series({ data: [1, 2, 3, 4] });
+s.groupby(["A", "A", "B", "B"]).sum();
+// Series { A: 3, B: 7 }
+
+
Output
+ + + + + + +
(index)value
A3
B7
+
+
+ + +

API Reference

+ + + + + + + + + + + + + + + + + + + + + + + + +
MethodDescription
groupby(by)Group by column name(s) → DataFrameGroupBy
.sum()Sum of each group (numeric columns)
.mean()Mean of each group
.min()Minimum per group
.max()Maximum per group
.count()Count non-null values per group
.std()Sample standard deviation per group
.first()First non-null value per group
.last()Last non-null value per group
.size()Number of rows per group (Series)
.agg(spec)Apply named agg / custom fn / per-column specs
.transform(fn)Same-shape result; broadcast group stats back
.apply(fn)Arbitrary per-group function; results concatenated
.filter(pred)Keep rows from groups that pass predicate
.getGroup(key)Extract sub-DataFrame for a single key
.ngroupsNumber of groups
.groupKeysArray of group key labels
.groupsMap from key → row labels
+ +
+ + + + + diff --git a/playground/index.html b/playground/index.html index 306f67ac..568ccef6 100644 --- a/playground/index.html +++ b/playground/index.html @@ -154,10 +154,65 @@

🔢 Dtypes

Rich dtype system: int/float/bool/string/datetime/category.

✅ Complete
+
+

🔀 GroupBy

+

Split-apply-combine — interactive tutorial. groupby, agg, transform, apply, filter.

+
✅ Complete
+
+
+

🔗 concat

+

Combine Series and DataFrames — interactive tutorial. axis=0/1, outer/inner join, ignoreIndex.

+
✅ Complete
+
+
+

🔀 merge

+

SQL-style DataFrame joins — interactive tutorial. inner/left/right/outer, on/left_on/right_on, suffixes.

+
✅ Complete
+
+
+

🔡 str accessor

+

Vectorised string operations — interactive tutorial. lower/upper/strip/pad/contains/replace/split/extract & predicates.

+
✅ Complete
+
+
+

📅 dt accessor

+

Vectorised datetime operations — interactive tutorial. Calendar components, boolean boundaries, strftime, floor/ceil/round.

+
✅ Complete
+
+
+

📊 describe

+

Summary statistics — interactive tutorial. count/mean/std/min/percentiles/max for numeric; count/unique/top/freq for categorical. Series.quantile().

+
✅ Complete
+

📥 I/O

-

read_csv, read_json, read_parquet, to_csv, to_json.

-
⏳ Planned
+

CSV I/O — interactive tutorial. readCsv / toCsv with dtype inference, NA handling, quoted fields, custom separators.

+
✅ Complete
+
+
+

📥 JSON I/O

+

JSON I/O — interactive tutorial. readJson / toJson with five orient formats: records, split, index, columns, values.

+
✅ Complete
+
+
+

📈 corr & cov

+

Pearson correlation & covariance — interactive tutorial. Series.corr(), DataFrame.corr(), DataFrame.cov(), dataFrameCorr(), dataFrameCov() with index alignment, null handling, and configurable ddof/minPeriods.

+
✅ Complete
+
+
+

🪟 rolling

+

Sliding-window aggregations — interactive tutorial. Series.rolling() and DataFrame.rolling() with mean, sum, std, var, min, max, count, median, apply. Supports minPeriods and centered windows.

+
✅ Complete
+
+
+

📈 expanding

+

Growing-window aggregations — Series.expanding() and DataFrame.expanding() with mean, sum, std, var, min, max, count, median, apply. Window grows from start to current position.

+
✅ Complete
+
+
+

🏷️ cat accessor

+

Categorical operations — interactive tutorial. Series.cat with categories, codes, ordered, addCategories, removeCategories, renameCategories, setCategories, reorderCategories, valueCounts.

+
✅ Complete
diff --git a/src/core/frame.ts b/src/core/frame.ts index 4e127d48..f19d7597 100644 --- a/src/core/frame.ts +++ b/src/core/frame.ts @@ -17,7 +17,12 @@ * ``` */ +import { DataFrameGroupBy } from "../groupby/index.ts"; import type { Label, Scalar } from "../types.ts"; +import { Expanding } from "../window/expanding.ts"; +import type { ExpandingOptions } from "../window/expanding.ts"; +import { Rolling } from "../window/rolling.ts"; +import type { RollingOptions } from "../window/rolling.ts"; import { Index } from "./base-index.ts"; import { RangeIndex } from "./range-index.ts"; import { Series } from "./series.ts"; @@ -443,6 +448,43 @@ export class DataFrame { return new DataFrame(transposedMap, statIndex); } + /** + * Pairwise Pearson correlation matrix for all numeric columns. + * + * Returns a symmetric DataFrame whose row-index and column labels are the + * numeric column names. Diagonal entries are `1.0`. + * + * @param minPeriods - Minimum valid observation pairs (default 1). + * + * @example + * ```ts + * const df = new DataFrame({ a: [1, 2, 3], b: [4, 5, 6] }); + * df.corr().col("a").at(0); // 1.0 + * ``` + */ + corr(minPeriods = 1): DataFrame { + return buildPairwiseDf(this, (a, b) => a.corr(b, minPeriods)); + } + + /** + * Pairwise sample covariance matrix for all numeric columns. + * + * Returns a symmetric DataFrame whose row-index and column labels are the + * numeric column names. Diagonal entries are the variance of each column. + * + * @param ddof - Delta degrees of freedom (default 1). + * @param minPeriods - Minimum valid observation pairs (default 1). + * + * @example + * ```ts + * const df = new DataFrame({ a: [1, 2, 3], b: [2, 4, 6] }); + * df.cov().col("a").at(0); // 1.0 + * ``` + */ + cov(ddof = 1, minPeriods = 1): DataFrame { + return buildPairwiseDf(this, (a, b) => seriesCov(a, b, ddof, minPeriods)); + } + // ─── sorting ────────────────────────────────────────────────────────────── /** @@ -600,6 +642,63 @@ export class DataFrame { return formatDataFrame(this._columns, this.index, this.columns); } + // ─── rolling window ─────────────────────────────────────────────────────── + + /** + * Provide a rolling (sliding-window) view of the DataFrame. + * + * Aggregations are applied independently to each column. + * + * @param window - Size of the moving window (positive integer). + * @param options - Optional {@link RollingOptions} (`minPeriods`, `center`). + * + * @example + * ```ts + * const df = DataFrame.fromColumns({ a: [1, 2, 3, 4], b: [10, 20, 30, 40] }); + * df.rolling(2).mean(); + * // DataFrame: { a: [null, 1.5, 2.5, 3.5], b: [null, 15, 25, 35] } + * ``` + */ + rolling(window: number, options?: RollingOptions): DataFrameRolling { + return new DataFrameRolling(this, window, options); + } + + // ─── expanding window ───────────────────────────────────────────────────── + + /** + * Provide an expanding (growing-window) view of the DataFrame. + * + * @param options - Optional {@link ExpandingOptions} (`minPeriods`). + * + * @example + * ```ts + * const df = DataFrame.fromColumns({ a: [1, 2, 3] }); + * df.expanding().mean(); + * ``` + */ + expanding(options?: ExpandingOptions): DataFrameExpanding { + return new DataFrameExpanding(this, options); + } + + // ─── groupby ────────────────────────────────────────────────────────────── + + /** + * Group the DataFrame by one or more columns. + * + * Returns a `DataFrameGroupBy` object that can be used to apply + * aggregation, transformation, or filtering operations on each group. + * + * @example + * ```ts + * df.groupby("dept").sum(); + * df.groupby(["dept", "region"]).mean(); + * ``` + */ + groupby(by: string | readonly string[]): DataFrameGroupBy { + const cols = typeof by === "string" ? [by] : [...by]; + return new DataFrameGroupBy(this, cols); + } + // ─── private helpers ────────────────────────────────────────────────────── private _sliceRows(start: number, end: number): DataFrame { @@ -768,6 +867,98 @@ function compareRows( return 0; } +// ─── pairwise corr/cov helpers ──────────────────────────────────────────────── + +/** + * Align two Series on their shared index labels and return paired numeric + * values, dropping pairs where either value is missing. + */ +function alignedNumericPairs(a: Series, b: Series): [number[], number[]] { + const bMap = new Map(); + for (let j = 0; j < b.index.size; j++) { + bMap.set(String(b.index.at(j)), j); + } + const xs: number[] = []; + const ys: number[] = []; + for (let i = 0; i < a.index.size; i++) { + const label = String(a.index.at(i)); + const j = bMap.get(label); + if (j === undefined) { + continue; + } + const av = a.values[i]; + const bv = b.values[j]; + if ( + av === null || + av === undefined || + (typeof av === "number" && Number.isNaN(av)) || + bv === null || + bv === undefined || + (typeof bv === "number" && Number.isNaN(bv)) || + typeof av !== "number" || + typeof bv !== "number" + ) { + continue; + } + xs.push(av); + ys.push(bv); + } + return [xs, ys]; +} + +/** Sample covariance of two aligned numeric arrays. */ +function seriesCov(a: Series, b: Series, ddof: number, minPeriods: number): number { + const [xs, ys] = alignedNumericPairs(a, b); + const n = xs.length; + if (n < minPeriods || n - ddof <= 0) { + return Number.NaN; + } + let mx = 0; + let my = 0; + for (let i = 0; i < n; i++) { + mx += xs[i] as number; + my += ys[i] as number; + } + mx /= n; + my /= n; + let s = 0; + for (let i = 0; i < n; i++) { + s += ((xs[i] as number) - mx) * ((ys[i] as number) - my); + } + return s / (n - ddof); +} + +/** True when a column's dtype is numeric. */ +function isNumericCol(s: Series): boolean { + const k = s.dtype.kind; + return k === "int" || k === "uint" || k === "float"; +} + +/** + * Build a symmetric N×N DataFrame from a pairwise-value function applied to + * all numeric columns of `df`. + */ +function buildPairwiseDf( + df: DataFrame, + pairFn: (a: Series, b: Series) => number, +): DataFrame { + const cols = df.columns.values.filter((c) => isNumericCol(df.col(c))); + const n = cols.length; + const idx = new Index