From 392e44e085c2434aab1f9c97290db1889ba2f06f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 10 Apr 2026 21:22:04 +0000 Subject: [PATCH 1/2] =?UTF-8?q?Iteration=20172:=20Add=20na=5Fops=20?= =?UTF-8?q?=E2=80=94=20isna/notna/ffill/bfill?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements pandas missing-value utilities as standalone exported functions: - `isna` / `notna` / `isnull` / `notnull` — detect missing values in scalars, Series, and DataFrames (mirrors pd.isna / pd.notna) - `ffillSeries` / `bfillSeries` — forward/backward fill for Series with optional `limit` parameter - `dataFrameFfill` / `dataFrameBfill` — column-wise or row-wise fill for DataFrames with optional `limit` and `axis` parameters Metric: 28 → 29 pandas_features_ported Run: https://github.com/githubnext/tsessebe/actions/runs/24263385922 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- playground/index.html | 5 + playground/na_ops.html | 480 +++++++++++++++++++++++++++++++++++++ src/index.ts | 11 + src/stats/index.ts | 11 + src/stats/na_ops.ts | 336 ++++++++++++++++++++++++++ tests/stats/na_ops.test.ts | 280 ++++++++++++++++++++++ 6 files changed, 1123 insertions(+) create mode 100644 playground/na_ops.html create mode 100644 src/stats/na_ops.ts create mode 100644 tests/stats/na_ops.test.ts diff --git a/playground/index.html b/playground/index.html index 48bfbcb9..2b619f97 100644 --- a/playground/index.html +++ b/playground/index.html @@ -254,6 +254,11 @@

Element-wise transformations. clip(), seriesAbs(), seriesRound() for Series and DataFrame with min/max bounds, decimal precision, and axis support.

✅ Complete
+
+

🔍 missing-value ops

+

Detect and fill missing values. isna(), notna(), isnull(), notnull() for scalars/Series/DataFrame. ffillSeries(), bfillSeries(), dataFrameFfill(), dataFrameBfill() with optional limit and axis support.

+
✅ Complete
+

🔢 value_counts

Count unique values. valueCounts() for Series and dataFrameValueCounts() for DataFrame with normalize, sort, ascending, and dropna options.

diff --git a/playground/na_ops.html b/playground/na_ops.html new file mode 100644 index 00000000..c321438f --- /dev/null +++ b/playground/na_ops.html @@ -0,0 +1,480 @@ + + + + + + tsb — missing-value operations (isna, ffill, bfill) + + + +
+
+
Loading tsb runtime…
+
+ + ← Back to playground index + +

Missing-value operations

+

+ isna / notna — detect missing values in scalars, + Series, and DataFrames.
+ ffill / bfill — propagate the last (or next) valid + value to fill gaps.
+ Mirrors pd.isna(), Series.ffill(), and + DataFrame.bfill() from pandas. +

+ + +
+

1 · isna / notna on scalars

+

+ Returns true / false for individual values. + null, undefined, and NaN are all + considered "missing". +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

2 · isna on a Series

+

+ When passed a Series, isna returns a boolean Series of the + same length — true where values are missing. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

3 · isna on a DataFrame

+

+ Returns a DataFrame of booleans with the same shape — one column per + original column, true where missing. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

4 · Forward-fill (ffillSeries)

+

+ Propagates the last valid value forward to fill gaps. Leading + nulls that have no preceding value remain null. + Use the optional limit to cap consecutive fills. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

5 · Backward-fill (bfillSeries)

+

+ Propagates the next valid value backward to fill gaps. Trailing + nulls that have no following value remain null. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

6 · DataFrame forward-fill & backward-fill

+

+ dataFrameFfill and dataFrameBfill apply fill + column-wise by default (axis=0). Pass axis: 1 to fill + row-wise across columns. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

API Reference

+
// Module-level missing-value detection
+isna(value: Scalar): boolean
+isna(value: Series): Series<boolean>
+isna(value: DataFrame): DataFrame
+
+notna(value: Scalar): boolean
+notna(value: Series): Series<boolean>
+notna(value: DataFrame): DataFrame
+
+// Aliases
+isnull(...)  // same as isna
+notnull(...) // same as notna
+
+// Series forward / backward fill
+ffillSeries(series, options?: { limit?: number | null }): Series
+bfillSeries(series, options?: { limit?: number | null }): Series
+
+// DataFrame forward / backward fill
+dataFrameFfill(df, options?: {
+  limit?: number | null,   // max consecutive fills (default: no limit)
+  axis?: 0 | 1 | "index" | "columns",  // default 0 (column-wise)
+}): DataFrame
+
+dataFrameBfill(df, options?: {
+  limit?: number | null,
+  axis?: 0 | 1 | "index" | "columns",
+}): DataFrame
+
+ + + + + diff --git a/src/index.ts b/src/index.ts index 1dd0aa57..ec702a7e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -107,3 +107,14 @@ export { export type { ClipOptions, RoundOptions, DataFrameElemOptions } from "./stats/index.ts"; export { valueCounts, dataFrameValueCounts } from "./stats/index.ts"; export type { ValueCountsOptions, DataFrameValueCountsOptions } from "./stats/index.ts"; +export { + isna, + notna, + isnull, + notnull, + ffillSeries, + bfillSeries, + dataFrameFfill, + dataFrameBfill, +} from "./stats/index.ts"; +export type { FillDirectionOptions, DataFrameFillOptions } from "./stats/index.ts"; diff --git a/src/stats/index.ts b/src/stats/index.ts index b1de48eb..84202fde 100644 --- a/src/stats/index.ts +++ b/src/stats/index.ts @@ -39,3 +39,14 @@ export { nsmallestDataFrame, } from "./nlargest.ts"; export type { NKeep, NTopOptions, NTopDataFrameOptions } from "./nlargest.ts"; +export { + isna, + notna, + isnull, + notnull, + ffillSeries, + bfillSeries, + dataFrameFfill, + dataFrameBfill, +} from "./na_ops.ts"; +export type { FillDirectionOptions, DataFrameFillOptions } from "./na_ops.ts"; diff --git a/src/stats/na_ops.ts b/src/stats/na_ops.ts new file mode 100644 index 00000000..c776bb1f --- /dev/null +++ b/src/stats/na_ops.ts @@ -0,0 +1,336 @@ +/** + * na_ops — missing-value utilities for Series and DataFrame. + * + * Mirrors the following pandas module-level functions and methods: + * - `pd.isna(obj)` / `pd.isnull(obj)` — detect missing values + * - `pd.notna(obj)` / `pd.notnull(obj)` — detect non-missing values + * - `Series.ffill()` / `DataFrame.ffill()` — forward-fill missing values + * - `Series.bfill()` / `DataFrame.bfill()` — backward-fill missing values + * + * All functions are **pure** (return new objects; inputs are unchanged). + * + * @module + */ + +import { DataFrame } from "../core/index.ts"; +import { Series } from "../core/index.ts"; +import type { Scalar } from "../types.ts"; + +// ─── public types ───────────────────────────────────────────────────────────── + +/** Options for {@link ffillSeries} and {@link bfillSeries}. */ +export interface FillDirectionOptions { + /** + * Maximum number of consecutive NaN/null values to fill. + * `null` means no limit (default). + */ + readonly limit?: number | null; +} + +/** Options for {@link dataFrameFfill} and {@link dataFrameBfill}. */ +export interface DataFrameFillOptions extends FillDirectionOptions { + /** + * - `0` or `"index"` (default): fill missing values down each **column**. + * - `1` or `"columns"`: fill missing values across each **row**. + */ + readonly axis?: 0 | 1 | "index" | "columns"; +} + +// ─── helpers ────────────────────────────────────────────────────────────────── + +/** True when `v` should be treated as missing. */ +function isMissing(v: Scalar): boolean { + return v === null || v === undefined || (typeof v === "number" && Number.isNaN(v)); +} + +/** Forward-fill an array of scalars in-place (returns a new array). */ +function ffillArray(vals: readonly Scalar[], limit: number | null): Scalar[] { + const out: Scalar[] = Array.from(vals); + let lastValid: Scalar = null; + let streak = 0; + for (let i = 0; i < out.length; i++) { + if (isMissing(out[i])) { + if (!isMissing(lastValid) && (limit === null || streak < limit)) { + out[i] = lastValid; + streak++; + } + } else { + lastValid = out[i] as Scalar; + streak = 0; + } + } + return out; +} + +/** Backward-fill an array of scalars (returns a new array). */ +function bfillArray(vals: readonly Scalar[], limit: number | null): Scalar[] { + const out: Scalar[] = Array.from(vals); + let nextValid: Scalar = null; + let streak = 0; + for (let i = out.length - 1; i >= 0; i--) { + if (isMissing(out[i])) { + if (!isMissing(nextValid) && (limit === null || streak < limit)) { + out[i] = nextValid; + streak++; + } + } else { + nextValid = out[i] as Scalar; + streak = 0; + } + } + return out; +} + +// ─── isna / notna ───────────────────────────────────────────────────────────── + +/** + * Detect missing values in a scalar, Series, or DataFrame. + * + * - For a **scalar**: returns `true` if the value is `null`, `undefined`, or `NaN`. + * - For a **Series**: returns a `Series` of the same index. + * - For a **DataFrame**: returns a `DataFrame` of boolean columns. + * + * Mirrors `pandas.isna()` / `pandas.isnull()`. + * + * @example + * ```ts + * import { isna } from "tsb"; + * isna(null); // true + * isna(42); // false + * isna(NaN); // true + * + * const s = new Series({ data: [1, null, NaN, 4] }); + * isna(s); // Series([false, true, true, false]) + * ``` + */ +export function isna(value: Scalar): boolean; +export function isna(value: Series): Series; +export function isna(value: DataFrame): DataFrame; +export function isna( + value: Scalar | Series | DataFrame, +): boolean | Series | DataFrame { + if (value instanceof DataFrame) { + return value.isna(); + } + if (value instanceof Series) { + return value.isna(); + } + return isMissing(value as Scalar); +} + +/** + * Detect non-missing values in a scalar, Series, or DataFrame. + * + * Mirrors `pandas.notna()` / `pandas.notnull()`. + * + * @example + * ```ts + * import { notna } from "tsb"; + * notna(null); // false + * notna(42); // true + * ``` + */ +export function notna(value: Scalar): boolean; +export function notna(value: Series): Series; +export function notna(value: DataFrame): DataFrame; +export function notna( + value: Scalar | Series | DataFrame, +): boolean | Series | DataFrame { + if (value instanceof DataFrame) { + return value.notna(); + } + if (value instanceof Series) { + return value.notna(); + } + return !isMissing(value as Scalar); +} + +/** Alias for {@link isna}. Mirrors `pandas.isnull()`. */ +export const isnull = isna; + +/** Alias for {@link notna}. Mirrors `pandas.notnull()`. */ +export const notnull = notna; + +// ─── ffill ──────────────────────────────────────────────────────────────────── + +/** + * Forward-fill missing values in a Series. + * + * Each `null`/`NaN` value is replaced with the last non-missing value + * that precedes it (if any). Values before the first non-missing value + * remain missing. + * + * Mirrors `pandas.Series.ffill()`. + * + * @param series - Input Series (unchanged). + * @param options - Optional `{ limit }` — max consecutive fills. + * @returns New Series with forward-filled values. + * + * @example + * ```ts + * import { ffillSeries } from "tsb"; + * const s = new Series({ data: [1, null, null, 4] }); + * ffillSeries(s); // Series([1, 1, 1, 4]) + * ``` + */ +export function ffillSeries( + series: Series, + options?: FillDirectionOptions, +): Series { + const limit = options?.limit ?? null; + const filled = ffillArray(series.values as readonly Scalar[], limit) as T[]; + return new Series({ + data: filled, + index: series.index, + dtype: series.dtype, + name: series.name ?? undefined, + }); +} + +/** + * Backward-fill missing values in a Series. + * + * Each `null`/`NaN` value is replaced with the next non-missing value + * that follows it (if any). Values after the last non-missing value + * remain missing. + * + * Mirrors `pandas.Series.bfill()`. + * + * @example + * ```ts + * import { bfillSeries } from "tsb"; + * const s = new Series({ data: [1, null, null, 4] }); + * bfillSeries(s); // Series([1, 4, 4, 4]) + * ``` + */ +export function bfillSeries( + series: Series, + options?: FillDirectionOptions, +): Series { + const limit = options?.limit ?? null; + const filled = bfillArray(series.values as readonly Scalar[], limit) as T[]; + return new Series({ + data: filled, + index: series.index, + dtype: series.dtype, + name: series.name ?? undefined, + }); +} + +// ─── DataFrame ffill / bfill ────────────────────────────────────────────────── + +/** + * Forward-fill missing values in a DataFrame. + * + * By default operates **column-wise** (axis=0): each column is independently + * forward-filled. With `axis=1` each row is forward-filled across columns. + * + * Mirrors `pandas.DataFrame.ffill()`. + * + * @example + * ```ts + * import { dataFrameFfill } from "tsb"; + * const df = new DataFrame({ data: { a: [1, null, 3], b: [null, 2, null] } }); + * dataFrameFfill(df); + * // a: [1, 1, 3] + * // b: [null, 2, 2] + * ``` + */ +export function dataFrameFfill(df: DataFrame, options?: DataFrameFillOptions): DataFrame { + const limit = options?.limit ?? null; + const axis = options?.axis ?? 0; + const byRow = axis === 1 || axis === "columns"; + + if (!byRow) { + // column-wise: fill each column independently + const colMap = new Map>(); + for (const name of df.columns.values) { + const col = df.col(name); + const filled = ffillArray(col.values, limit) as Scalar[]; + colMap.set(name, new Series({ data: filled, index: col.index, dtype: col.dtype })); + } + return new DataFrame(colMap, df.index); + } + + // row-wise: fill across columns for each row + const nRows = df.shape[0]; + const cols = df.columns.values; + const columns = cols.map((name) => df.col(name)); + const rowsFilled: Scalar[][] = columns.map((c) => Array.from(c.values)); + for (let r = 0; r < nRows; r++) { + const rowVals: Scalar[] = columns.map((_, ci) => rowsFilled[ci]?.[r] ?? null); + const filled = ffillArray(rowVals, limit); + for (let ci = 0; ci < cols.length; ci++) { + const rowsFilledCI = rowsFilled[ci]; + if (rowsFilledCI !== undefined) { + rowsFilledCI[r] = filled[ci] ?? null; + } + } + } + const colMap = new Map>(); + for (let ci = 0; ci < cols.length; ci++) { + const name = cols[ci] as string; + const col = columns[ci] as Series; + colMap.set( + name, + new Series({ + data: rowsFilled[ci] ?? [], + index: col.index, + dtype: col.dtype, + }), + ); + } + return new DataFrame(colMap, df.index); +} + +/** + * Backward-fill missing values in a DataFrame. + * + * By default operates **column-wise** (axis=0). With `axis=1` fills across rows. + * + * Mirrors `pandas.DataFrame.bfill()`. + */ +export function dataFrameBfill(df: DataFrame, options?: DataFrameFillOptions): DataFrame { + const limit = options?.limit ?? null; + const axis = options?.axis ?? 0; + const byRow = axis === 1 || axis === "columns"; + + if (!byRow) { + const colMap = new Map>(); + for (const name of df.columns.values) { + const col = df.col(name); + const filled = bfillArray(col.values, limit) as Scalar[]; + colMap.set(name, new Series({ data: filled, index: col.index, dtype: col.dtype })); + } + return new DataFrame(colMap, df.index); + } + + const nRows = df.shape[0]; + const cols = df.columns.values; + const columns = cols.map((name) => df.col(name)); + const rowsFilled: Scalar[][] = columns.map((c) => Array.from(c.values)); + for (let r = 0; r < nRows; r++) { + const rowVals: Scalar[] = columns.map((_, ci) => rowsFilled[ci]?.[r] ?? null); + const filled = bfillArray(rowVals, limit); + for (let ci = 0; ci < cols.length; ci++) { + const rowsFilledCI = rowsFilled[ci]; + if (rowsFilledCI !== undefined) { + rowsFilledCI[r] = filled[ci] ?? null; + } + } + } + const colMap = new Map>(); + for (let ci = 0; ci < cols.length; ci++) { + const name = cols[ci] as string; + const col = columns[ci] as Series; + colMap.set( + name, + new Series({ + data: rowsFilled[ci] ?? [], + index: col.index, + dtype: col.dtype, + }), + ); + } + return new DataFrame(colMap, df.index); +} diff --git a/tests/stats/na_ops.test.ts b/tests/stats/na_ops.test.ts new file mode 100644 index 00000000..340406ac --- /dev/null +++ b/tests/stats/na_ops.test.ts @@ -0,0 +1,280 @@ +/** + * Tests for na_ops — missing-value utilities (isna, notna, ffill, bfill). + */ + +import { describe, expect, it } from "bun:test"; +import fc from "fast-check"; +import { + DataFrame, + Series, + bfillSeries, + dataFrameBfill, + dataFrameFfill, + ffillSeries, + isna, + isnull, + notna, + notnull, +} from "../../src/index.ts"; + +// ─── isna / notna ───────────────────────────────────────────────────────────── + +describe("isna (scalar)", () => { + it("returns true for null", () => expect(isna(null)).toBe(true)); + it("returns true for undefined", () => expect(isna(undefined)).toBe(true)); + it("returns true for NaN", () => expect(isna(Number.NaN)).toBe(true)); + it("returns false for 0", () => expect(isna(0)).toBe(false)); + it("returns false for empty string", () => expect(isna("")).toBe(false)); + it("returns false for false", () => expect(isna(false)).toBe(false)); + it("returns false for a number", () => expect(isna(42)).toBe(false)); +}); + +describe("notna (scalar)", () => { + it("returns false for null", () => expect(notna(null)).toBe(false)); + it("returns false for NaN", () => expect(notna(Number.NaN)).toBe(false)); + it("returns true for 42", () => expect(notna(42)).toBe(true)); + it("returns true for a string", () => expect(notna("hello")).toBe(true)); +}); + +describe("isnull / notnull aliases", () => { + it("isnull equals isna for scalar", () => { + expect(isnull(null)).toBe(isna(null)); + expect(isnull(42)).toBe(isna(42)); + }); + it("notnull equals notna for scalar", () => { + expect(notnull(null)).toBe(notna(null)); + expect(notnull(42)).toBe(notna(42)); + }); +}); + +describe("isna (Series)", () => { + it("returns boolean Series of correct length", () => { + const s = new Series({ data: [1, null, Number.NaN, 4] }); + const result = isna(s); + expect(result).toBeInstanceOf(Series); + expect([...result.values]).toEqual([false, true, true, false]); + }); + + it("all present", () => { + const s = new Series({ data: [1, 2, 3] }); + expect([...isna(s).values]).toEqual([false, false, false]); + }); + + it("all missing", () => { + const s = new Series({ data: [null, null, Number.NaN] }); + expect([...isna(s).values]).toEqual([true, true, true]); + }); +}); + +describe("notna (Series)", () => { + it("is the inverse of isna", () => { + const s = new Series({ data: [1, null, Number.NaN, 4] }); + const na = isna(s).values; + const nna = notna(s).values; + for (let i = 0; i < na.length; i++) { + expect(nna[i]).toBe(!na[i]); + } + }); +}); + +describe("isna (DataFrame)", () => { + it("returns DataFrame of booleans", () => { + const df = DataFrame.fromColumns({ a: [1, null], b: [Number.NaN, 2] }); + const result = isna(df); + expect(result).toBeInstanceOf(DataFrame); + expect([...result.col("a").values]).toEqual([false, true]); + expect([...result.col("b").values]).toEqual([true, false]); + }); +}); + +describe("notna (DataFrame)", () => { + it("returns inverse of isna DataFrame", () => { + const df = DataFrame.fromColumns({ a: [1, null], b: [Number.NaN, 2] }); + expect([...notna(df).col("a").values]).toEqual([true, false]); + expect([...notna(df).col("b").values]).toEqual([false, true]); + }); +}); + +// ─── ffillSeries ────────────────────────────────────────────────────────────── + +describe("ffillSeries", () => { + it("fills nulls with preceding value", () => { + const s = new Series({ data: [1, null, null, 4] }); + expect([...ffillSeries(s).values]).toEqual([1, 1, 1, 4]); + }); + + it("leaves leading nulls untouched", () => { + const s = new Series({ data: [null, null, 3, null] }); + expect([...ffillSeries(s).values]).toEqual([null, null, 3, 3]); + }); + + it("NaN is treated as missing", () => { + const s = new Series({ data: [2, Number.NaN, 5] }); + const result = ffillSeries(s).values; + expect(result[0]).toBe(2); + expect(result[1]).toBe(2); + expect(result[2]).toBe(5); + }); + + it("respects limit option", () => { + const s = new Series({ data: [1, null, null, null, 5] }); + expect([...ffillSeries(s, { limit: 1 }).values]).toEqual([1, 1, null, null, 5]); + }); + + it("preserves original Series", () => { + const s = new Series({ data: [1, null, 3] }); + ffillSeries(s); + expect([...s.values]).toEqual([1, null, 3]); + }); + + it("empty Series returns empty", () => { + const s = new Series({ data: [] }); + expect([...ffillSeries(s).values]).toEqual([]); + }); + + it("preserves name and index", () => { + const s = new Series({ data: [1, null], name: "x" }); + const filled = ffillSeries(s); + expect(filled.name).toBe("x"); + expect(filled.index.size).toBe(2); + }); +}); + +// ─── bfillSeries ────────────────────────────────────────────────────────────── + +describe("bfillSeries", () => { + it("fills nulls with following value", () => { + const s = new Series({ data: [1, null, null, 4] }); + expect([...bfillSeries(s).values]).toEqual([1, 4, 4, 4]); + }); + + it("leaves trailing nulls untouched", () => { + const s = new Series({ data: [null, 3, null, null] }); + expect([...bfillSeries(s).values]).toEqual([3, 3, null, null]); + }); + + it("respects limit option", () => { + const s = new Series({ data: [1, null, null, null, 5] }); + expect([...bfillSeries(s, { limit: 2 }).values]).toEqual([1, null, 5, 5, 5]); + }); + + it("empty Series returns empty", () => { + const s = new Series({ data: [] }); + expect([...bfillSeries(s).values]).toEqual([]); + }); +}); + +// ─── dataFrameFfill ─────────────────────────────────────────────────────────── + +describe("dataFrameFfill (column-wise)", () => { + it("fills each column independently", () => { + const df = DataFrame.fromColumns({ a: [1, null, 3], b: [null, 2, null] }); + const result = dataFrameFfill(df); + expect([...result.col("a").values]).toEqual([1, 1, 3]); + expect([...result.col("b").values]).toEqual([null, 2, 2]); + }); + + it("preserves index", () => { + const df = DataFrame.fromColumns({ x: [1, null] }); + expect(dataFrameFfill(df).index.size).toBe(2); + }); +}); + +describe("dataFrameFfill (row-wise)", () => { + it("fills across columns per row", () => { + const df = DataFrame.fromColumns({ a: [1, null], b: [null, null], c: [3, 4] }); + const result = dataFrameFfill(df, { axis: 1 }); + expect([...result.col("a").values]).toEqual([1, null]); + expect([...result.col("b").values]).toEqual([1, null]); + expect([...result.col("c").values]).toEqual([3, 4]); + }); +}); + +// ─── dataFrameBfill ─────────────────────────────────────────────────────────── + +describe("dataFrameBfill (column-wise)", () => { + it("fills each column backward", () => { + const df = DataFrame.fromColumns({ a: [null, null, 3], b: [1, null, null] }); + const result = dataFrameBfill(df); + expect([...result.col("a").values]).toEqual([3, 3, 3]); + expect([...result.col("b").values]).toEqual([1, null, null]); + }); +}); + +describe("dataFrameBfill (row-wise)", () => { + it("fills backward across columns per row", () => { + const df = DataFrame.fromColumns({ a: [null, 1], b: [null, null], c: [3, null] }); + const result = dataFrameBfill(df, { axis: 1 }); + expect([...result.col("a").values]).toEqual([3, 1]); + expect([...result.col("b").values]).toEqual([3, null]); + expect([...result.col("c").values]).toEqual([3, null]); + }); +}); + +// ─── property-based tests ───────────────────────────────────────────────────── + +describe("property: ffill followed by bfill fills all if any non-null", () => { + it("all values filled when at least one is present", () => { + fc.assert( + fc.property( + fc.array(fc.option(fc.integer({ min: 0, max: 100 }), { nil: null }), { + minLength: 1, + maxLength: 20, + }), + (raw) => { + const hasNonNull = raw.some((v) => v !== null); + if (!hasNonNull) { + return true; + } + const s = new Series({ data: raw }); + const result = bfillSeries(ffillSeries(s)); + return result.values.every((v) => v !== null); + }, + ), + ); + }); +}); + +describe("property: ffill never introduces new non-null values beyond last valid", () => { + it("ffilled series has no nulls after first valid value", () => { + fc.assert( + fc.property( + fc.array(fc.option(fc.integer({ min: -50, max: 50 }), { nil: null }), { + minLength: 0, + maxLength: 30, + }), + (raw) => { + const s = new Series({ data: raw }); + const filled = ffillSeries(s).values; + let sawValid = false; + for (const v of filled) { + if (v !== null) { + sawValid = true; + } + if (sawValid && v === null) { + return false; + } + } + return true; + }, + ), + ); + }); +}); + +describe("property: isna is inverse of notna for scalars", () => { + it("isna(v) === !notna(v)", () => { + fc.assert( + fc.property( + fc.oneof( + fc.integer(), + fc.float({ noNaN: false }), + fc.constant(null), + fc.string(), + fc.boolean(), + ), + (v) => isna(v as Parameters[0]) === !notna(v as Parameters[0]), + ), + ); + }); +}); From d6df47348e0ea49480e28bf091182af90b08720b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 13 Apr 2026 08:01:00 +0000 Subject: [PATCH 2/2] Merge main into PR branch, resolving conflicts in src/index.ts and src/stats/index.ts - Keep na_ops additions (ffillSeries/bfillSeries/dataFrameFfill/dataFrameBfill) - Keep main's additions (cut/qcut, where_mask, notna_isna, string_ops, etc.) - Resolve duplicate isna/notna by using notna_isna.ts (main) for those exports Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .autoloop/programs/perf-comparison/program.md | 74 +++ .github/workflows/autoloop.lock.yml | 36 +- .github/workflows/autoloop.md | 69 +- .github/workflows/evergreen.lock.yml | 36 +- .github/workflows/evergreen.md | 22 +- .github/workflows/pages.yml | 7 + .github/workflows/sync-branches.lock.yml | 20 +- .github/workflows/sync-branches.md | 18 +- benchmarks/pandas/bench_concat.py | 28 + benchmarks/pandas/bench_dataframe_apply.py | 27 + benchmarks/pandas/bench_dataframe_creation.py | 27 + benchmarks/pandas/bench_dataframe_dropna.py | 27 + benchmarks/pandas/bench_dataframe_filter.py | 26 + benchmarks/pandas/bench_dataframe_rename.py | 27 + benchmarks/pandas/bench_dataframe_sort.py | 28 + benchmarks/pandas/bench_describe.py | 27 + benchmarks/pandas/bench_ewm_mean.py | 26 + benchmarks/pandas/bench_groupby_mean.py | 27 + benchmarks/pandas/bench_merge.py | 29 + benchmarks/pandas/bench_pivot_table.py | 28 + benchmarks/pandas/bench_read_csv.py | 30 + benchmarks/pandas/bench_rolling_mean.py | 26 + benchmarks/pandas/bench_series_arithmetic.py | 26 + benchmarks/pandas/bench_series_creation.py | 47 ++ benchmarks/pandas/bench_series_cumsum.py | 26 + benchmarks/pandas/bench_series_fillna.py | 26 + benchmarks/pandas/bench_series_shift.py | 26 + benchmarks/pandas/bench_series_sort.py | 27 + benchmarks/pandas/bench_series_string_ops.py | 27 + .../pandas/bench_series_value_counts.py | 25 + benchmarks/results.json | 247 +++++++ benchmarks/run_benchmarks.sh | 129 ++++ benchmarks/tsb/bench_concat.ts | 32 + benchmarks/tsb/bench_dataframe_apply.ts | 32 + benchmarks/tsb/bench_dataframe_creation.ts | 33 + benchmarks/tsb/bench_dataframe_dropna.ts | 31 + benchmarks/tsb/bench_dataframe_filter.ts | 30 + benchmarks/tsb/bench_dataframe_rename.ts | 31 + benchmarks/tsb/bench_dataframe_sort.ts | 31 + benchmarks/tsb/bench_describe.ts | 31 + benchmarks/tsb/bench_ewm_mean.ts | 30 + benchmarks/tsb/bench_groupby_mean.ts | 31 + benchmarks/tsb/bench_merge.ts | 33 + benchmarks/tsb/bench_pivot_table.ts | 32 + benchmarks/tsb/bench_read_csv.ts | 39 ++ benchmarks/tsb/bench_rolling_mean.ts | 30 + benchmarks/tsb/bench_series_arithmetic.ts | 30 + benchmarks/tsb/bench_series_creation.ts | 49 ++ benchmarks/tsb/bench_series_cumsum.ts | 30 + benchmarks/tsb/bench_series_fillna.ts | 31 + benchmarks/tsb/bench_series_shift.ts | 30 + benchmarks/tsb/bench_series_sort.ts | 30 + benchmarks/tsb/bench_series_string_ops.ts | 32 + benchmarks/tsb/bench_series_value_counts.ts | 30 + docs/playground.md | 3 - playground/api_types.html | 222 +++++++ playground/attrs.html | 183 +++++ playground/benchmarks.html | 360 ++++++++++ playground/categorical_ops.html | 338 ++++++++++ playground/cut_qcut.html | 163 +++++ playground/format_ops.html | 262 ++++++++ playground/index.html | 80 +++ playground/insert_pop.html | 172 +++++ playground/notna_isna.html | 242 +++++++ playground/numeric_extended.html | 353 ++++++++++ playground/pipe_apply.html | 276 ++++++++ playground/playground-runtime.js | 179 ++++- playground/rolling_apply.html | 225 +++++++ playground/string_ops.html | 282 ++++++++ playground/string_ops_extended.html | 413 ++++++++++++ playground/to_from_dict.html | 122 ++++ playground/where_mask.html | 220 ++++++ playground/wide_to_long.html | 113 ++++ playground/window_extended.html | 304 +++++++++ src/core/api_types.ts | 629 ++++++++++++++++++ src/core/attrs.ts | 291 ++++++++ src/core/index.ts | 68 ++ src/core/insert_pop.ts | 214 ++++++ src/core/pipe_apply.ts | 303 +++++++++ src/core/to_from_dict.ts | 283 ++++++++ src/index.ts | 173 ++++- src/reshape/index.ts | 2 + src/reshape/wide_to_long.ts | 217 ++++++ src/stats/categorical_ops.ts | 483 ++++++++++++++ src/stats/cut_qcut.ts | 383 +++++++++++ src/stats/format_ops.ts | 442 ++++++++++++ src/stats/index.ts | 111 +++- src/stats/notna_isna.ts | 369 ++++++++++ src/stats/numeric_extended.ts | 586 ++++++++++++++++ src/stats/string_ops.ts | 468 +++++++++++++ src/stats/string_ops_extended.ts | 429 ++++++++++++ src/stats/where_mask.ts | 289 ++++++++ src/stats/window_extended.ts | 321 +++++++++ src/window/index.ts | 7 + src/window/rolling_apply.ts | 323 +++++++++ tests/core/api_types.test.ts | 621 +++++++++++++++++ tests/core/attrs.test.ts | 542 +++++++++++++++ tests/core/insert_pop.test.ts | 286 ++++++++ tests/core/pipe_apply.test.ts | 449 +++++++++++++ tests/core/to_from_dict.test.ts | 278 ++++++++ tests/reshape/wide_to_long.test.ts | 211 ++++++ tests/stats/categorical_ops.test.ts | 476 +++++++++++++ tests/stats/cut_qcut.test.ts | 277 ++++++++ tests/stats/format_ops.test.ts | 568 ++++++++++++++++ tests/stats/notna_isna.test.ts | 536 +++++++++++++++ tests/stats/numeric_extended.test.ts | 509 ++++++++++++++ tests/stats/rank.test.ts | 2 +- tests/stats/string_ops.test.ts | 459 +++++++++++++ tests/stats/string_ops_extended.test.ts | 437 ++++++++++++ tests/stats/where_mask.test.ts | 338 ++++++++++ tests/stats/window_extended.test.ts | 365 ++++++++++ tests/window/rolling_apply.test.ts | 354 ++++++++++ 112 files changed, 19493 insertions(+), 97 deletions(-) create mode 100644 .autoloop/programs/perf-comparison/program.md create mode 100644 benchmarks/pandas/bench_concat.py create mode 100644 benchmarks/pandas/bench_dataframe_apply.py create mode 100644 benchmarks/pandas/bench_dataframe_creation.py create mode 100644 benchmarks/pandas/bench_dataframe_dropna.py create mode 100644 benchmarks/pandas/bench_dataframe_filter.py create mode 100644 benchmarks/pandas/bench_dataframe_rename.py create mode 100644 benchmarks/pandas/bench_dataframe_sort.py create mode 100644 benchmarks/pandas/bench_describe.py create mode 100644 benchmarks/pandas/bench_ewm_mean.py create mode 100644 benchmarks/pandas/bench_groupby_mean.py create mode 100644 benchmarks/pandas/bench_merge.py create mode 100644 benchmarks/pandas/bench_pivot_table.py create mode 100644 benchmarks/pandas/bench_read_csv.py create mode 100644 benchmarks/pandas/bench_rolling_mean.py create mode 100644 benchmarks/pandas/bench_series_arithmetic.py create mode 100644 benchmarks/pandas/bench_series_creation.py create mode 100644 benchmarks/pandas/bench_series_cumsum.py create mode 100644 benchmarks/pandas/bench_series_fillna.py create mode 100644 benchmarks/pandas/bench_series_shift.py create mode 100644 benchmarks/pandas/bench_series_sort.py create mode 100644 benchmarks/pandas/bench_series_string_ops.py create mode 100644 benchmarks/pandas/bench_series_value_counts.py create mode 100644 benchmarks/results.json create mode 100644 benchmarks/run_benchmarks.sh create mode 100644 benchmarks/tsb/bench_concat.ts create mode 100644 benchmarks/tsb/bench_dataframe_apply.ts create mode 100644 benchmarks/tsb/bench_dataframe_creation.ts create mode 100644 benchmarks/tsb/bench_dataframe_dropna.ts create mode 100644 benchmarks/tsb/bench_dataframe_filter.ts create mode 100644 benchmarks/tsb/bench_dataframe_rename.ts create mode 100644 benchmarks/tsb/bench_dataframe_sort.ts create mode 100644 benchmarks/tsb/bench_describe.ts create mode 100644 benchmarks/tsb/bench_ewm_mean.ts create mode 100644 benchmarks/tsb/bench_groupby_mean.ts create mode 100644 benchmarks/tsb/bench_merge.ts create mode 100644 benchmarks/tsb/bench_pivot_table.ts create mode 100644 benchmarks/tsb/bench_read_csv.ts create mode 100644 benchmarks/tsb/bench_rolling_mean.ts create mode 100644 benchmarks/tsb/bench_series_arithmetic.ts create mode 100644 benchmarks/tsb/bench_series_creation.ts create mode 100644 benchmarks/tsb/bench_series_cumsum.ts create mode 100644 benchmarks/tsb/bench_series_fillna.ts create mode 100644 benchmarks/tsb/bench_series_shift.ts create mode 100644 benchmarks/tsb/bench_series_sort.ts create mode 100644 benchmarks/tsb/bench_series_string_ops.ts create mode 100644 benchmarks/tsb/bench_series_value_counts.ts create mode 100644 playground/api_types.html create mode 100644 playground/attrs.html create mode 100644 playground/benchmarks.html create mode 100644 playground/categorical_ops.html create mode 100644 playground/cut_qcut.html create mode 100644 playground/format_ops.html create mode 100644 playground/insert_pop.html create mode 100644 playground/notna_isna.html create mode 100644 playground/numeric_extended.html create mode 100644 playground/pipe_apply.html create mode 100644 playground/rolling_apply.html create mode 100644 playground/string_ops.html create mode 100644 playground/string_ops_extended.html create mode 100644 playground/to_from_dict.html create mode 100644 playground/where_mask.html create mode 100644 playground/wide_to_long.html create mode 100644 playground/window_extended.html create mode 100644 src/core/api_types.ts create mode 100644 src/core/attrs.ts create mode 100644 src/core/insert_pop.ts create mode 100644 src/core/pipe_apply.ts create mode 100644 src/core/to_from_dict.ts create mode 100644 src/reshape/wide_to_long.ts create mode 100644 src/stats/categorical_ops.ts create mode 100644 src/stats/cut_qcut.ts create mode 100644 src/stats/format_ops.ts create mode 100644 src/stats/notna_isna.ts create mode 100644 src/stats/numeric_extended.ts create mode 100644 src/stats/string_ops.ts create mode 100644 src/stats/string_ops_extended.ts create mode 100644 src/stats/where_mask.ts create mode 100644 src/stats/window_extended.ts create mode 100644 src/window/rolling_apply.ts create mode 100644 tests/core/api_types.test.ts create mode 100644 tests/core/attrs.test.ts create mode 100644 tests/core/insert_pop.test.ts create mode 100644 tests/core/pipe_apply.test.ts create mode 100644 tests/core/to_from_dict.test.ts create mode 100644 tests/reshape/wide_to_long.test.ts create mode 100644 tests/stats/categorical_ops.test.ts create mode 100644 tests/stats/cut_qcut.test.ts create mode 100644 tests/stats/format_ops.test.ts create mode 100644 tests/stats/notna_isna.test.ts create mode 100644 tests/stats/numeric_extended.test.ts create mode 100644 tests/stats/string_ops.test.ts create mode 100644 tests/stats/string_ops_extended.test.ts create mode 100644 tests/stats/where_mask.test.ts create mode 100644 tests/stats/window_extended.test.ts create mode 100644 tests/window/rolling_apply.test.ts diff --git a/.autoloop/programs/perf-comparison/program.md b/.autoloop/programs/perf-comparison/program.md new file mode 100644 index 00000000..c1aec206 --- /dev/null +++ b/.autoloop/programs/perf-comparison/program.md @@ -0,0 +1,74 @@ +--- +schedule: every 6h +--- + +# Performance Comparison: tsb (TypeScript) vs pandas (Python) + +## Goal + +Systematically benchmark every tsb function against its pandas equivalent, one function per iteration. Each iteration picks a function that has not yet been benchmarked, writes a matching performance test for both tsb (TypeScript/Bun) and pandas (Python), runs both, and records the timing results. The benchmark results are displayed on the playground pages doc site. + +This is an open-ended program — it runs continuously, always adding the next benchmark comparison. + +### How each iteration works + +1. **Read existing benchmarks** — check `benchmarks/tsb/` and `benchmarks/pandas/` to see which functions are already benchmarked. +2. **Pick ONE function** from `src/` that has no benchmark yet. Prioritize core operations (Series, DataFrame, GroupBy, etc.). +3. **Write a TypeScript benchmark** in `benchmarks/tsb/bench_{function}.ts` that: + - Creates a realistic dataset (e.g. 100,000 rows) + - Runs the operation in a tight loop (warm-up + measured iterations) + - Outputs JSON: `{"function": "...", "mean_ms": ..., "iterations": ..., "total_ms": ...}` +4. **Write a matching Python benchmark** in `benchmarks/pandas/bench_{function}.py` that: + - Creates the same dataset as the TypeScript version + - Runs the same operation with the same loop structure + - Outputs the same JSON format +5. **Run both benchmarks** via `benchmarks/run_benchmarks.sh` and capture results. +6. **Update `benchmarks/results.json`** with the new timing data. +7. **Update `playground/benchmarks.html`** to display the new function's comparison metrics. + +### Key constraints + +- **Matching datasets** — both benchmarks must use identical data (same size, same values where possible). +- **Fair comparison** — same number of warm-up and measured iterations for both. +- **JSON output** — every benchmark script must output a single JSON line to stdout. +- **No modifications to `src/`** — benchmark code is separate from library code. +- **Python environment** — install pandas via pip if not present. + +## Target + +Only modify these files: +- `benchmarks/**` — benchmark scripts and results +- `playground/benchmarks.html` — performance comparison playground page +- `playground/index.html` — add/update link to benchmarks page + +Do NOT modify: +- `src/**` — library source code +- `tests/**` — test files +- `README.md` — read-only +- `.autoloop/programs/**` — program definitions (except this file's code/ dir) +- `.github/workflows/autoloop*` — autoloop workflow files + +## Evaluation + +```bash +# Set up Python environment if needed +if ! command -v python3 &>/dev/null; then + echo "Python3 not found, skipping" +fi +pip3 install pandas --quiet 2>/dev/null || true + +# Count the number of benchmark pairs (functions with both TS and Python benchmarks) +ts_benchmarks=$(ls benchmarks/tsb/bench_*.ts 2>/dev/null | wc -l | tr -d ' ') +py_benchmarks=$(ls benchmarks/pandas/bench_*.py 2>/dev/null | wc -l | tr -d ' ') + +# The metric is the minimum of the two (both must exist for a complete benchmark) +if [ "$ts_benchmarks" -lt "$py_benchmarks" ]; then + count=$ts_benchmarks +else + count=$py_benchmarks +fi + +echo "{\"benchmarked_functions\": ${count:-0}}" +``` + +The metric is `benchmarked_functions`. **Higher is better.** diff --git a/.github/workflows/autoloop.lock.yml b/.github/workflows/autoloop.lock.yml index ce21ec84..489ea218 100644 --- a/.github/workflows/autoloop.lock.yml +++ b/.github/workflows/autoloop.lock.yml @@ -37,7 +37,7 @@ # Imports: # - shared/reporting.md # -# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"ae0854a9693094d32638babc16d353dc5de46c218ae3d893a9306b0b2a916042","compiler_version":"v0.65.6","strict":true,"agent_id":"copilot"} +# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"4a373c351f80c4a3192abb04ad384f012a37e1fa4edfab3d08dc852deac2cf4f","compiler_version":"v0.65.6","strict":true,"agent_id":"copilot"} name: "Autoloop" "on": @@ -222,21 +222,21 @@ jobs: run: | bash ${RUNNER_TEMP}/gh-aw/actions/create_prompt_first.sh { - cat << 'GH_AW_PROMPT_6cb617c1e46803c0_EOF' + cat << 'GH_AW_PROMPT_8719c7b9dd3572a2_EOF' - GH_AW_PROMPT_6cb617c1e46803c0_EOF + GH_AW_PROMPT_8719c7b9dd3572a2_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/xpia.md" cat "${RUNNER_TEMP}/gh-aw/prompts/temp_folder_prompt.md" cat "${RUNNER_TEMP}/gh-aw/prompts/markdown.md" cat "${RUNNER_TEMP}/gh-aw/prompts/repo_memory_prompt.md" cat "${RUNNER_TEMP}/gh-aw/prompts/safe_outputs_prompt.md" - cat << 'GH_AW_PROMPT_6cb617c1e46803c0_EOF' + cat << 'GH_AW_PROMPT_8719c7b9dd3572a2_EOF' Tools: add_comment(max:7), create_issue(max:2), update_issue(max:3), create_pull_request, add_labels(max:2), remove_labels(max:2), push_to_pull_request_branch, missing_tool, missing_data, noop - GH_AW_PROMPT_6cb617c1e46803c0_EOF + GH_AW_PROMPT_8719c7b9dd3572a2_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/safe_outputs_create_pull_request.md" cat "${RUNNER_TEMP}/gh-aw/prompts/safe_outputs_push_to_pr_branch.md" - cat << 'GH_AW_PROMPT_6cb617c1e46803c0_EOF' + cat << 'GH_AW_PROMPT_8719c7b9dd3572a2_EOF' The following GitHub context information is available for this workflow: @@ -269,7 +269,7 @@ jobs: - **Note**: If a branch you need is not in the list above and is not listed as an additional fetched ref, it has NOT been checked out. For private repositories you cannot fetch it without proper authentication. If the branch is required and not available, exit with an error and ask the user to add it to the `fetch:` option of the `checkout:` configuration (e.g., `fetch: ["refs/pulls/open/*"]` for all open PR refs, or `fetch: ["main", "feature/my-branch"]` for specific branches). - GH_AW_PROMPT_6cb617c1e46803c0_EOF + GH_AW_PROMPT_8719c7b9dd3572a2_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/github_mcp_tools_with_safeoutputs_prompt.md" if [ "$GITHUB_EVENT_NAME" = "issue_comment" ] && [ -n "$GH_AW_IS_PR_COMMENT" ] || [ "$GITHUB_EVENT_NAME" = "pull_request_review_comment" ] || [ "$GITHUB_EVENT_NAME" = "pull_request_review" ]; then cat "${RUNNER_TEMP}/gh-aw/prompts/pr_context_prompt.md" @@ -277,11 +277,11 @@ jobs: if [ "$GITHUB_EVENT_NAME" = "issue_comment" ] && [ -n "$GH_AW_IS_PR_COMMENT" ] || [ "$GITHUB_EVENT_NAME" = "pull_request_review_comment" ] || [ "$GITHUB_EVENT_NAME" = "pull_request_review" ]; then cat "${RUNNER_TEMP}/gh-aw/prompts/pr_context_push_to_pr_branch_guidance.md" fi - cat << 'GH_AW_PROMPT_6cb617c1e46803c0_EOF' + cat << 'GH_AW_PROMPT_8719c7b9dd3572a2_EOF' {{#runtime-import .github/workflows/shared/reporting.md}} {{#runtime-import .github/workflows/autoloop.md}} - GH_AW_PROMPT_6cb617c1e46803c0_EOF + GH_AW_PROMPT_8719c7b9dd3572a2_EOF } > "$GH_AW_PROMPT" - name: Interpolate variables and render templates uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 @@ -439,7 +439,7 @@ jobs: GITHUB_REPOSITORY: ${{ github.repository }} GITHUB_TOKEN: ${{ github.token }} name: Check which programs are due - run: "python3 - << 'PYEOF'\nimport os, json, re, glob, sys\nimport urllib.request, urllib.error\nfrom datetime import datetime, timezone, timedelta\n\nprograms_dir = \".autoloop/programs\"\nautoloop_dir = \".autoloop/programs\"\ntemplate_file = os.path.join(autoloop_dir, \"example.md\")\n\n# Read program state from repo-memory (persistent git-backed storage)\ngithub_token = os.environ.get(\"GITHUB_TOKEN\", \"\")\nrepo = os.environ.get(\"GITHUB_REPOSITORY\", \"\")\nforced_program = os.environ.get(\"AUTOLOOP_PROGRAM\", \"\").strip()\n\n# Repo-memory files are cloned to /tmp/gh-aw/repo-memory/{id}/ where {id}\n# is derived from the branch-name configured in the tools section (memory/autoloop → autoloop)\nrepo_memory_dir = \"/tmp/gh-aw/repo-memory/autoloop\"\n\ndef parse_machine_state(content):\n \"\"\"Parse the ⚙️ Machine State table from a state file. Returns a dict.\"\"\"\n state = {}\n m = re.search(r'## ⚙️ Machine State.*?\\n(.*?)(?=\\n## |\\Z)', content, re.DOTALL)\n if not m:\n return state\n section = m.group(0)\n for row in re.finditer(r'\\|\\s*(.+?)\\s*\\|\\s*(.+?)\\s*\\|', section):\n raw_key = row.group(1).strip()\n raw_val = row.group(2).strip()\n if raw_key.lower() in (\"field\", \"---\", \":---\", \":---:\", \"---:\"):\n continue\n key = raw_key.lower().replace(\" \", \"_\")\n val = None if raw_val in (\"—\", \"-\", \"\") else raw_val\n state[key] = val\n # Coerce types\n for int_field in (\"iteration_count\", \"consecutive_errors\"):\n if int_field in state:\n try:\n state[int_field] = int(state[int_field])\n except (ValueError, TypeError):\n state[int_field] = 0\n if \"paused\" in state:\n state[\"paused\"] = str(state.get(\"paused\", \"\")).lower() == \"true\"\n if \"completed\" in state:\n state[\"completed\"] = str(state.get(\"completed\", \"\")).lower() == \"true\"\n # recent_statuses: stored as comma-separated words (e.g. \"accepted, rejected, error\")\n rs_raw = state.get(\"recent_statuses\") or \"\"\n if rs_raw:\n state[\"recent_statuses\"] = [s.strip().lower() for s in rs_raw.split(\",\") if s.strip()]\n else:\n state[\"recent_statuses\"] = []\n return state\n\ndef read_program_state(program_name):\n \"\"\"Read scheduling state from the repo-memory state file.\"\"\"\n state_file = os.path.join(repo_memory_dir, f\"{program_name}.md\")\n if not os.path.isfile(state_file):\n print(f\" {program_name}: no state file found (first run)\")\n return {}\n with open(state_file, encoding=\"utf-8\") as f:\n content = f.read()\n return parse_machine_state(content)\n\n# Bootstrap: create autoloop programs directory and template if missing\nif not os.path.isdir(autoloop_dir):\n os.makedirs(autoloop_dir, exist_ok=True)\n bt = chr(96) # backtick — avoid literal backticks that break gh-aw compiler\n template = \"\\n\".join([\n \"\",\n \"\",\n \"\",\n \"\",\n \"# Autoloop Program\",\n \"\",\n \"\",\n \"\",\n \"## Goal\",\n \"\",\n \"\",\n \"\",\n \"REPLACE THIS with your optimization goal.\",\n \"\",\n \"## Target\",\n \"\",\n \"\",\n \"\",\n \"Only modify these files:\",\n f\"- {bt}REPLACE_WITH_FILE{bt} -- (describe what this file does)\",\n \"\",\n \"Do NOT modify:\",\n \"- (list files that must not be touched)\",\n \"\",\n \"## Evaluation\",\n \"\",\n \"\",\n \"\",\n f\"{bt}{bt}{bt}bash\",\n \"REPLACE_WITH_YOUR_EVALUATION_COMMAND\",\n f\"{bt}{bt}{bt}\",\n \"\",\n f\"The metric is {bt}REPLACE_WITH_METRIC_NAME{bt}. **Lower/Higher is better.** (pick one)\",\n \"\",\n ])\n with open(template_file, \"w\") as f:\n f.write(template)\n # Leave the template unstaged — the agent will create a draft PR with it\n print(f\"BOOTSTRAPPED: created {template_file} locally (agent will create a draft PR)\")\n\n# Find all program files from all locations:\n# 1. Directory-based programs: .autoloop/programs//program.md (preferred)\n# 2. Bare markdown programs: .autoloop/programs/.md (simple)\n# 3. Issue-based programs: GitHub issues with the 'autoloop-program' label\nprogram_files = []\nissue_programs = {} # name -> {issue_number, file}\n\n# Scan .autoloop/programs/ for directory-based programs\nif os.path.isdir(programs_dir):\n for entry in sorted(os.listdir(programs_dir)):\n prog_dir = os.path.join(programs_dir, entry)\n if os.path.isdir(prog_dir):\n # Look for program.md inside the directory\n prog_file = os.path.join(prog_dir, \"program.md\")\n if os.path.isfile(prog_file):\n program_files.append(prog_file)\n\n# Scan .autoloop/programs/ for bare markdown programs\nbare_programs = sorted(glob.glob(os.path.join(autoloop_dir, \"*.md\")))\nfor pf in bare_programs:\n program_files.append(pf)\n\n# Scan GitHub issues with the 'autoloop-program' label\nissue_programs_dir = \"/tmp/gh-aw/issue-programs\"\nos.makedirs(issue_programs_dir, exist_ok=True)\ntry:\n api_url = f\"https://api.github.com/repos/{repo}/issues?labels=autoloop-program&state=open&per_page=100\"\n req = urllib.request.Request(api_url, headers={\n \"Authorization\": f\"token {github_token}\",\n \"Accept\": \"application/vnd.github.v3+json\",\n })\n with urllib.request.urlopen(req, timeout=30) as resp:\n issues = json.loads(resp.read().decode())\n for issue in issues:\n if issue.get(\"pull_request\"):\n continue # skip PRs\n body = issue.get(\"body\") or \"\"\n title = issue.get(\"title\") or \"\"\n number = issue[\"number\"]\n # Derive program name from issue title: slugify to lowercase with hyphens\n slug = re.sub(r'[^a-z0-9]+', '-', title.lower()).strip('-')\n slug = re.sub(r'-+', '-', slug) # collapse consecutive hyphens\n if not slug:\n slug = f\"issue-{number}\"\n # Avoid slug collisions: if another issue already claimed this slug, append issue number\n if slug in issue_programs:\n print(f\" Warning: slug '{slug}' (issue #{number}) collides with issue #{issue_programs[slug]['issue_number']}, appending issue number\")\n slug = f\"{slug}-{number}\"\n # Write issue body to a temp file so the scheduling loop can process it\n issue_file = os.path.join(issue_programs_dir, f\"{slug}.md\")\n with open(issue_file, \"w\") as f:\n f.write(body)\n program_files.append(issue_file)\n issue_programs[slug] = {\"issue_number\": number, \"file\": issue_file, \"title\": title}\n print(f\" Found issue-based program: '{slug}' (issue #{number})\")\nexcept Exception as e:\n print(f\" Warning: could not fetch issue-based programs: {e}\")\n\nif not program_files:\n # Fallback to single-file locations\n for path in [\".autoloop/program.md\", \"program.md\"]:\n if os.path.isfile(path):\n program_files = [path]\n break\n\nif not program_files:\n print(\"NO_PROGRAMS_FOUND\")\n os.makedirs(\"/tmp/gh-aw\", exist_ok=True)\n with open(\"/tmp/gh-aw/autoloop.json\", \"w\") as f:\n json.dump({\"due\": [], \"skipped\": [], \"unconfigured\": [], \"no_programs\": True}, f)\n sys.exit(0)\n\nos.makedirs(\"/tmp/gh-aw\", exist_ok=True)\nnow = datetime.now(timezone.utc)\ndue = []\nskipped = []\nunconfigured = []\nall_programs = {} # name -> file path (populated during scanning)\n\n# Schedule string to timedelta\ndef parse_schedule(s):\n s = s.strip().lower()\n m = re.match(r\"every\\s+(\\d+)\\s*h\", s)\n if m:\n return timedelta(hours=int(m.group(1)))\n m = re.match(r\"every\\s+(\\d+)\\s*m\", s)\n if m:\n return timedelta(minutes=int(m.group(1)))\n if s == \"daily\":\n return timedelta(hours=24)\n if s == \"weekly\":\n return timedelta(days=7)\n return None # No per-program schedule — always due\n\ndef get_program_name(pf):\n \"\"\"Extract program name from file path.\n Directory-based: .autoloop/programs//program.md -> \n Bare markdown: .autoloop/programs/.md -> \n Issue-based: /tmp/gh-aw/issue-programs/.md -> \n \"\"\"\n if pf.endswith(\"/program.md\"):\n # Directory-based program: name is the parent directory\n return os.path.basename(os.path.dirname(pf))\n else:\n # Bare markdown or issue-based program: name is the filename without .md\n return os.path.splitext(os.path.basename(pf))[0]\n\nfor pf in program_files:\n name = get_program_name(pf)\n all_programs[name] = pf\n with open(pf) as f:\n content = f.read()\n\n # Check sentinel (skip for issue-based programs which use AUTOLOOP:ISSUE-PROGRAM)\n if \"\" in content:\n unconfigured.append(name)\n continue\n\n # Check for TODO/REPLACE placeholders\n if re.search(r'\\bTODO\\b|\\bREPLACE', content):\n unconfigured.append(name)\n continue\n\n # Parse optional YAML frontmatter for schedule and target-metric\n # Strip leading HTML comments before checking (issue-based programs may have them)\n content_stripped = re.sub(r'^(\\s*\\s*\\n)*', '', content, flags=re.DOTALL)\n schedule_delta = None\n target_metric = None\n fm_match = re.match(r\"^---\\s*\\n(.*?)\\n---\\s*\\n\", content_stripped, re.DOTALL)\n if fm_match:\n for line in fm_match.group(1).split(\"\\n\"):\n if line.strip().startswith(\"schedule:\"):\n schedule_str = line.split(\":\", 1)[1].strip()\n schedule_delta = parse_schedule(schedule_str)\n if line.strip().startswith(\"target-metric:\"):\n try:\n target_metric = float(line.split(\":\", 1)[1].strip())\n except (ValueError, TypeError):\n print(f\" Warning: {name} has invalid target-metric value: {line.split(':', 1)[1].strip()}\")\n\n # Read state from repo-memory\n state = read_program_state(name)\n if state:\n print(f\" {name}: last_run={state.get('last_run')}, iteration_count={state.get('iteration_count')}\")\n else:\n print(f\" {name}: no state found (first run)\")\n\n last_run = None\n lr = state.get(\"last_run\")\n if lr:\n try:\n last_run = datetime.fromisoformat(lr.replace(\"Z\", \"+00:00\"))\n except ValueError:\n pass\n\n # Check if completed (target metric was reached)\n if str(state.get(\"completed\", \"\")).lower() == \"true\":\n skipped.append({\"name\": name, \"reason\": f\"completed: target metric reached\"})\n continue\n\n # Check if paused (e.g., plateau or recurring errors)\n if state.get(\"paused\"):\n skipped.append({\"name\": name, \"reason\": f\"paused: {state.get('pause_reason', 'unknown')}\"})\n continue\n\n # Auto-pause on plateau: 5+ consecutive rejections\n recent = state.get(\"recent_statuses\", [])[-5:]\n if len(recent) >= 5 and all(s == \"rejected\" for s in recent):\n skipped.append({\"name\": name, \"reason\": \"plateau: 5 consecutive rejections\"})\n continue\n\n # Check if due based on per-program schedule\n if schedule_delta and last_run:\n if now - last_run < schedule_delta:\n skipped.append({\"name\": name, \"reason\": \"not due yet\",\n \"next_due\": (last_run + schedule_delta).isoformat()})\n continue\n\n due.append({\"name\": name, \"last_run\": lr, \"file\": pf, \"target_metric\": target_metric})\n\n# Pick the program to run\nselected = None\nselected_file = None\nselected_issue = None\nselected_target_metric = None\ndeferred = []\n\nif forced_program:\n # Manual dispatch requested a specific program — bypass scheduling\n # (paused, not-due, and plateau programs can still be forced)\n if forced_program not in all_programs:\n print(f\"ERROR: requested program '{forced_program}' not found.\")\n print(f\" Available programs: {list(all_programs.keys())}\")\n sys.exit(1)\n if forced_program in unconfigured:\n print(f\"ERROR: requested program '{forced_program}' is unconfigured (has placeholders).\")\n sys.exit(1)\n selected = forced_program\n selected_file = all_programs[forced_program]\n deferred = [p[\"name\"] for p in due if p[\"name\"] != forced_program]\n if selected in issue_programs:\n selected_issue = issue_programs[selected][\"issue_number\"]\n # Find target_metric: check the due list first, then parse from the program file\n for p in due:\n if p[\"name\"] == forced_program:\n selected_target_metric = p.get(\"target_metric\")\n break\n if selected_target_metric is None:\n # Program may have been skipped (completed/paused/plateau) — parse directly\n try:\n with open(selected_file) as _f:\n _content = _f.read()\n _content_stripped = re.sub(r'^(\\s*\\s*\\n)*', '', _content, flags=re.DOTALL)\n _fm = re.match(r\"^---\\s*\\n(.*?)\\n---\\s*\\n\", _content_stripped, re.DOTALL)\n if _fm:\n for _line in _fm.group(1).split(\"\\n\"):\n if _line.strip().startswith(\"target-metric:\"):\n selected_target_metric = float(_line.split(\":\", 1)[1].strip())\n break\n except (OSError, ValueError, TypeError):\n pass\n print(f\"FORCED: running program '{forced_program}' (manual dispatch)\")\nelif due:\n # Normal scheduling: pick the single most-overdue program\n due.sort(key=lambda p: p[\"last_run\"] or \"\") # None/empty sorts first (never run)\n selected = due[0][\"name\"]\n selected_file = due[0][\"file\"]\n selected_target_metric = due[0].get(\"target_metric\")\n deferred = [p[\"name\"] for p in due[1:]]\n # Check if the selected program is issue-based\n if selected in issue_programs:\n selected_issue = issue_programs[selected][\"issue_number\"]\n\nresult = {\n \"selected\": selected,\n \"selected_file\": selected_file,\n \"selected_issue\": selected_issue,\n \"selected_target_metric\": selected_target_metric,\n \"issue_programs\": {name: info[\"issue_number\"] for name, info in issue_programs.items()},\n \"deferred\": deferred,\n \"skipped\": skipped,\n \"unconfigured\": unconfigured,\n \"no_programs\": False,\n}\n\nos.makedirs(\"/tmp/gh-aw\", exist_ok=True)\nwith open(\"/tmp/gh-aw/autoloop.json\", \"w\") as f:\n json.dump(result, f, indent=2)\n\nprint(\"=== Autoloop Program Check ===\")\nprint(f\"Selected program: {selected or '(none)'} ({selected_file or 'n/a'})\")\nprint(f\"Deferred (next run): {deferred or '(none)'}\")\nprint(f\"Programs skipped: {[s['name'] for s in skipped] or '(none)'}\")\nprint(f\"Programs unconfigured: {unconfigured or '(none)'}\")\n\nif not selected and not unconfigured:\n print(\"\\nNo programs due this run. Exiting early.\")\n sys.exit(1) # Non-zero exit skips the agent step\nPYEOF\n" + run: "python3 - << 'PYEOF'\nimport os, json, re, glob, sys\nimport urllib.request, urllib.error\nfrom datetime import datetime, timezone, timedelta\n\nprograms_dir = \".autoloop/programs\"\nautoloop_dir = \".autoloop/programs\"\ntemplate_file = os.path.join(autoloop_dir, \"example.md\")\n\n# Read program state from repo-memory (persistent git-backed storage)\ngithub_token = os.environ.get(\"GITHUB_TOKEN\", \"\")\nrepo = os.environ.get(\"GITHUB_REPOSITORY\", \"\")\nforced_program = os.environ.get(\"AUTOLOOP_PROGRAM\", \"\").strip()\n\n# Repo-memory files are cloned to /tmp/gh-aw/repo-memory/{id}/ where {id}\n# is derived from the branch-name configured in the tools section (memory/autoloop → autoloop)\nrepo_memory_dir = \"/tmp/gh-aw/repo-memory/autoloop\"\n\ndef parse_machine_state(content):\n \"\"\"Parse the ⚙️ Machine State table from a state file. Returns a dict.\"\"\"\n state = {}\n m = re.search(r'## ⚙️ Machine State.*?\\n(.*?)(?=\\n## |\\Z)', content, re.DOTALL)\n if not m:\n return state\n section = m.group(0)\n for row in re.finditer(r'\\|\\s*(.+?)\\s*\\|\\s*(.+?)\\s*\\|', section):\n raw_key = row.group(1).strip()\n raw_val = row.group(2).strip()\n if raw_key.lower() in (\"field\", \"---\", \":---\", \":---:\", \"---:\"):\n continue\n key = raw_key.lower().replace(\" \", \"_\")\n val = None if raw_val in (\"—\", \"-\", \"\") else raw_val\n state[key] = val\n # Coerce types\n for int_field in (\"iteration_count\", \"consecutive_errors\"):\n if int_field in state:\n try:\n state[int_field] = int(state[int_field])\n except (ValueError, TypeError):\n state[int_field] = 0\n if \"paused\" in state:\n state[\"paused\"] = str(state.get(\"paused\", \"\")).lower() == \"true\"\n if \"completed\" in state:\n state[\"completed\"] = str(state.get(\"completed\", \"\")).lower() == \"true\"\n # recent_statuses: stored as comma-separated words (e.g. \"accepted, rejected, error\")\n rs_raw = state.get(\"recent_statuses\") or \"\"\n if rs_raw:\n state[\"recent_statuses\"] = [s.strip().lower() for s in rs_raw.split(\",\") if s.strip()]\n else:\n state[\"recent_statuses\"] = []\n return state\n\ndef read_program_state(program_name):\n \"\"\"Read scheduling state from the repo-memory state file.\"\"\"\n state_file = os.path.join(repo_memory_dir, f\"{program_name}.md\")\n if not os.path.isfile(state_file):\n print(f\" {program_name}: no state file found (first run)\")\n return {}\n with open(state_file, encoding=\"utf-8\") as f:\n content = f.read()\n return parse_machine_state(content)\n\n# Bootstrap: create autoloop programs directory and template if missing\nif not os.path.isdir(autoloop_dir):\n os.makedirs(autoloop_dir, exist_ok=True)\n bt = chr(96) # backtick — avoid literal backticks that break gh-aw compiler\n template = \"\\n\".join([\n \"\",\n \"\",\n \"\",\n \"\",\n \"# Autoloop Program\",\n \"\",\n \"\",\n \"\",\n \"## Goal\",\n \"\",\n \"\",\n \"\",\n \"REPLACE THIS with your optimization goal.\",\n \"\",\n \"## Target\",\n \"\",\n \"\",\n \"\",\n \"Only modify these files:\",\n f\"- {bt}REPLACE_WITH_FILE{bt} -- (describe what this file does)\",\n \"\",\n \"Do NOT modify:\",\n \"- (list files that must not be touched)\",\n \"\",\n \"## Evaluation\",\n \"\",\n \"\",\n \"\",\n f\"{bt}{bt}{bt}bash\",\n \"REPLACE_WITH_YOUR_EVALUATION_COMMAND\",\n f\"{bt}{bt}{bt}\",\n \"\",\n f\"The metric is {bt}REPLACE_WITH_METRIC_NAME{bt}. **Lower/Higher is better.** (pick one)\",\n \"\",\n ])\n with open(template_file, \"w\") as f:\n f.write(template)\n # Leave the template unstaged — the agent will create a draft PR with it\n print(f\"BOOTSTRAPPED: created {template_file} locally (agent will create a draft PR)\")\n\n# Find all program files from all locations:\n# 1. Directory-based programs: .autoloop/programs//program.md (preferred)\n# 2. Bare markdown programs: .autoloop/programs/.md (simple)\n# 3. Issue-based programs: GitHub issues with the 'autoloop-program' label\nprogram_files = []\nissue_programs = {} # name -> {issue_number, file}\n\n# Scan .autoloop/programs/ for directory-based programs\nif os.path.isdir(programs_dir):\n for entry in sorted(os.listdir(programs_dir)):\n prog_dir = os.path.join(programs_dir, entry)\n if os.path.isdir(prog_dir):\n # Look for program.md inside the directory\n prog_file = os.path.join(prog_dir, \"program.md\")\n if os.path.isfile(prog_file):\n program_files.append(prog_file)\n\n# Scan .autoloop/programs/ for bare markdown programs\nbare_programs = sorted(glob.glob(os.path.join(autoloop_dir, \"*.md\")))\nfor pf in bare_programs:\n program_files.append(pf)\n\n# Scan GitHub issues with the 'autoloop-program' label\nissue_programs_dir = \"/tmp/gh-aw/issue-programs\"\nos.makedirs(issue_programs_dir, exist_ok=True)\ntry:\n api_url = f\"https://api.github.com/repos/{repo}/issues?labels=autoloop-program&state=open&per_page=100\"\n req = urllib.request.Request(api_url, headers={\n \"Authorization\": f\"token {github_token}\",\n \"Accept\": \"application/vnd.github.v3+json\",\n })\n with urllib.request.urlopen(req, timeout=30) as resp:\n issues = json.loads(resp.read().decode())\n for issue in issues:\n if issue.get(\"pull_request\"):\n continue # skip PRs\n body = issue.get(\"body\") or \"\"\n title = issue.get(\"title\") or \"\"\n number = issue[\"number\"]\n # Derive program name from issue title: slugify to lowercase with hyphens\n slug = re.sub(r'[^a-z0-9]+', '-', title.lower()).strip('-')\n slug = re.sub(r'-+', '-', slug) # collapse consecutive hyphens\n if not slug:\n slug = f\"issue-{number}\"\n # Avoid slug collisions: if another issue already claimed this slug, append issue number\n if slug in issue_programs:\n print(f\" Warning: slug '{slug}' (issue #{number}) collides with issue #{issue_programs[slug]['issue_number']}, appending issue number\")\n slug = f\"{slug}-{number}\"\n # Write issue body to a temp file so the scheduling loop can process it\n issue_file = os.path.join(issue_programs_dir, f\"{slug}.md\")\n with open(issue_file, \"w\") as f:\n f.write(body)\n program_files.append(issue_file)\n issue_programs[slug] = {\"issue_number\": number, \"file\": issue_file, \"title\": title}\n print(f\" Found issue-based program: '{slug}' (issue #{number})\")\nexcept Exception as e:\n print(f\" Warning: could not fetch issue-based programs: {e}\")\n\nif not program_files:\n # Fallback to single-file locations\n for path in [\".autoloop/program.md\", \"program.md\"]:\n if os.path.isfile(path):\n program_files = [path]\n break\n\nif not program_files:\n print(\"NO_PROGRAMS_FOUND\")\n os.makedirs(\"/tmp/gh-aw\", exist_ok=True)\n with open(\"/tmp/gh-aw/autoloop.json\", \"w\") as f:\n json.dump({\"due\": [], \"skipped\": [], \"unconfigured\": [], \"no_programs\": True}, f)\n sys.exit(0)\n\nos.makedirs(\"/tmp/gh-aw\", exist_ok=True)\nnow = datetime.now(timezone.utc)\ndue = []\nskipped = []\nunconfigured = []\nall_programs = {} # name -> file path (populated during scanning)\n\n# Schedule string to timedelta\ndef parse_schedule(s):\n s = s.strip().lower()\n m = re.match(r\"every\\s+(\\d+)\\s*h\", s)\n if m:\n return timedelta(hours=int(m.group(1)))\n m = re.match(r\"every\\s+(\\d+)\\s*m\", s)\n if m:\n return timedelta(minutes=int(m.group(1)))\n if s == \"daily\":\n return timedelta(hours=24)\n if s == \"weekly\":\n return timedelta(days=7)\n return None # No per-program schedule — always due\n\ndef get_program_name(pf):\n \"\"\"Extract program name from file path.\n Directory-based: .autoloop/programs//program.md -> \n Bare markdown: .autoloop/programs/.md -> \n Issue-based: /tmp/gh-aw/issue-programs/.md -> \n \"\"\"\n if pf.endswith(\"/program.md\"):\n # Directory-based program: name is the parent directory\n return os.path.basename(os.path.dirname(pf))\n else:\n # Bare markdown or issue-based program: name is the filename without .md\n return os.path.splitext(os.path.basename(pf))[0]\n\nfor pf in program_files:\n name = get_program_name(pf)\n all_programs[name] = pf\n with open(pf) as f:\n content = f.read()\n\n # Check sentinel (skip for issue-based programs which use AUTOLOOP:ISSUE-PROGRAM)\n if \"\" in content:\n unconfigured.append(name)\n continue\n\n # Check for TODO/REPLACE placeholders\n if re.search(r'\\bTODO\\b|\\bREPLACE', content):\n unconfigured.append(name)\n continue\n\n # Parse optional YAML frontmatter for schedule and target-metric\n # Strip leading HTML comments before checking (issue-based programs may have them)\n content_stripped = re.sub(r'^(\\s*\\s*\\n)*', '', content, flags=re.DOTALL)\n schedule_delta = None\n target_metric = None\n fm_match = re.match(r\"^---\\s*\\n(.*?)\\n---\\s*\\n\", content_stripped, re.DOTALL)\n if fm_match:\n for line in fm_match.group(1).split(\"\\n\"):\n if line.strip().startswith(\"schedule:\"):\n schedule_str = line.split(\":\", 1)[1].strip()\n schedule_delta = parse_schedule(schedule_str)\n if line.strip().startswith(\"target-metric:\"):\n try:\n target_metric = float(line.split(\":\", 1)[1].strip())\n except (ValueError, TypeError):\n print(f\" Warning: {name} has invalid target-metric value: {line.split(':', 1)[1].strip()}\")\n\n # Read state from repo-memory\n state = read_program_state(name)\n if state:\n print(f\" {name}: last_run={state.get('last_run')}, iteration_count={state.get('iteration_count')}\")\n else:\n print(f\" {name}: no state found (first run)\")\n\n last_run = None\n lr = state.get(\"last_run\")\n if lr:\n try:\n last_run = datetime.fromisoformat(lr.replace(\"Z\", \"+00:00\"))\n except ValueError:\n pass\n\n # Check if completed (target metric was reached)\n if str(state.get(\"completed\", \"\")).lower() == \"true\":\n skipped.append({\"name\": name, \"reason\": f\"completed: target metric reached\"})\n continue\n\n # Check if paused (e.g., plateau or recurring errors)\n if state.get(\"paused\"):\n skipped.append({\"name\": name, \"reason\": f\"paused: {state.get('pause_reason', 'unknown')}\"})\n continue\n\n # Auto-pause on plateau: 5+ consecutive rejections\n recent = state.get(\"recent_statuses\", [])[-5:]\n if len(recent) >= 5 and all(s == \"rejected\" for s in recent):\n skipped.append({\"name\": name, \"reason\": \"plateau: 5 consecutive rejections\"})\n continue\n\n # Check if due based on per-program schedule\n if schedule_delta and last_run:\n if now - last_run < schedule_delta:\n skipped.append({\"name\": name, \"reason\": \"not due yet\",\n \"next_due\": (last_run + schedule_delta).isoformat()})\n continue\n\n due.append({\"name\": name, \"last_run\": lr, \"file\": pf, \"target_metric\": target_metric})\n\n# Pick the program to run\nselected = None\nselected_file = None\nselected_issue = None\nselected_target_metric = None\ndeferred = []\n\nif forced_program:\n # Manual dispatch requested a specific program — bypass scheduling\n # (paused, not-due, and plateau programs can still be forced)\n if forced_program not in all_programs:\n print(f\"ERROR: requested program '{forced_program}' not found.\")\n print(f\" Available programs: {list(all_programs.keys())}\")\n sys.exit(1)\n if forced_program in unconfigured:\n print(f\"ERROR: requested program '{forced_program}' is unconfigured (has placeholders).\")\n sys.exit(1)\n selected = forced_program\n selected_file = all_programs[forced_program]\n deferred = [p[\"name\"] for p in due if p[\"name\"] != forced_program]\n if selected in issue_programs:\n selected_issue = issue_programs[selected][\"issue_number\"]\n # Find target_metric: check the due list first, then parse from the program file\n for p in due:\n if p[\"name\"] == forced_program:\n selected_target_metric = p.get(\"target_metric\")\n break\n if selected_target_metric is None:\n # Program may have been skipped (completed/paused/plateau) — parse directly\n try:\n with open(selected_file) as _f:\n _content = _f.read()\n _content_stripped = re.sub(r'^(\\s*\\s*\\n)*', '', _content, flags=re.DOTALL)\n _fm = re.match(r\"^---\\s*\\n(.*?)\\n---\\s*\\n\", _content_stripped, re.DOTALL)\n if _fm:\n for _line in _fm.group(1).split(\"\\n\"):\n if _line.strip().startswith(\"target-metric:\"):\n selected_target_metric = float(_line.split(\":\", 1)[1].strip())\n break\n except (OSError, ValueError, TypeError):\n pass\n print(f\"FORCED: running program '{forced_program}' (manual dispatch)\")\nelif due:\n # Normal scheduling: pick the single most-overdue program\n due.sort(key=lambda p: p[\"last_run\"] or \"\") # None/empty sorts first (never run)\n selected = due[0][\"name\"]\n selected_file = due[0][\"file\"]\n selected_target_metric = due[0].get(\"target_metric\")\n deferred = [p[\"name\"] for p in due[1:]]\n # Check if the selected program is issue-based\n if selected in issue_programs:\n selected_issue = issue_programs[selected][\"issue_number\"]\n\n# Look up existing PR for the selected program's canonical branch\nexisting_pr = None\nhead_branch = None\nif selected:\n head_branch = f\"autoloop/{selected}\"\n owner = repo.split(\"/\")[0] if \"/\" in repo else \"\"\n if owner:\n try:\n pr_api_url = (\n f\"https://api.github.com/repos/{repo}/pulls\"\n f\"?state=open&head={owner}:{head_branch}&per_page=5\"\n )\n pr_req = urllib.request.Request(pr_api_url, headers={\n \"Authorization\": f\"token {github_token}\",\n \"Accept\": \"application/vnd.github.v3+json\",\n })\n with urllib.request.urlopen(pr_req, timeout=30) as pr_resp:\n open_prs = json.loads(pr_resp.read().decode())\n if open_prs:\n existing_pr = open_prs[0][\"number\"]\n print(f\" Found existing PR #{existing_pr} for branch {head_branch}\")\n else:\n print(f\" No existing PR found for branch {head_branch}\")\n except Exception as e:\n print(f\" Warning: could not check for existing PRs: {e}\")\n else:\n print(f\" Warning: could not parse owner from GITHUB_REPOSITORY='{repo}'\")\n\n # Also check the state file for a recorded PR number as fallback\n if existing_pr is None:\n state = read_program_state(selected)\n pr_field = state.get(\"pr\") or \"\"\n pr_match = re.match(r'^#?(\\d+)$', pr_field.strip())\n if pr_match:\n existing_pr = int(pr_match.group(1))\n print(f\" Found PR #{existing_pr} from state file for {selected}\")\n\nresult = {\n \"selected\": selected,\n \"selected_file\": selected_file,\n \"selected_issue\": selected_issue,\n \"selected_target_metric\": selected_target_metric,\n \"existing_pr\": existing_pr,\n \"head_branch\": head_branch,\n \"issue_programs\": {name: info[\"issue_number\"] for name, info in issue_programs.items()},\n \"deferred\": deferred,\n \"skipped\": skipped,\n \"unconfigured\": unconfigured,\n \"no_programs\": False,\n}\n\nos.makedirs(\"/tmp/gh-aw\", exist_ok=True)\nwith open(\"/tmp/gh-aw/autoloop.json\", \"w\") as f:\n json.dump(result, f, indent=2)\n\nprint(\"=== Autoloop Program Check ===\")\nprint(f\"Selected program: {selected or '(none)'} ({selected_file or 'n/a'})\")\nif existing_pr:\n print(f\"Existing PR: #{existing_pr} (branch: {head_branch})\")\nelse:\n print(f\"Existing PR: (none — will create on first accepted iteration)\")\nprint(f\"Deferred (next run): {deferred or '(none)'}\")\nprint(f\"Programs skipped: {[s['name'] for s in skipped] or '(none)'}\")\nprint(f\"Programs unconfigured: {unconfigured or '(none)'}\")\n\nif not selected and not unconfigured:\n print(\"\\nNo programs due this run. Exiting early.\")\n sys.exit(1) # Non-zero exit skips the agent step\nPYEOF\n" # Repo memory git-based storage configuration from frontmatter processed below - name: Clone repo-memory branch (default) @@ -498,12 +498,12 @@ jobs: mkdir -p ${RUNNER_TEMP}/gh-aw/safeoutputs mkdir -p /tmp/gh-aw/safeoutputs mkdir -p /tmp/gh-aw/mcp-logs/safeoutputs - cat > ${RUNNER_TEMP}/gh-aw/safeoutputs/config.json << 'GH_AW_SAFE_OUTPUTS_CONFIG_05418b5b293ba2f8_EOF' + cat > ${RUNNER_TEMP}/gh-aw/safeoutputs/config.json << 'GH_AW_SAFE_OUTPUTS_CONFIG_02af14f48bc5ba75_EOF' {"add_comment":{"hide_older_comments":false,"max":7,"target":"*"},"add_labels":{"max":2,"target":"*"},"create_issue":{"labels":["automation","autoloop"],"max":2,"title_prefix":"[Autoloop] "},"create_pull_request":{"draft":true,"labels":["automation","autoloop"],"max":1,"max_patch_size":1024,"protected_files":["package.json","bun.lockb","bunfig.toml","deno.json","deno.jsonc","deno.lock","global.json","NuGet.Config","Directory.Packages.props","mix.exs","mix.lock","go.mod","go.sum","stack.yaml","stack.yaml.lock","pom.xml","build.gradle","build.gradle.kts","settings.gradle","settings.gradle.kts","gradle.properties","package-lock.json","yarn.lock","pnpm-lock.yaml","npm-shrinkwrap.json","requirements.txt","Pipfile","Pipfile.lock","pyproject.toml","setup.py","setup.cfg","Gemfile","Gemfile.lock","uv.lock","CODEOWNERS"],"protected_files_policy":"fallback-to-issue","protected_path_prefixes":[".github/",".agents/"],"title_prefix":"[Autoloop] "},"missing_data":{},"missing_tool":{},"noop":{"max":1,"report-as-issue":"true"},"push_repo_memory":{"memories":[{"dir":"/tmp/gh-aw/repo-memory/default","id":"default","max_file_count":100,"max_file_size":30720,"max_patch_size":10240}]},"push_to_pull_request_branch":{"if_no_changes":"warn","max":1,"max_patch_size":1024,"protected_files":["package.json","bun.lockb","bunfig.toml","deno.json","deno.jsonc","deno.lock","global.json","NuGet.Config","Directory.Packages.props","mix.exs","mix.lock","go.mod","go.sum","stack.yaml","stack.yaml.lock","pom.xml","build.gradle","build.gradle.kts","settings.gradle","settings.gradle.kts","gradle.properties","package-lock.json","yarn.lock","pnpm-lock.yaml","npm-shrinkwrap.json","requirements.txt","Pipfile","Pipfile.lock","pyproject.toml","setup.py","setup.cfg","Gemfile","Gemfile.lock","uv.lock","CODEOWNERS"],"protected_path_prefixes":[".github/",".agents/"],"target":"*","title_prefix":"[Autoloop] "},"remove_labels":{"max":2,"target":"*"},"update_issue":{"allow_body":true,"max":3,"target":"*","title_prefix":"[Autoloop] "}} - GH_AW_SAFE_OUTPUTS_CONFIG_05418b5b293ba2f8_EOF + GH_AW_SAFE_OUTPUTS_CONFIG_02af14f48bc5ba75_EOF - name: Write Safe Outputs Tools run: | - cat > ${RUNNER_TEMP}/gh-aw/safeoutputs/tools_meta.json << 'GH_AW_SAFE_OUTPUTS_TOOLS_META_e9f0c97ff8e4b848_EOF' + cat > ${RUNNER_TEMP}/gh-aw/safeoutputs/tools_meta.json << 'GH_AW_SAFE_OUTPUTS_TOOLS_META_e09cbb8e5f09e2a3_EOF' { "description_suffixes": { "add_comment": " CONSTRAINTS: Maximum 7 comment(s) can be added. Target: *.", @@ -517,8 +517,8 @@ jobs: "repo_params": {}, "dynamic_tools": [] } - GH_AW_SAFE_OUTPUTS_TOOLS_META_e9f0c97ff8e4b848_EOF - cat > ${RUNNER_TEMP}/gh-aw/safeoutputs/validation.json << 'GH_AW_SAFE_OUTPUTS_VALIDATION_c2504b2536e4b3d6_EOF' + GH_AW_SAFE_OUTPUTS_TOOLS_META_e09cbb8e5f09e2a3_EOF + cat > ${RUNNER_TEMP}/gh-aw/safeoutputs/validation.json << 'GH_AW_SAFE_OUTPUTS_VALIDATION_ecaf271fbfb920d8_EOF' { "add_comment": { "defaultMax": 1, @@ -777,7 +777,7 @@ jobs: "customValidation": "requiresOneOf:status,title,body" } } - GH_AW_SAFE_OUTPUTS_VALIDATION_c2504b2536e4b3d6_EOF + GH_AW_SAFE_OUTPUTS_VALIDATION_ecaf271fbfb920d8_EOF node ${RUNNER_TEMP}/gh-aw/actions/generate_safe_outputs_tools.cjs - name: Generate Safe Outputs MCP Server Config id: safe-outputs-config @@ -847,7 +847,7 @@ jobs: export MCP_GATEWAY_DOCKER_COMMAND='docker run -i --rm --network host -v /var/run/docker.sock:/var/run/docker.sock -e MCP_GATEWAY_PORT -e MCP_GATEWAY_DOMAIN -e MCP_GATEWAY_API_KEY -e MCP_GATEWAY_PAYLOAD_DIR -e MCP_GATEWAY_PAYLOAD_SIZE_THRESHOLD -e DEBUG -e MCP_GATEWAY_LOG_DIR -e GH_AW_MCP_LOG_DIR -e GH_AW_SAFE_OUTPUTS -e GH_AW_SAFE_OUTPUTS_CONFIG_PATH -e GH_AW_SAFE_OUTPUTS_TOOLS_PATH -e GH_AW_ASSETS_BRANCH -e GH_AW_ASSETS_MAX_SIZE_KB -e GH_AW_ASSETS_ALLOWED_EXTS -e DEFAULT_BRANCH -e GITHUB_MCP_SERVER_TOKEN -e GITHUB_MCP_GUARD_MIN_INTEGRITY -e GITHUB_MCP_GUARD_REPOS -e GITHUB_REPOSITORY -e GITHUB_SERVER_URL -e GITHUB_SHA -e GITHUB_WORKSPACE -e GITHUB_TOKEN -e GITHUB_RUN_ID -e GITHUB_RUN_NUMBER -e GITHUB_RUN_ATTEMPT -e GITHUB_JOB -e GITHUB_ACTION -e GITHUB_EVENT_NAME -e GITHUB_EVENT_PATH -e GITHUB_ACTOR -e GITHUB_ACTOR_ID -e GITHUB_TRIGGERING_ACTOR -e GITHUB_WORKFLOW -e GITHUB_WORKFLOW_REF -e GITHUB_WORKFLOW_SHA -e GITHUB_REF -e GITHUB_REF_NAME -e GITHUB_REF_TYPE -e GITHUB_HEAD_REF -e GITHUB_BASE_REF -e GH_AW_SAFE_OUTPUTS_PORT -e GH_AW_SAFE_OUTPUTS_API_KEY -v /tmp/gh-aw/mcp-payloads:/tmp/gh-aw/mcp-payloads:rw -v /opt:/opt:ro -v /tmp:/tmp:rw -v '"${GITHUB_WORKSPACE}"':'"${GITHUB_WORKSPACE}"':rw ghcr.io/github/gh-aw-mcpg:v0.2.11' mkdir -p /home/runner/.copilot - cat << GH_AW_MCP_CONFIG_be8a945ee3e28a8b_EOF | bash ${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.sh + cat << GH_AW_MCP_CONFIG_757354268663f6b1_EOF | bash ${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.sh { "mcpServers": { "github": { @@ -888,7 +888,7 @@ jobs: "payloadDir": "${MCP_GATEWAY_PAYLOAD_DIR}" } } - GH_AW_MCP_CONFIG_be8a945ee3e28a8b_EOF + GH_AW_MCP_CONFIG_757354268663f6b1_EOF - name: Download activation artifact uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: diff --git a/.github/workflows/autoloop.md b/.github/workflows/autoloop.md index f3e796cc..ddc456a7 100644 --- a/.github/workflows/autoloop.md +++ b/.github/workflows/autoloop.md @@ -431,11 +431,50 @@ steps: if selected in issue_programs: selected_issue = issue_programs[selected]["issue_number"] + # Look up existing PR for the selected program's canonical branch + existing_pr = None + head_branch = None + if selected: + head_branch = f"autoloop/{selected}" + owner = repo.split("/")[0] if "/" in repo else "" + if owner: + try: + pr_api_url = ( + f"https://api.github.com/repos/{repo}/pulls" + f"?state=open&head={owner}:{head_branch}&per_page=5" + ) + pr_req = urllib.request.Request(pr_api_url, headers={ + "Authorization": f"token {github_token}", + "Accept": "application/vnd.github.v3+json", + }) + with urllib.request.urlopen(pr_req, timeout=30) as pr_resp: + open_prs = json.loads(pr_resp.read().decode()) + if open_prs: + existing_pr = open_prs[0]["number"] + print(f" Found existing PR #{existing_pr} for branch {head_branch}") + else: + print(f" No existing PR found for branch {head_branch}") + except Exception as e: + print(f" Warning: could not check for existing PRs: {e}") + else: + print(f" Warning: could not parse owner from GITHUB_REPOSITORY='{repo}'") + + # Also check the state file for a recorded PR number as fallback + if existing_pr is None: + state = read_program_state(selected) + pr_field = state.get("pr") or "" + pr_match = re.match(r'^#?(\d+)$', pr_field.strip()) + if pr_match: + existing_pr = int(pr_match.group(1)) + print(f" Found PR #{existing_pr} from state file for {selected}") + result = { "selected": selected, "selected_file": selected_file, "selected_issue": selected_issue, "selected_target_metric": selected_target_metric, + "existing_pr": existing_pr, + "head_branch": head_branch, "issue_programs": {name: info["issue_number"] for name, info in issue_programs.items()}, "deferred": deferred, "skipped": skipped, @@ -449,6 +488,10 @@ steps: print("=== Autoloop Program Check ===") print(f"Selected program: {selected or '(none)'} ({selected_file or 'n/a'})") + if existing_pr: + print(f"Existing PR: #{existing_pr} (branch: {head_branch})") + else: + print(f"Existing PR: (none — will create on first accepted iteration)") print(f"Deferred (next run): {deferred or '(none)'}") print(f"Programs skipped: {[s['name'] for s in skipped] or '(none)'}") print(f"Programs unconfigured: {unconfigured or '(none)'}") @@ -538,6 +581,8 @@ The pre-step has already determined which program to run. Read `/tmp/gh-aw/autol - **`selected_file`**: The full path to the program's markdown file (either `.autoloop/programs//program.md`, `.autoloop/programs/.md`, or `/tmp/gh-aw/issue-programs/.md` for issue-based programs). - **`selected_issue`**: The GitHub issue number if the selected program came from an issue, or `null` if it came from a file. - **`selected_target_metric`**: The `target-metric` value from the program's frontmatter (a number), or `null` if the program is open-ended. Used to check the [halting condition](#halting-condition) after each accepted iteration. +- **`existing_pr`**: The PR number (e.g., `42`) of an already-open PR for this program's branch, or `null` if no open PR exists. **If this is not null, you MUST use `push-to-pull-request-branch` to push to this PR — do NOT call `create-pull-request`.** +- **`head_branch`**: The canonical branch name for this program (e.g., `autoloop/coverage`). Always use this exact branch name — never append suffixes. - **`issue_programs`**: A mapping of program name → issue number for all discovered issue-based programs. - **`deferred`**: Other programs that were due but will be handled in future runs. - **`unconfigured`**: Programs that still have the sentinel or placeholder content. @@ -550,6 +595,7 @@ If `selected` is not null: 3. Read the current state of all target files. 4. Read the state file `{selected}.md` from the repo-memory folder for all state: the ⚙️ Machine State table (scheduling fields) plus the research sections (priorities, lessons, foreclosed avenues, iteration history). 5. If `selected_issue` is not null, this is an issue-based program — also read the issue comments for any human steering input. +6. **Check `existing_pr`**: if it is not null, a PR already exists — use `push-to-pull-request-branch` to push commits to it. Only call `create-pull-request` when `existing_pr` is null. ## Multiple Programs @@ -694,7 +740,7 @@ Each run executes **one iteration for the single selected program**: If the state file does not yet exist, create it in the repo-memory folder using the template defined in the [Repo Memory](#repo-memory) section. -3. Note the `PR` field from the Machine State table. If it contains a PR number (e.g., `#42`), that is the **existing draft PR** for this program — you must update it, not create a new one. +3. Note the `existing_pr` field from `/tmp/gh-aw/autoloop.json`. If it is not null, that is the **existing draft PR** for this program — you must push to it using `push-to-pull-request-branch`, not create a new one. Also check the `PR` field from the Machine State table as a fallback. ### Step 2: Analyze and Propose @@ -743,15 +789,15 @@ Each run executes **one iteration for the single selected program**: - Commit message body (after a blank line): `Run: {run_url}` referencing the GitHub Actions run URL. 2. Push the commit to the long-running branch `autoloop/{program-name}`. 3. **Find the existing PR or create one** — follow these steps in order: - a. Check the `PR` field in the state file's **⚙️ Machine State** table. If it contains a PR number (e.g., `#42`), that is the existing draft PR. - b. If the state file has no PR number, search GitHub for open PRs with head branch `autoloop/{program-name}`. Use the GitHub API: `GET /repos/{owner}/{repo}/pulls?state=open&head={owner}:autoloop/{program-name}`. - c. **If an existing PR is found** (from either step a or b): use `push-to-pull-request-branch` to push additional commits to the existing PR. Update the PR body with the latest metric and a summary of the most recent accepted iteration. Add a comment to the PR summarizing the iteration: what changed, old metric, new metric, improvement delta, and a link to the actions run. **Do NOT call `create-pull-request`.** - d. **If NO PR exists** for `autoloop/{program-name}`: create one using `create-pull-request`: + a. **First, check `existing_pr` from `/tmp/gh-aw/autoloop.json`.** The pre-step has already looked up the open PR for this program. If `existing_pr` is not null, that is the existing draft PR — skip to step (c). + b. If `existing_pr` is null, also check the `PR` field in the state file's **⚙️ Machine State** table as a fallback. If it contains a PR number (e.g., `#42`), verify it is still open via the GitHub API. + c. **If an existing PR is found** (from step a or b): use `push-to-pull-request-branch` to push additional commits to the existing PR. Update the PR body with the latest metric and a summary of the most recent accepted iteration. Add a comment to the PR summarizing the iteration: what changed, old metric, new metric, improvement delta, and a link to the actions run. **Do NOT call `create-pull-request`.** + d. **If NO PR exists** for `autoloop/{program-name}` (both `existing_pr` is null AND the state file has no PR): create one using `create-pull-request`: - Branch: `autoloop/{program-name}` (the branch you already created in Step 3 — do NOT let the framework auto-generate a branch name) - Title: `[Autoloop: {program-name}]` - Body includes: a summary of the program goal, link to the steering issue, the current best metric, and AI disclosure: `🤖 *This PR is maintained by Autoloop. Each accepted iteration adds a commit to this branch.*` - > ⚠️ **Never create a new PR if one already exists for `autoloop/{program-name}`.** Each program must have exactly one draft PR at any time. If you are unsure whether a PR exists, check the GitHub API before calling `create-pull-request`. + > ⚠️ **Never create a new PR if one already exists for `autoloop/{program-name}`.** Each program must have exactly one draft PR at any time. The pre-step provides `existing_pr` in autoloop.json — always check it first. Only call `create-pull-request` when `existing_pr` is null AND the state file has no PR number. 4. Ensure the steering issue exists (see [Steering Issue](#steering-issue) below). Add a comment to the steering issue linking to the commit and actions run. 5. Add an entry to the experiment log issue. 6. Update the state file `{program-name}.md` in the repo-memory folder: @@ -790,6 +836,13 @@ Maintain a single open issue **per program** titled `[Autoloop: {program-name}] ```markdown 🤖 *Autoloop — an iterative optimization agent for this repository.* +| | | +|---|---| +| **Branch** | [`autoloop/{program-name}`](https://github.com/{owner}/{repo}/tree/autoloop/{program-name}) | +| **Pull Request** | #{pr_number} | +| **Steering Issue** | #{steering_issue_number} | +| **State File** | [`{program-name}.md`](https://github.com/{owner}/{repo}/blob/memory/autoloop/{program-name}.md) | + ## Program **Goal**: {one-line summary from program.md} @@ -817,6 +870,7 @@ Maintain a single open issue **per program** titled `[Autoloop: {program-name}] - Iterations in **reverse chronological order** (newest first). - Each iteration heading links to its GitHub Actions run. - Use `${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}` for the current run URL. +- The **links table at the top** must always show the current branch, PR, steering issue, and state file. Update the PR number when a new PR is created. When creating a continuation issue for a new month, copy the links table from the previous issue. - Close the previous month's issue and create a new one at month boundaries. - Maximum 50 iterations per issue; create a continuation issue if exceeded. @@ -1148,9 +1202,10 @@ After each iteration, prepend an entry to the **📊 Iteration History** section > ❌ **Do NOT create a new branch with a suffix for each iteration.** > Correct: `autoloop/coverage` > Wrong: `autoloop/coverage-abc123`, `autoloop/coverage-iter42`, `autoloop/coverage-deadbeef1234` +> Use the `head_branch` field from `autoloop.json` — it is always the canonical name. > ❌ **Do NOT create a new PR if one already exists for `autoloop/{program-name}`.** -> Always check the state file's `PR` field and the GitHub API before calling `create-pull-request`. If a PR exists, use `push-to-pull-request-branch` instead. +> The pre-step provides `existing_pr` in `autoloop.json`. If it is not null, **always** use `push-to-pull-request-branch` — never call `create-pull-request`. Only create a PR when `existing_pr` is null AND the state file has no PR number. > ❌ **Do NOT let the gh-aw framework auto-generate a branch name when creating a PR.** > Always specify the branch explicitly as `autoloop/{program-name}` when calling `create-pull-request`. diff --git a/.github/workflows/evergreen.lock.yml b/.github/workflows/evergreen.lock.yml index 703bf2c5..fbdca9b7 100644 --- a/.github/workflows/evergreen.lock.yml +++ b/.github/workflows/evergreen.lock.yml @@ -28,7 +28,7 @@ # Imports: # - shared/reporting.md # -# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"fdc470cbbb93445cdc31d03533b3983d30603af2cbc207a06bc506a93d186f95","compiler_version":"v0.65.6","strict":true,"agent_id":"copilot"} +# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"6c52512ee1dd9f0c424a7b5af5207b2d89e239e673df6f5ad79911a4820b75ab","compiler_version":"v0.65.6","strict":true,"agent_id":"copilot"} name: "Evergreen — PR Health Keeper" "on": @@ -141,20 +141,20 @@ jobs: run: | bash ${RUNNER_TEMP}/gh-aw/actions/create_prompt_first.sh { - cat << 'GH_AW_PROMPT_c90257464e463e6f_EOF' + cat << 'GH_AW_PROMPT_1c58cbcd2bf82635_EOF' - GH_AW_PROMPT_c90257464e463e6f_EOF + GH_AW_PROMPT_1c58cbcd2bf82635_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/xpia.md" cat "${RUNNER_TEMP}/gh-aw/prompts/temp_folder_prompt.md" cat "${RUNNER_TEMP}/gh-aw/prompts/markdown.md" cat "${RUNNER_TEMP}/gh-aw/prompts/repo_memory_prompt.md" cat "${RUNNER_TEMP}/gh-aw/prompts/safe_outputs_prompt.md" - cat << 'GH_AW_PROMPT_c90257464e463e6f_EOF' + cat << 'GH_AW_PROMPT_1c58cbcd2bf82635_EOF' Tools: add_comment(max:3), push_to_pull_request_branch(max:3), missing_tool, missing_data, noop - GH_AW_PROMPT_c90257464e463e6f_EOF + GH_AW_PROMPT_1c58cbcd2bf82635_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/safe_outputs_push_to_pr_branch.md" - cat << 'GH_AW_PROMPT_c90257464e463e6f_EOF' + cat << 'GH_AW_PROMPT_1c58cbcd2bf82635_EOF' The following GitHub context information is available for this workflow: @@ -187,13 +187,13 @@ jobs: - **Note**: If a branch you need is not in the list above and is not listed as an additional fetched ref, it has NOT been checked out. For private repositories you cannot fetch it without proper authentication. If the branch is required and not available, exit with an error and ask the user to add it to the `fetch:` option of the `checkout:` configuration (e.g., `fetch: ["refs/pulls/open/*"]` for all open PR refs, or `fetch: ["main", "feature/my-branch"]` for specific branches). - GH_AW_PROMPT_c90257464e463e6f_EOF + GH_AW_PROMPT_1c58cbcd2bf82635_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/github_mcp_tools_with_safeoutputs_prompt.md" - cat << 'GH_AW_PROMPT_c90257464e463e6f_EOF' + cat << 'GH_AW_PROMPT_1c58cbcd2bf82635_EOF' {{#runtime-import .github/workflows/shared/reporting.md}} {{#runtime-import .github/workflows/evergreen.md}} - GH_AW_PROMPT_c90257464e463e6f_EOF + GH_AW_PROMPT_1c58cbcd2bf82635_EOF } > "$GH_AW_PROMPT" - name: Interpolate variables and render templates uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 @@ -339,7 +339,7 @@ jobs: GITHUB_REPOSITORY: ${{ github.repository }} GITHUB_TOKEN: ${{ github.token }} name: Find a PR that needs attention - run: "python3 - << 'PYEOF'\nimport os, json, re, sys\nimport urllib.request, urllib.error\n\ntoken = os.environ.get(\"GITHUB_TOKEN\", \"\")\nrepo = os.environ.get(\"GITHUB_REPOSITORY\", \"\")\nforced_pr = os.environ.get(\"FORCED_PR\", \"\").strip()\n\nrepo_memory_dir = \"/tmp/gh-aw/repo-memory/evergreen\"\noutput_file = \"/tmp/gh-aw/evergreen.json\"\nos.makedirs(\"/tmp/gh-aw\", exist_ok=True)\n\nMAX_ATTEMPTS = 5\n\ndef api_get(url):\n \"\"\"Make an authenticated GET request to the GitHub API.\"\"\"\n req = urllib.request.Request(url, headers={\n \"Authorization\": f\"token {token}\",\n \"Accept\": \"application/vnd.github.v3+json\",\n })\n with urllib.request.urlopen(req, timeout=30) as resp:\n return json.loads(resp.read().decode())\n\ndef get_all_open_prs():\n \"\"\"Fetch all open PRs, paginated.\"\"\"\n prs = []\n page = 1\n while True:\n url = f\"https://api.github.com/repos/{repo}/pulls?state=open&per_page=100&page={page}&sort=number&direction=asc\"\n batch = api_get(url)\n if not batch:\n break\n prs.extend(batch)\n if len(batch) < 100:\n break\n page += 1\n return prs\n\ndef get_check_status(pr):\n \"\"\"Get combined CI check status for a PR's head commit.\"\"\"\n head_sha = pr[\"head\"][\"sha\"]\n url = f\"https://api.github.com/repos/{repo}/commits/{head_sha}/status\"\n try:\n status = api_get(url)\n return status.get(\"state\", \"unknown\")\n except Exception as e:\n print(f\" Warning: could not fetch status for PR #{pr['number']}: {e}\")\n return \"unknown\"\n\ndef get_check_runs(pr):\n \"\"\"Get check runs for a PR's head commit.\"\"\"\n head_sha = pr[\"head\"][\"sha\"]\n url = f\"https://api.github.com/repos/{repo}/commits/{head_sha}/check-runs\"\n try:\n data = api_get(url)\n return data.get(\"check_runs\", [])\n except Exception as e:\n print(f\" Warning: could not fetch check runs for PR #{pr['number']}: {e}\")\n return []\n\ndef read_attempt_state(pr_number):\n \"\"\"Read attempt tracking state from repo-memory.\"\"\"\n state_file = os.path.join(repo_memory_dir, f\"pr-{pr_number}.md\")\n if not os.path.isfile(state_file):\n return {\"attempts\": 0, \"head_sha\": None}\n with open(state_file, encoding=\"utf-8\") as f:\n content = f.read()\n state = {\"attempts\": 0, \"head_sha\": None}\n m = re.search(r'\\|\\s*head_sha\\s*\\|\\s*(\\S+)\\s*\\|', content)\n if m:\n state[\"head_sha\"] = m.group(1)\n m = re.search(r'\\|\\s*attempts\\s*\\|\\s*(\\d+)\\s*\\|', content)\n if m:\n state[\"attempts\"] = int(m.group(1))\n return state\n\ndef pr_needs_attention(pr):\n \"\"\"Check if a PR has merge conflicts or failing CI. Returns a list of issues.\"\"\"\n issues = []\n\n # Check mergeable state\n # Need to fetch full PR details for mergeable info\n pr_url = f\"https://api.github.com/repos/{repo}/pulls/{pr['number']}\"\n try:\n full_pr = api_get(pr_url)\n mergeable = full_pr.get(\"mergeable\")\n mergeable_state = full_pr.get(\"mergeable_state\", \"unknown\")\n if mergeable is False:\n issues.append(\"merge_conflict\")\n elif mergeable_state == \"dirty\":\n issues.append(\"merge_conflict\")\n except Exception as e:\n print(f\" Warning: could not fetch mergeable state for PR #{pr['number']}: {e}\")\n\n # Check CI status via check runs\n check_runs = get_check_runs(pr)\n failed_checks = []\n for cr in check_runs:\n conclusion = cr.get(\"conclusion\")\n status = cr.get(\"status\")\n name = cr.get(\"name\", \"unknown\")\n if conclusion in (\"failure\", \"timed_out\", \"action_required\"):\n failed_checks.append(name)\n elif status == \"completed\" and conclusion not in (\"success\", \"neutral\", \"skipped\"):\n if conclusion is not None:\n failed_checks.append(name)\n if failed_checks:\n issues.append(f\"failing_checks: {', '.join(failed_checks)}\")\n\n # Also check commit status API (some checks use the older status API)\n combined_status = get_check_status(pr)\n if combined_status == \"failure\":\n if not failed_checks:\n issues.append(\"failing_status\")\n\n return issues\n\n# --- Main logic ---\n\nprint(\"=== Evergreen PR Health Check ===\")\nprint(f\"Repository: {repo}\")\n\nprs = get_all_open_prs()\nprint(f\"Found {len(prs)} open PR(s)\")\n\nif not prs:\n print(\"No open PRs. Exiting.\")\n with open(output_file, \"w\") as f:\n json.dump({\"selected\": None, \"reason\": \"no_open_prs\"}, f)\n sys.exit(1)\n\n# Evaluate each PR deterministically (sorted by PR number ascending)\ncandidates = []\nskipped = []\n\n# If a specific PR is forced, only check that one\nif forced_pr:\n prs = [pr for pr in prs if str(pr[\"number\"]) == forced_pr]\n if not prs:\n print(f\"ERROR: PR #{forced_pr} not found among open PRs.\")\n sys.exit(1)\n print(f\"FORCED: checking only PR #{forced_pr}\")\n\nfor pr in sorted(prs, key=lambda p: p[\"number\"]):\n pr_num = pr[\"number\"]\n head_sha = pr[\"head\"][\"sha\"]\n print(f\"\\nChecking PR #{pr_num}: {pr['title'][:60]}...\")\n print(f\" Head SHA: {head_sha[:12]}\")\n\n issues = pr_needs_attention(pr)\n if not issues:\n print(f\" Status: healthy (no issues)\")\n continue\n\n print(f\" Issues: {issues}\")\n\n # Check attempt tracking\n attempt_state = read_attempt_state(pr_num)\n if attempt_state[\"head_sha\"] == head_sha:\n attempts = attempt_state[\"attempts\"]\n print(f\" Attempts on this SHA: {attempts}/{MAX_ATTEMPTS}\")\n if attempts >= MAX_ATTEMPTS:\n skipped.append({\n \"pr\": pr_num,\n \"reason\": f\"max attempts ({MAX_ATTEMPTS}) reached on SHA {head_sha[:12]}\",\n })\n print(f\" SKIPPED: max attempts reached\")\n continue\n else:\n attempts = 0\n print(f\" New SHA detected — resetting attempt counter\")\n\n candidates.append({\n \"pr_number\": pr_num,\n \"title\": pr[\"title\"],\n \"head_sha\": head_sha,\n \"base_branch\": pr[\"base\"][\"ref\"],\n \"head_branch\": pr[\"head\"][\"ref\"],\n \"issues\": issues,\n \"attempts\": attempts,\n })\n\n# Select the first candidate (lowest PR number — deterministic)\nselected = candidates[0] if candidates else None\n\nresult = {\n \"selected\": selected,\n \"skipped\": skipped,\n \"total_open_prs\": len(prs),\n \"candidates_found\": len(candidates),\n}\n\nwith open(output_file, \"w\") as f:\n json.dump(result, f, indent=2)\n\nif selected:\n print(f\"\\n>>> Selected PR #{selected['pr_number']}: {selected['title']}\")\n print(f\" Issues: {selected['issues']}\")\n print(f\" Attempt: {selected['attempts'] + 1}/{MAX_ATTEMPTS}\")\nelse:\n print(\"\\nNo PRs need attention. Exiting.\")\n sys.exit(1)\nPYEOF\n" + run: "python3 - << 'PYEOF'\nimport os, json, re, subprocess, sys\nimport urllib.request, urllib.error\n\ntoken = os.environ.get(\"GITHUB_TOKEN\", \"\")\nrepo = os.environ.get(\"GITHUB_REPOSITORY\", \"\")\nforced_pr = os.environ.get(\"FORCED_PR\", \"\").strip()\n\nrepo_memory_dir = \"/tmp/gh-aw/repo-memory/evergreen\"\noutput_file = \"/tmp/gh-aw/evergreen.json\"\nos.makedirs(\"/tmp/gh-aw\", exist_ok=True)\n\nMAX_ATTEMPTS = 5\n\ndef api_get(url):\n \"\"\"Make an authenticated GET request to the GitHub API.\"\"\"\n req = urllib.request.Request(url, headers={\n \"Authorization\": f\"token {token}\",\n \"Accept\": \"application/vnd.github.v3+json\",\n })\n with urllib.request.urlopen(req, timeout=30) as resp:\n return json.loads(resp.read().decode())\n\ndef get_all_open_prs():\n \"\"\"Fetch all open PRs, paginated.\"\"\"\n prs = []\n page = 1\n while True:\n url = f\"https://api.github.com/repos/{repo}/pulls?state=open&per_page=100&page={page}&sort=number&direction=asc\"\n batch = api_get(url)\n if not batch:\n break\n prs.extend(batch)\n if len(batch) < 100:\n break\n page += 1\n return prs\n\ndef get_check_status(pr):\n \"\"\"Get combined CI check status for a PR's head commit.\"\"\"\n head_sha = pr[\"head\"][\"sha\"]\n url = f\"https://api.github.com/repos/{repo}/commits/{head_sha}/status\"\n try:\n status = api_get(url)\n return status.get(\"state\", \"unknown\")\n except Exception as e:\n print(f\" Warning: could not fetch status for PR #{pr['number']}: {e}\")\n return \"unknown\"\n\ndef get_check_runs(pr):\n \"\"\"Get check runs for a PR's head commit.\"\"\"\n head_sha = pr[\"head\"][\"sha\"]\n url = f\"https://api.github.com/repos/{repo}/commits/{head_sha}/check-runs\"\n try:\n data = api_get(url)\n return data.get(\"check_runs\", [])\n except Exception as e:\n print(f\" Warning: could not fetch check runs for PR #{pr['number']}: {e}\")\n return []\n\ndef read_attempt_state(pr_number):\n \"\"\"Read attempt tracking state from repo-memory.\"\"\"\n state_file = os.path.join(repo_memory_dir, f\"pr-{pr_number}.md\")\n if not os.path.isfile(state_file):\n return {\"attempts\": 0, \"head_sha\": None}\n with open(state_file, encoding=\"utf-8\") as f:\n content = f.read()\n state = {\"attempts\": 0, \"head_sha\": None}\n m = re.search(r'\\|\\s*head_sha\\s*\\|\\s*(\\S+)\\s*\\|', content)\n if m:\n state[\"head_sha\"] = m.group(1)\n m = re.search(r'\\|\\s*attempts\\s*\\|\\s*(\\d+)\\s*\\|', content)\n if m:\n state[\"attempts\"] = int(m.group(1))\n return state\n\ndef pr_needs_attention(pr):\n \"\"\"Check if a PR has merge conflicts or failing CI. Returns a list of issues.\"\"\"\n issues = []\n\n # Check mergeable state\n # Need to fetch full PR details for mergeable info\n pr_url = f\"https://api.github.com/repos/{repo}/pulls/{pr['number']}\"\n try:\n full_pr = api_get(pr_url)\n mergeable = full_pr.get(\"mergeable\")\n mergeable_state = full_pr.get(\"mergeable_state\", \"unknown\")\n if mergeable is False:\n issues.append(\"merge_conflict\")\n elif mergeable_state == \"dirty\":\n issues.append(\"merge_conflict\")\n except Exception as e:\n print(f\" Warning: could not fetch mergeable state for PR #{pr['number']}: {e}\")\n\n # Check CI status via check runs\n check_runs = get_check_runs(pr)\n failed_checks = []\n for cr in check_runs:\n conclusion = cr.get(\"conclusion\")\n status = cr.get(\"status\")\n name = cr.get(\"name\", \"unknown\")\n if conclusion in (\"failure\", \"timed_out\", \"action_required\"):\n failed_checks.append(name)\n elif status == \"completed\" and conclusion not in (\"success\", \"neutral\", \"skipped\"):\n if conclusion is not None:\n failed_checks.append(name)\n if failed_checks:\n issues.append(f\"failing_checks: {', '.join(failed_checks)}\")\n\n # Also check commit status API (some checks use the older status API)\n combined_status = get_check_status(pr)\n if combined_status == \"failure\":\n if not failed_checks:\n issues.append(\"failing_status\")\n\n return issues\n\n# --- Main logic ---\n\nprint(\"=== Evergreen PR Health Check ===\")\nprint(f\"Repository: {repo}\")\n\nprs = get_all_open_prs()\nprint(f\"Found {len(prs)} open PR(s)\")\n\nif not prs:\n print(\"No open PRs. Nothing to do.\")\n with open(output_file, \"w\") as f:\n json.dump({\"selected\": None, \"reason\": \"no_open_prs\"}, f)\n sys.exit(0)\n\n# Evaluate each PR deterministically (sorted by PR number ascending)\ncandidates = []\nskipped = []\n\n# If a specific PR is forced, only check that one\nif forced_pr:\n prs = [pr for pr in prs if str(pr[\"number\"]) == forced_pr]\n if not prs:\n print(f\"ERROR: PR #{forced_pr} not found among open PRs.\")\n sys.exit(1)\n print(f\"FORCED: checking only PR #{forced_pr}\")\n\nfor pr in sorted(prs, key=lambda p: p[\"number\"]):\n pr_num = pr[\"number\"]\n head_sha = pr[\"head\"][\"sha\"]\n print(f\"\\nChecking PR #{pr_num}: {pr['title'][:60]}...\")\n print(f\" Head SHA: {head_sha[:12]}\")\n\n issues = pr_needs_attention(pr)\n if not issues:\n print(f\" Status: healthy (no issues)\")\n continue\n\n print(f\" Issues: {issues}\")\n\n # Check attempt tracking\n attempt_state = read_attempt_state(pr_num)\n if attempt_state[\"head_sha\"] == head_sha:\n attempts = attempt_state[\"attempts\"]\n print(f\" Attempts on this SHA: {attempts}/{MAX_ATTEMPTS}\")\n if attempts >= MAX_ATTEMPTS:\n skipped.append({\n \"pr\": pr_num,\n \"reason\": f\"max attempts ({MAX_ATTEMPTS}) reached on SHA {head_sha[:12]}\",\n })\n print(f\" SKIPPED: max attempts reached\")\n continue\n else:\n attempts = 0\n print(f\" New SHA detected — resetting attempt counter\")\n\n candidates.append({\n \"pr_number\": pr_num,\n \"title\": pr[\"title\"],\n \"head_sha\": head_sha,\n \"base_branch\": pr[\"base\"][\"ref\"],\n \"head_branch\": pr[\"head\"][\"ref\"],\n \"issues\": issues,\n \"attempts\": attempts,\n })\n\n# Select the first candidate (lowest PR number — deterministic)\nselected = candidates[0] if candidates else None\n\nresult = {\n \"selected\": selected,\n \"skipped\": skipped,\n \"total_open_prs\": len(prs),\n \"candidates_found\": len(candidates),\n}\n\nwith open(output_file, \"w\") as f:\n json.dump(result, f, indent=2)\n\nif selected:\n branch = selected[\"head_branch\"]\n print(f\"Checking out PR branch before agent run: {branch}\")\n subprocess.check_call([\"git\", \"checkout\", \"-B\", branch, f\"origin/{branch}\"])\n subprocess.check_call([\"git\", \"branch\", \"--set-upstream-to\", f\"origin/{branch}\", branch])\n print(f\"\\n>>> Selected PR #{selected['pr_number']}: {selected['title']}\")\n print(f\" Issues: {selected['issues']}\")\n print(f\" Attempt: {selected['attempts'] + 1}/{MAX_ATTEMPTS}\")\nelse:\n print(\"\\nNo PRs need attention. Nothing to do.\")\n sys.exit(0)\nPYEOF\n" # Repo memory git-based storage configuration from frontmatter processed below - name: Clone repo-memory branch (default) @@ -398,12 +398,12 @@ jobs: mkdir -p ${RUNNER_TEMP}/gh-aw/safeoutputs mkdir -p /tmp/gh-aw/safeoutputs mkdir -p /tmp/gh-aw/mcp-logs/safeoutputs - cat > ${RUNNER_TEMP}/gh-aw/safeoutputs/config.json << 'GH_AW_SAFE_OUTPUTS_CONFIG_ec5103758147a5b8_EOF' + cat > ${RUNNER_TEMP}/gh-aw/safeoutputs/config.json << 'GH_AW_SAFE_OUTPUTS_CONFIG_5d352d3a7dc8ac3d_EOF' {"add_comment":{"max":3,"target":"*"},"missing_data":{},"missing_tool":{},"noop":{"max":1,"report-as-issue":"true"},"push_repo_memory":{"memories":[{"dir":"/tmp/gh-aw/repo-memory/default","id":"default","max_file_count":100,"max_file_size":10240,"max_patch_size":10240}]},"push_to_pull_request_branch":{"if_no_changes":"warn","max":3,"max_patch_size":1024,"protected_files":["package.json","bun.lockb","bunfig.toml","deno.json","deno.jsonc","deno.lock","global.json","NuGet.Config","Directory.Packages.props","mix.exs","mix.lock","go.mod","go.sum","stack.yaml","stack.yaml.lock","pom.xml","build.gradle","build.gradle.kts","settings.gradle","settings.gradle.kts","gradle.properties","package-lock.json","yarn.lock","pnpm-lock.yaml","npm-shrinkwrap.json","requirements.txt","Pipfile","Pipfile.lock","pyproject.toml","setup.py","setup.cfg","Gemfile","Gemfile.lock","uv.lock","CODEOWNERS"],"protected_files_policy":"allowed","protected_path_prefixes":[".github/",".agents/"],"target":"*"}} - GH_AW_SAFE_OUTPUTS_CONFIG_ec5103758147a5b8_EOF + GH_AW_SAFE_OUTPUTS_CONFIG_5d352d3a7dc8ac3d_EOF - name: Write Safe Outputs Tools run: | - cat > ${RUNNER_TEMP}/gh-aw/safeoutputs/tools_meta.json << 'GH_AW_SAFE_OUTPUTS_TOOLS_META_0b11521b2b188ecd_EOF' + cat > ${RUNNER_TEMP}/gh-aw/safeoutputs/tools_meta.json << 'GH_AW_SAFE_OUTPUTS_TOOLS_META_e94b9b0d12aa4571_EOF' { "description_suffixes": { "add_comment": " CONSTRAINTS: Maximum 3 comment(s) can be added. Target: *.", @@ -412,8 +412,8 @@ jobs: "repo_params": {}, "dynamic_tools": [] } - GH_AW_SAFE_OUTPUTS_TOOLS_META_0b11521b2b188ecd_EOF - cat > ${RUNNER_TEMP}/gh-aw/safeoutputs/validation.json << 'GH_AW_SAFE_OUTPUTS_VALIDATION_4f2d89a889ce19de_EOF' + GH_AW_SAFE_OUTPUTS_TOOLS_META_e94b9b0d12aa4571_EOF + cat > ${RUNNER_TEMP}/gh-aw/safeoutputs/validation.json << 'GH_AW_SAFE_OUTPUTS_VALIDATION_7cc97c0128fe54d3_EOF' { "add_comment": { "defaultMax": 1, @@ -511,7 +511,7 @@ jobs: } } } - GH_AW_SAFE_OUTPUTS_VALIDATION_4f2d89a889ce19de_EOF + GH_AW_SAFE_OUTPUTS_VALIDATION_7cc97c0128fe54d3_EOF node ${RUNNER_TEMP}/gh-aw/actions/generate_safe_outputs_tools.cjs - name: Generate Safe Outputs MCP Server Config id: safe-outputs-config @@ -581,7 +581,7 @@ jobs: export MCP_GATEWAY_DOCKER_COMMAND='docker run -i --rm --network host -v /var/run/docker.sock:/var/run/docker.sock -e MCP_GATEWAY_PORT -e MCP_GATEWAY_DOMAIN -e MCP_GATEWAY_API_KEY -e MCP_GATEWAY_PAYLOAD_DIR -e MCP_GATEWAY_PAYLOAD_SIZE_THRESHOLD -e DEBUG -e MCP_GATEWAY_LOG_DIR -e GH_AW_MCP_LOG_DIR -e GH_AW_SAFE_OUTPUTS -e GH_AW_SAFE_OUTPUTS_CONFIG_PATH -e GH_AW_SAFE_OUTPUTS_TOOLS_PATH -e GH_AW_ASSETS_BRANCH -e GH_AW_ASSETS_MAX_SIZE_KB -e GH_AW_ASSETS_ALLOWED_EXTS -e DEFAULT_BRANCH -e GITHUB_MCP_SERVER_TOKEN -e GITHUB_MCP_GUARD_MIN_INTEGRITY -e GITHUB_MCP_GUARD_REPOS -e GITHUB_REPOSITORY -e GITHUB_SERVER_URL -e GITHUB_SHA -e GITHUB_WORKSPACE -e GITHUB_TOKEN -e GITHUB_RUN_ID -e GITHUB_RUN_NUMBER -e GITHUB_RUN_ATTEMPT -e GITHUB_JOB -e GITHUB_ACTION -e GITHUB_EVENT_NAME -e GITHUB_EVENT_PATH -e GITHUB_ACTOR -e GITHUB_ACTOR_ID -e GITHUB_TRIGGERING_ACTOR -e GITHUB_WORKFLOW -e GITHUB_WORKFLOW_REF -e GITHUB_WORKFLOW_SHA -e GITHUB_REF -e GITHUB_REF_NAME -e GITHUB_REF_TYPE -e GITHUB_HEAD_REF -e GITHUB_BASE_REF -e GH_AW_SAFE_OUTPUTS_PORT -e GH_AW_SAFE_OUTPUTS_API_KEY -v /tmp/gh-aw/mcp-payloads:/tmp/gh-aw/mcp-payloads:rw -v /opt:/opt:ro -v /tmp:/tmp:rw -v '"${GITHUB_WORKSPACE}"':'"${GITHUB_WORKSPACE}"':rw ghcr.io/github/gh-aw-mcpg:v0.2.11' mkdir -p /home/runner/.copilot - cat << GH_AW_MCP_CONFIG_e92a5aad7336713f_EOF | bash ${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.sh + cat << GH_AW_MCP_CONFIG_df1a40d4ce900f98_EOF | bash ${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.sh { "mcpServers": { "github": { @@ -622,7 +622,7 @@ jobs: "payloadDir": "${MCP_GATEWAY_PAYLOAD_DIR}" } } - GH_AW_MCP_CONFIG_e92a5aad7336713f_EOF + GH_AW_MCP_CONFIG_df1a40d4ce900f98_EOF - name: Download activation artifact uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: diff --git a/.github/workflows/evergreen.md b/.github/workflows/evergreen.md index 481eae1c..147b912b 100644 --- a/.github/workflows/evergreen.md +++ b/.github/workflows/evergreen.md @@ -54,7 +54,7 @@ steps: FORCED_PR: ${{ github.event.inputs.pr_number }} run: | python3 - << 'PYEOF' - import os, json, re, sys + import os, json, re, subprocess, sys import urllib.request, urllib.error token = os.environ.get("GITHUB_TOKEN", "") @@ -179,10 +179,10 @@ steps: print(f"Found {len(prs)} open PR(s)") if not prs: - print("No open PRs. Exiting.") + print("No open PRs. Nothing to do.") with open(output_file, "w") as f: json.dump({"selected": None, "reason": "no_open_prs"}, f) - sys.exit(1) + sys.exit(0) # Evaluate each PR deterministically (sorted by PR number ascending) candidates = [] @@ -249,12 +249,16 @@ steps: json.dump(result, f, indent=2) if selected: + branch = selected["head_branch"] + print(f"Checking out PR branch before agent run: {branch}") + subprocess.check_call(["git", "checkout", "-B", branch, f"origin/{branch}"]) + subprocess.check_call(["git", "branch", "--set-upstream-to", f"origin/{branch}", branch]) print(f"\n>>> Selected PR #{selected['pr_number']}: {selected['title']}") print(f" Issues: {selected['issues']}") print(f" Attempt: {selected['attempts'] + 1}/{MAX_ATTEMPTS}") else: - print("\nNo PRs need attention. Exiting.") - sys.exit(1) + print("\nNo PRs need attention. Nothing to do.") + sys.exit(0) PYEOF features: @@ -279,11 +283,9 @@ A pre-flight step has already identified a PR that needs attention. Read the sel - `selected.base_branch` — the target branch (usually `main`) - `selected.attempts` — how many times we've already tried on this SHA -2. **Check out the PR branch** as a local tracking branch so the push tool can find it: - ```bash - git checkout -b origin/ - ``` - where `` is `selected.head_branch` from the selection file. **Do not** use a detached HEAD checkout — the `push-to-pull-request-branch` tool requires a named local branch. + > If `selected` is `null`, no PRs need attention right now. Call the **noop** tool with a message like "All PRs are healthy — nothing to fix." and stop. + +2. The pre-flight step already checks out `selected.head_branch` as a named local tracking branch before you start. Keep working on that branch (do not switch back to `main` or use detached HEAD). 3. **Fix the issues**: diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index a7ede9cd..127a90d6 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -36,6 +36,13 @@ jobs: - name: Bundle TypeScript compiler for offline playground run: cp node_modules/typescript/lib/typescript.js ./playground/dist/typescript.js + - name: Copy benchmark results to playground + run: | + mkdir -p ./playground/benchmarks + if [ -f benchmarks/results.json ]; then + cp benchmarks/results.json ./playground/benchmarks/results.json + fi + - name: Setup Python uses: actions/setup-python@v5 with: diff --git a/.github/workflows/sync-branches.lock.yml b/.github/workflows/sync-branches.lock.yml index 0183de8c..78f6887e 100644 --- a/.github/workflows/sync-branches.lock.yml +++ b/.github/workflows/sync-branches.lock.yml @@ -24,7 +24,7 @@ # Runs whenever the default branch changes and merges it into all active # autoloop/* branches so that program iterations always build on the latest code. # -# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"719939a8013705db572524495d231d61b5652aa8fa86506426ccbe84aade70e1","compiler_version":"v0.65.6","strict":true,"agent_id":"copilot"} +# gh-aw-metadata: {"schema_version":"v3","frontmatter_hash":"42baaebd1818fa54f67dfaadbc42d425fcd44388126d27496222c26a7fcdd745","compiler_version":"v0.65.6","strict":true,"agent_id":"copilot"} name: "Sync Branches" "on": @@ -133,13 +133,13 @@ jobs: run: | bash ${RUNNER_TEMP}/gh-aw/actions/create_prompt_first.sh { - cat << 'GH_AW_PROMPT_c6c6ca09724d546c_EOF' + cat << 'GH_AW_PROMPT_6ce21e657f0d715b_EOF' - GH_AW_PROMPT_c6c6ca09724d546c_EOF + GH_AW_PROMPT_6ce21e657f0d715b_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/xpia.md" cat "${RUNNER_TEMP}/gh-aw/prompts/temp_folder_prompt.md" cat "${RUNNER_TEMP}/gh-aw/prompts/markdown.md" - cat << 'GH_AW_PROMPT_c6c6ca09724d546c_EOF' + cat << 'GH_AW_PROMPT_6ce21e657f0d715b_EOF' The following GitHub context information is available for this workflow: {{#if __GH_AW_GITHUB_ACTOR__ }} @@ -168,12 +168,12 @@ jobs: {{/if}} - GH_AW_PROMPT_c6c6ca09724d546c_EOF + GH_AW_PROMPT_6ce21e657f0d715b_EOF cat "${RUNNER_TEMP}/gh-aw/prompts/github_mcp_tools_prompt.md" - cat << 'GH_AW_PROMPT_c6c6ca09724d546c_EOF' + cat << 'GH_AW_PROMPT_6ce21e657f0d715b_EOF' {{#runtime-import .github/workflows/sync-branches.md}} - GH_AW_PROMPT_c6c6ca09724d546c_EOF + GH_AW_PROMPT_6ce21e657f0d715b_EOF } > "$GH_AW_PROMPT" - name: Interpolate variables and render templates uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8 @@ -285,7 +285,7 @@ jobs: DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} GITHUB_REPOSITORY: ${{ github.repository }} name: Merge default branch into all autoloop program branches - run: "python3 - << 'PYEOF'\nimport os, subprocess, sys\n\ntoken = os.environ.get(\"GITHUB_TOKEN\", \"\")\nrepo = os.environ.get(\"GITHUB_REPOSITORY\", \"\")\ndefault_branch = os.environ.get(\"DEFAULT_BRANCH\", \"main\")\n\n# List all remote branches matching the autoloop/* pattern\nresult = subprocess.run(\n [\"git\", \"branch\", \"-r\", \"--list\", \"origin/autoloop/*\"],\n capture_output=True, text=True\n)\nif result.returncode != 0:\n print(f\"Failed to list remote branches: {result.stderr}\")\n sys.exit(0)\n\nbranches = [b.strip().replace(\"origin/\", \"\") for b in result.stdout.strip().split(\"\\n\") if b.strip()]\n\nif not branches:\n print(\"No autoloop/* branches found. Nothing to sync.\")\n sys.exit(0)\n\nprint(f\"Found {len(branches)} autoloop branch(es) to sync: {branches}\")\n\nfailed = []\nfor branch in branches:\n print(f\"\\n--- Syncing {branch} with {default_branch} ---\")\n\n # Fetch both branches\n subprocess.run([\"git\", \"fetch\", \"origin\", branch], capture_output=True)\n subprocess.run([\"git\", \"fetch\", \"origin\", default_branch], capture_output=True)\n\n # Check out the program branch\n checkout = subprocess.run(\n [\"git\", \"checkout\", branch],\n capture_output=True, text=True\n )\n if checkout.returncode != 0:\n # Try creating a local tracking branch\n checkout = subprocess.run(\n [\"git\", \"checkout\", \"-b\", branch, f\"origin/{branch}\"],\n capture_output=True, text=True\n )\n if checkout.returncode != 0:\n print(f\" Failed to checkout {branch}: {checkout.stderr}\")\n failed.append(branch)\n continue\n\n # Merge the default branch into the program branch\n merge = subprocess.run(\n [\"git\", \"merge\", f\"origin/{default_branch}\", \"--no-edit\",\n \"-m\", f\"Merge {default_branch} into {branch}\"],\n capture_output=True, text=True\n )\n if merge.returncode != 0:\n print(f\" Merge conflict or failure for {branch}: {merge.stderr}\")\n # Abort the merge to leave a clean state\n subprocess.run([\"git\", \"merge\", \"--abort\"], capture_output=True)\n failed.append(branch)\n continue\n\n # Push the updated branch\n push = subprocess.run(\n [\"git\", \"push\", \"origin\", branch],\n capture_output=True, text=True\n )\n if push.returncode != 0:\n print(f\" Failed to push {branch}: {push.stderr}\")\n failed.append(branch)\n continue\n\n print(f\" Successfully synced {branch}\")\n\n# Return to default branch\nsubprocess.run([\"git\", \"checkout\", default_branch], capture_output=True)\n\nif failed:\n print(f\"\\n⚠️ Failed to sync {len(failed)} branch(es): {failed}\")\n print(\"These branches may need manual conflict resolution.\")\n # Don't fail the workflow — log the issue but continue\nelse:\n print(f\"\\n✅ All {len(branches)} branch(es) synced successfully.\")\nPYEOF\n" + run: "python3 - << 'PYEOF'\nimport os, re, subprocess, sys\n\ntoken = os.environ.get(\"GITHUB_TOKEN\", \"\")\nrepo = os.environ.get(\"GITHUB_REPOSITORY\", \"\")\ndefault_branch = os.environ.get(\"DEFAULT_BRANCH\", \"main\")\n\n# List all remote branches matching the autoloop/* pattern\nresult = subprocess.run(\n [\"git\", \"branch\", \"-r\", \"--list\", \"origin/autoloop/*\"],\n capture_output=True, text=True\n)\nif result.returncode != 0:\n print(f\"Failed to list remote branches: {result.stderr}\")\n sys.exit(0)\n\nall_branches = [b.strip().replace(\"origin/\", \"\") for b in result.stdout.strip().split(\"\\n\") if b.strip()]\n\n# Filter to canonical branches only: autoloop/{name} without hash suffixes.\n# Stale branches created by the framework (e.g. autoloop/name-a1b2c3d4e5f6g7h8)\n# are skipped — they are not the long-running program branches.\n_hash_suffix = re.compile(r'-[0-9a-f]{16}$')\nbranches = [b for b in all_branches if not _hash_suffix.search(b)]\nskipped_branches = [b for b in all_branches if _hash_suffix.search(b)]\n\nif skipped_branches:\n print(f\"Skipping {len(skipped_branches)} stale branch(es) with hash suffixes: {skipped_branches}\")\n\nif not branches:\n print(\"No canonical autoloop/* branches found. Nothing to sync.\")\n sys.exit(0)\n\nprint(f\"Found {len(branches)} canonical autoloop branch(es) to sync: {branches}\")\n\nfailed = []\nfor branch in branches:\n print(f\"\\n--- Syncing {branch} with {default_branch} ---\")\n\n # Fetch both branches\n subprocess.run([\"git\", \"fetch\", \"origin\", branch], capture_output=True)\n subprocess.run([\"git\", \"fetch\", \"origin\", default_branch], capture_output=True)\n\n # Check out the program branch\n checkout = subprocess.run(\n [\"git\", \"checkout\", branch],\n capture_output=True, text=True\n )\n if checkout.returncode != 0:\n # Try creating a local tracking branch\n checkout = subprocess.run(\n [\"git\", \"checkout\", \"-b\", branch, f\"origin/{branch}\"],\n capture_output=True, text=True\n )\n if checkout.returncode != 0:\n print(f\" Failed to checkout {branch}: {checkout.stderr}\")\n failed.append(branch)\n continue\n\n # Merge the default branch into the program branch\n merge = subprocess.run(\n [\"git\", \"merge\", f\"origin/{default_branch}\", \"--no-edit\",\n \"-m\", f\"Merge {default_branch} into {branch}\"],\n capture_output=True, text=True\n )\n if merge.returncode != 0:\n print(f\" Merge conflict or failure for {branch}: {merge.stderr}\")\n # Abort the merge to leave a clean state\n subprocess.run([\"git\", \"merge\", \"--abort\"], capture_output=True)\n failed.append(branch)\n continue\n\n # Push the updated branch\n push = subprocess.run(\n [\"git\", \"push\", \"origin\", branch],\n capture_output=True, text=True\n )\n if push.returncode != 0:\n print(f\" Failed to push {branch}: {push.stderr}\")\n failed.append(branch)\n continue\n\n print(f\" Successfully synced {branch}\")\n\n# Return to default branch\nsubprocess.run([\"git\", \"checkout\", default_branch], capture_output=True)\n\nif failed:\n print(f\"\\n⚠️ Failed to sync {len(failed)} branch(es): {failed}\")\n print(\"These branches may need manual conflict resolution.\")\n # Don't fail the workflow — log the issue but continue\nelse:\n print(f\"\\n✅ All {len(branches)} branch(es) synced successfully.\")\nPYEOF\n" - name: Configure Git credentials env: @@ -354,7 +354,7 @@ jobs: export MCP_GATEWAY_DOCKER_COMMAND='docker run -i --rm --network host -v /var/run/docker.sock:/var/run/docker.sock -e MCP_GATEWAY_PORT -e MCP_GATEWAY_DOMAIN -e MCP_GATEWAY_API_KEY -e MCP_GATEWAY_PAYLOAD_DIR -e MCP_GATEWAY_PAYLOAD_SIZE_THRESHOLD -e DEBUG -e MCP_GATEWAY_LOG_DIR -e GH_AW_MCP_LOG_DIR -e GH_AW_SAFE_OUTPUTS -e GH_AW_SAFE_OUTPUTS_CONFIG_PATH -e GH_AW_SAFE_OUTPUTS_TOOLS_PATH -e GH_AW_ASSETS_BRANCH -e GH_AW_ASSETS_MAX_SIZE_KB -e GH_AW_ASSETS_ALLOWED_EXTS -e DEFAULT_BRANCH -e GITHUB_MCP_SERVER_TOKEN -e GITHUB_MCP_GUARD_MIN_INTEGRITY -e GITHUB_MCP_GUARD_REPOS -e GITHUB_REPOSITORY -e GITHUB_SERVER_URL -e GITHUB_SHA -e GITHUB_WORKSPACE -e GITHUB_TOKEN -e GITHUB_RUN_ID -e GITHUB_RUN_NUMBER -e GITHUB_RUN_ATTEMPT -e GITHUB_JOB -e GITHUB_ACTION -e GITHUB_EVENT_NAME -e GITHUB_EVENT_PATH -e GITHUB_ACTOR -e GITHUB_ACTOR_ID -e GITHUB_TRIGGERING_ACTOR -e GITHUB_WORKFLOW -e GITHUB_WORKFLOW_REF -e GITHUB_WORKFLOW_SHA -e GITHUB_REF -e GITHUB_REF_NAME -e GITHUB_REF_TYPE -e GITHUB_HEAD_REF -e GITHUB_BASE_REF -v /tmp/gh-aw/mcp-payloads:/tmp/gh-aw/mcp-payloads:rw -v /opt:/opt:ro -v /tmp:/tmp:rw -v '"${GITHUB_WORKSPACE}"':'"${GITHUB_WORKSPACE}"':rw ghcr.io/github/gh-aw-mcpg:v0.2.11' mkdir -p /home/runner/.copilot - cat << GH_AW_MCP_CONFIG_f2267ff9994f362a_EOF | bash ${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.sh + cat << GH_AW_MCP_CONFIG_6e54b48a11cd24bb_EOF | bash ${RUNNER_TEMP}/gh-aw/actions/start_mcp_gateway.sh { "mcpServers": { "github": { @@ -381,7 +381,7 @@ jobs: "payloadDir": "${MCP_GATEWAY_PAYLOAD_DIR}" } } - GH_AW_MCP_CONFIG_f2267ff9994f362a_EOF + GH_AW_MCP_CONFIG_6e54b48a11cd24bb_EOF - name: Download activation artifact uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: diff --git a/.github/workflows/sync-branches.md b/.github/workflows/sync-branches.md index d6775100..772e2438 100644 --- a/.github/workflows/sync-branches.md +++ b/.github/workflows/sync-branches.md @@ -25,7 +25,7 @@ steps: DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} run: | python3 - << 'PYEOF' - import os, subprocess, sys + import os, re, subprocess, sys token = os.environ.get("GITHUB_TOKEN", "") repo = os.environ.get("GITHUB_REPOSITORY", "") @@ -40,13 +40,23 @@ steps: print(f"Failed to list remote branches: {result.stderr}") sys.exit(0) - branches = [b.strip().replace("origin/", "") for b in result.stdout.strip().split("\n") if b.strip()] + all_branches = [b.strip().replace("origin/", "") for b in result.stdout.strip().split("\n") if b.strip()] + + # Filter to canonical branches only: autoloop/{name} without hash suffixes. + # Stale branches created by the framework (e.g. autoloop/name-a1b2c3d4e5f6g7h8) + # are skipped — they are not the long-running program branches. + _hash_suffix = re.compile(r'-[0-9a-f]{16}$') + branches = [b for b in all_branches if not _hash_suffix.search(b)] + skipped_branches = [b for b in all_branches if _hash_suffix.search(b)] + + if skipped_branches: + print(f"Skipping {len(skipped_branches)} stale branch(es) with hash suffixes: {skipped_branches}") if not branches: - print("No autoloop/* branches found. Nothing to sync.") + print("No canonical autoloop/* branches found. Nothing to sync.") sys.exit(0) - print(f"Found {len(branches)} autoloop branch(es) to sync: {branches}") + print(f"Found {len(branches)} canonical autoloop branch(es) to sync: {branches}") failed = [] for branch in branches: diff --git a/benchmarks/pandas/bench_concat.py b/benchmarks/pandas/bench_concat.py new file mode 100644 index 00000000..3533109e --- /dev/null +++ b/benchmarks/pandas/bench_concat.py @@ -0,0 +1,28 @@ +"""Benchmark: concat — concatenate two 50k-row DataFrames""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 50_000 +WARMUP = 5 +ITERATIONS = 20 + +vals1 = np.arange(ROWS, dtype=np.float64) +vals2 = np.arange(ROWS, dtype=np.float64) * 2.0 +df1 = pd.DataFrame({"value": vals1}) +df2 = pd.DataFrame({"value": vals2}) + +for _ in range(WARMUP): + pd.concat([df1, df2], ignore_index=True) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.concat([df1, df2], ignore_index=True) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "concat", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_apply.py b/benchmarks/pandas/bench_dataframe_apply.py new file mode 100644 index 00000000..6788d422 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_apply.py @@ -0,0 +1,27 @@ +"""Benchmark: dataframe_apply — apply a function across rows of a 10k-row DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 10_000 +WARMUP = 3 +ITERATIONS = 10 + +a = np.arange(ROWS, dtype=np.float64) +b = np.arange(ROWS, dtype=np.float64) * 2.0 +df = pd.DataFrame({"a": a, "b": b}) + +for _ in range(WARMUP): + df.apply(lambda row: row["a"] + row["b"], axis=1) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.apply(lambda row: row["a"] + row["b"], axis=1) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_apply", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_creation.py b/benchmarks/pandas/bench_dataframe_creation.py new file mode 100644 index 00000000..706c8b13 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_creation.py @@ -0,0 +1,27 @@ +"""Benchmark: DataFrame creation from arrays (pandas equivalent)""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +nums1 = np.arange(ROWS, dtype=np.float64) * 1.1 +nums2 = np.arange(ROWS, dtype=np.float64) * 2.2 +strs = [f"label_{i % 100}" for i in range(ROWS)] + +for _ in range(WARMUP): + pd.DataFrame({"a": nums1, "b": nums2, "c": strs}) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.DataFrame({"a": nums1, "b": nums2, "c": strs}) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_creation", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_dropna.py b/benchmarks/pandas/bench_dataframe_dropna.py new file mode 100644 index 00000000..08a11895 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_dropna.py @@ -0,0 +1,27 @@ +"""Benchmark: dataframe_dropna — drop rows with NaN values from 100k-row DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +a = np.where(np.arange(ROWS) % 10 == 0, np.nan, np.arange(ROWS) * 1.1) +b = np.where(np.arange(ROWS) % 7 == 0, np.nan, np.arange(ROWS) * 2.2) +df = pd.DataFrame({"a": a, "b": b}) + +for _ in range(WARMUP): + df.dropna() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.dropna() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_dropna", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_filter.py b/benchmarks/pandas/bench_dataframe_filter.py new file mode 100644 index 00000000..112384f8 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_filter.py @@ -0,0 +1,26 @@ +"""Benchmark: DataFrame filter (boolean mask on 100k-row DataFrame)""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +vals = np.arange(ROWS, dtype=np.float64) * 0.1 +df = pd.DataFrame({"value": vals}) + +for _ in range(WARMUP): + df[df["value"] > 5000] + +start = time.perf_counter() +for _ in range(ITERATIONS): + df[df["value"] > 5000] +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_filter", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_rename.py b/benchmarks/pandas/bench_dataframe_rename.py new file mode 100644 index 00000000..65e44626 --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_rename.py @@ -0,0 +1,27 @@ +"""Benchmark: dataframe_rename — rename columns in a 100k-row DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +a = np.arange(ROWS, dtype=np.float64) * 1.1 +b = np.arange(ROWS, dtype=np.float64) * 2.2 +df = pd.DataFrame({"old_a": a, "old_b": b}) + +for _ in range(WARMUP): + df.rename(columns={"old_a": "new_a", "old_b": "new_b"}) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.rename(columns={"old_a": "new_a", "old_b": "new_b"}) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_rename", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_dataframe_sort.py b/benchmarks/pandas/bench_dataframe_sort.py new file mode 100644 index 00000000..6ef3c84d --- /dev/null +++ b/benchmarks/pandas/bench_dataframe_sort.py @@ -0,0 +1,28 @@ +"""Benchmark: dataframe_sort — sort a 100k-row DataFrame by two columns""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +rng = np.random.default_rng(42) +a = [f"group_{i % 100}" for i in range(ROWS)] +b = rng.random(ROWS) * 1000 +df = pd.DataFrame({"a": a, "b": b}) + +for _ in range(WARMUP): + df.sort_values(["a", "b"]) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.sort_values(["a", "b"]) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "dataframe_sort", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_describe.py b/benchmarks/pandas/bench_describe.py new file mode 100644 index 00000000..b9e84dcc --- /dev/null +++ b/benchmarks/pandas/bench_describe.py @@ -0,0 +1,27 @@ +"""Benchmark: describe — summary statistics on a 100k-row DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +a = np.arange(ROWS, dtype=np.float64) * 1.1 +b = np.sqrt(np.arange(1, ROWS + 1, dtype=np.float64)) +df = pd.DataFrame({"a": a, "b": b}) + +for _ in range(WARMUP): + df.describe() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.describe() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "describe", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_ewm_mean.py b/benchmarks/pandas/bench_ewm_mean.py new file mode 100644 index 00000000..4e6cbadd --- /dev/null +++ b/benchmarks/pandas/bench_ewm_mean.py @@ -0,0 +1,26 @@ +"""Benchmark: ewm_mean — exponentially weighted mean on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.sin(np.arange(ROWS) * 0.05) +s = pd.Series(data) + +for _ in range(WARMUP): + s.ewm(span=20).mean() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.ewm(span=20).mean() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "ewm_mean", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_groupby_mean.py b/benchmarks/pandas/bench_groupby_mean.py new file mode 100644 index 00000000..050959af --- /dev/null +++ b/benchmarks/pandas/bench_groupby_mean.py @@ -0,0 +1,27 @@ +"""Benchmark: GroupBy mean on 100k-row DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +keys = [f"group_{i % 100}" for i in range(ROWS)] +vals = np.arange(ROWS, dtype=np.float64) * 0.1 +df = pd.DataFrame({"key": keys, "value": vals}) + +for _ in range(WARMUP): + df.groupby("key")["value"].mean() + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.groupby("key")["value"].mean() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "groupby_mean", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_merge.py b/benchmarks/pandas/bench_merge.py new file mode 100644 index 00000000..9775f4a2 --- /dev/null +++ b/benchmarks/pandas/bench_merge.py @@ -0,0 +1,29 @@ +"""Benchmark: merge — inner join two 50k-row DataFrames on a key column""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 50_000 +WARMUP = 3 +ITERATIONS = 10 + +keys = np.arange(ROWS) % 1000 +vals1 = np.arange(ROWS, dtype=np.float64) +vals2 = np.arange(ROWS, dtype=np.float64) * 2.0 +df1 = pd.DataFrame({"key": keys, "val1": vals1}) +df2 = pd.DataFrame({"key": keys, "val2": vals2}) + +for _ in range(WARMUP): + pd.merge(df1, df2, on="key", how="inner") + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.merge(df1, df2, on="key", how="inner") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "merge", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_pivot_table.py b/benchmarks/pandas/bench_pivot_table.py new file mode 100644 index 00000000..f65f9321 --- /dev/null +++ b/benchmarks/pandas/bench_pivot_table.py @@ -0,0 +1,28 @@ +"""Benchmark: pivot_table — pivot aggregation on 100k-row DataFrame""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +rows = [f"row_{i % 100}" for i in range(ROWS)] +cols = [f"col_{i % 50}" for i in range(ROWS)] +vals = np.arange(ROWS, dtype=np.float64) * 0.1 +df = pd.DataFrame({"row": rows, "col": cols, "value": vals}) + +for _ in range(WARMUP): + df.pivot_table(values="value", index="row", columns="col", aggfunc="mean") + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.pivot_table(values="value", index="row", columns="col", aggfunc="mean") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "pivot_table", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_read_csv.py b/benchmarks/pandas/bench_read_csv.py new file mode 100644 index 00000000..d6aa816a --- /dev/null +++ b/benchmarks/pandas/bench_read_csv.py @@ -0,0 +1,30 @@ +"""Benchmark: read_csv — parse a 100k-row CSV file""" +import json, time, os, tempfile +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 2 +ITERATIONS = 5 + +# Build CSV file +tmp_path = "/tmp/gh-aw/agent/bench_read_csv.csv" +with open(tmp_path, "w") as f: + f.write("id,value,label\n") + for i in range(ROWS): + f.write(f"{i},{i * 1.1:.4f},cat_{i % 50}\n") + +for _ in range(WARMUP): + pd.read_csv(tmp_path) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.read_csv(tmp_path) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "read_csv", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_rolling_mean.py b/benchmarks/pandas/bench_rolling_mean.py new file mode 100644 index 00000000..5258fca4 --- /dev/null +++ b/benchmarks/pandas/bench_rolling_mean.py @@ -0,0 +1,26 @@ +"""Benchmark: rolling mean with window=100 on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.sin(np.arange(ROWS) * 0.01) +s = pd.Series(data) + +for _ in range(WARMUP): + s.rolling(100).mean() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.rolling(100).mean() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "rolling_mean", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_arithmetic.py b/benchmarks/pandas/bench_series_arithmetic.py new file mode 100644 index 00000000..4f0325b0 --- /dev/null +++ b/benchmarks/pandas/bench_series_arithmetic.py @@ -0,0 +1,26 @@ +"""Benchmark: Series arithmetic (add + multiply on 100k-element Series)""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +data = np.arange(ROWS, dtype=np.float64) * 0.5 +s = pd.Series(data) + +for _ in range(WARMUP): + (s + 2.0) * 0.5 + +start = time.perf_counter() +for _ in range(ITERATIONS): + (s + 2.0) * 0.5 +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_arithmetic", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_creation.py b/benchmarks/pandas/bench_series_creation.py new file mode 100644 index 00000000..c27fcf87 --- /dev/null +++ b/benchmarks/pandas/bench_series_creation.py @@ -0,0 +1,47 @@ +""" +Benchmark: Series creation + +Creates a Series from a large numeric array and measures the time. +Outputs JSON: {"function": "series_creation", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" + +import json +import time + +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + + +def generate_data(n: int) -> "list[float]": + """Generate a deterministic numeric array of the given size.""" + return [i * 1.1 + 0.5 for i in range(n)] + + +data = generate_data(SIZE) + +# Warm-up +for _ in range(WARMUP): + pd.Series(list(data)) + +# Measured runs +times: "list[float]" = [] +for _ in range(ITERATIONS): + start = time.perf_counter() + pd.Series(list(data)) + end = time.perf_counter() + times.append((end - start) * 1000) # convert to ms + +total_ms = sum(times) +mean_ms = total_ms / ITERATIONS + +result = { + "function": "series_creation", + "mean_ms": round(mean_ms, 3), + "iterations": ITERATIONS, + "total_ms": round(total_ms, 3), +} + +print(json.dumps(result)) diff --git a/benchmarks/pandas/bench_series_cumsum.py b/benchmarks/pandas/bench_series_cumsum.py new file mode 100644 index 00000000..556e3ebd --- /dev/null +++ b/benchmarks/pandas/bench_series_cumsum.py @@ -0,0 +1,26 @@ +"""Benchmark: series_cumsum — cumulative sum on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +data = np.arange(ROWS, dtype=np.float64) * 0.001 +s = pd.Series(data) + +for _ in range(WARMUP): + s.cumsum() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.cumsum() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_cumsum", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_fillna.py b/benchmarks/pandas/bench_series_fillna.py new file mode 100644 index 00000000..6b62f6ad --- /dev/null +++ b/benchmarks/pandas/bench_series_fillna.py @@ -0,0 +1,26 @@ +"""Benchmark: series_fillna — fill NaN values in a 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +data = np.where(np.arange(ROWS) % 5 == 0, np.nan, np.arange(ROWS) * 1.1) +s = pd.Series(data) + +for _ in range(WARMUP): + s.fillna(0.0) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.fillna(0.0) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_fillna", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_shift.py b/benchmarks/pandas/bench_series_shift.py new file mode 100644 index 00000000..0b294485 --- /dev/null +++ b/benchmarks/pandas/bench_series_shift.py @@ -0,0 +1,26 @@ +"""Benchmark: series_shift — shift values by 1 position in a 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 5 +ITERATIONS = 20 + +data = np.arange(ROWS, dtype=np.float64) +s = pd.Series(data) + +for _ in range(WARMUP): + s.shift(1) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.shift(1) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_shift", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_sort.py b/benchmarks/pandas/bench_series_sort.py new file mode 100644 index 00000000..c31de4aa --- /dev/null +++ b/benchmarks/pandas/bench_series_sort.py @@ -0,0 +1,27 @@ +"""Benchmark: Series sort (sort_values on 100k-element numeric Series)""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +rng = np.random.default_rng(42) +data = rng.random(ROWS) * 1000 +s = pd.Series(data) + +for _ in range(WARMUP): + s.sort_values() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.sort_values() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_sort", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_string_ops.py b/benchmarks/pandas/bench_series_string_ops.py new file mode 100644 index 00000000..8744ddcc --- /dev/null +++ b/benchmarks/pandas/bench_series_string_ops.py @@ -0,0 +1,27 @@ +"""Benchmark: series_string_ops — str.upper and str.contains on 100k strings""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"hello_world_{i % 200}" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.str.upper() + s.str.contains("world") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.str.upper() + s.str.contains("world") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_string_ops", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_series_value_counts.py b/benchmarks/pandas/bench_series_value_counts.py new file mode 100644 index 00000000..c156a1eb --- /dev/null +++ b/benchmarks/pandas/bench_series_value_counts.py @@ -0,0 +1,25 @@ +"""Benchmark: value_counts on a 100k-element Series with 100 distinct values""" +import json, time +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = [f"cat_{i % 100}" for i in range(ROWS)] +s = pd.Series(data) + +for _ in range(WARMUP): + s.value_counts() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.value_counts() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "series_value_counts", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/results.json b/benchmarks/results.json new file mode 100644 index 00000000..c883f334 --- /dev/null +++ b/benchmarks/results.json @@ -0,0 +1,247 @@ +{ + "benchmarks": [ + { + "function": "concat", + "tsb": null, + "pandas": { + "function": "concat", + "mean_ms": 0.11375509999993483, + "iterations": 20, + "total_ms": 2.2751019999986966 + }, + "ratio": null + }, + { + "function": "dataframe_apply", + "tsb": null, + "pandas": { + "function": "dataframe_apply", + "mean_ms": 47.161531699998704, + "iterations": 10, + "total_ms": 471.61531699998704 + }, + "ratio": null + }, + { + "function": "dataframe_creation", + "tsb": null, + "pandas": { + "function": "dataframe_creation", + "mean_ms": 5.148059900000135, + "iterations": 10, + "total_ms": 51.48059900000135 + }, + "ratio": null + }, + { + "function": "dataframe_dropna", + "tsb": null, + "pandas": { + "function": "dataframe_dropna", + "mean_ms": 2.42739894999886, + "iterations": 20, + "total_ms": 48.547978999977204 + }, + "ratio": null + }, + { + "function": "dataframe_filter", + "tsb": null, + "pandas": { + "function": "dataframe_filter", + "mean_ms": 0.4964389500003108, + "iterations": 20, + "total_ms": 9.928779000006216 + }, + "ratio": null + }, + { + "function": "dataframe_rename", + "tsb": null, + "pandas": { + "function": "dataframe_rename", + "mean_ms": 0.17103454999869427, + "iterations": 20, + "total_ms": 3.4206909999738855 + }, + "ratio": null + }, + { + "function": "dataframe_sort", + "tsb": null, + "pandas": { + "function": "dataframe_sort", + "mean_ms": 33.301584399998774, + "iterations": 10, + "total_ms": 333.01584399998774 + }, + "ratio": null + }, + { + "function": "describe", + "tsb": null, + "pandas": { + "function": "describe", + "mean_ms": 5.521558600003118, + "iterations": 10, + "total_ms": 55.21558600003118 + }, + "ratio": null + }, + { + "function": "ewm_mean", + "tsb": null, + "pandas": { + "function": "ewm_mean", + "mean_ms": 1.7652839999982461, + "iterations": 10, + "total_ms": 17.65283999998246 + }, + "ratio": null + }, + { + "function": "groupby_mean", + "tsb": null, + "pandas": { + "function": "groupby_mean", + "mean_ms": 8.079756900002621, + "iterations": 10, + "total_ms": 80.79756900002621 + }, + "ratio": null + }, + { + "function": "merge", + "tsb": null, + "pandas": { + "function": "merge", + "mean_ms": 60.42320619999941, + "iterations": 10, + "total_ms": 604.2320619999941 + }, + "ratio": null + }, + { + "function": "pivot_table", + "tsb": null, + "pandas": { + "function": "pivot_table", + "mean_ms": 22.500251999997545, + "iterations": 10, + "total_ms": 225.00251999997545 + }, + "ratio": null + }, + { + "function": "read_csv", + "tsb": null, + "pandas": { + "function": "read_csv", + "mean_ms": 29.951929399999244, + "iterations": 5, + "total_ms": 149.75964699999622 + }, + "ratio": null + }, + { + "function": "rolling_mean", + "tsb": null, + "pandas": { + "function": "rolling_mean", + "mean_ms": 1.71982609999759, + "iterations": 10, + "total_ms": 17.1982609999759 + }, + "ratio": null + }, + { + "function": "series_arithmetic", + "tsb": null, + "pandas": { + "function": "series_arithmetic", + "mean_ms": 0.764571400000591, + "iterations": 20, + "total_ms": 15.29142800001182 + }, + "ratio": null + }, + { + "function": "series_creation", + "tsb": null, + "pandas": { + "function": "series_creation", + "mean_ms": 7.607, + "iterations": 50, + "total_ms": 380.349 + }, + "ratio": null + }, + { + "function": "series_cumsum", + "tsb": null, + "pandas": { + "function": "series_cumsum", + "mean_ms": 1.1250383499998406, + "iterations": 20, + "total_ms": 22.500766999996813 + }, + "ratio": null + }, + { + "function": "series_fillna", + "tsb": null, + "pandas": { + "function": "series_fillna", + "mean_ms": 0.18527670000025864, + "iterations": 20, + "total_ms": 3.705534000005173 + }, + "ratio": null + }, + { + "function": "series_shift", + "tsb": null, + "pandas": { + "function": "series_shift", + "mean_ms": 0.07249699999931636, + "iterations": 20, + "total_ms": 1.4499399999863272 + }, + "ratio": null + }, + { + "function": "series_sort", + "tsb": null, + "pandas": { + "function": "series_sort", + "mean_ms": 5.127767300001551, + "iterations": 10, + "total_ms": 51.27767300001551 + }, + "ratio": null + }, + { + "function": "series_string_ops", + "tsb": null, + "pandas": { + "function": "series_string_ops", + "mean_ms": 34.08206670000027, + "iterations": 10, + "total_ms": 340.8206670000027 + }, + "ratio": null + }, + { + "function": "series_value_counts", + "tsb": null, + "pandas": { + "function": "series_value_counts", + "mean_ms": 9.212644899997713, + "iterations": 10, + "total_ms": 92.12644899997713 + }, + "ratio": null + } + ], + "timestamp": "2026-04-12T15:46:00Z" +} \ No newline at end of file diff --git a/benchmarks/run_benchmarks.sh b/benchmarks/run_benchmarks.sh new file mode 100644 index 00000000..0f800de0 --- /dev/null +++ b/benchmarks/run_benchmarks.sh @@ -0,0 +1,129 @@ +#!/usr/bin/env bash +# +# Run all tsb (TypeScript) and pandas (Python) benchmarks and collect results. +# +# Usage: ./benchmarks/run_benchmarks.sh +# +# Outputs: benchmarks/results.json with all benchmark results +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +# Ensure Python and pandas are available +if ! command -v python3 &>/dev/null; then + echo "ERROR: python3 is required but not found" >&2 + exit 1 +fi + +python3 -c "import pandas" 2>/dev/null || { + echo "Installing pandas..." + pip3 install pandas --quiet +} + +# Ensure Bun is available +if ! command -v bun &>/dev/null; then + echo "ERROR: bun is required but not found" >&2 + exit 1 +fi + +# Collect results +results='{"benchmarks": [], "timestamp": "'$(date -u +"%Y-%m-%dT%H:%M:%SZ")'"}' + +echo "=== Running Performance Benchmarks ===" +echo "" + +# Find all TypeScript benchmark files +for ts_bench in "$SCRIPT_DIR"/tsb/bench_*.ts; do + [ -f "$ts_bench" ] || continue + bench_name=$(basename "$ts_bench" .ts | sed 's/^bench_//') + + # Check for matching Python benchmark + py_bench="$SCRIPT_DIR/pandas/bench_${bench_name}.py" + if [ ! -f "$py_bench" ]; then + echo "SKIP: $bench_name (no matching Python benchmark)" + continue + fi + + echo "--- Benchmarking: $bench_name ---" + + # Run TypeScript benchmark + echo " Running tsb (TypeScript)..." + ts_result=$(cd "$REPO_ROOT" && bun run "$ts_bench" 2>/dev/null) || { + echo " ERROR: TypeScript benchmark failed" + continue + } + echo " tsb result: $ts_result" + + # Run Python benchmark + echo " Running pandas (Python)..." + py_result=$(cd "$REPO_ROOT" && python3 "$py_bench" 2>/dev/null) || { + echo " ERROR: Python benchmark failed" + continue + } + echo " pandas result: $py_result" + + # Extract mean_ms from both + ts_mean=$(echo "$ts_result" | python3 -c "import sys, json; d=json.load(sys.stdin); print(d['mean_ms'])" 2>/dev/null) || { + echo " ERROR: could not parse tsb benchmark result" + continue + } + py_mean=$(echo "$py_result" | python3 -c "import sys, json; d=json.load(sys.stdin); print(d['mean_ms'])" 2>/dev/null) || { + echo " ERROR: could not parse pandas benchmark result" + continue + } + + # Calculate ratio (tsb / pandas) — < 1.0 means tsb is faster + ratio=$(python3 -c " +ts, py = $ts_mean, $py_mean +if py <= 0: + print('null') +else: + print(round(ts / py, 3)) +") + if [ "$ratio" = "null" ]; then + echo " ERROR: pandas mean_ms is zero, cannot compute ratio" + continue + fi + + echo " Ratio (tsb/pandas): ${ratio}x" + echo "" + + # Add to results JSON + results=$(echo "$results" | python3 -c " +import sys, json +data = json.load(sys.stdin) +data['benchmarks'].append({ + 'function': '$bench_name', + 'tsb': $ts_result, + 'pandas': $py_result, + 'ratio': $ratio +}) +print(json.dumps(data, indent=2)) +") +done + +# Write results +echo "$results" > "$SCRIPT_DIR/results.json" +echo "=== Results written to benchmarks/results.json ===" +echo "" + +# Summary +echo "=== Summary ===" +echo "$results" | python3 -c " +import sys, json +data = json.load(sys.stdin) +benchmarks = data.get('benchmarks', []) +if not benchmarks: + print('No benchmarks found.') +else: + print(f'Functions benchmarked: {len(benchmarks)}') + for b in benchmarks: + fn = b['function'] + ts = b['tsb']['mean_ms'] + py = b['pandas']['mean_ms'] + ratio = b['ratio'] + faster = 'tsb' if ratio < 1 else 'pandas' + print(f' {fn}: tsb={ts}ms, pandas={py}ms, ratio={ratio}x ({faster} is faster)') +" diff --git a/benchmarks/tsb/bench_concat.ts b/benchmarks/tsb/bench_concat.ts new file mode 100644 index 00000000..7a72f777 --- /dev/null +++ b/benchmarks/tsb/bench_concat.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: concat — concatenate two 50k-row DataFrames + */ +import { DataFrame, concat } from "../../src/index.js"; + +const ROWS = 50_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const vals1 = Float64Array.from({ length: ROWS }, (_, i) => i * 1.0); +const vals2 = Float64Array.from({ length: ROWS }, (_, i) => i * 2.0); +const df1 = new DataFrame({ value: vals1 }); +const df2 = new DataFrame({ value: vals2 }); + +for (let i = 0; i < WARMUP; i++) { + concat([df1, df2]); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + concat([df1, df2]); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "concat", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_apply.ts b/benchmarks/tsb/bench_dataframe_apply.ts new file mode 100644 index 00000000..32a99a68 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_apply.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: dataframe_apply — apply a function across rows of a 10k-row DataFrame + * (reduced size due to JS per-row overhead) + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const a = Float64Array.from({ length: ROWS }, (_, i) => i * 1.0); +const b = Float64Array.from({ length: ROWS }, (_, i) => i * 2.0); +const df = new DataFrame({ a, b }); + +for (let i = 0; i < WARMUP; i++) { + df.apply((row) => (row["a"] as number) + (row["b"] as number), { axis: 1 }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.apply((row) => (row["a"] as number) + (row["b"] as number), { axis: 1 }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_apply", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_creation.ts b/benchmarks/tsb/bench_dataframe_creation.ts new file mode 100644 index 00000000..2eb8fd56 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_creation.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: DataFrame creation from arrays + * Creates a 3-column (2 numeric + 1 string) 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const nums1 = Float64Array.from({ length: ROWS }, (_, i) => i * 1.1); +const nums2 = Float64Array.from({ length: ROWS }, (_, i) => i * 2.2); +const strs = Array.from({ length: ROWS }, (_, i) => `label_${i % 100}`); + +// Warm up +for (let i = 0; i < WARMUP; i++) { + new DataFrame({ a: nums1, b: nums2, c: strs }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + new DataFrame({ a: nums1, b: nums2, c: strs }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_creation", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_dropna.ts b/benchmarks/tsb/bench_dataframe_dropna.ts new file mode 100644 index 00000000..e4fef46b --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_dropna.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: dataframe_dropna — drop rows with NaN values from 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const a = Float64Array.from({ length: ROWS }, (_, i) => (i % 10 === 0 ? NaN : i * 1.1)); +const b = Float64Array.from({ length: ROWS }, (_, i) => (i % 7 === 0 ? NaN : i * 2.2)); +const df = new DataFrame({ a, b }); + +for (let i = 0; i < WARMUP; i++) { + df.dropna(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.dropna(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_dropna", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_filter.ts b/benchmarks/tsb/bench_dataframe_filter.ts new file mode 100644 index 00000000..57d78bd7 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_filter.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: DataFrame filter (boolean mask on 100k-row DataFrame) + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const vals = Float64Array.from({ length: ROWS }, (_, i) => i * 0.1); +const df = new DataFrame({ value: vals }); + +for (let i = 0; i < WARMUP; i++) { + df.filter((row) => (row["value"] as number) > 5000); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.filter((row) => (row["value"] as number) > 5000); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_filter", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_rename.ts b/benchmarks/tsb/bench_dataframe_rename.ts new file mode 100644 index 00000000..807b63c9 --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_rename.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: dataframe_rename — rename columns in a 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const a = Float64Array.from({ length: ROWS }, (_, i) => i * 1.1); +const b = Float64Array.from({ length: ROWS }, (_, i) => i * 2.2); +const df = new DataFrame({ old_a: a, old_b: b }); + +for (let i = 0; i < WARMUP; i++) { + df.rename({ old_a: "new_a", old_b: "new_b" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.rename({ old_a: "new_a", old_b: "new_b" }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_rename", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_dataframe_sort.ts b/benchmarks/tsb/bench_dataframe_sort.ts new file mode 100644 index 00000000..707e4ecf --- /dev/null +++ b/benchmarks/tsb/bench_dataframe_sort.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: dataframe_sort — sort a 100k-row DataFrame by two columns + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const a = Array.from({ length: ROWS }, (_, i) => `group_${i % 100}`); +const b = Float64Array.from({ length: ROWS }, () => Math.random() * 1000); +const df = new DataFrame({ a, b }); + +for (let i = 0; i < WARMUP; i++) { + df.sort_values(["a", "b"]); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.sort_values(["a", "b"]); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "dataframe_sort", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_describe.ts b/benchmarks/tsb/bench_describe.ts new file mode 100644 index 00000000..368156a3 --- /dev/null +++ b/benchmarks/tsb/bench_describe.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: describe — summary statistics on a 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const a = Float64Array.from({ length: ROWS }, (_, i) => i * 1.1); +const b = Float64Array.from({ length: ROWS }, (_, i) => Math.sqrt(i + 1)); +const df = new DataFrame({ a, b }); + +for (let i = 0; i < WARMUP; i++) { + df.describe(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.describe(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "describe", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_ewm_mean.ts b/benchmarks/tsb/bench_ewm_mean.ts new file mode 100644 index 00000000..8e6597f7 --- /dev/null +++ b/benchmarks/tsb/bench_ewm_mean.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: ewm_mean — exponentially weighted mean on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.05)); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.ewm({ span: 20 }).mean(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.ewm({ span: 20 }).mean(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "ewm_mean", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_groupby_mean.ts b/benchmarks/tsb/bench_groupby_mean.ts new file mode 100644 index 00000000..efecfddb --- /dev/null +++ b/benchmarks/tsb/bench_groupby_mean.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: GroupBy mean on 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const keys = Array.from({ length: ROWS }, (_, i) => `group_${i % 100}`); +const vals = Float64Array.from({ length: ROWS }, (_, i) => i * 0.1); +const df = new DataFrame({ key: keys, value: vals }); + +for (let i = 0; i < WARMUP; i++) { + df.groupby("key").mean(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.groupby("key").mean(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "groupby_mean", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_merge.ts b/benchmarks/tsb/bench_merge.ts new file mode 100644 index 00000000..da68b52b --- /dev/null +++ b/benchmarks/tsb/bench_merge.ts @@ -0,0 +1,33 @@ +/** + * Benchmark: merge — inner join two 50k-row DataFrames on a key column + */ +import { DataFrame, merge } from "../../src/index.js"; + +const ROWS = 50_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const keys = Array.from({ length: ROWS }, (_, i) => i % 1000); +const vals1 = Float64Array.from({ length: ROWS }, (_, i) => i * 1.0); +const vals2 = Float64Array.from({ length: ROWS }, (_, i) => i * 2.0); +const df1 = new DataFrame({ key: keys, val1: vals1 }); +const df2 = new DataFrame({ key: keys, val2: vals2 }); + +for (let i = 0; i < WARMUP; i++) { + merge(df1, df2, { on: "key", how: "inner" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + merge(df1, df2, { on: "key", how: "inner" }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "merge", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_pivot_table.ts b/benchmarks/tsb/bench_pivot_table.ts new file mode 100644 index 00000000..78b94702 --- /dev/null +++ b/benchmarks/tsb/bench_pivot_table.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: pivot_table — pivot aggregation on 100k-row DataFrame + */ +import { DataFrame } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const rows = Array.from({ length: ROWS }, (_, i) => `row_${i % 100}`); +const cols = Array.from({ length: ROWS }, (_, i) => `col_${i % 50}`); +const vals = Float64Array.from({ length: ROWS }, (_, i) => i * 0.1); +const df = new DataFrame({ row: rows, col: cols, value: vals }); + +for (let i = 0; i < WARMUP; i++) { + df.pivot_table({ values: "value", index: "row", columns: "col", aggfunc: "mean" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + df.pivot_table({ values: "value", index: "row", columns: "col", aggfunc: "mean" }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "pivot_table", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_read_csv.ts b/benchmarks/tsb/bench_read_csv.ts new file mode 100644 index 00000000..0d9462bf --- /dev/null +++ b/benchmarks/tsb/bench_read_csv.ts @@ -0,0 +1,39 @@ +/** + * Benchmark: read_csv — parse a 100k-row CSV string + */ +import { read_csv } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 2; +const ITERATIONS = 5; + +// Build CSV string +const lines = ["id,value,label"]; +for (let i = 0; i < ROWS; i++) { + lines.push(`${i},${(i * 1.1).toFixed(4)},cat_${i % 50}`); +} +const csvContent = lines.join("\n"); + +// Write to a temp file +import { writeFileSync } from "node:fs"; +const tmpPath = "/tmp/gh-aw/agent/bench_read_csv.csv"; +writeFileSync(tmpPath, csvContent, "utf8"); + +for (let i = 0; i < WARMUP; i++) { + read_csv(tmpPath); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + read_csv(tmpPath); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "read_csv", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_rolling_mean.ts b/benchmarks/tsb/bench_rolling_mean.ts new file mode 100644 index 00000000..646d3100 --- /dev/null +++ b/benchmarks/tsb/bench_rolling_mean.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: rolling mean with window=100 on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Float64Array.from({ length: ROWS }, (_, i) => Math.sin(i * 0.01)); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.rolling(100).mean(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.rolling(100).mean(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "rolling_mean", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_arithmetic.ts b/benchmarks/tsb/bench_series_arithmetic.ts new file mode 100644 index 00000000..552be2ca --- /dev/null +++ b/benchmarks/tsb/bench_series_arithmetic.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: Series arithmetic (add + multiply on 100k-element Series) + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const data = Float64Array.from({ length: ROWS }, (_, i) => i * 0.5); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.add(2.0).mul(0.5); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.add(2.0).mul(0.5); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_arithmetic", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_creation.ts b/benchmarks/tsb/bench_series_creation.ts new file mode 100644 index 00000000..c7b4e145 --- /dev/null +++ b/benchmarks/tsb/bench_series_creation.ts @@ -0,0 +1,49 @@ +/** + * Benchmark: Series creation + * + * Creates a Series from a large numeric array and measures the time. + * Outputs JSON: {"function": "series_creation", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ + +import { Series } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +/** Generate a deterministic numeric array of the given size. */ +function generateData(n: number): readonly number[] { + const arr: number[] = []; + for (let i = 0; i < n; i++) { + arr.push(i * 1.1 + 0.5); + } + return arr; +} + +const data = generateData(SIZE); + +// Warm-up +for (let i = 0; i < WARMUP; i++) { + new Series({ data: [...data] }); +} + +// Measured runs +const times: number[] = []; +for (let i = 0; i < ITERATIONS; i++) { + const start = performance.now(); + new Series({ data: [...data] }); + const end = performance.now(); + times.push(end - start); +} + +const totalMs = times.reduce((a, b) => a + b, 0); +const meanMs = totalMs / ITERATIONS; + +const result = { + function: "series_creation", + mean_ms: Math.round(meanMs * 1000) / 1000, + iterations: ITERATIONS, + total_ms: Math.round(totalMs * 1000) / 1000, +}; + +console.log(JSON.stringify(result)); diff --git a/benchmarks/tsb/bench_series_cumsum.ts b/benchmarks/tsb/bench_series_cumsum.ts new file mode 100644 index 00000000..3eeba5b0 --- /dev/null +++ b/benchmarks/tsb/bench_series_cumsum.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: series_cumsum — cumulative sum on 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const data = Float64Array.from({ length: ROWS }, (_, i) => i * 0.001); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.cumsum(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.cumsum(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_cumsum", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_fillna.ts b/benchmarks/tsb/bench_series_fillna.ts new file mode 100644 index 00000000..3e658b01 --- /dev/null +++ b/benchmarks/tsb/bench_series_fillna.ts @@ -0,0 +1,31 @@ +/** + * Benchmark: series_fillna — fill NaN/null values in a 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +// Create series with every 5th value as NaN +const data = Float64Array.from({ length: ROWS }, (_, i) => (i % 5 === 0 ? NaN : i * 1.1)); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.fillna(0.0); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.fillna(0.0); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_fillna", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_shift.ts b/benchmarks/tsb/bench_series_shift.ts new file mode 100644 index 00000000..46e79d19 --- /dev/null +++ b/benchmarks/tsb/bench_series_shift.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: series_shift — shift values by 1 position in a 100k-element Series + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 5; +const ITERATIONS = 20; + +const data = Float64Array.from({ length: ROWS }, (_, i) => i * 1.0); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.shift(1); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.shift(1); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_shift", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_sort.ts b/benchmarks/tsb/bench_series_sort.ts new file mode 100644 index 00000000..c6aedb93 --- /dev/null +++ b/benchmarks/tsb/bench_series_sort.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: Series sort (argsort on 100k-element numeric Series) + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Float64Array.from({ length: ROWS }, () => Math.random() * 1000); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.sort_values(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.sort_values(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_sort", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_string_ops.ts b/benchmarks/tsb/bench_series_string_ops.ts new file mode 100644 index 00000000..c44cdefe --- /dev/null +++ b/benchmarks/tsb/bench_series_string_ops.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: series_string_ops — str.upper and str.contains on 100k strings + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `hello_world_${i % 200}`); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.str.upper(); + s.str.contains("world"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.str.upper(); + s.str.contains("world"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_string_ops", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_series_value_counts.ts b/benchmarks/tsb/bench_series_value_counts.ts new file mode 100644 index 00000000..b5352f54 --- /dev/null +++ b/benchmarks/tsb/bench_series_value_counts.ts @@ -0,0 +1,30 @@ +/** + * Benchmark: value_counts on a 100k-element Series with 100 distinct values + */ +import { Series } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => `cat_${i % 100}`); +const s = new Series(data); + +for (let i = 0; i < WARMUP; i++) { + s.value_counts(); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + s.value_counts(); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "series_value_counts", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/docs/playground.md b/docs/playground.md index b2e64183..7f08e62b 100644 --- a/docs/playground.md +++ b/docs/playground.md @@ -120,9 +120,6 @@ The CI pipeline (`pages.yml`) runs this automatically during deployment. ## Non-Goals (Current Scope) -- **Syntax highlighting** in the editor: the current implementation uses a - plain ` + +
+
+ + +
+

Category set operations

+

+ catUnionCategories, catIntersectCategories, + catDiffCategories, and catEqualCategories let you + combine or compare the category sets of two Series. +

+ + +
+
+ + +
+

catSortByFreq(series, opts?)

+

+ Reorder categories by their frequency in the data (most frequent first by default). + Mirrors s.cat.reorder_categories(s.value_counts().index). +

+ + +
+
+ + +
+

catToOrdinal(series, order)

+

+ Create an ordered categorical from a Series using order to define both the + category set and their rank. Values not in order become null. +

+ + +
+
+ + +
+

catFreqTable(series)

+

+ Return a plain Record<string, number> of counts per category. + Zero-frequency categories are included. +

+ + +
+
+ + +
+

catCrossTab(a, b, opts?)

+

+ Cross-tabulation of two categorical Series. Rows = a's categories, + columns = b's categories, cells = co-occurrence counts. + Supports margins and normalization. +

+ + +
+
+ + +
+

catRecode(series, mapping)

+

+ Rename categories via an object map or a transform function. Unmapped categories + are left unchanged. +

+ + +
+
+ + + + + diff --git a/playground/cut_qcut.html b/playground/cut_qcut.html new file mode 100644 index 00000000..1d273a17 --- /dev/null +++ b/playground/cut_qcut.html @@ -0,0 +1,163 @@ + + + + + + tsb — cut / qcut: Binning Continuous Data + + + +

tsb — cut / qcut: Binning Continuous Data

+

+ cut and qcut partition continuous numeric values into + discrete intervals — the TypeScript equivalents of + pandas.cut + and + pandas.qcut. +

+ +

1. cut — Fixed-Width Binning

+

+ Bin values into equal-width (or user-specified) intervals. + Pass an integer for automatic bins, or an explicit edge array. +

+ +

Integer bins

+
import { cut } from "tsb";
+
+const ages = [5, 18, 25, 35, 50, 70];
+const { codes, labels, bins } = cut(ages, 3);
+
+// labels: ["(5.0, 26.7]", "(26.7, 48.3]", "(48.3, 70.0]"]
+// bins:   [4.935, 26.667, 48.333, 70]
+// codes:  [0, 0, 0, 1, 1, 2]
+console.table(ages.map((a, i) => ({ age: a, bin: labels[codes[i]!] })));
+
+ +

Explicit bin edges

+
const scores = [55, 65, 72, 80, 91, 98];
+const { codes, labels } = cut(scores, [0, 60, 70, 80, 90, 100], {
+  labels: ["F", "D", "C", "B", "A"],
+  include_lowest: true,
+});
+// codes:  [0, 1, 2, 3, 4, 4]
+// labels[codes[0]] → "F"
+// labels[codes[5]] → "A"
+
+ +

Options

+ + + + + + + + + +
OptionDefaultDescription
righttrueIntervals closed on right: (a, b]. Set false for [a, b).
include_lowestfalseMake lowest interval left-closed: [a, b].
labelsautoCustom string labels, or false for integer codes.
precision3Decimal places in auto-generated labels.
duplicates"raise""drop" to silently remove duplicate bin edges.
+ +

2. qcut — Quantile-Based Binning

+

+ Divide values into bins of (approximately) equal population using quantiles. + Useful for creating percentile buckets or roughly equal-sized groups. +

+ +

Quartile split

+
import { qcut } from "tsb";
+
+const values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
+const { codes, labels, bins } = qcut(values, 4);
+
+// labels: ["[1, 3.25]", "(3.25, 5.5]", "(5.5, 7.75]", "(7.75, 10]"]
+// Every bin has ~2-3 elements
+
+ +

Custom quantile probabilities

+
const { labels } = qcut(values, [0, 0.1, 0.5, 0.9, 1], {
+  labels: ["bottom 10%", "lower middle", "upper middle", "top 10%"],
+});
+
+ +

Decile labels

+
const { codes } = qcut(data, 10, { labels: false });
+// codes[i] is 0..9 — the decile bucket index
+
+ +

3. Return Value: BinResult

+
interface BinResult {
+  codes:  ReadonlyArray<number | null>; // bin index per value; null for NaN
+  labels: readonly string[];            // ordered label per bin
+  bins:   readonly number[];            // bin edge array (labels.length + 1)
+}
+
+ +
+ Missing values: NaN and Infinity are + assigned null in the codes array and are never placed + in a bin. +
+ +

4. cut vs qcut

+ + + + + + + + +
cutqcut
Bin widthEqual (uniform edges)Varies (equal population)
Bin countDetermined by binsDetermined by q
Best forMeaningful thresholds (age groups, grade bands)Percentile buckets, rank-based analysis
Left edge of first binOpen ( unless include_lowestAlways closed [
+ +

5. pandas Compatibility

+
# Python pandas
+pd.cut([1, 2, 3, 4, 5], 2)
+# Interval(0.996, 3.0, closed='right')  ...
+
+# tsb equivalent
+cut([1, 2, 3, 4, 5], 2)
+// codes: [0, 0, 0, 1, 1]
+// labels: ["(0.996, 3.0]", "(3.0, 5.0]"]
+
+ +

+ Both cut and qcut follow pandas semantics exactly: + right-closed by default, linear interpolation for quantiles, and duplicate-edge + handling via duplicates. +

+ +

← Back to tsb feature index

+ + diff --git a/playground/format_ops.html b/playground/format_ops.html new file mode 100644 index 00000000..d72fd1ec --- /dev/null +++ b/playground/format_ops.html @@ -0,0 +1,262 @@ + + + + + + tsb — format_ops: Number Formatting + + + +

🔢 format_ops — Number Formatting

+

+ tsb provides a suite of number-formatting helpers that mirror pandas' + style.format() and Series.map() patterns. + Every function is zero-dependency and fully typed. +

+

← Back to index

+ +

Scalar formatters

+ + + + + + + + + + + + +
FunctionExample inputExample outputNotes
formatFloat(n, d)3.14159, 2"3.14"Fixed decimal places
formatPercent(n, d)0.1234, 1"12.3%"Multiplies by 100
formatScientific(n, d)12345.678, 3"1.235e+4"Exponential notation
formatEngineering(n, d)12345.678, 3"12.346e+3"Exponent multiple of 3
formatThousands(n, d, sep)1234567.89, 2"1,234,567.89"Thousands separator
formatCurrency(n, sym, d)1234.5, "$""$1,234.50"Currency prefix + thousands
formatCompact(n, d)1_234_567, 2"1.23M"K / M / B / T suffixes
+ +

Interactive demo — scalar formatting

+
+ + + + +
+
+ +

Formatter factories

+
import {
+  makeFloatFormatter,
+  makePercentFormatter,
+  makeCurrencyFormatter,
+} from "tsb";
+
+const fmtFloat   = makeFloatFormatter(3);      // (v) => formatFloat(v, 3)
+const fmtPct     = makePercentFormatter(1);     // (v) => formatPercent(v, 1)
+const fmtDollar  = makeCurrencyFormatter("$");  // (v) => formatCurrency(v, "$", 2)
+
+fmtFloat(3.14159);   // "3.142"
+fmtPct(0.0825);      // "8.3%"
+fmtDollar(9999.99);  // "$9,999.99"
+
+ +

Apply to a Series

+
import { Series, applySeriesFormatter, makePercentFormatter } from "tsb";
+
+const returns = new Series({ data: [0.05, -0.02, 0.134, 0.007], name: "returns" });
+
+const formatted = applySeriesFormatter(returns, makePercentFormatter(1));
+// Series<string> ["5.0%", "-2.0%", "13.4%", "0.7%"]
+
+ +

Apply to a DataFrame

+
import { DataFrame, applyDataFrameFormatter, makeCurrencyFormatter, makePercentFormatter } from "tsb";
+
+const df = DataFrame.fromColumns({
+  price:   [1_299.99, 899.50, 45.00],
+  change:  [0.025, -0.031, 0.102],
+  volume:  [15_000, 8_200, 230_000],
+});
+
+const formatted = applyDataFrameFormatter(df, {
+  price:  makeCurrencyFormatter("$", 2),
+  change: makePercentFormatter(2),
+});
+
+// formatted = {
+//   price:  ["$1,299.99", "$899.50", "$45.00"],
+//   change: ["2.50%", "-3.10%", "10.20%"],
+//   volume: ["15000", "8200", "230000"],      // no formatter → String(v)
+// }
+
+ +

Interactive demo — DataFrame formatting

+
+ +
+
+ +

String rendering

+
import { Series, DataFrame, seriesToString, dataFrameToString, makeFloatFormatter } from "tsb";
+
+const s = new Series({ data: [1.2, 3.4, 5.6], name: "value" });
+console.log(seriesToString(s, { formatter: makeFloatFormatter(1) }));
+// 0    1.2
+// 1    3.4
+// 2    5.6
+// Name: value, dtype: float64
+
+const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [4.0, 5.0, 6.0] });
+console.log(dataFrameToString(df));
+//    a    b
+// 0  1  4.0
+// 1  2  5.0
+// 2  3  6.0
+
+ +

Interactive demo — seriesToString / dataFrameToString

+
+ +
+
+ + + + diff --git a/playground/index.html b/playground/index.html index 2b619f97..4834953d 100644 --- a/playground/index.html +++ b/playground/index.html @@ -269,6 +269,86 @@

✅ Complete +
+

📥 insertColumn / popColumn

+

Insert and remove DataFrame columns at precise positions. insertColumn(df, loc, col, values) inserts at integer position, popColumn(df, col) returns { series, df }. Also includes reorderColumns and moveColumn. Mirrors pandas.DataFrame.insert() and .pop().

+
✅ Complete
+
+
+

✂️ cut / qcut

+

Bin continuous numeric data into discrete intervals. cut() uses fixed-width or explicit bin edges; qcut() uses quantile-based bins of equal population. Both return codes, labels, and bin edges. Mirrors pandas.cut and pandas.qcut.

+
✅ Complete
+
+
+

📊 Rolling Extended Stats

+

Higher-order rolling window statistics: rollingSem (standard error of mean), rollingSkew (Fisher-Pearson skewness), rollingKurt (excess kurtosis), and rollingQuantile (arbitrary percentile with 5 interpolation methods). Mirrors pandas.Series.rolling().sem/skew/kurt/quantile().

+
✅ Complete
+
+
+

🔧 Rolling Apply & Multi-Agg

+

Standalone custom rolling-window functions: rollingApply (custom fn per window), rollingAgg (multiple named aggregations → DataFrame), dataFrameRollingApply, dataFrameRollingAgg. Supports minPeriods, center, and raw mode. Mirrors pandas.Rolling.apply() and Rolling.agg().

+
✅ Complete
+
+
+

🎭 where / mask

+

Element-wise conditional selection: seriesWhere / seriesMask and dataFrameWhere / dataFrameMask. Accepts boolean arrays, label-aligned boolean Series/DataFrame, or callables. Mirrors pandas.Series.where, pandas.DataFrame.where, and their .mask() inverses.

+
✅ Complete
+
+
+

🔍 isna / notna

+

Module-level missing-value detection: isna, notna, isnull, notnull work on scalars, arrays, Series, and DataFrames. Plus standalone fillna, dropna, countna, and countValid. Mirrors pandas.isna, pandas.notna, pandas.isnull, pandas.notnull.

+
✅ Complete
+
+
+

🏷️ attrs — User Metadata

+

Attach arbitrary key→value metadata to any Series or DataFrame via a WeakMap registry. Provides getAttrs, setAttrs, updateAttrs, copyAttrs, withAttrs, mergeAttrs, clearAttrs, getAttr, setAttr, deleteAttr, attrsCount, attrsKeys. Mirrors pandas.DataFrame.attrs / pandas.Series.attrs.

+
✅ Complete
+
+
+

🔤 string_ops — Standalone String Ops

+

Module-level string utilities: strNormalize (Unicode NFC/NFD/NFKC/NFKD), strGetDummies (one-hot DataFrame), strExtractAll (all regex matches), strRemovePrefix, strRemoveSuffix, strTranslate (char-level substitution), strCharWidth (CJK-aware display width), strByteLength. Works on Series, arrays, or scalars.

+
✅ Complete
+
+
+

🔤 string_ops_extended — Extended String Ops

+

Advanced string utilities: strSplitExpand (split → DataFrame columns), strExtractGroups (regex capture groups → DataFrame), strPartition / strRPartition (split into before/sep/after), strMultiReplace (batch replacements), strIndent / strDedent (line-level indentation). Works on Series, arrays, or scalars.

+
✅ Complete
+
+
+

🔗 pipe_apply — Pipeline & Apply Utilities

+

Standalone equivalents of pandas' pipe() / apply() / applymap(): pipe (variadic type-safe pipeline), seriesApply (element-wise with label/pos context), seriesTransform, dataFrameApply (axis 0/1), dataFrameApplyMap (cell-wise), dataFrameTransform (column-wise), dataFrameTransformRows (row-wise).

+
✅ Complete
+
+
+

🔢 numeric_extended — Numeric Utilities

+

numpy/scipy-style numeric utilities: digitize (bin values), histogram (frequency counts with density option), linspace / arange (number sequences), percentileOfScore (percentile rank of a score), zscore (z-score standardisation), minMaxNormalize (scale to [0,1] or custom range), coefficientOfVariation (std/mean). Series-aware variants included.

+
✅ Complete
+
+ +
+
+

🏷️ categorical_ops — Categorical Utilities

+

Standalone categorical helpers: catFromCodes (from integer codes), set operations (catUnionCategories, catIntersectCategories, catDiffCategories, catEqualCategories), catSortByFreq, catToOrdinal, catFreqTable, catCrossTab, catRecode.

+
✅ Complete
+
+
+
+
+

🔢 format_ops — Number Formatting

+

Number-formatting helpers for Series and DataFrame. Scalar formatters: formatFloat, formatPercent, formatScientific, formatEngineering, formatThousands, formatCurrency, formatCompact. Formatter factories: makeFloatFormatter, makePercentFormatter, makeCurrencyFormatter. Apply to collections: applySeriesFormatter, applyDataFrameFormatter. Render to string: seriesToString, dataFrameToString.

+
✅ Complete
+
+
+ + +
+

Performance

+
+
+

⚡ Benchmarks

+

Side-by-side performance comparison of tsb (TypeScript/Bun) vs pandas (Python). Timing metrics for each function.

+
🏗️ In Progress
+
diff --git a/playground/insert_pop.html b/playground/insert_pop.html new file mode 100644 index 00000000..8b724566 --- /dev/null +++ b/playground/insert_pop.html @@ -0,0 +1,172 @@ + + + + + + tsb — insertColumn / popColumn + + + +

← tsb playground

+ +

insertColumn / popColumn

+

+ Column insertion and removal for DataFrames — mirrors + + pandas.DataFrame.insert() and + + pandas.DataFrame.pop(). +

+

+ Because tsb DataFrames are immutable, both functions return a new DataFrame + rather than mutating the original. popColumn returns both the extracted + Series and the resulting DataFrame. +

+ +

API summary

+ + + + + + + + + + + + + + + + + + + + + + + + +
FunctionPandas equivalentDescription
insertColumn(df, loc, col, values)df.insert(loc, col, value)Insert a new column at integer position loc
popColumn(df, col)df.pop(col)Remove a column; returns { series, df }
reorderColumns(df, order)df[order]Reorder (and optionally subset) columns
moveColumn(df, col, newLoc)Move an existing column to a new integer position
+ +

Example 1 — insertColumn

+
import { DataFrame, insertColumn } from "tsb";
+
+const df = DataFrame.fromColumns({
+  name: ["Alice", "Bob", "Carol"],
+  age:  [30, 25, 35],
+});
+// columns: ["name", "age"]
+
+// Insert "city" between "name" and "age"
+const df2 = insertColumn(df, 1, "city", ["NY", "LA", "SF"]);
+// df2.columns.values → ["name", "city", "age"]
+// df2.col("city").values → ["NY", "LA", "SF"]
+
+// Original is unchanged
+// df.columns.values → ["name", "age"]
+
+ +

Example 2 — Insert with a Series

+
import { DataFrame, Series, insertColumn } from "tsb";
+
+const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [4, 5, 6] });
+const salary = new Series({ data: [100_000, 90_000, 120_000], name: "salary" });
+
+const df2 = insertColumn(df, 0, "salary", salary);
+// df2.columns.values → ["salary", "a", "b"]
+
+ +

Example 3 — popColumn

+
import { DataFrame, popColumn } from "tsb";
+
+const df = DataFrame.fromColumns({
+  id:   [1, 2, 3],
+  name: ["Alice", "Bob", "Carol"],
+  age:  [30, 25, 35],
+});
+
+// Remove "age" and keep the Series
+const { series: ageSeries, df: df2 } = popColumn(df, "age");
+// ageSeries.values       → [30, 25, 35]
+// df2.columns.values     → ["id", "name"]
+// df.columns.values      → ["id", "name", "age"]  ← original unchanged
+
+ +

Example 4 — reorderColumns

+
import { DataFrame, reorderColumns } from "tsb";
+
+const df = DataFrame.fromColumns({ a: [1], b: [2], c: [3], d: [4] });
+
+// Reverse the column order
+const df2 = reorderColumns(df, ["d", "c", "b", "a"]);
+// df2.columns.values → ["d", "c", "b", "a"]
+
+// Select a subset (drops columns not listed)
+const df3 = reorderColumns(df, ["a", "c"]);
+// df3.columns.values → ["a", "c"]   (b and d are dropped)
+
+ +

Example 5 — moveColumn

+
import { DataFrame, moveColumn } from "tsb";
+
+const df = DataFrame.fromColumns({
+  year:  [2020, 2021, 2022],
+  value: [10, 20, 30],
+  label: ["a", "b", "c"],
+});
+// columns: ["year", "value", "label"]
+
+// Move "label" to the front
+const df2 = moveColumn(df, "label", 0);
+// df2.columns.values → ["label", "year", "value"]
+
+ +

Error cases

+
// Duplicate column name (default: not allowed)
+insertColumn(df, 1, "a", [1, 2, 3]);
+// → RangeError: Column "a" already exists. Use allowDuplicates=true to permit...
+
+// Out-of-range loc
+insertColumn(df, 99, "x", [1, 2, 3]);
+// → RangeError: loc=99 is out of range [0, 2].
+
+// Wrong number of values
+insertColumn(df, 0, "x", [1]);  // df has 3 rows
+// → RangeError: values length 1 does not match DataFrame row count 3.
+
+// Column not found
+popColumn(df, "missing");
+// → RangeError: Column "missing" not found in DataFrame.
+
+ +
+ Immutability: Like all tsb DataFrame operations, these functions never + mutate the original DataFrame. Always assign the return value to a new variable. +
+ +

pandas equivalence table

+ + + + + + + + + +
pandastsb
df.insert(1, "x", [1,2,3]) *(mutates)*insertColumn(df, 1, "x", [1,2,3])
series = df.pop("col") *(mutates)*const { series, df: df2 } = popColumn(df, "col")
df[["c","a","b"]]reorderColumns(df, ["c","a","b"])
+ + diff --git a/playground/notna_isna.html b/playground/notna_isna.html new file mode 100644 index 00000000..8002a5d9 --- /dev/null +++ b/playground/notna_isna.html @@ -0,0 +1,242 @@ + + + + + + tsb · isna / notna — Missing Value Detection + + + + + +

isna / notna

+

Module-level missing-value detection — mirrors pd.isna(), pd.notna(), pd.isnull(), pd.notnull() from pandas.

+ +

What is "missing"?

+

In tsb, the following values are considered missing:

+ +

Everything else — 0, false, "", new Date(...) — is not missing.

+ +

API Overview

+ + + + + + + + + + + + +
FunctionInputOutputPandas equivalent
isna(v)Scalarbooleanpd.isna(v)
isna(arr)Scalar[]boolean[]pd.isna(arr)
isna(series)SeriesSeries<boolean>pd.isna(series)
isna(df)DataFrameDataFramepd.isna(df)
notna(v)any of abovesame shape, invertedpd.notna(v)
isnull / notnullany of abovesame as isna/notnaaliases
fillna(obj, {value})Scalar/array/Series/DataFramesame type, no missingpd.Series.fillna()
dropna(obj, opts?)array/Series/DataFramemissing entries removedpd.Series.dropna()
countna(obj)array or Seriesnumberseries.isna().sum()
countValid(obj)array or Seriesnumberseries.count()
+ +

🔬 Try it: isna on scalars

+
+ + + +
Click "Run isna" to see results.
+
+ +

🔬 Try it: isna on arrays

+
+ + + +
Click "Run isna" to see results.
+
+ +

🔬 Try it: fillna on arrays

+
+ + + + + +
Click "Run fillna" to see results.
+
+ +

🔬 Try it: dropna on arrays

+
+ + + +
Click "Run dropna" to see results.
+
+ +

📝 Code examples

+
+
+import { isna, notna, isnull, notnull, fillna, dropna, countna, countValid } from "tsb";
+import { Series, DataFrame } from "tsb";
+
+// ── scalar ──────────────────────────────────────────────────
+isna(null);          // true
+isna(undefined);     // true
+isna(NaN);           // true
+isna(0);             // false  — zero is not missing
+isna(false);         // false  — false is not missing
+isna("");            // false  — empty string is not missing
+
+// ── array ───────────────────────────────────────────────────
+isna([1, null, NaN, 3]);     // [false, true, true, false]
+notna([1, null, NaN, 3]);    // [true, false, false, true]
+
+// ── Series ──────────────────────────────────────────────────
+const s = new Series({ data: [1, null, NaN, 4] });
+isna(s).values;   // [false, true, true, false]
+notna(s).values;  // [true, false, false, true]
+
+// ── DataFrame ───────────────────────────────────────────────
+const df = new DataFrame(new Map([
+  ["a", new Series({ data: [1, null, 3] })],
+  ["b", new Series({ data: [NaN, 5, 6] })],
+]));
+isna(df).col("a").values;  // [false, true, false]
+isna(df).col("b").values;  // [true, false, false]
+
+// ── aliases ─────────────────────────────────────────────────
+isnull(null);   // true  (same as isna)
+notnull(42);    // true  (same as notna)
+
+// ── fillna ──────────────────────────────────────────────────
+fillna([1, null, NaN, 4], { value: 0 });   // [1, 0, 0, 4]
+fillna(s, { value: -1 }).values;           // [1, -1, -1, 4]
+fillna(df, { value: 0 }).col("b").values;  // [0, 5, 6]
+
+// ── dropna ──────────────────────────────────────────────────
+dropna([1, null, NaN, 3]);   // [1, 3]
+dropna(s).values;            // [1, 4]
+dropna(df).shape;            // [2, 2]  (row 0 dropped because b[0]=NaN, row 1 dropped because a[1]=null)
+dropna(df, { how: "all" }).shape;         // drops only rows where ALL values are missing
+dropna(df, { axis: 1 }).columns.values;  // drops columns that contain any missing value
+
+// ── countna / countValid ─────────────────────────────────────
+countna([1, null, NaN, 3]);    // 2
+countValid([1, null, NaN, 3]); // 2
+
+
+ + + + diff --git a/playground/numeric_extended.html b/playground/numeric_extended.html new file mode 100644 index 00000000..14cc4990 --- /dev/null +++ b/playground/numeric_extended.html @@ -0,0 +1,353 @@ + + + + + + tsb — Numeric Utilities (digitize, histogram, linspace, arange, zscore…) + + + +

🔢 Numeric Utilities

+

+ ← back to index +

+

+ tsb ships numpy/scipy-style numeric utility functions — all implemented + from scratch with no external dependencies: + digitize, histogram, linspace, arange, + percentileOfScore, zscore, minMaxNormalize, + coefficientOfVariation. +

+ +
+

digitize — bin values

+

+ Map each value to the index of the bin it falls into. Mirrors numpy.digitize. + Indices are 0-based; values below the first edge return -1. +

+
import { digitize, seriesDigitize, Series } from "tsb";
+
+// Find which [0,33), [33,66), [66,100] bucket each score belongs to
+const scores = [15, 45, 70, 33, 100];
+const edges  = [33, 66, 100];
+
+const bins = digitize(scores, edges);
+// → [-1, 1, 2, 0, 2]
+// 15 < 33      → bin -1 (below first edge)
+// 45 ∈ [33,66) → bin  1
+// 70 ∈ [66,100)→ bin  2
+// 33 ∈ [33,66) → bin  0 (33 < 66, right=false default)
+// 100 = last   → bin  2
+
+// Series version — preserves index
+const s = new Series({ data: [15, 45, 70], index: ["Alice","Bob","Carol"] });
+seriesDigitize(s, [33, 66, 100]);
+// Series: Alice→-1, Bob→1, Carol→2
+
Running…
+
+ +
+

histogram — frequency counts

+

Count how many values fall in each bin. Mirrors numpy.histogram.

+
import { histogram } from "tsb";
+
+const data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
+
+// Default: 10 equal-width bins
+const { counts, binEdges } = histogram(data);
+
+// Custom: 5 bins, density normalised
+const { counts: d, binEdges: e } = histogram(data, { bins: 5, density: true });
+
+// Explicit edges
+histogram(data, { binEdges: [1, 4, 7, 10] });
+// counts: [ 3, 3, 4 ]
+
Running…
+
+ +
+

linspace & arange — number sequences

+

Generate evenly-spaced sequences, mirroring numpy.linspace and numpy.arange.

+
import { linspace, arange } from "tsb";
+
+// 5 values from 0 to 1 (inclusive)
+linspace(0, 1, 5);
+// → [0, 0.25, 0.5, 0.75, 1]
+
+// 0..4
+arange(5);
+// → [0, 1, 2, 3, 4]
+
+// From 2 to 10, step 2
+arange(2, 10, 2);
+// → [2, 4, 6, 8]
+
+// Descending
+arange(5, 0, -1);
+// → [5, 4, 3, 2, 1]
+
Running…
+
+ +
+

percentileOfScore — percentile rank

+

+ Compute what percentile a given score falls at within a dataset. + Mirrors scipy.stats.percentileofscore. +

+
import { percentileOfScore } from "tsb";
+
+const grades = [55, 60, 70, 75, 80, 85, 90, 95];
+
+// What percentile is a score of 75?
+percentileOfScore(grades, 75);            // 50 (rank — default)
+percentileOfScore(grades, 75, "weak");    // 50 (≤ 75: 4/8 = 50%)
+percentileOfScore(grades, 75, "strict");  // 37.5 (< 75: 3/8 = 37.5%)
+
Running…
+
+ +
+

zscore — standardisation

+

+ Transform values to zero mean and unit variance. Mirrors scipy.stats.zscore. + Missing values are propagated; zero-variance data returns all NaN. +

+
import { zscore, Series } from "tsb";
+
+const s = new Series({ data: [2, 4, 4, 4, 5, 5, 7, 9], name: "values" });
+const z = zscore(s);
+
+// z.values ≈ [-1.5, -0.5, -0.5, -0.5, 0, 0, 1, 2]
+
+// With population std (ddof=0)
+const zPop = zscore(s, { ddof: 0 });
+
Running…
+
+ +
+

minMaxNormalize — scale to [0, 1]

+

+ Scale all values to the interval [0, 1] (or a custom range). + Mirrors sklearn MinMaxScaler. +

+
import { minMaxNormalize, Series } from "tsb";
+
+const s = new Series({ data: [0, 25, 50, 75, 100] });
+minMaxNormalize(s).values;
+// → [0, 0.25, 0.5, 0.75, 1]
+
+// Scale to [-1, 1]
+minMaxNormalize(s, { featureRangeMin: -1, featureRangeMax: 1 }).values;
+// → [-1, -0.5, 0, 0.5, 1]
+
Running…
+
+ +
+

coefficientOfVariation — relative spread

+

+ Dimensionless measure of dispersion: std / |mean|. + Useful for comparing spread across datasets with different units. +

+
import { coefficientOfVariation, Series } from "tsb";
+
+// Dataset A: [10, 20, 30]  mean=20, std=10  → CV=0.5
+coefficientOfVariation(new Series({ data: [10, 20, 30] }));
+
+// Dataset B: [100, 200, 300]  same shape, higher scale  → CV=0.5
+coefficientOfVariation(new Series({ data: [100, 200, 300] }));
+
+// CV with population std
+coefficientOfVariation(new Series({ data: [1, 2, 3, 4, 5] }), { ddof: 0 });
+
Running…
+
+ + + + diff --git a/playground/pipe_apply.html b/playground/pipe_apply.html new file mode 100644 index 00000000..25f10a21 --- /dev/null +++ b/playground/pipe_apply.html @@ -0,0 +1,276 @@ + + + + + + tsb — pipe_apply: functional pipeline & apply utilities + + + +

← tsb playground

+ +

pipe_apply — Functional Pipeline & Apply Utilities

+

+ Standalone equivalents of the pandas + DataFrame.pipe() + / + Series.pipe() + chaining pattern plus various + apply() + / + applymap() + operations — usable without method-call syntax. +

+ +
+ Why standalone? pandas chains operations via methods: + df.pipe(fn1).pipe(fn2). tsb provides a module-level + pipe(value, fn1, fn2, …) that works on any value, + not just DataFrames. All functions are pure — inputs are never mutated. +
+ +

API Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FunctionPandas equivalentDescription
pipe(value, fn1, fn2, …)df.pipe(fn).pipe(fn2)Variadic type-safe pipeline — passes value through fns left-to-right
seriesApply(s, fn)s.apply(fn)Element-wise; fn receives (value, label, position)
seriesTransform(s, fn)s.transform(fn)Element-wise scalar→scalar; simpler than seriesApply
dataFrameApply(df, fn, axis?)df.apply(fn, axis=0|1)Apply fn to each column (axis=0) or row (axis=1) → Series of results
dataFrameApplyMap(df, fn)df.applymap(fn) / df.map(fn)Apply fn to every cell; fn receives (value, rowLabel, colName)
dataFrameTransform(df, fn)df.transform(fn)Replace each column with fn(col) — must return same-length Series
dataFrameTransformRows(df, fn)df.apply(fn, axis=1, result_type='expand')Replace each row with fn(rowRecord) — partial updates allowed
+ +

pipe — functional pipeline

+ +
import { pipe } from "tsb";
+import { DataFrame } from "tsb";
+
+// Type-safe pipeline with up to 8 steps (return type inferred at each step)
+const result = pipe(
+  rawData,
+  (df) => df.dropna(),                                  // DataFrame → DataFrame
+  (df) => df.assign({ z: df.col("x").add(df.col("y")).values }), // DataFrame → DataFrame
+  (df) => df.head(10),                                  // DataFrame → DataFrame
+  (df) => df.sum(),                                     // DataFrame → Series
+);
+
+// Works on any value — not just DataFrames
+const n = pipe(
+  3,
+  (x) => x + 1,   // 4
+  (x) => x * x,   // 16
+  (x) => x - 1,   // 15
+);
+// n === 15
+ +

seriesApply — element-wise apply

+ +
import { seriesApply, seriesTransform } from "tsb";
+import { Series } from "tsb";
+
+const temps = new Series({ data: [22.1, 23.5, null, 21.8], name: "temp_C" });
+
+// Element-wise with (value, label, position) context
+const fahrenheit = seriesApply(temps, (v) => v === null ? null : (v as number) * 9/5 + 32);
+// [71.78, 74.3, null, 71.24]
+
+// Simple scalar transform (no label/position needed)
+const rounded = seriesTransform(temps, (v) => v === null ? null : Math.round(v as number));
+// [22, 24, null, 22]
+
+// Using position to build cumulative logic
+const withPos = seriesApply(
+  new Series({ data: [10, 20, 30] }),
+  (v, _label, pos) => (v as number) + pos * 100,
+);
+// [10, 120, 230]
+ +

dataFrameApply — column/row aggregation

+ +
import { dataFrameApply } from "tsb";
+import { DataFrame } from "tsb";
+
+const df = DataFrame.fromColumns({
+  score: [85, 92, 78, 95],
+  weight: [1.0, 1.2, 0.8, 1.5],
+});
+
+// axis=0 (default): apply fn to each column → Series indexed by column names
+const colMax = dataFrameApply(df, (col) => col.max() ?? null);
+// colMax.at("score")  === 95
+// colMax.at("weight") === 1.5
+
+// axis=1: apply fn to each row → Series indexed by row labels
+const weightedScore = dataFrameApply(
+  df,
+  (row) => (row.at("score") as number) * (row.at("weight") as number),
+  1,
+);
+// [85, 110.4, 62.4, 142.5]
+ +

dataFrameApplyMap — element-wise cell transform

+ +
import { dataFrameApplyMap } from "tsb";
+import { DataFrame } from "tsb";
+
+const df = DataFrame.fromColumns({
+  a: [1, -2, 3],
+  b: [-4, 5, -6],
+});
+
+// Zero out all negative values (like pandas df.applymap(lambda x: max(x, 0)))
+const clipped = dataFrameApplyMap(df, (v) => {
+  return typeof v === "number" && v < 0 ? 0 : v;
+});
+// a: [1, 0, 3]
+// b: [0, 5, 0]
+
+// fn receives full context: (value, rowLabel, colName)
+const tagged = dataFrameApplyMap(df, (v, row, col) => `${col}[${row}]=${v}`);
+// a: ["a[0]=1", "a[1]=-2", "a[2]=3"]
+// b: ["b[0]=-4", "b[1]=5", "b[2]=-6"]
+ +

dataFrameTransform — column-wise transform

+ +
import { dataFrameTransform, seriesTransform } from "tsb";
+import { DataFrame } from "tsb";
+
+const df = DataFrame.fromColumns({
+  x: [1, 2, 3, 4, 5],
+  y: [10, 20, 30, 40, 50],
+});
+
+// Z-score normalize each column
+const normalized = dataFrameTransform(df, (col) => {
+  const mu = col.mean();
+  const sd = col.std();
+  return seriesTransform(col, (v) =>
+    typeof v === "number" && sd > 0 ? (v - mu) / sd : v
+  );
+});
+
+// Bin each column into quartiles
+const binned = dataFrameTransform(df, (col) => {
+  const q1 = col.quantile(0.25);
+  const q2 = col.quantile(0.5);
+  const q3 = col.quantile(0.75);
+  return seriesTransform(col, (v) => {
+    const n = v as number;
+    if (n <= q1) return "Q1";
+    if (n <= q2) return "Q2";
+    if (n <= q3) return "Q3";
+    return "Q4";
+  });
+});
+ +

dataFrameTransformRows — row-wise transform

+ +
import { dataFrameTransformRows } from "tsb";
+import { DataFrame } from "tsb";
+
+const df = DataFrame.fromColumns({
+  first: ["alice", "bob", "carol"],
+  last:  ["smith", "jones", "white"],
+  score: [88, 75, 92],
+});
+
+// Normalise scores relative to the row's position (illustrative)
+const updated = dataFrameTransformRows(df, (row, _label, pos) => ({
+  // Only return keys you want to change — others are preserved as-is
+  score: (row["score"] as number) + pos,
+}));
+// scores become [88, 76, 94]
+// first and last columns are unchanged
+
+// Full row transformation (compute full name)
+const withFull = dataFrameTransformRows(df, (row) => ({
+  first: row["first"],
+  last:  row["last"],
+  score: row["score"],
+  full:  `${row["first"]} ${row["last"]}`,
+}));
+ +

Combining pipe + apply

+ +
import { pipe, dataFrameApplyMap, dataFrameTransform, seriesTransform } from "tsb";
+import { DataFrame } from "tsb";
+
+const raw = DataFrame.fromColumns({
+  price:    [9.99, -1, 24.5, null, 49.0],
+  quantity: [3, 5, null, 2, 1],
+});
+
+// Clean → impute → normalise in one readable pipeline
+const clean = pipe(
+  raw,
+  // 1. zero out invalid prices/quantities
+  (df) => dataFrameApplyMap(df, (v) =>
+    v === null || (typeof v === "number" && v < 0) ? 0 : v
+  ),
+  // 2. add derived revenue column
+  (df) => df.assign({
+    revenue: df.col("price").mul(df.col("quantity")).values,
+  }),
+  // 3. round everything to 2 dp
+  (df) => dataFrameTransform(df, (col) =>
+    seriesTransform(col, (v) =>
+      typeof v === "number" ? Math.round(v * 100) / 100 : v
+    )
+  ),
+);
+ +
+

+ pandas DataFrame.pipe docs + · + pandas DataFrame.apply docs + · + tsb on GitHub +

+ + diff --git a/playground/playground-runtime.js b/playground/playground-runtime.js index 8a6afbd5..32039ec6 100644 --- a/playground/playground-runtime.js +++ b/playground/playground-runtime.js @@ -202,6 +202,104 @@ function setEditorCode(editor, code) { } } +// ── Syntax highlighting ──────────────────────────────────────────── + +function escapeHtml(text) { + return text + .replace(/&/g, "&") + .replace(//g, ">"); +} + +var HL_KEYWORDS = + "import|from|export|default|const|let|var|function|return|if|else|for|" + + "while|do|switch|case|break|continue|throw|try|catch|finally|new|class|" + + "extends|implements|interface|type|enum|as|typeof|instanceof|in|of|void|" + + "async|await|yield|static|get|set|super|delete|namespace"; +var HL_LITERALS = "true|false|null|undefined|this|NaN|Infinity"; +var HL_LITERALS_SET = {}; +HL_LITERALS.split("|").forEach(function (w) { + HL_LITERALS_SET[w] = true; +}); + +var HL_REGEX = new RegExp( + "(" + + "\\/\\/[^\\n]*" + + "|\\/\\*[\\s\\S]*?\\*\\/" + + '|"(?:[^"\\\\]|\\\\.)*"' + + "|'(?:[^'\\\\]|\\\\.)*'" + + "|`(?:[^`\\\\]|\\\\.)*`" + + "|\\b(?:" + + HL_KEYWORDS + + ")\\b" + + "|\\b(?:" + + HL_LITERALS + + ")\\b" + + "|\\b\\d+(?:\\.\\d+)?(?:e[+-]?\\d+)?\\b" + + ")", + "g", +); + +function highlightCode(code) { + var html = ""; + var lastIndex = 0; + HL_REGEX.lastIndex = 0; + var match; + while ((match = HL_REGEX.exec(code)) !== null) { + if (match.index > lastIndex) { + html += escapeHtml(code.slice(lastIndex, match.index)); + } + var token = match[0]; + var cls; + var ch = token.charAt(0); + if (ch === "/" && (token.charAt(1) === "/" || token.charAt(1) === "*")) { + cls = "hl-comment"; + } else if (ch === '"' || ch === "'" || ch === "`") { + cls = "hl-string"; + } else if (/^\d/.test(token)) { + cls = "hl-number"; + } else if (HL_LITERALS_SET[token]) { + cls = "hl-literal"; + } else { + cls = "hl-keyword"; + } + html += '' + escapeHtml(token) + ""; + lastIndex = HL_REGEX.lastIndex; + } + if (lastIndex < code.length) { + html += escapeHtml(code.slice(lastIndex)); + } + return html; +} + +function injectHighlightStyles() { + var style = document.createElement("style"); + style.textContent = + ".editor-wrapper{position:relative}" + + ".editor-highlight{" + + "position:absolute;top:0;left:0;right:0;bottom:0;" + + "margin:0;pointer-events:none;z-index:1;" + + "background:transparent;" + + "border:1px solid transparent;border-top:none;border-bottom:none;" + + "border-radius:0;" + + "padding:1rem;" + + "font-family:var(--font-mono,'Cascadia Code','Fira Code','JetBrains Mono',monospace);" + + "font-size:0.875rem;line-height:1.55;" + + "white-space:pre;tab-size:2;overflow:hidden;" + + "color:#e6edf3;" + + "}" + + ".playground-editor.editor-transparent{" + + "color:transparent!important;" + + "caret-color:#e6edf3;" + + "}" + + ".hl-keyword{color:#ff7b72}" + + ".hl-string{color:#a5d6ff}" + + ".hl-comment{color:#8b949e;font-style:italic}" + + ".hl-number{color:#79c0ff}" + + ".hl-literal{color:#79c0ff}"; + document.head.appendChild(style); +} + // ── Playground block setup ───────────────────────────────────────── function setupBlock(block, ts) { @@ -214,6 +312,20 @@ function setupBlock(block, ts) { var originalCode = getEditorCode(editor); + // Normalize: convert contenteditable
 to 
+      
+      
+ + +
+ + +

+  
+
+  
+  
+

strGetDummies — one-hot encode by delimiter

+

Split each string by a delimiter and produce a binary indicator DataFrame — + one column per unique token. Equivalent to pandas.Series.str.get_dummies().

+
+
+ + +
+
+ + + + +
+
+ +

+  
+ + +
+

strExtractAll — extract all regex matches

+

Find every non-overlapping regex match in each element. Returns a JSON-encoded + array of match arrays per element — parse with JSON.parse.

+
+
+ + +
+
+ + + + +
+
+ +

+  
+ + +
+

strRemovePrefix / strRemoveSuffix

+

Strip a leading or trailing string from elements only when it is present.

+
+
+ + +
+
+ + + + +
+
+ +

+  
+ + +
+

strTranslate — character-level substitution

+

Replace or delete individual characters using a lookup table. + Format: one mapping per line as from=to or from= + to delete.

+
+
+ + +
+
+ + +
+
+ +

+  
+ + +
+

strCharWidth & strByteLength — display & byte widths

+

+ strCharWidth counts columns for terminal display (CJK chars count as 2).
+ strByteLength counts UTF-8 bytes (useful for byte-limited APIs). +

+ + + +

+  
+ + + + + diff --git a/playground/string_ops_extended.html b/playground/string_ops_extended.html new file mode 100644 index 00000000..81bdaddb --- /dev/null +++ b/playground/string_ops_extended.html @@ -0,0 +1,413 @@ + + + + + + tsb — Extended String Operations + + + +
+

tsb

+ string_ops_extended + Advanced standalone string operations: split-expand, extract, partition, multi-replace, indent, dedent +
+
+

+ string_ops_extended adds advanced string utilities that complement + string_ops and the Series.str accessor. All functions accept + a Series, an array, or a scalar string. +

+ + +
+

strSplitExpand — split and expand to DataFrame columns

+

+ Split each element by a delimiter and expand the parts into a DataFrame + with one column per position. Mirrors pandas.Series.str.split(expand=True). + Shorter rows are padded with null. +

+
+
+ + +
+
+ + + + +
+
+ +

+  
+ + +
+

strExtractGroups — extract regex capture groups

+

+ Extract regex capture groups from each element into a DataFrame. + Named groups ((?<name>...)) become column names; unnamed groups + become 0, 1, … Non-matching rows produce null. +

+
+
+ + +
+
+ + +
Use (?<name>...) for named capture groups.
+
+
+ +

+  
+ + +
+

strPartition / strRPartition — split into (before, sep, after)

+

+ strPartition splits at the first occurrence of the separator; + strRPartition splits at the last. When the separator is not + found, strPartition returns [s, "", ""] and + strRPartition returns ["", "", s]. +

+
+
+ + +
+
+ + +
+
+ +

+  
+ + +
+

strMultiReplace — apply multiple replacements in sequence

+

+ Apply an ordered list of {pat, repl} pairs to each element. + Each replacement is applied to the result of the previous one. + Patterns can be string literals (replaced globally) or RegExp objects. +

+
+
+ + +
+
+ + +
+
+ +

+  
+ + +
+

strIndent / strDedent — line-level indentation utilities

+

+ strIndent adds a prefix to every non-empty line (mirrors + textwrap.indent). + strDedent removes the common leading whitespace from all lines + (mirrors textwrap.dedent). +

+
+
+ + +
+
+ + +
+
+ +

+  
+ +
+ + + + diff --git a/playground/to_from_dict.html b/playground/to_from_dict.html new file mode 100644 index 00000000..a8ca3e88 --- /dev/null +++ b/playground/to_from_dict.html @@ -0,0 +1,122 @@ + + + + + + tsb — toDictOriented / fromDictOriented + + + +

← tsb playground

+ +

toDictOriented / fromDictOriented

+

+ Convert a DataFrame to and from dictionary structures with flexible orientation — mirrors + + pandas.DataFrame.to_dict(orient=...) and + + pandas.DataFrame.from_dict(orient=...). +

+ +

Supported orientations — toDictOriented

+ + + + + + + + + + + +
OrientReturn typeDescription
"dict" / "columns"Record<col, Record<rowLabel, value>>Nested column → row-label → value map
"list"Record<col, value[]>Column name → array of values
"series"Record<col, Series>Column name → Series object
"split"{ index, columns, data }Serialisable split structure
"tight"{ index, columns, data, index_names, column_names }Split plus axis-name metadata
"records"Record<col, value>[]Array of row objects
"index"Record<rowLabel, Record<col, value>>Row-label → column → value
+ +

Supported orientations — fromDictOriented

+ + + + + + + + +
OrientInput shape
"columns" (default){ col: value[] }
"index"{ rowLabel: { col: value } }
"split"{ index?, columns, data }
"tight"Same as "split", extra fields ignored
+ +

Example — all orientations

+
import { DataFrame } from "tsb";
+import { toDictOriented, fromDictOriented } from "tsb";
+
+const df = DataFrame.fromColumns(
+  { name: ["Alice", "Bob"], score: [92, 85] },
+  { index: new Index(["r0", "r1"]) },
+);
+
+// "dict" / "columns"
+toDictOriented(df, "dict");
+// { name: { r0: "Alice", r1: "Bob" }, score: { r0: 92, r1: 85 } }
+
+// "list"
+toDictOriented(df, "list");
+// { name: ["Alice", "Bob"], score: [92, 85] }
+
+// "records"
+toDictOriented(df, "records");
+// [ { name: "Alice", score: 92 }, { name: "Bob", score: 85 } ]
+
+// "split"
+toDictOriented(df, "split");
+// { index: ["r0", "r1"], columns: ["name", "score"], data: [["Alice", 92], ["Bob", 85]] }
+
+// "index"
+toDictOriented(df, "index");
+// { r0: { name: "Alice", score: 92 }, r1: { name: "Bob", score: 85 } }
+
+// fromDictOriented — columns (default)
+fromDictOriented({ name: ["Alice", "Bob"], score: [92, 85] });
+
+// fromDictOriented — index
+fromDictOriented(
+  { r0: { name: "Alice", score: 92 }, r1: { name: "Bob", score: 85 } },
+  "index",
+);
+
+// fromDictOriented — split (round-trip)
+const split = toDictOriented(df, "split");
+const df2 = fromDictOriented(split, "split");
+// df2 is equivalent to df
+
+ +

Missing values

+
+ Missing values (null / undefined) are preserved as null + in all orientations. When using fromDictOriented with "index" + orientation, any column that is absent from a given row object is filled with null. +
+ +

Type signatures

+
function toDictOriented(df: DataFrame, orient: "dict" | "columns"): Record<string, Record<string, Scalar>>;
+function toDictOriented(df: DataFrame, orient: "list"): Record<string, Scalar[]>;
+function toDictOriented(df: DataFrame, orient: "series"): Record<string, Series<Scalar>>;
+function toDictOriented(df: DataFrame, orient: "split"): DictSplit;
+function toDictOriented(df: DataFrame, orient: "tight"): DictTight;
+function toDictOriented(df: DataFrame, orient: "records"): Record<string, Scalar>[];
+function toDictOriented(df: DataFrame, orient: "index"): Record<string, Record<string, Scalar>>;
+
+function fromDictOriented(data: Record<string, readonly Scalar[]>, orient?: "columns"): DataFrame;
+function fromDictOriented(data: Record<string, Record<string, Scalar>>, orient: "index"): DataFrame;
+function fromDictOriented(data: SplitInput, orient: "split" | "tight"): DataFrame;
+
+ + diff --git a/playground/where_mask.html b/playground/where_mask.html new file mode 100644 index 00000000..89a50a05 --- /dev/null +++ b/playground/where_mask.html @@ -0,0 +1,220 @@ + + + + + + tsb — where / mask: Conditional Selection + + + +

tsb — where / mask: Conditional Selection

+

+ seriesWhere / seriesMask and their DataFrame equivalents + allow element-wise conditional replacement — the TypeScript equivalents of + pandas.Series.where + and + pandas.Series.mask. +

+ +
+ Quick rule:
+ where(cond)keep where cond is true, replace elsewhere.
+ mask(cond)keep where cond is false, replace elsewhere.
+ They are exact inverses of each other. +
+ +

1. seriesWhere — Boolean Array Condition

+

+ Pass a boolean[] to keep values at true positions, replace + the rest with null (or a custom other value). +

+
import { Series, seriesWhere } from "tsb";
+
+const scores = new Series({ data: [42, 91, 67, 55, 88] });
+const highScores = seriesWhere(scores, [false, true, false, false, true]);
+// Series [null, 91, null, null, 88]
+
+// Custom replacement value
+const clamped = seriesWhere(scores, [false, true, false, false, true], { other: 0 });
+// Series [0, 91, 0, 0, 88]
+ +

2. seriesWhere — Callable Condition

+

+ Pass a function that receives the Series and returns a boolean[] or + Series<boolean>. This avoids computing the condition array manually. +

+
import { Series, seriesWhere } from "tsb";
+
+const temps = new Series({ data: [-5, 12, 23, -3, 8] });
+
+// Keep only values above freezing
+const aboveFreezing = seriesWhere(
+  temps,
+  (s) => s.values.map((v) => (v as number) > 0),
+);
+// Series [null, 12, 23, null, 8]
+
+// Replace with 0 instead of null
+const noFreeze = seriesWhere(
+  temps,
+  (s) => s.values.map((v) => (v as number) > 0),
+  { other: 0 },
+);
+// Series [0, 12, 23, 0, 8]
+ +

3. seriesMask — The Inverse

+

+ mask replaces positions where the condition is true + (the opposite of where). Use it to "blank out" outliers or invalid values. +

+
import { Series, seriesMask } from "tsb";
+
+const data = new Series({ data: [1, 2, 3, 4, 5] });
+
+// Mask out values greater than 3
+const masked = seriesMask(
+  data,
+  (s) => s.values.map((v) => (v as number) > 3),
+  { other: null },
+);
+// Series [1, 2, 3, null, null]
+ +

4. dataFrameWhere — Element-Wise on DataFrames

+

+ Pass a boolean DataFrame or a callable that returns one. + Columns and row labels are aligned by name. +

+
import { DataFrame, dataFrameWhere } from "tsb";
+
+const df = DataFrame.fromColumns({
+  temp_c:   [22, -3, 18, -7, 30],
+  humidity: [55, 80, 62, 75, 45],
+});
+
+// Keep only valid summer readings (temp > 0)
+const condDf = DataFrame.fromColumns({
+  temp_c:   [true, false, true, false, true],
+  humidity: [true, false, true, false, true],
+});
+
+const summer = dataFrameWhere(df, condDf);
+// DataFrame:
+//   temp_c   [22,   null, 18,   null, 30  ]
+//   humidity [55,   null, 62,   null, 45  ]
+ +

5. dataFrameWhere — Callable Condition

+
import { DataFrame, dataFrameWhere } from "tsb";
+
+const df = DataFrame.fromColumns({
+  a: [1, 2, 3, 4, 5],
+  b: [10, 20, 30, 40, 50],
+});
+
+// Keep only values > 2 (column-wise threshold)
+const result = dataFrameWhere(df, (d) => {
+  const condCols: Record<string, boolean[]> = {};
+  for (const col of d.columns) {
+    condCols[col as string] = d.col(col as string).values.map(
+      (v) => (v as number) > 2
+    );
+  }
+  return DataFrame.fromColumns(condCols);
+});
+// DataFrame:
+//   a: [null, null, 3, 4, 5]
+//   b: [10,   20,   30, 40, 50]
+ +

6. dataFrameMask — DataFrame Mask

+
import { DataFrame, dataFrameMask } from "tsb";
+
+const df = DataFrame.fromColumns({
+  sales:  [100, 200, 50,  300, 80],
+  profit: [10,  40,  -5,  60,  -2],
+});
+
+// Mask out (replace) rows with negative profit
+const cleaned = dataFrameMask(
+  df,
+  (d) => {
+    const condCols: Record<string, boolean[]> = {};
+    for (const col of d.columns) {
+      condCols[col as string] = d.col(col as string).values.map(
+        (v) => (v as number) < 0
+      );
+    }
+    return DataFrame.fromColumns(condCols);
+  },
+  { other: 0 },
+);
+// DataFrame:
+//   sales:  [100, 200, 50,  300, 80]
+//   profit: [10,  40,  0,   60,  0 ]
+ +

Label-Aligned Series Condition

+

+ When you pass a Series<boolean> as the condition, values are aligned + by label, not position. Labels absent from the condition series are treated + as false. +

+
import { Series, seriesWhere } from "tsb";
+
+const prices = new Series({ data: [10, 20, 30], index: ["a", "b", "c"] });
+const valid  = new Series<boolean>({ data: [false, true], index: ["a", "b"] });
+
+// Only "b" is in the condition with value=true; "a"=false, "c" missing→false
+const result = seriesWhere(prices, valid, { other: -1 });
+// Series { a: -1, b: 20, c: -1 }
+ +

API Reference

+ + + + + + +
FunctionKeeps when cond is…Replaces with
seriesWhere(s, cond, {other})trueother (default null)
seriesMask(s, cond, {other})falseother (default null)
dataFrameWhere(df, cond, {other})trueother (default null)
dataFrameMask(df, cond, {other})falseother (default null)
+ +

Condition types

+ + + + + + +
TypeSeries opsDataFrame ops
Boolean array✅ positional
Series<boolean>✅ label-aligned
DataFrame (boolean)✅ label-aligned
Callable✅ receives Series✅ receives DataFrame
+ +

← Back to tsb playground index

+ + diff --git a/playground/wide_to_long.html b/playground/wide_to_long.html new file mode 100644 index 00000000..b30980cd --- /dev/null +++ b/playground/wide_to_long.html @@ -0,0 +1,113 @@ + + + + + + tsb — wideToLong + + + +

← tsb playground

+ +

wideToLong

+

+ Reshape a wide-format DataFrame to long format by collapsing stub-prefixed column + groups into rows — mirrors + + pandas.wide_to_long(). +

+ +

Concept

+

+ Given a wide DataFrame where repeated measurements are spread across columns with a + common stub prefix and a numeric (or other) suffix — e.g. score_2021, + score_2022wideToLong pivots those column groups into rows. + One row per original row per unique suffix is produced. +

+ +

Example — numeric suffixes

+
import { DataFrame } from "tsb";
+import { wideToLong } from "tsb";
+
+const df = DataFrame.fromColumns({
+  id:  ["x", "y"],
+  A1:  [1, 2],
+  A2:  [3, 4],
+  B1:  [5, 6],
+  B2:  [7, 8],
+});
+
+const long = wideToLong(df, ["A", "B"], "id", "num");
+
+// long.columns.values → ["id", "num", "A", "B"]
+// long.shape          → [4, 4]
+//
+// id  num   A   B
+//  x    1   1   5
+//  y    1   2   6
+//  x    2   3   7
+//  y    2   4   8
+
+ +

Example — separator and custom suffix

+
const df = DataFrame.fromColumns({
+  country: ["US", "UK"],
+  gdp_2020: [21e12, 2.7e12],
+  gdp_2021: [23e12, 3.1e12],
+  pop_2020: [331e6, 67e6],
+  pop_2021: [332e6, 68e6],
+});
+
+const long = wideToLong(df, ["gdp", "pop"], "country", "year", { sep: "_" });
+// long.shape → [4, 4]  — 2 countries × 2 years
+// Columns: ["country", "year", "gdp", "pop"]
+
+ +

API reference

+
function wideToLong(
+  df: DataFrame,
+  stubnames: string | string[],
+  i: string | string[],
+  j: string,
+  options?: WideToLongOptions,
+): DataFrame;
+
+interface WideToLongOptions {
+  sep?: string;      // separator between stub and suffix, default ""
+  suffix?: string;   // regex string matching suffix, default "\\d+"
+}
+
+ +

Parameters

+ + + + + + + + + + +
ParameterTypeDescription
dfDataFrameSource DataFrame (not mutated)
stubnamesstring | string[]Prefix(es) shared by the wide column groups
istring | string[]Column(s) to keep as id variables (repeated per suffix)
jstringName of the new column holding the suffix values
options.sepstringSeparator between stub and suffix (default: "")
options.suffixstringRegex string matching the suffix (default: "\\d+")
+ +

Output layout

+
+ Output columns are always ordered: id cols, j, stub cols + (in the same order the stubs were passed). Suffixes are sorted numerically when they are all + integers, otherwise lexicographically. Wide columns that are absent from the DataFrame are + filled with null. +
+ + diff --git a/playground/window_extended.html b/playground/window_extended.html new file mode 100644 index 00000000..4232fa5d --- /dev/null +++ b/playground/window_extended.html @@ -0,0 +1,304 @@ + + + + + + tsb — Rolling Extended Stats: sem, skew, kurt, quantile + + + +

tsb — Rolling Extended Statistics

+

+ Higher-order rolling window statistics extending the core + + pandas.Series.rolling() + + API: + sem, skew, kurt, and + quantile. +

+ +

1. rollingSem — Standard Error of the Mean

+

+ The standard error of the mean measures how much the sample mean + would vary across repeated samples. For a window of n values: +

+
sem = std(ddof=1) / √n
+

Requires at least 2 valid observations per window.

+ +
import { rollingSem, Series } from "tsb";
+
+const s = new Series({ data: [2, 4, 4, 4, 5, 5, 7, 9], name: "x" });
+const sem3 = rollingSem(s, 3);
+// [null, null, 0.667, 0, 0.577, 0.577, 1.155, 2.082]
+
+ +
+

Live demo — sem with window=3

+

Comma-separated numbers (nulls accepted):

+ + + + +

+    
+ +

2. rollingSkew — Fisher-Pearson Skewness

+

+ Skewness measures asymmetry of the distribution in each window. + Positive = right tail heavier; negative = left tail heavier. + Uses the unbiased Fisher-Pearson formula (same as pandas): +

+
skew = [n/((n-1)(n-2))] × Σ[(xᵢ−x̄)/s]³
+

Requires ≥ 3 valid observations.

+ +
import { rollingSkew, Series } from "tsb";
+
+const s = new Series({ data: [1, 2, 3, 4, 5] });
+rollingSkew(s, 3);
+// [null, null, 0, 0, 0]   ← symmetric windows → zero skew
+
+ +
+

Live demo — skewness with window=4

+ + + +

+    
+ +

3. rollingKurt — Excess Kurtosis

+

+ Kurtosis measures how heavy the tails are relative to a normal distribution. + The excess kurtosis subtracts 3, so a normal distribution gives 0. + Uses the Fisher (1930) unbiased formula: +

+
kurt = [n(n+1)/((n-1)(n-2)(n-3))] × Σ[(xᵢ−x̄)/s]⁴ − 3(n-1)²/((n-2)(n-3))
+

Requires ≥ 4 valid observations.

+ +
import { rollingKurt, Series } from "tsb";
+
+const s = new Series({ data: [1, 2, 3, 4] });
+rollingKurt(s, 4);
+// [null, null, null, -1.2]   ← uniform distribution has kurt = -1.2
+
+ +
+

Live demo — excess kurtosis with window=5

+ + + +

+    
+ +

4. rollingQuantile — Rolling Quantile

+

+ Computes any quantile within each sliding window using configurable + interpolation. When q = 0.5 this is identical to + rolling.median(). +

+ +
import { rollingQuantile, Series } from "tsb";
+
+const s = new Series({ data: [1, 2, 3, 4, 5] });
+
+rollingQuantile(s, 0.5, 3);  // rolling median: [null, null, 2, 3, 4]
+rollingQuantile(s, 0.25, 3); // [null, null, 1.5, 2.5, 3.5]
+rollingQuantile(s, 0.75, 3); // [null, null, 2.5, 3.5, 4.5]
+
+ +

Interpolation methods

+ + + + + + + + + +
MethodBehaviour when q falls between two values
linear (default)Linear interpolation — same as NumPy / pandas default
lowerTake the lower of the two surrounding values
higherTake the higher of the two surrounding values
midpointArithmetic mean of the two surrounding values
nearestWhichever surrounding value is closest
+ +
+

Live demo — rolling quantile

+ + + + + +

+    
+ +

Common Options

+ + + + + + +
OptionTypeDefaultDescription
minPeriodsnumber= windowMinimum valid obs required per window
centerbooleanfalseCentre the window around each position
+ +
+ Note: Functions are pure — they return new Series objects + without modifying the input. Missing values (null, NaN) + are excluded from each window calculation. +
+ + + + diff --git a/src/core/api_types.ts b/src/core/api_types.ts new file mode 100644 index 00000000..860d2050 --- /dev/null +++ b/src/core/api_types.ts @@ -0,0 +1,629 @@ +/** + * api_types — runtime type-checking predicates, mirroring `pandas.api.types`. + * + * Two groups of functions are provided: + * + * **Value-level predicates** — operate on arbitrary JavaScript values, equivalent + * to `pandas.api.types.is_scalar`, `is_list_like`, `is_number`, etc. + * + * **Dtype-level predicates** — accept a `Dtype` instance or a `DtypeName` string + * and answer questions about the dtype's kind, equivalent to + * `pandas.api.types.is_numeric_dtype`, `is_float_dtype`, etc. + * + * @example + * ```ts + * import { isScalar, isNumericDtype, Dtype } from "tsb"; + * isScalar(42); // true + * isScalar([1, 2, 3]); // false + * isListLike([1, 2, 3]); // true + * isNumericDtype(Dtype.float64); // true + * isStringDtype("string"); // true + * ``` + * + * @module + */ + +import { Dtype } from "./dtype.ts"; +import type { DtypeName } from "../types.ts"; + +// ─── internal helper ────────────────────────────────────────────────────────── + +/** Resolve a Dtype | DtypeName to a Dtype instance. */ +function resolveDtype(dtype: Dtype | DtypeName): Dtype { + if (dtype instanceof Dtype) { + return dtype; + } + return Dtype.from(dtype); +} + +// ═════════════════════════════════════════════════════════════════════════════ +// VALUE-LEVEL PREDICATES +// ═════════════════════════════════════════════════════════════════════════════ + +/** + * Return `true` if `val` is a scalar (not a collection). + * + * Scalars: `string`, `number`, `bigint`, `boolean`, `symbol`, `null`, + * `undefined`, and `Date` objects. Arrays, plain objects, `Map`, `Set`, + * iterables, and class instances other than `Date` are **not** scalars. + * + * Mirrors `pandas.api.types.is_scalar`. + * + * @example + * ```ts + * isScalar(42); // true + * isScalar("hello"); // true + * isScalar(null); // true + * isScalar([1, 2]); // false + * isScalar({ a: 1 }); // false + * ``` + */ +export function isScalar(val: unknown): boolean { + if (val === null || val === undefined) { + return true; + } + const t = typeof val; + if (t === "string" || t === "number" || t === "bigint" || t === "boolean" || t === "symbol") { + return true; + } + if (val instanceof Date) { + return true; + } + return false; +} + +/** + * Return `true` if `val` is "list-like" — i.e. iterable (but not a string) + * or has a non-negative integer `length` property. + * + * Mirrors `pandas.api.types.is_list_like`. + * + * @example + * ```ts + * isListLike([1, 2, 3]); // true + * isListLike(new Set([1])); // true + * isListLike("abc"); // false (strings excluded) + * isListLike(42); // false + * isListLike({ a: 1 }); // false + * ``` + */ +export function isListLike(val: unknown): boolean { + if (val === null || val === undefined) { + return false; + } + if (typeof val === "string") { + return false; + } + // Has Symbol.iterator and is not a plain number/boolean/bigint/symbol + if (typeof val === "number" || typeof val === "boolean" || typeof val === "bigint" || typeof val === "symbol") { + return false; + } + if (typeof val === "object" || typeof val === "function") { + if (Symbol.iterator in (val as object)) { + return true; + } + const len = (val as Record)["length"]; + if (typeof len === "number" && len >= 0 && Number.isInteger(len)) { + return true; + } + } + return false; +} + +/** + * Return `true` if `val` is array-like — i.e. has a non-negative integer + * `length` property. + * + * Mirrors `pandas.api.types.is_array_like`. + * + * @example + * ```ts + * isArrayLike([1, 2]); // true + * isArrayLike("abc"); // true (strings have .length) + * isArrayLike(42); // false + * isArrayLike({}); // false + * ``` + */ +export function isArrayLike(val: unknown): boolean { + if (val === null || val === undefined) { + return false; + } + if (typeof val === "string") { + return true; + } + if (typeof val !== "object" && typeof val !== "function") { + return false; + } + const len = (val as Record)["length"]; + return typeof len === "number" && len >= 0 && Number.isInteger(len); +} + +/** + * Return `true` if `val` is dict-like — a plain object (not an array, not a + * `Date`, not a class instance). + * + * Mirrors `pandas.api.types.is_dict_like`. + * + * @example + * ```ts + * isDictLike({ a: 1 }); // true + * isDictLike(new Map()); // true (has .get / .set) + * isDictLike([1, 2]); // false + * isDictLike("abc"); // false + * ``` + */ +export function isDictLike(val: unknown): boolean { + if (val === null || val === undefined) { + return false; + } + if (typeof val !== "object") { + return false; + } + if (Array.isArray(val)) { + return false; + } + // Treat Map as dict-like (supports key lookup) + if (val instanceof Map) { + return true; + } + // Date is not dict-like + if (val instanceof Date) { + return false; + } + // Plain objects and other objects with properties + return true; +} + +/** + * Return `true` if `val` is an iterator — i.e. has a callable `next` method. + * + * Mirrors `pandas.api.types.is_iterator`. + * + * @example + * ```ts + * isIterator([1, 2][Symbol.iterator]()); // true + * isIterator([1, 2]); // false + * ``` + */ +export function isIterator(val: unknown): boolean { + if (val === null || val === undefined) { + return false; + } + if (typeof val !== "object" && typeof val !== "function") { + return false; + } + return typeof (val as Record)["next"] === "function"; +} + +/** + * Return `true` if `val` is a `number` (including `NaN` and `±Infinity`). + * + * Mirrors `pandas.api.types.is_number`. + * + * @example + * ```ts + * isNumber(3.14); // true + * isNumber(NaN); // true + * isNumber("3"); // false + * ``` + */ +export function isNumber(val: unknown): val is number { + return typeof val === "number"; +} + +/** + * Return `true` if `val` is a `boolean`. + * + * Mirrors `pandas.api.types.is_bool`. + * + * @example + * ```ts + * isBool(true); // true + * isBool(1); // false + * ``` + */ +export function isBool(val: unknown): val is boolean { + return typeof val === "boolean"; +} + +/** + * Return `true` if `val` is a `string`. + * + * Named `isStringValue` to distinguish from the dtype-level `isStringDtype`. + * Mirrors `pandas.api.types.is_string` (not to be confused with dtype checks). + * + * @example + * ```ts + * isStringValue("hello"); // true + * isStringValue(42); // false + * ``` + */ +export function isStringValue(val: unknown): val is string { + return typeof val === "string"; +} + +/** + * Return `true` if `val` is a finite floating-point number (has a fractional + * component or is finite non-integer). `NaN`, `±Infinity` are **not** floats + * in the pandas sense. + * + * Mirrors `pandas.api.types.is_float`. + * + * @example + * ```ts + * isFloat(3.14); // true + * isFloat(3.0); // false (integer value) + * isFloat(NaN); // false + * isFloat(Infinity); // false + * ``` + */ +export function isFloat(val: unknown): boolean { + if (typeof val !== "number") { + return false; + } + if (!Number.isFinite(val)) { + return false; + } + return val !== Math.trunc(val); +} + +/** + * Return `true` if `val` is a finite integer-valued number. + * + * Mirrors `pandas.api.types.is_integer`. + * + * @example + * ```ts + * isInteger(3); // true + * isInteger(3.0); // true (integer value stored as float) + * isInteger(3.14); // false + * isInteger(NaN); // false + * ``` + */ +export function isInteger(val: unknown): boolean { + return typeof val === "number" && Number.isInteger(val); +} + +/** + * Return `true` if `val` is a `bigint`. + * + * @example + * ```ts + * isBigInt(42n); // true + * isBigInt(42); // false + * ``` + */ +export function isBigInt(val: unknown): val is bigint { + return typeof val === "bigint"; +} + +/** + * Return `true` if `val` is a `RegExp`. + * + * Mirrors `pandas.api.types.is_re`. + * + * @example + * ```ts + * isRegExp(/abc/); // true + * isRegExp(new RegExp("x")); // true + * isRegExp("abc"); // false + * ``` + */ +export function isRegExp(val: unknown): val is RegExp { + return val instanceof RegExp; +} + +/** + * Return `true` if `val` can be compiled into a `RegExp` — i.e. it is either + * a `string` or already a `RegExp`. + * + * Mirrors `pandas.api.types.is_re_compilable`. + * + * @example + * ```ts + * isReCompilable("abc"); // true + * isReCompilable(/abc/); // true + * isReCompilable(42); // false + * ``` + */ +export function isReCompilable(val: unknown): boolean { + return typeof val === "string" || val instanceof RegExp; +} + +/** + * Return `true` if `val` is a "missing" value in the pandas sense: `null`, + * `undefined`, or `NaN`. + * + * @example + * ```ts + * isMissing(null); // true + * isMissing(undefined); // true + * isMissing(NaN); // true + * isMissing(0); // false + * isMissing(""); // false + * ``` + */ +export function isMissing(val: unknown): boolean { + if (val === null || val === undefined) { + return true; + } + if (typeof val === "number" && Number.isNaN(val)) { + return true; + } + return false; +} + +/** + * Return `true` if `val` is "hashable" — usable as an object-key in + * JavaScript. In practice this means it is a primitive (`string`, `number`, + * `bigint`, `boolean`, `symbol`, `null`, `undefined`). + * + * Mirrors the spirit of `pandas.api.types.is_hashable`. + * + * @example + * ```ts + * isHashable("key"); // true + * isHashable(42); // true + * isHashable({}); // false + * isHashable([]); // false + * ``` + */ +export function isHashable(val: unknown): boolean { + if (val === null || val === undefined) { + return true; + } + const t = typeof val; + return t === "string" || t === "number" || t === "bigint" || t === "boolean" || t === "symbol"; +} + +/** + * Return `true` if `val` is a `Date` instance. + * + * @example + * ```ts + * isDate(new Date()); // true + * isDate("2024-01-01"); // false + * ``` + */ +export function isDate(val: unknown): val is Date { + return val instanceof Date; +} + +// ═════════════════════════════════════════════════════════════════════════════ +// DTYPE-LEVEL PREDICATES +// ═════════════════════════════════════════════════════════════════════════════ + +/** + * Return `true` if the dtype is numeric (integer, unsigned integer, or float). + * + * Mirrors `pandas.api.types.is_numeric_dtype`. + * + * @example + * ```ts + * isNumericDtype(Dtype.float64); // true + * isNumericDtype("int32"); // true + * isNumericDtype("string"); // false + * ``` + */ +export function isNumericDtype(dtype: Dtype | DtypeName): boolean { + return resolveDtype(dtype).isNumeric; +} + +/** + * Return `true` if the dtype is any integer kind (signed or unsigned). + * + * Mirrors `pandas.api.types.is_integer_dtype`. + * + * @example + * ```ts + * isIntegerDtype("int64"); // true + * isIntegerDtype("uint8"); // true + * isIntegerDtype("float32"); // false + * ``` + */ +export function isIntegerDtype(dtype: Dtype | DtypeName): boolean { + return resolveDtype(dtype).isInteger; +} + +/** + * Return `true` if the dtype is a signed integer (`int8`–`int64`). + * + * Mirrors `pandas.api.types.is_signed_integer_dtype`. + * + * @example + * ```ts + * isSignedIntegerDtype("int32"); // true + * isSignedIntegerDtype("uint32"); // false + * ``` + */ +export function isSignedIntegerDtype(dtype: Dtype | DtypeName): boolean { + return resolveDtype(dtype).isSignedInteger; +} + +/** + * Return `true` if the dtype is an unsigned integer (`uint8`–`uint64`). + * + * Mirrors `pandas.api.types.is_unsigned_integer_dtype`. + * + * @example + * ```ts + * isUnsignedIntegerDtype("uint64"); // true + * isUnsignedIntegerDtype("int64"); // false + * ``` + */ +export function isUnsignedIntegerDtype(dtype: Dtype | DtypeName): boolean { + return resolveDtype(dtype).isUnsignedInteger; +} + +/** + * Return `true` if the dtype is a floating-point type (`float32` or `float64`). + * + * Mirrors `pandas.api.types.is_float_dtype`. + * + * @example + * ```ts + * isFloatDtype("float64"); // true + * isFloatDtype("float32"); // true + * isFloatDtype("int32"); // false + * ``` + */ +export function isFloatDtype(dtype: Dtype | DtypeName): boolean { + return resolveDtype(dtype).isFloat; +} + +/** + * Return `true` if the dtype is boolean. + * + * Mirrors `pandas.api.types.is_bool_dtype`. + * + * @example + * ```ts + * isBoolDtype("bool"); // true + * isBoolDtype("int8"); // false + * ``` + */ +export function isBoolDtype(dtype: Dtype | DtypeName): boolean { + return resolveDtype(dtype).isBool; +} + +/** + * Return `true` if the dtype is the `string` dtype. + * + * Mirrors `pandas.api.types.is_string_dtype`. + * + * @example + * ```ts + * isStringDtype("string"); // true + * isStringDtype("object"); // false + * ``` + */ +export function isStringDtype(dtype: Dtype | DtypeName): boolean { + return resolveDtype(dtype).isString; +} + +/** + * Return `true` if the dtype is a datetime type. + * + * Mirrors `pandas.api.types.is_datetime64_dtype`. + * + * @example + * ```ts + * isDatetimeDtype("datetime"); // true + * isDatetimeDtype("string"); // false + * ``` + */ +export function isDatetimeDtype(dtype: Dtype | DtypeName): boolean { + return resolveDtype(dtype).isDatetime; +} + +/** + * Return `true` if the dtype is a timedelta type. + * + * Mirrors `pandas.api.types.is_timedelta64_dtype`. + * + * @example + * ```ts + * isTimedeltaDtype("timedelta"); // true + * isTimedeltaDtype("datetime"); // false + * ``` + */ +export function isTimedeltaDtype(dtype: Dtype | DtypeName): boolean { + return resolveDtype(dtype).isTimedelta; +} + +/** + * Return `true` if the dtype is the categorical dtype. + * + * Mirrors `pandas.api.types.is_categorical_dtype`. + * + * @example + * ```ts + * isCategoricalDtype("category"); // true + * isCategoricalDtype("string"); // false + * ``` + */ +export function isCategoricalDtype(dtype: Dtype | DtypeName): boolean { + return resolveDtype(dtype).isCategory; +} + +/** + * Return `true` if the dtype is the object dtype. + * + * Mirrors `pandas.api.types.is_object_dtype`. + * + * @example + * ```ts + * isObjectDtype("object"); // true + * isObjectDtype("string"); // false + * ``` + */ +export function isObjectDtype(dtype: Dtype | DtypeName): boolean { + return resolveDtype(dtype).isObject; +} + +/** + * Return `true` if the dtype represents complex numbers. + * + * JavaScript has no native complex number type, so this always returns `false` + * (no complex dtype exists in the `tsb` dtype system). Provided for API + * parity with `pandas.api.types.is_complex_dtype`. + * + * @example + * ```ts + * isComplexDtype("float64"); // false (no complex dtype) + * ``` + */ +export function isComplexDtype(_dtype: Dtype | DtypeName): boolean { + return false; +} + +/** + * Return `true` if the dtype is an "extension array" dtype — i.e. any dtype + * beyond the numeric primitives: `string`, `object`, `datetime`, `timedelta`, + * `category`. + * + * Mirrors `pandas.api.types.is_extension_array_dtype`. + * + * @example + * ```ts + * isExtensionArrayDtype("category"); // true + * isExtensionArrayDtype("datetime"); // true + * isExtensionArrayDtype("int64"); // false + * ``` + */ +export function isExtensionArrayDtype(dtype: Dtype | DtypeName): boolean { + const d = resolveDtype(dtype); + return d.isString || d.isObject || d.isDatetime || d.isTimedelta || d.isCategory; +} + +/** + * Return `true` if the dtype can hold period (date period) data. + * In the current `tsb` dtype system this maps to the `datetime` kind. + * + * Mirrors `pandas.api.types.is_period_dtype`. + * + * @example + * ```ts + * isPeriodDtype("datetime"); // true + * isPeriodDtype("float64"); // false + * ``` + */ +export function isPeriodDtype(dtype: Dtype | DtypeName): boolean { + return resolveDtype(dtype).isDatetime; +} + +/** + * Return `true` if the dtype is suitable for interval data — float or integer. + * + * Mirrors `pandas.api.types.is_interval_dtype`. + * + * @example + * ```ts + * isIntervalDtype("float64"); // true + * isIntervalDtype("int32"); // true + * isIntervalDtype("string"); // false + * ``` + */ +export function isIntervalDtype(dtype: Dtype | DtypeName): boolean { + return resolveDtype(dtype).isNumeric; +} diff --git a/src/core/attrs.ts b/src/core/attrs.ts new file mode 100644 index 00000000..81c6be1c --- /dev/null +++ b/src/core/attrs.ts @@ -0,0 +1,291 @@ +/** + * attrs — user-defined metadata dictionary for Series and DataFrame. + * + * Mirrors `pandas.DataFrame.attrs` / `pandas.Series.attrs`: an arbitrary + * key→value dictionary that travels with a data object and lets callers + * annotate it with provenance, units, descriptions, or any other metadata. + * + * Because the tsb Series and DataFrame classes are immutable by design, this + * module maintains a **WeakMap registry** that maps each object to its attrs + * record. The registry entries are garbage-collected automatically when the + * object itself is collected — there is no memory leak. + * + * ### Public surface + * + * ```ts + * import { getAttrs, setAttrs, updateAttrs, copyAttrs, withAttrs, clearAttrs, + * hasAttrs } from "tsb"; + * + * const df = DataFrame.fromColumns({ x: [1, 2, 3] }); + * + * // Annotate + * setAttrs(df, { source: "sensor_A", unit: "metres" }); + * getAttrs(df); // { source: "sensor_A", unit: "metres" } + * + * // Merge additional keys + * updateAttrs(df, { version: 2 }); + * getAttrs(df); // { source: "sensor_A", unit: "metres", version: 2 } + * + * // Fluent helper — sets attrs and returns the same object + * const annotated = withAttrs(df, { source: "sensor_B" }); + * annotated === df; // true — same reference + * + * // Propagate to a derived object + * const df2 = DataFrame.fromColumns({ y: [4, 5, 6] }); + * copyAttrs(df, df2); + * getAttrs(df2); // { source: "sensor_A", unit: "metres", version: 2 } + * ``` + * + * @module + */ + +// ─── types ──────────────────────────────────────────────────────────────────── + +/** + * The attrs dictionary type. Keys are strings; values may be any JSON-safe + * primitive or nested structure. Mirrors the `dict` type of `pandas.attrs`. + */ +export type Attrs = Record; + +// ─── registry ───────────────────────────────────────────────────────────────── + +/** Internal WeakMap from any object to its attrs record. */ +const registry = new WeakMap(); + +// ─── public API ─────────────────────────────────────────────────────────────── + +/** + * Retrieve the attrs dictionary for `obj`. + * + * Returns a **shallow copy** so callers cannot mutate the stored record + * accidentally. If no attrs have been set, returns an empty object `{}`. + * + * @example + * ```ts + * const s = new Series({ data: [1, 2, 3] }); + * setAttrs(s, { unit: "kg" }); + * getAttrs(s); // { unit: "kg" } + * ``` + */ +export function getAttrs(obj: object): Attrs { + const stored = registry.get(obj); + return stored !== undefined ? { ...stored } : {}; +} + +/** + * **Overwrite** the attrs dictionary for `obj` with `attrs`. + * + * Any previously stored attrs are discarded. Stores a shallow copy so + * subsequent mutations to the passed-in object do not affect the stored value. + * + * @example + * ```ts + * setAttrs(df, { source: "sensor_A" }); + * getAttrs(df); // { source: "sensor_A" } + * ``` + */ +export function setAttrs(obj: object, attrs: Attrs): void { + registry.set(obj, { ...attrs }); +} + +/** + * **Merge** `updates` into the existing attrs for `obj`. + * + * Existing keys that are not present in `updates` are preserved. Keys that + * are present in both `updates` and the existing attrs are overwritten. + * + * @example + * ```ts + * setAttrs(df, { source: "A" }); + * updateAttrs(df, { version: 2 }); + * getAttrs(df); // { source: "A", version: 2 } + * ``` + */ +export function updateAttrs(obj: object, updates: Attrs): void { + const existing = registry.get(obj) ?? {}; + registry.set(obj, { ...existing, ...updates }); +} + +/** + * **Copy** the attrs from `source` to `target`, overwriting any existing attrs + * on `target`. + * + * Useful for propagating metadata from an input to a derived result. + * + * @example + * ```ts + * setAttrs(df1, { source: "sensor_A" }); + * const df2 = df1.head(5); + * copyAttrs(df1, df2); + * getAttrs(df2); // { source: "sensor_A" } + * ``` + */ +export function copyAttrs(source: object, target: object): void { + const stored = registry.get(source); + if (stored !== undefined) { + registry.set(target, { ...stored }); + } else { + registry.delete(target); + } +} + +/** + * **Fluent helper** — set attrs on `obj` and return the same object. + * + * This **replaces** any previously stored attrs (same semantics as + * {@link setAttrs}). The return type is `T` so callers do not lose the + * concrete type of their object. + * + * @example + * ```ts + * const annotated = withAttrs(df, { source: "sensor_A", unit: "metres" }); + * annotated === df; // true — same reference + * getAttrs(annotated); // { source: "sensor_A", unit: "metres" } + * ``` + */ +export function withAttrs(obj: T, attrs: Attrs): T { + registry.set(obj, { ...attrs }); + return obj; +} + +/** + * **Remove** all attrs from `obj`. + * + * After calling this, {@link getAttrs} returns `{}` and {@link hasAttrs} + * returns `false`. + * + * @example + * ```ts + * setAttrs(df, { source: "A" }); + * clearAttrs(df); + * hasAttrs(df); // false + * getAttrs(df); // {} + * ``` + */ +export function clearAttrs(obj: object): void { + registry.delete(obj); +} + +/** + * Returns `true` if `obj` has any attrs set, `false` otherwise. + * + * @example + * ```ts + * hasAttrs(df); // false + * setAttrs(df, { x: 1 }); + * hasAttrs(df); // true + * clearAttrs(df); + * hasAttrs(df); // false + * ``` + */ +export function hasAttrs(obj: object): boolean { + return registry.has(obj); +} + +/** + * Retrieve a **single** attrs value by key. + * + * Returns `undefined` if the key does not exist (or no attrs are set). + * + * @example + * ```ts + * setAttrs(df, { unit: "kg" }); + * getAttr(df, "unit"); // "kg" + * getAttr(df, "missing"); // undefined + * ``` + */ +export function getAttr(obj: object, key: string): unknown { + return registry.get(obj)?.[key]; +} + +/** + * Set a **single** attrs key on `obj`, preserving all other existing attrs. + * + * @example + * ```ts + * setAttr(df, "unit", "kg"); + * setAttr(df, "source", "lab"); + * getAttrs(df); // { unit: "kg", source: "lab" } + * ``` + */ +export function setAttr(obj: object, key: string, value: unknown): void { + const existing = registry.get(obj) ?? {}; + registry.set(obj, { ...existing, [key]: value }); +} + +/** + * Delete a **single** attrs key from `obj`, preserving all other keys. + * + * Does nothing if the key does not exist. + * + * @example + * ```ts + * setAttrs(df, { a: 1, b: 2 }); + * deleteAttr(df, "a"); + * getAttrs(df); // { b: 2 } + * ``` + */ +export function deleteAttr(obj: object, key: string): void { + const existing = registry.get(obj); + if (existing === undefined) return; + const { [key]: _removed, ...rest } = existing; + if (Object.keys(rest).length === 0) { + registry.delete(obj); + } else { + registry.set(obj, rest); + } +} + +/** + * Return the number of attrs keys stored on `obj`. + * + * @example + * ```ts + * attrsCount(df); // 0 + * setAttrs(df, { a: 1, b: 2 }); + * attrsCount(df); // 2 + * ``` + */ +export function attrsCount(obj: object): number { + return Object.keys(registry.get(obj) ?? {}).length; +} + +/** + * Return the list of attrs keys stored on `obj`. + * + * @example + * ```ts + * setAttrs(df, { a: 1, b: 2 }); + * attrsKeys(df); // ["a", "b"] + * ``` + */ +export function attrsKeys(obj: object): string[] { + return Object.keys(registry.get(obj) ?? {}); +} + +/** + * Merge attrs from multiple source objects into a single target object. + * + * Sources are applied left-to-right; later sources overwrite earlier ones on + * key conflicts. Overwrites any existing attrs on `target`. + * + * @example + * ```ts + * setAttrs(s1, { source: "A", unit: "kg" }); + * setAttrs(s2, { source: "B", scale: 2 }); + * mergeAttrs([s1, s2], df); + * getAttrs(df); // { source: "B", unit: "kg", scale: 2 } + * ``` + */ +export function mergeAttrs(sources: readonly object[], target: object): void { + const merged: Attrs = {}; + for (const src of sources) { + const stored = registry.get(src); + if (stored !== undefined) { + Object.assign(merged, stored); + } + } + if (Object.keys(merged).length > 0) { + registry.set(target, merged); + } +} diff --git a/src/core/index.ts b/src/core/index.ts index ada43b65..08713cae 100644 --- a/src/core/index.ts +++ b/src/core/index.ts @@ -15,3 +15,71 @@ export { CategoricalAccessor } from "./cat_accessor.ts"; export type { CatSeriesLike } from "./cat_accessor.ts"; export { MultiIndex } from "./multi_index.ts"; export type { MultiIndexOptions } from "./multi_index.ts"; +export { insertColumn, popColumn, reorderColumns, moveColumn, dataFrameFromPairs } from "./insert_pop.ts"; +export type { PopResult } from "./insert_pop.ts"; +export { toDictOriented, fromDictOriented } from "./to_from_dict.ts"; +export type { + ToDictOrient, + FromDictOrient, + DictSplit, + DictTight, + SplitInput, +} from "./to_from_dict.ts"; +export { + getAttrs, + setAttrs, + updateAttrs, + copyAttrs, + withAttrs, + clearAttrs, + hasAttrs, + getAttr, + setAttr, + deleteAttr, + attrsCount, + attrsKeys, + mergeAttrs, +} from "./attrs.ts"; +export type { Attrs } from "./attrs.ts"; +export { + pipe, + seriesApply, + seriesTransform, + dataFrameApply, + dataFrameApplyMap, + dataFrameTransform, + dataFrameTransformRows, +} from "./pipe_apply.ts"; +export { + isScalar, + isListLike, + isArrayLike, + isDictLike, + isIterator, + isNumber, + isBool, + isStringValue, + isFloat, + isInteger, + isBigInt, + isRegExp, + isReCompilable, + isMissing, + isHashable, + isDate, + isNumericDtype, + isIntegerDtype, + isSignedIntegerDtype, + isUnsignedIntegerDtype, + isFloatDtype, + isBoolDtype, + isStringDtype, + isDatetimeDtype, + isTimedeltaDtype, + isCategoricalDtype, + isObjectDtype, + isComplexDtype, + isExtensionArrayDtype, + isPeriodDtype, + isIntervalDtype, +} from "./api_types.ts"; diff --git a/src/core/insert_pop.ts b/src/core/insert_pop.ts new file mode 100644 index 00000000..d56c42bc --- /dev/null +++ b/src/core/insert_pop.ts @@ -0,0 +1,214 @@ +/** + * DataFrame.insert() and DataFrame.pop() — column insertion and removal. + * + * Mirrors `pandas.DataFrame.insert(loc, column, value)` and + * `pandas.DataFrame.pop(item)`. + * + * Since `DataFrame` in tsb is immutable, both operations return a new DataFrame. + * `popColumn` returns both the extracted `Series` and the resulting DataFrame. + * + * @example + * ```ts + * import { DataFrame, insertColumn, popColumn } from "tsb"; + * + * const df = DataFrame.fromColumns({ a: [1, 2], b: [3, 4] }); + * + * // Insert column "x" at position 1 (between "a" and "b") + * const df2 = insertColumn(df, 1, "x", [10, 20]); + * // df2.columns.values → ["a", "x", "b"] + * + * // Pop column "a" out of df2 + * const { series, df: df3 } = popColumn(df2, "a"); + * // series.values → [1, 2] + * // df3.columns.values → ["x", "b"] + * ``` + * + * @packageDocumentation + */ + +import type { Label, Scalar } from "../types.ts"; +import { Index } from "./base-index.ts"; +import { DataFrame } from "./frame.ts"; +import { Series } from "./series.ts"; + +// ─── insertColumn ───────────────────────────────────────────────────────────── + +/** + * Insert a new column into `df` at integer column position `loc`. + * + * Mirrors `pandas.DataFrame.insert(loc, column, value, allow_duplicates=False)`. + * Raises a `RangeError` if: + * - `column` already exists in `df` (no duplicates by default) + * - `loc` is out of range (must be 0 ≤ loc ≤ df.shape[1]) + * - `values` length does not match the number of rows + * + * @param df Source DataFrame (not mutated). + * @param loc Zero-based integer position at which to insert the column. + * @param column Name of the new column. + * @param values Column data as an array of scalars or a `Series`. + * @param allowDuplicates When `true`, silently allow duplicate column names. Default `false`. + * @returns A new DataFrame with the column inserted. + */ +export function insertColumn( + df: DataFrame, + loc: number, + column: string, + values: readonly Scalar[] | Series, + allowDuplicates = false, +): DataFrame { + const nCols = df.shape[1]; + const nRows = df.shape[0]; + + if (!allowDuplicates && df.has(column)) { + throw new RangeError( + `Column "${column}" already exists. Use allowDuplicates=true to permit duplicate names.`, + ); + } + + if (loc < 0 || loc > nCols) { + throw new RangeError(`loc=${loc} is out of range [0, ${nCols}].`); + } + + // Resolve values to a Series aligned to df's row index. + const series: Series = + values instanceof Series + ? values + : new Series({ data: values, index: df.index, name: column }); + + if (series.size !== nRows) { + throw new RangeError( + `values length ${series.size} does not match DataFrame row count ${nRows}.`, + ); + } + + // Rebuild the column map, inserting the new column at position `loc`. + const colMap = new Map>(); + let idx = 0; + + for (const colName of df.columns.values) { + if (idx === loc) { + colMap.set(column, series); + } + colMap.set(colName, df.col(colName)); + idx++; + } + + // Handle insertion at the end (loc === nCols). + if (loc === nCols) { + colMap.set(column, series); + } + + return new DataFrame(colMap, df.index); +} + +// ─── popColumn ──────────────────────────────────────────────────────────────── + +/** Return type of {@link popColumn}. */ +export interface PopResult { + /** The extracted column as a Series. */ + readonly series: Series; + /** The DataFrame with the column removed. */ + readonly df: DataFrame; +} + +/** + * Remove a column from `df` and return both the extracted `Series` and the + * resulting DataFrame. + * + * Mirrors `pandas.DataFrame.pop(item)`, but because tsb DataFrames are + * immutable this function returns the removed Series *and* the new DataFrame + * (rather than mutating in place). + * + * Raises a `RangeError` if `col` does not exist in `df`. + * + * @param df Source DataFrame (not mutated). + * @param col Name of the column to remove. + * @returns `{ series, df }` — the extracted column and the remaining DataFrame. + * + * @example + * ```ts + * const { series, df: remaining } = popColumn(df, "age"); + * // series contains the "age" column; remaining has all other columns + * ``` + */ +export function popColumn(df: DataFrame, col: string): PopResult { + const series = df.get(col); + if (series === undefined) { + throw new RangeError(`Column "${col}" not found in DataFrame.`); + } + + const colMap = new Map>(); + for (const colName of df.columns.values) { + if (colName !== col) { + colMap.set(colName, df.col(colName)); + } + } + + return { + series, + df: new DataFrame(colMap, df.index), + }; +} + +// ─── reorderColumns ────────────────────────────────────────────────────────── + +/** + * Reorder the columns of `df` to match `order`. + * + * Mirrors `df[order]` in pandas. All names in `order` must be present in `df`; + * extra names in `df` not listed in `order` are dropped. + * + * @param df Source DataFrame. + * @param order New column order (subset of `df.columns.values`). + * @returns A new DataFrame with columns in the specified order. + */ +export function reorderColumns(df: DataFrame, order: readonly string[]): DataFrame { + const colMap = new Map>(); + for (const name of order) { + const s = df.get(name); + if (s === undefined) { + throw new RangeError(`Column "${name}" not found in DataFrame.`); + } + colMap.set(name, s); + } + return new DataFrame(colMap, df.index); +} + +// ─── moveColumn ────────────────────────────────────────────────────────────── + +/** + * Move an existing column to a new integer position. + * + * This is a convenience wrapper combining {@link popColumn} and + * {@link insertColumn}: it removes the column from its current position and + * re-inserts it at `newLoc` in the resulting DataFrame. + * + * @param df Source DataFrame. + * @param col Name of the column to move. + * @param newLoc Target position (0 ≤ newLoc ≤ df.shape[1] − 1). + * @returns A new DataFrame with the column at the new position. + */ +export function moveColumn(df: DataFrame, col: string, newLoc: number): DataFrame { + const { series, df: without } = popColumn(df, col); + return insertColumn(without, newLoc, col, series); +} + +// ─── internal re-export helper (used by DataFrame constructor access) ───────── + +/** + * Build a new DataFrame from an ordered iterable of `[name, Series]` pairs and + * a row index. Exported for use by other tsb modules that need to construct + * DataFrames without going through the public factory methods. + * + * @internal + */ +export function dataFrameFromPairs( + pairs: Iterable]>, + index: Index