From 39eb03e0164e1491487cdca31eee3450daa1bd3f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 11 Apr 2026 20:32:32 +0000 Subject: [PATCH 01/14] =?UTF-8?q?Iteration=20172:=20Add=20na=5Fops=20?= =?UTF-8?q?=E2=80=94=20isna/notna/ffill/bfill?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements pandas missing-value utilities as standalone exported functions: - `isna` / `notna` / `isnull` / `notnull` — detect missing values in scalars, Series, and DataFrames (mirrors pd.isna / pd.notna) - `ffillSeries` / `bfillSeries` — forward/backward fill for Series with optional `limit` parameter - `dataFrameFfill` / `dataFrameBfill` — column-wise or row-wise fill for DataFrames with optional `limit` and `axis` parameters Metric: 28 → 29 pandas_features_ported Run: https://github.com/githubnext/tsessebe/actions/runs/24263385922 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- playground/index.html | 80 +++++++ playground/na_ops.html | 480 +++++++++++++++++++++++++++++++++++++ src/index.ts | 85 +++++++ src/stats/index.ts | 79 ++++++ src/stats/na_ops.ts | 336 ++++++++++++++++++++++++++ tests/stats/na_ops.test.ts | 280 ++++++++++++++++++++++ 6 files changed, 1340 insertions(+) create mode 100644 playground/na_ops.html create mode 100644 src/stats/na_ops.ts create mode 100644 tests/stats/na_ops.test.ts diff --git a/playground/index.html b/playground/index.html index 48bfbcb9..5074e12b 100644 --- a/playground/index.html +++ b/playground/index.html @@ -229,6 +229,11 @@

Reshape with aggregation. pivot() for unique reshaping; pivotTable() for aggregation (mean/sum/count/min/max/first/last) with fill_value and dropna support.

✅ Complete
+
+

📊 pivotTableFull (margins)

+

Enhanced pivot table with row/column margin totals. Supports all aggfuncs, custom margins_name, sort order, fill_value, dropna, and multiple index/column columns.

+
✅ Complete
+

📐 stack & unstack

Pivot column labels to/from row index. stack() rotates columns into a compound-index Series; unstack() recovers the DataFrame. Custom sep, dropna, and fill_value support.

@@ -254,6 +259,11 @@

Element-wise transformations. clip(), seriesAbs(), seriesRound() for Series and DataFrame with min/max bounds, decimal precision, and axis support.

✅ Complete

+
+

🔍 missing-value ops

+

Detect and fill missing values. isna(), notna(), isnull(), notnull() for scalars/Series/DataFrame. ffillSeries(), bfillSeries(), dataFrameFfill(), dataFrameBfill() with optional limit and axis support.

+
✅ Complete
+

🔢 value_counts

Count unique values. valueCounts() for Series and dataFrameValueCounts() for DataFrame with normalize, sort, ascending, and dropna options.

@@ -264,6 +274,76 @@

✅ Complete

+
+

📊 pct_change

+

Fractional change between elements. pctChangeSeries() and pctChangeDataFrame() with periods, fillMethod (pad/bfill), limit, and axis options.

+
✅ Complete
+
+
+

🔎 idxmin / idxmax

+

Return the index label of the minimum or maximum value. idxminSeries(), idxmaxSeries(), idxminDataFrame(), idxmaxDataFrame() with skipna support.

+
✅ Complete
+
+
+

🔄 astype

+

Cast Series and DataFrame values to a different dtype. astypeSeries(), astype() with per-column mapping support and integer clamping.

+
✅ Complete
+
+
+

🔁 replace

+

Substitute values in Series and DataFrame. Supports scalar, array (many→one, pair-wise), Record, and Map replacement specs.

+
✅ Complete
+
+
+

🔀 where / mask

+

Conditional value selection. where keeps values where the condition is true; mask replaces them. Supports boolean arrays, Series, DataFrame, and callable conditions.

+
✅ Complete
+
+
+

📈 diff / shift

+

Discrete difference and value shifting for Series and DataFrame. diff computes element-wise differences; shift lags or leads values by a number of periods. Essential for time-series analysis.

+
✅ Complete
+
+
+

🔍 duplicated / drop_duplicates

+

Detect and remove duplicate values or rows. Supports keep="first", keep="last", and keep=false (mark all occurrences). DataFrame supports a subset of columns.

+
✅ Complete
+
+
+

🎲 sample

+

Random sampling from Series and DataFrame. Supports fixed count, fractional sampling, with/without replacement, weighted sampling, and seeded deterministic results via randomState.

+
✅ Complete
+
+
+

✂️ clip_advanced

+

Per-element clipping with scalar, array, Series, or DataFrame bounds. Supports axis-based Series broadcasting for DataFrames — mirrors pandas.Series.clip(lower, upper) with array bounds.

+
✅ Complete
+
+
+

🔧 apply / map

+

Function application and value mapping. applySeries, mapSeries (function/dict lookup), applyDataFrame (reduce per col/row), applyExpandDataFrame (transform per col/row), mapDataFrame (element-wise).

+
✅ Complete
+
+
+

🪣 cut / qcut

+

Bin continuous data into discrete intervals. cut for equal-width or user-defined bins; qcut for equal-frequency quantile bins. Custom labels, retbins, cutCodes, and cutCategories.

+
✅ Complete
+
+
+

📐 Interval / IntervalIndex

+

Bounded interval objects and an ordered index of intervals. Interval supports all four closed types; IntervalIndex supports lookup, overlap queries, and intervalRange for equal-length ranges.

+
✅ Complete
+
+
+

🎲 getDummies / fromDummies

+

One-hot encode categorical Series or DataFrame columns into binary indicator columns. getDummies supports custom prefix, separator, dropFirst, and dummyNa. fromDummies reverses the encoding.

+
✅ Complete
+
+
+

📊 crosstab

+

Cross-tabulation frequency tables for two categorical factors. Supports custom aggfunc (count, sum, mean, min, max), row/column margins, normalize (all / index / columns), and dropna.

+
✅ Complete
+
diff --git a/playground/na_ops.html b/playground/na_ops.html new file mode 100644 index 00000000..c321438f --- /dev/null +++ b/playground/na_ops.html @@ -0,0 +1,480 @@ + + + + + + tsb — missing-value operations (isna, ffill, bfill) + + + +
+
+
Loading tsb runtime…
+
+ + ← Back to playground index + +

Missing-value operations

+

+ isna / notna — detect missing values in scalars, + Series, and DataFrames.
+ ffill / bfill — propagate the last (or next) valid + value to fill gaps.
+ Mirrors pd.isna(), Series.ffill(), and + DataFrame.bfill() from pandas. +

+ + +
+

1 · isna / notna on scalars

+

+ Returns true / false for individual values. + null, undefined, and NaN are all + considered "missing". +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

2 · isna on a Series

+

+ When passed a Series, isna returns a boolean Series of the + same length — true where values are missing. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

3 · isna on a DataFrame

+

+ Returns a DataFrame of booleans with the same shape — one column per + original column, true where missing. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

4 · Forward-fill (ffillSeries)

+

+ Propagates the last valid value forward to fill gaps. Leading + nulls that have no preceding value remain null. + Use the optional limit to cap consecutive fills. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

5 · Backward-fill (bfillSeries)

+

+ Propagates the next valid value backward to fill gaps. Trailing + nulls that have no following value remain null. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

6 · DataFrame forward-fill & backward-fill

+

+ dataFrameFfill and dataFrameBfill apply fill + column-wise by default (axis=0). Pass axis: 1 to fill + row-wise across columns. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

API Reference

+
// Module-level missing-value detection
+isna(value: Scalar): boolean
+isna(value: Series): Series<boolean>
+isna(value: DataFrame): DataFrame
+
+notna(value: Scalar): boolean
+notna(value: Series): Series<boolean>
+notna(value: DataFrame): DataFrame
+
+// Aliases
+isnull(...)  // same as isna
+notnull(...) // same as notna
+
+// Series forward / backward fill
+ffillSeries(series, options?: { limit?: number | null }): Series
+bfillSeries(series, options?: { limit?: number | null }): Series
+
+// DataFrame forward / backward fill
+dataFrameFfill(df, options?: {
+  limit?: number | null,   // max consecutive fills (default: no limit)
+  axis?: 0 | 1 | "index" | "columns",  // default 0 (column-wise)
+}): DataFrame
+
+dataFrameBfill(df, options?: {
+  limit?: number | null,
+  axis?: 0 | 1 | "index" | "columns",
+}): DataFrame
+
+ + + + + diff --git a/src/index.ts b/src/index.ts index 1dd0aa57..c1557e84 100644 --- a/src/index.ts +++ b/src/index.ts @@ -74,6 +74,8 @@ export type { } from "./reshape/index.ts"; export { stack, unstack, STACK_DEFAULT_SEP } from "./reshape/index.ts"; export type { StackOptions, UnstackOptions } from "./reshape/index.ts"; +export { pivotTableFull } from "./reshape/index.ts"; +export type { PivotTableFullOptions, PivotAggFunc } from "./reshape/index.ts"; export { MultiIndex } from "./core/index.ts"; export type { MultiIndexOptions } from "./core/index.ts"; export { rankSeries, rankDataFrame } from "./stats/index.ts"; @@ -107,3 +109,86 @@ export { export type { ClipOptions, RoundOptions, DataFrameElemOptions } from "./stats/index.ts"; export { valueCounts, dataFrameValueCounts } from "./stats/index.ts"; export type { ValueCountsOptions, DataFrameValueCountsOptions } from "./stats/index.ts"; +export { + isna, + notna, + isnull, + notnull, + ffillSeries, + bfillSeries, + dataFrameFfill, + dataFrameBfill, +} from "./stats/index.ts"; +export type { FillDirectionOptions, DataFrameFillOptions } from "./stats/index.ts"; +export { pctChangeSeries, pctChangeDataFrame } from "./stats/index.ts"; +export type { + PctChangeFillMethod, + PctChangeOptions, + DataFramePctChangeOptions, +} from "./stats/index.ts"; +export { idxminSeries, idxmaxSeries, idxminDataFrame, idxmaxDataFrame } from "./stats/index.ts"; +export type { IdxOptions, IdxDataFrameOptions } from "./stats/index.ts"; +export { astypeSeries, astype, castScalar } from "./core/index.ts"; +export type { AstypeOptions, DataFrameAstypeOptions } from "./core/index.ts"; +export { replaceSeries, replaceDataFrame } from "./stats/index.ts"; +export type { + ReplaceMapping, + ReplaceSpec, + ReplaceOptions, + DataFrameReplaceOptions, +} from "./stats/index.ts"; +export { whereSeries, maskSeries, whereDataFrame, maskDataFrame } from "./stats/index.ts"; +export type { + SeriesCond, + DataFrameCond, + WhereOptions, + WhereDataFrameOptions, +} from "./stats/index.ts"; +export { diffSeries, diffDataFrame, shiftSeries, shiftDataFrame } from "./stats/index.ts"; +export type { + DiffOptions, + DataFrameDiffOptions, + ShiftOptions, + DataFrameShiftOptions, +} from "./stats/index.ts"; +export { + duplicatedSeries, + duplicatedDataFrame, + dropDuplicatesSeries, + dropDuplicatesDataFrame, +} from "./stats/index.ts"; +export type { KeepPolicy, DuplicatedOptions, DataFrameDuplicatedOptions } from "./stats/index.ts"; +export { sampleSeries, sampleDataFrame } from "./core/index.ts"; +export type { SampleOptions } from "./core/index.ts"; +export { clipAdvancedSeries, clipAdvancedDataFrame } from "./stats/index.ts"; +export type { + SeriesBound, + DataFrameBound, + ClipAdvancedSeriesOptions, + ClipAdvancedDataFrameOptions, +} from "./stats/index.ts"; +export { + applySeries, + mapSeries, + applyDataFrame, + applyExpandDataFrame, + mapDataFrame, +} from "./stats/index.ts"; +export type { + MapLookup, + ApplyDataFrameOptions, + ApplyExpandDataFrameOptions, +} from "./stats/index.ts"; +export { cut, qcut, cutCodes, cutCategories } from "./stats/index.ts"; +export type { + CutOptions, + QcutOptions, + CutResult, + CutResultWithBins, +} from "./stats/index.ts"; +export { Interval, IntervalIndex, intervalRange } from "./stats/index.ts"; +export type { ClosedType, IntervalOptions, IntervalRangeOptions } from "./stats/index.ts"; +export { getDummies, getDummiesSeries, getDummiesDataFrame, fromDummies } from "./stats/index.ts"; +export type { GetDummiesOptions, FromDummiesOptions } from "./stats/index.ts"; +export { crosstab, crosstabSeries } from "./stats/index.ts"; +export type { CrosstabOptions, CrosstabAggFunc, CrosstabNormalize } from "./stats/index.ts"; diff --git a/src/stats/index.ts b/src/stats/index.ts index b1de48eb..a1c9bf51 100644 --- a/src/stats/index.ts +++ b/src/stats/index.ts @@ -39,3 +39,82 @@ export { nsmallestDataFrame, } from "./nlargest.ts"; export type { NKeep, NTopOptions, NTopDataFrameOptions } from "./nlargest.ts"; +export { + isna, + notna, + isnull, + notnull, + ffillSeries, + bfillSeries, + dataFrameFfill, + dataFrameBfill, +} from "./na_ops.ts"; +export type { FillDirectionOptions, DataFrameFillOptions } from "./na_ops.ts"; +export { pctChangeSeries, pctChangeDataFrame } from "./pct_change.ts"; +export type { + PctChangeFillMethod, + PctChangeOptions, + DataFramePctChangeOptions, +} from "./pct_change.ts"; +export { idxminSeries, idxmaxSeries, idxminDataFrame, idxmaxDataFrame } from "./idxmin_idxmax.ts"; +export type { IdxOptions, IdxDataFrameOptions } from "./idxmin_idxmax.ts"; +export { replaceSeries, replaceDataFrame } from "./replace.ts"; +export type { + ReplaceMapping, + ReplaceSpec, + ReplaceOptions, + DataFrameReplaceOptions, +} from "./replace.ts"; +export { whereSeries, maskSeries, whereDataFrame, maskDataFrame } from "./where_mask.ts"; +export type { + SeriesCond, + DataFrameCond, + WhereOptions, + WhereDataFrameOptions, +} from "./where_mask.ts"; +export { diffSeries, diffDataFrame, shiftSeries, shiftDataFrame } from "./diff_shift.ts"; +export type { + DiffOptions, + DataFrameDiffOptions, + ShiftOptions, + DataFrameShiftOptions, +} from "./diff_shift.ts"; +export { + duplicatedSeries, + duplicatedDataFrame, + dropDuplicatesSeries, + dropDuplicatesDataFrame, +} from "./duplicated.ts"; +export type { KeepPolicy, DuplicatedOptions, DataFrameDuplicatedOptions } from "./duplicated.ts"; +export { clipAdvancedSeries, clipAdvancedDataFrame } from "./clip_advanced.ts"; +export type { + SeriesBound, + DataFrameBound, + ClipAdvancedSeriesOptions, + ClipAdvancedDataFrameOptions, +} from "./clip_advanced.ts"; +export { + applySeries, + mapSeries, + applyDataFrame, + applyExpandDataFrame, + mapDataFrame, +} from "./apply.ts"; +export type { + MapLookup, + ApplyDataFrameOptions, + ApplyExpandDataFrameOptions, +} from "./apply.ts"; +export { cut, qcut, cutCodes, cutCategories } from "./cut.ts"; +export type { + CutOptions, + QcutOptions, + CutResult, + CutResultWithBins, +} from "./cut.ts"; +export { Interval, IntervalIndex, intervalRange } from "./interval.ts"; +export type { ClosedType, IntervalOptions, IntervalRangeOptions } from "./interval.ts"; +export { getDummies, getDummiesSeries, getDummiesDataFrame, fromDummies } from "./get_dummies.ts"; +export type { GetDummiesOptions, FromDummiesOptions } from "./get_dummies.ts"; +export { crosstab, crosstabSeries } from "./crosstab.ts"; +export type { CrosstabOptions, CrosstabAggFunc, CrosstabNormalize } from "./crosstab.ts"; diff --git a/src/stats/na_ops.ts b/src/stats/na_ops.ts new file mode 100644 index 00000000..c776bb1f --- /dev/null +++ b/src/stats/na_ops.ts @@ -0,0 +1,336 @@ +/** + * na_ops — missing-value utilities for Series and DataFrame. + * + * Mirrors the following pandas module-level functions and methods: + * - `pd.isna(obj)` / `pd.isnull(obj)` — detect missing values + * - `pd.notna(obj)` / `pd.notnull(obj)` — detect non-missing values + * - `Series.ffill()` / `DataFrame.ffill()` — forward-fill missing values + * - `Series.bfill()` / `DataFrame.bfill()` — backward-fill missing values + * + * All functions are **pure** (return new objects; inputs are unchanged). + * + * @module + */ + +import { DataFrame } from "../core/index.ts"; +import { Series } from "../core/index.ts"; +import type { Scalar } from "../types.ts"; + +// ─── public types ───────────────────────────────────────────────────────────── + +/** Options for {@link ffillSeries} and {@link bfillSeries}. */ +export interface FillDirectionOptions { + /** + * Maximum number of consecutive NaN/null values to fill. + * `null` means no limit (default). + */ + readonly limit?: number | null; +} + +/** Options for {@link dataFrameFfill} and {@link dataFrameBfill}. */ +export interface DataFrameFillOptions extends FillDirectionOptions { + /** + * - `0` or `"index"` (default): fill missing values down each **column**. + * - `1` or `"columns"`: fill missing values across each **row**. + */ + readonly axis?: 0 | 1 | "index" | "columns"; +} + +// ─── helpers ────────────────────────────────────────────────────────────────── + +/** True when `v` should be treated as missing. */ +function isMissing(v: Scalar): boolean { + return v === null || v === undefined || (typeof v === "number" && Number.isNaN(v)); +} + +/** Forward-fill an array of scalars in-place (returns a new array). */ +function ffillArray(vals: readonly Scalar[], limit: number | null): Scalar[] { + const out: Scalar[] = Array.from(vals); + let lastValid: Scalar = null; + let streak = 0; + for (let i = 0; i < out.length; i++) { + if (isMissing(out[i])) { + if (!isMissing(lastValid) && (limit === null || streak < limit)) { + out[i] = lastValid; + streak++; + } + } else { + lastValid = out[i] as Scalar; + streak = 0; + } + } + return out; +} + +/** Backward-fill an array of scalars (returns a new array). */ +function bfillArray(vals: readonly Scalar[], limit: number | null): Scalar[] { + const out: Scalar[] = Array.from(vals); + let nextValid: Scalar = null; + let streak = 0; + for (let i = out.length - 1; i >= 0; i--) { + if (isMissing(out[i])) { + if (!isMissing(nextValid) && (limit === null || streak < limit)) { + out[i] = nextValid; + streak++; + } + } else { + nextValid = out[i] as Scalar; + streak = 0; + } + } + return out; +} + +// ─── isna / notna ───────────────────────────────────────────────────────────── + +/** + * Detect missing values in a scalar, Series, or DataFrame. + * + * - For a **scalar**: returns `true` if the value is `null`, `undefined`, or `NaN`. + * - For a **Series**: returns a `Series` of the same index. + * - For a **DataFrame**: returns a `DataFrame` of boolean columns. + * + * Mirrors `pandas.isna()` / `pandas.isnull()`. + * + * @example + * ```ts + * import { isna } from "tsb"; + * isna(null); // true + * isna(42); // false + * isna(NaN); // true + * + * const s = new Series({ data: [1, null, NaN, 4] }); + * isna(s); // Series([false, true, true, false]) + * ``` + */ +export function isna(value: Scalar): boolean; +export function isna(value: Series): Series; +export function isna(value: DataFrame): DataFrame; +export function isna( + value: Scalar | Series | DataFrame, +): boolean | Series | DataFrame { + if (value instanceof DataFrame) { + return value.isna(); + } + if (value instanceof Series) { + return value.isna(); + } + return isMissing(value as Scalar); +} + +/** + * Detect non-missing values in a scalar, Series, or DataFrame. + * + * Mirrors `pandas.notna()` / `pandas.notnull()`. + * + * @example + * ```ts + * import { notna } from "tsb"; + * notna(null); // false + * notna(42); // true + * ``` + */ +export function notna(value: Scalar): boolean; +export function notna(value: Series): Series; +export function notna(value: DataFrame): DataFrame; +export function notna( + value: Scalar | Series | DataFrame, +): boolean | Series | DataFrame { + if (value instanceof DataFrame) { + return value.notna(); + } + if (value instanceof Series) { + return value.notna(); + } + return !isMissing(value as Scalar); +} + +/** Alias for {@link isna}. Mirrors `pandas.isnull()`. */ +export const isnull = isna; + +/** Alias for {@link notna}. Mirrors `pandas.notnull()`. */ +export const notnull = notna; + +// ─── ffill ──────────────────────────────────────────────────────────────────── + +/** + * Forward-fill missing values in a Series. + * + * Each `null`/`NaN` value is replaced with the last non-missing value + * that precedes it (if any). Values before the first non-missing value + * remain missing. + * + * Mirrors `pandas.Series.ffill()`. + * + * @param series - Input Series (unchanged). + * @param options - Optional `{ limit }` — max consecutive fills. + * @returns New Series with forward-filled values. + * + * @example + * ```ts + * import { ffillSeries } from "tsb"; + * const s = new Series({ data: [1, null, null, 4] }); + * ffillSeries(s); // Series([1, 1, 1, 4]) + * ``` + */ +export function ffillSeries( + series: Series, + options?: FillDirectionOptions, +): Series { + const limit = options?.limit ?? null; + const filled = ffillArray(series.values as readonly Scalar[], limit) as T[]; + return new Series({ + data: filled, + index: series.index, + dtype: series.dtype, + name: series.name ?? undefined, + }); +} + +/** + * Backward-fill missing values in a Series. + * + * Each `null`/`NaN` value is replaced with the next non-missing value + * that follows it (if any). Values after the last non-missing value + * remain missing. + * + * Mirrors `pandas.Series.bfill()`. + * + * @example + * ```ts + * import { bfillSeries } from "tsb"; + * const s = new Series({ data: [1, null, null, 4] }); + * bfillSeries(s); // Series([1, 4, 4, 4]) + * ``` + */ +export function bfillSeries( + series: Series, + options?: FillDirectionOptions, +): Series { + const limit = options?.limit ?? null; + const filled = bfillArray(series.values as readonly Scalar[], limit) as T[]; + return new Series({ + data: filled, + index: series.index, + dtype: series.dtype, + name: series.name ?? undefined, + }); +} + +// ─── DataFrame ffill / bfill ────────────────────────────────────────────────── + +/** + * Forward-fill missing values in a DataFrame. + * + * By default operates **column-wise** (axis=0): each column is independently + * forward-filled. With `axis=1` each row is forward-filled across columns. + * + * Mirrors `pandas.DataFrame.ffill()`. + * + * @example + * ```ts + * import { dataFrameFfill } from "tsb"; + * const df = new DataFrame({ data: { a: [1, null, 3], b: [null, 2, null] } }); + * dataFrameFfill(df); + * // a: [1, 1, 3] + * // b: [null, 2, 2] + * ``` + */ +export function dataFrameFfill(df: DataFrame, options?: DataFrameFillOptions): DataFrame { + const limit = options?.limit ?? null; + const axis = options?.axis ?? 0; + const byRow = axis === 1 || axis === "columns"; + + if (!byRow) { + // column-wise: fill each column independently + const colMap = new Map>(); + for (const name of df.columns.values) { + const col = df.col(name); + const filled = ffillArray(col.values, limit) as Scalar[]; + colMap.set(name, new Series({ data: filled, index: col.index, dtype: col.dtype })); + } + return new DataFrame(colMap, df.index); + } + + // row-wise: fill across columns for each row + const nRows = df.shape[0]; + const cols = df.columns.values; + const columns = cols.map((name) => df.col(name)); + const rowsFilled: Scalar[][] = columns.map((c) => Array.from(c.values)); + for (let r = 0; r < nRows; r++) { + const rowVals: Scalar[] = columns.map((_, ci) => rowsFilled[ci]?.[r] ?? null); + const filled = ffillArray(rowVals, limit); + for (let ci = 0; ci < cols.length; ci++) { + const rowsFilledCI = rowsFilled[ci]; + if (rowsFilledCI !== undefined) { + rowsFilledCI[r] = filled[ci] ?? null; + } + } + } + const colMap = new Map>(); + for (let ci = 0; ci < cols.length; ci++) { + const name = cols[ci] as string; + const col = columns[ci] as Series; + colMap.set( + name, + new Series({ + data: rowsFilled[ci] ?? [], + index: col.index, + dtype: col.dtype, + }), + ); + } + return new DataFrame(colMap, df.index); +} + +/** + * Backward-fill missing values in a DataFrame. + * + * By default operates **column-wise** (axis=0). With `axis=1` fills across rows. + * + * Mirrors `pandas.DataFrame.bfill()`. + */ +export function dataFrameBfill(df: DataFrame, options?: DataFrameFillOptions): DataFrame { + const limit = options?.limit ?? null; + const axis = options?.axis ?? 0; + const byRow = axis === 1 || axis === "columns"; + + if (!byRow) { + const colMap = new Map>(); + for (const name of df.columns.values) { + const col = df.col(name); + const filled = bfillArray(col.values, limit) as Scalar[]; + colMap.set(name, new Series({ data: filled, index: col.index, dtype: col.dtype })); + } + return new DataFrame(colMap, df.index); + } + + const nRows = df.shape[0]; + const cols = df.columns.values; + const columns = cols.map((name) => df.col(name)); + const rowsFilled: Scalar[][] = columns.map((c) => Array.from(c.values)); + for (let r = 0; r < nRows; r++) { + const rowVals: Scalar[] = columns.map((_, ci) => rowsFilled[ci]?.[r] ?? null); + const filled = bfillArray(rowVals, limit); + for (let ci = 0; ci < cols.length; ci++) { + const rowsFilledCI = rowsFilled[ci]; + if (rowsFilledCI !== undefined) { + rowsFilledCI[r] = filled[ci] ?? null; + } + } + } + const colMap = new Map>(); + for (let ci = 0; ci < cols.length; ci++) { + const name = cols[ci] as string; + const col = columns[ci] as Series; + colMap.set( + name, + new Series({ + data: rowsFilled[ci] ?? [], + index: col.index, + dtype: col.dtype, + }), + ); + } + return new DataFrame(colMap, df.index); +} diff --git a/tests/stats/na_ops.test.ts b/tests/stats/na_ops.test.ts new file mode 100644 index 00000000..340406ac --- /dev/null +++ b/tests/stats/na_ops.test.ts @@ -0,0 +1,280 @@ +/** + * Tests for na_ops — missing-value utilities (isna, notna, ffill, bfill). + */ + +import { describe, expect, it } from "bun:test"; +import fc from "fast-check"; +import { + DataFrame, + Series, + bfillSeries, + dataFrameBfill, + dataFrameFfill, + ffillSeries, + isna, + isnull, + notna, + notnull, +} from "../../src/index.ts"; + +// ─── isna / notna ───────────────────────────────────────────────────────────── + +describe("isna (scalar)", () => { + it("returns true for null", () => expect(isna(null)).toBe(true)); + it("returns true for undefined", () => expect(isna(undefined)).toBe(true)); + it("returns true for NaN", () => expect(isna(Number.NaN)).toBe(true)); + it("returns false for 0", () => expect(isna(0)).toBe(false)); + it("returns false for empty string", () => expect(isna("")).toBe(false)); + it("returns false for false", () => expect(isna(false)).toBe(false)); + it("returns false for a number", () => expect(isna(42)).toBe(false)); +}); + +describe("notna (scalar)", () => { + it("returns false for null", () => expect(notna(null)).toBe(false)); + it("returns false for NaN", () => expect(notna(Number.NaN)).toBe(false)); + it("returns true for 42", () => expect(notna(42)).toBe(true)); + it("returns true for a string", () => expect(notna("hello")).toBe(true)); +}); + +describe("isnull / notnull aliases", () => { + it("isnull equals isna for scalar", () => { + expect(isnull(null)).toBe(isna(null)); + expect(isnull(42)).toBe(isna(42)); + }); + it("notnull equals notna for scalar", () => { + expect(notnull(null)).toBe(notna(null)); + expect(notnull(42)).toBe(notna(42)); + }); +}); + +describe("isna (Series)", () => { + it("returns boolean Series of correct length", () => { + const s = new Series({ data: [1, null, Number.NaN, 4] }); + const result = isna(s); + expect(result).toBeInstanceOf(Series); + expect([...result.values]).toEqual([false, true, true, false]); + }); + + it("all present", () => { + const s = new Series({ data: [1, 2, 3] }); + expect([...isna(s).values]).toEqual([false, false, false]); + }); + + it("all missing", () => { + const s = new Series({ data: [null, null, Number.NaN] }); + expect([...isna(s).values]).toEqual([true, true, true]); + }); +}); + +describe("notna (Series)", () => { + it("is the inverse of isna", () => { + const s = new Series({ data: [1, null, Number.NaN, 4] }); + const na = isna(s).values; + const nna = notna(s).values; + for (let i = 0; i < na.length; i++) { + expect(nna[i]).toBe(!na[i]); + } + }); +}); + +describe("isna (DataFrame)", () => { + it("returns DataFrame of booleans", () => { + const df = DataFrame.fromColumns({ a: [1, null], b: [Number.NaN, 2] }); + const result = isna(df); + expect(result).toBeInstanceOf(DataFrame); + expect([...result.col("a").values]).toEqual([false, true]); + expect([...result.col("b").values]).toEqual([true, false]); + }); +}); + +describe("notna (DataFrame)", () => { + it("returns inverse of isna DataFrame", () => { + const df = DataFrame.fromColumns({ a: [1, null], b: [Number.NaN, 2] }); + expect([...notna(df).col("a").values]).toEqual([true, false]); + expect([...notna(df).col("b").values]).toEqual([false, true]); + }); +}); + +// ─── ffillSeries ────────────────────────────────────────────────────────────── + +describe("ffillSeries", () => { + it("fills nulls with preceding value", () => { + const s = new Series({ data: [1, null, null, 4] }); + expect([...ffillSeries(s).values]).toEqual([1, 1, 1, 4]); + }); + + it("leaves leading nulls untouched", () => { + const s = new Series({ data: [null, null, 3, null] }); + expect([...ffillSeries(s).values]).toEqual([null, null, 3, 3]); + }); + + it("NaN is treated as missing", () => { + const s = new Series({ data: [2, Number.NaN, 5] }); + const result = ffillSeries(s).values; + expect(result[0]).toBe(2); + expect(result[1]).toBe(2); + expect(result[2]).toBe(5); + }); + + it("respects limit option", () => { + const s = new Series({ data: [1, null, null, null, 5] }); + expect([...ffillSeries(s, { limit: 1 }).values]).toEqual([1, 1, null, null, 5]); + }); + + it("preserves original Series", () => { + const s = new Series({ data: [1, null, 3] }); + ffillSeries(s); + expect([...s.values]).toEqual([1, null, 3]); + }); + + it("empty Series returns empty", () => { + const s = new Series({ data: [] }); + expect([...ffillSeries(s).values]).toEqual([]); + }); + + it("preserves name and index", () => { + const s = new Series({ data: [1, null], name: "x" }); + const filled = ffillSeries(s); + expect(filled.name).toBe("x"); + expect(filled.index.size).toBe(2); + }); +}); + +// ─── bfillSeries ────────────────────────────────────────────────────────────── + +describe("bfillSeries", () => { + it("fills nulls with following value", () => { + const s = new Series({ data: [1, null, null, 4] }); + expect([...bfillSeries(s).values]).toEqual([1, 4, 4, 4]); + }); + + it("leaves trailing nulls untouched", () => { + const s = new Series({ data: [null, 3, null, null] }); + expect([...bfillSeries(s).values]).toEqual([3, 3, null, null]); + }); + + it("respects limit option", () => { + const s = new Series({ data: [1, null, null, null, 5] }); + expect([...bfillSeries(s, { limit: 2 }).values]).toEqual([1, null, 5, 5, 5]); + }); + + it("empty Series returns empty", () => { + const s = new Series({ data: [] }); + expect([...bfillSeries(s).values]).toEqual([]); + }); +}); + +// ─── dataFrameFfill ─────────────────────────────────────────────────────────── + +describe("dataFrameFfill (column-wise)", () => { + it("fills each column independently", () => { + const df = DataFrame.fromColumns({ a: [1, null, 3], b: [null, 2, null] }); + const result = dataFrameFfill(df); + expect([...result.col("a").values]).toEqual([1, 1, 3]); + expect([...result.col("b").values]).toEqual([null, 2, 2]); + }); + + it("preserves index", () => { + const df = DataFrame.fromColumns({ x: [1, null] }); + expect(dataFrameFfill(df).index.size).toBe(2); + }); +}); + +describe("dataFrameFfill (row-wise)", () => { + it("fills across columns per row", () => { + const df = DataFrame.fromColumns({ a: [1, null], b: [null, null], c: [3, 4] }); + const result = dataFrameFfill(df, { axis: 1 }); + expect([...result.col("a").values]).toEqual([1, null]); + expect([...result.col("b").values]).toEqual([1, null]); + expect([...result.col("c").values]).toEqual([3, 4]); + }); +}); + +// ─── dataFrameBfill ─────────────────────────────────────────────────────────── + +describe("dataFrameBfill (column-wise)", () => { + it("fills each column backward", () => { + const df = DataFrame.fromColumns({ a: [null, null, 3], b: [1, null, null] }); + const result = dataFrameBfill(df); + expect([...result.col("a").values]).toEqual([3, 3, 3]); + expect([...result.col("b").values]).toEqual([1, null, null]); + }); +}); + +describe("dataFrameBfill (row-wise)", () => { + it("fills backward across columns per row", () => { + const df = DataFrame.fromColumns({ a: [null, 1], b: [null, null], c: [3, null] }); + const result = dataFrameBfill(df, { axis: 1 }); + expect([...result.col("a").values]).toEqual([3, 1]); + expect([...result.col("b").values]).toEqual([3, null]); + expect([...result.col("c").values]).toEqual([3, null]); + }); +}); + +// ─── property-based tests ───────────────────────────────────────────────────── + +describe("property: ffill followed by bfill fills all if any non-null", () => { + it("all values filled when at least one is present", () => { + fc.assert( + fc.property( + fc.array(fc.option(fc.integer({ min: 0, max: 100 }), { nil: null }), { + minLength: 1, + maxLength: 20, + }), + (raw) => { + const hasNonNull = raw.some((v) => v !== null); + if (!hasNonNull) { + return true; + } + const s = new Series({ data: raw }); + const result = bfillSeries(ffillSeries(s)); + return result.values.every((v) => v !== null); + }, + ), + ); + }); +}); + +describe("property: ffill never introduces new non-null values beyond last valid", () => { + it("ffilled series has no nulls after first valid value", () => { + fc.assert( + fc.property( + fc.array(fc.option(fc.integer({ min: -50, max: 50 }), { nil: null }), { + minLength: 0, + maxLength: 30, + }), + (raw) => { + const s = new Series({ data: raw }); + const filled = ffillSeries(s).values; + let sawValid = false; + for (const v of filled) { + if (v !== null) { + sawValid = true; + } + if (sawValid && v === null) { + return false; + } + } + return true; + }, + ), + ); + }); +}); + +describe("property: isna is inverse of notna for scalars", () => { + it("isna(v) === !notna(v)", () => { + fc.assert( + fc.property( + fc.oneof( + fc.integer(), + fc.float({ noNaN: false }), + fc.constant(null), + fc.string(), + fc.boolean(), + ), + (v) => isna(v as Parameters[0]) === !notna(v as Parameters[0]), + ), + ); + }); +}); From b9af270fed0d7065982629e2fb8800f414d30f99 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 11 Apr 2026 20:32:33 +0000 Subject: [PATCH 02/14] Iteration 174: Add pct_change for Series and DataFrame Implements pctChangeSeries() and pctChangeDataFrame() mirroring pandas.Series.pct_change() / pandas.DataFrame.pct_change(). - periods: configurable lag (positive = backward, negative = forward) - fillMethod: "pad" (default), "bfill", or null (no fill) - limit: cap consecutive fills - axis: column-wise (default) or row-wise for DataFrame Full test coverage: unit tests, edge cases, and fast-check property tests. Interactive playground page at playground/pct_change.html. Run: https://github.com/githubnext/tsessebe/actions/runs/24266545401 --- playground/pct_change.html | 448 +++++++++++++++++++++++++++++++++ src/stats/pct_change.ts | 231 +++++++++++++++++ tests/stats/pct_change.test.ts | 252 +++++++++++++++++++ 3 files changed, 931 insertions(+) create mode 100644 playground/pct_change.html create mode 100644 src/stats/pct_change.ts create mode 100644 tests/stats/pct_change.test.ts diff --git a/playground/pct_change.html b/playground/pct_change.html new file mode 100644 index 00000000..3576797a --- /dev/null +++ b/playground/pct_change.html @@ -0,0 +1,448 @@ + + + + + + tsb — pct_change + + + +
+
+
Initializing playground…
+
+ ← Back to roadmap +

📊 pct_change — Interactive Playground

+

Compute the fractional change between each element and a prior element. + Mirrors pandas.Series.pct_change() / + pandas.DataFrame.pct_change().
+ Edit any code block below and press ▶ Run + (or Ctrl+Enter) to execute it live in your browser. +

+ + +
+

1 · Basic pct_change on a Series

+

pctChangeSeries(series) returns the fractional (not percentage) change + from each previous element. The first element is always null.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

2 · Multi-period change

+

The periods option controls the lag. Use periods: 2 to + compare each value to the one two steps earlier — useful for month-over-month + comparisons in quarterly data.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

3 · Handling missing values

+

By default, pctChangeSeries forward-fills (fillMethod: "pad") + NaN/null values before computing the ratio — so gaps don't break the chain. + Set fillMethod: null to propagate NaN instead.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

4 · Limit consecutive fills

+

The limit option caps how many consecutive NaN values get forward-filled. + Useful when you want to tolerate short gaps but not bridge large ones.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

5 · DataFrame column-wise pct_change

+

pctChangeDataFrame(df) applies pctChangeSeries to every + column independently. Ideal for comparing multiple assets or metrics simultaneously.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

6 · Negative periods (look-forward change)

+

A negative periods value computes the forward change: how much will + this element change by the time we reach |periods| steps ahead. + Useful for computing returns on a "hold for N periods" strategy.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

API Reference

+

All functions return a new Series/DataFrame of the same shape — inputs are never mutated.

+
// Series
+pctChangeSeries(series, {
+  periods?: number,           // default 1 (positive = look back, negative = look forward)
+  fillMethod?: "pad" | "bfill" | null,  // default "pad"
+  limit?: number | null,      // max consecutive fills; default unlimited
+}): Series
+
+// DataFrame
+pctChangeDataFrame(df, {
+  periods?: number,
+  fillMethod?: "pad" | "bfill" | null,
+  limit?: number | null,
+  axis?: 0 | 1 | "index" | "columns",  // default 0 (column-wise)
+}): DataFrame
+
+ + + + + diff --git a/src/stats/pct_change.ts b/src/stats/pct_change.ts new file mode 100644 index 00000000..c46c9e84 --- /dev/null +++ b/src/stats/pct_change.ts @@ -0,0 +1,231 @@ +/** + * pct_change — percentage change between current and prior element. + * + * Mirrors `pandas.Series.pct_change()` / `pandas.DataFrame.pct_change()`: + * - `pctChangeSeries(series, options)` — per-element % change + * - `pctChangeDataFrame(df, options)` — column-wise % change + * + * Formula (per element i, with shift=periods): + * `result[i] = (x[i] - x[i-periods]) / x[i-periods]` + * + * When `fillMethod` is set, NaN/null values in the source are filled *before* + * computing the ratio (matching pandas' default behaviour of `fill_method="pad"`). + * + * @module + */ + +import { DataFrame } from "../core/index.ts"; +import { Series } from "../core/index.ts"; +import type { Scalar } from "../types.ts"; + +// ─── public types ───────────────────────────────────────────────────────────── + +/** Fill method applied to NaN/null before computing pct_change. */ +export type PctChangeFillMethod = "pad" | "bfill"; + +/** Options for {@link pctChangeSeries} and {@link pctChangeDataFrame}. */ +export interface PctChangeOptions { + /** + * Number of periods (lags) to shift when computing the ratio. + * Positive values look backward; negative values look forward. + * Default `1`. + */ + readonly periods?: number; + /** + * How to fill NaN/null values *before* computing the ratio. + * - `"pad"` (default): forward-fill (last valid observation carries forward). + * - `"bfill"`: backward-fill (next valid observation fills backward). + * - `null`: no filling — NaN/null stays as-is. + */ + readonly fillMethod?: PctChangeFillMethod | null; + /** + * Maximum number of consecutive NaN/null values to fill when `fillMethod` + * is set. `undefined` / `null` means no limit. + */ + readonly limit?: number | null; +} + +/** Options for {@link pctChangeDataFrame} — adds an axis selector. */ +export interface DataFramePctChangeOptions extends PctChangeOptions { + /** + * - `0` or `"index"` (default): apply operation **column-wise** (down rows). + * - `1` or `"columns"`: apply operation **row-wise** (across columns). + */ + readonly axis?: 0 | 1 | "index" | "columns"; +} + +// ─── helpers ────────────────────────────────────────────────────────────────── + +/** True when `v` is a valid number (not null, undefined, or NaN). */ +function isNum(v: Scalar): v is number { + return typeof v === "number" && !Number.isNaN(v) && v !== null; +} + +/** + * Forward-fill an array of scalars in place, respecting an optional limit. + * Returns a NEW array. + */ +function padFill(vals: readonly Scalar[], limit: number | null | undefined): Scalar[] { + const out: Scalar[] = [...vals]; + let run = 0; + let lastValid: Scalar = null; + for (let i = 0; i < out.length; i++) { + const v = out[i] as Scalar; + if (v !== null && v !== undefined && !(typeof v === "number" && Number.isNaN(v))) { + lastValid = v; + run = 0; + } else if (lastValid !== null && (limit == null || run < limit)) { + out[i] = lastValid; + run++; + } + } + return out; +} + +/** + * Backward-fill an array of scalars, respecting an optional limit. + * Returns a NEW array. + */ +function bfillFill(vals: readonly Scalar[], limit: number | null | undefined): Scalar[] { + const tmp = padFill([...vals].reverse(), limit); + return tmp.reverse(); +} + +/** Fill NaN/null in `vals` using the requested method. */ +function applyFill( + vals: readonly Scalar[], + method: PctChangeFillMethod | null | undefined, + limit: number | null | undefined, +): Scalar[] { + if (!method) return [...vals]; + return method === "pad" ? padFill(vals, limit) : bfillFill(vals, limit); +} + +/** Compute pct_change on a flat array of scalars. */ +function computePct(vals: readonly Scalar[], periods: number): Scalar[] { + const n = vals.length; + const out: Scalar[] = new Array(n).fill(null); + const shift = periods; + if (shift >= 0) { + for (let i = shift; i < n; i++) { + const curr = vals[i] as Scalar; + const prev = vals[i - shift] as Scalar; + if (isNum(curr) && isNum(prev) && prev !== 0) { + out[i] = curr / prev - 1; + } else if (isNum(curr) && isNum(prev) && prev === 0) { + // 0 denominator → Infinity (same as pandas) + out[i] = curr === 0 ? Number.NaN : curr > 0 ? Infinity : -Infinity; + } else { + out[i] = null; + } + } + } else { + // Negative periods: look forward + const absShift = -shift; + for (let i = 0; i < n - absShift; i++) { + const curr = vals[i] as Scalar; + const fwd = vals[i + absShift] as Scalar; + if (isNum(curr) && isNum(fwd) && curr !== 0) { + out[i] = fwd / curr - 1; + } else if (isNum(curr) && isNum(fwd) && curr === 0) { + out[i] = fwd === 0 ? Number.NaN : fwd > 0 ? Infinity : -Infinity; + } else { + out[i] = null; + } + } + } + return out; +} + +// ─── public API ─────────────────────────────────────────────────────────────── + +/** + * Compute the fractional change between a Series element and the element + * `periods` positions earlier (or later, for negative `periods`). + * + * Matches `pandas.Series.pct_change()`. + * + * @example + * ```ts + * const s = new Series({ data: [100, 110, 99, 121] }); + * pctChangeSeries(s); // [null, 0.1, -0.1, 0.2222…] + * ``` + */ +export function pctChangeSeries(series: Series, options: PctChangeOptions = {}): Series { + const periods = options.periods ?? 1; + const fillMethod = options.fillMethod !== undefined ? options.fillMethod : "pad"; + const limit = options.limit ?? null; + + const filled = applyFill(series.values, fillMethod, limit); + const result = computePct(filled, periods); + + return new Series({ + data: result, + index: series.index, + name: series.name ?? undefined, + }); +} + +/** + * Compute percentage change for every column (or row) of a DataFrame. + * + * Matches `pandas.DataFrame.pct_change()`. + * + * @example + * ```ts + * const df = new DataFrame(new Map([ + * ["a", new Series({ data: [100, 110, 121] })], + * ["b", new Series({ data: [200, 180, 198] })], + * ])); + * pctChangeDataFrame(df); // fractional change per column + * ``` + */ +export function pctChangeDataFrame( + df: DataFrame, + options: DataFramePctChangeOptions = {}, +): DataFrame { + const axis = options.axis ?? 0; + const colWise = axis === 0 || axis === "index"; + + if (colWise) { + const colMap = new Map>(); + for (const name of df.columns.values) { + colMap.set(name, pctChangeSeries(df.col(name), options)); + } + return new DataFrame(colMap, df.index); + } + + // Row-wise: each row across columns + const periods = options.periods ?? 1; + const fillMethod = options.fillMethod !== undefined ? options.fillMethod : "pad"; + const limit = options.limit ?? null; + const nRows = df.index.length; + const cols = df.columns.values; + const nCols = cols.length; + + const resultCols = new Map(); + for (const name of cols) { + resultCols.set(name, new Array(nRows).fill(null)); + } + + for (let r = 0; r < nRows; r++) { + const row: Scalar[] = []; + for (const name of cols) { + row.push(df.col(name).values[r] as Scalar); + } + const filled = applyFill(row, fillMethod, limit); + const pct = computePct(filled, periods); + for (let c = 0; c < nCols; c++) { + (resultCols.get(cols[c] as string) as Scalar[])[r] = pct[c] as Scalar; + } + } + + const colMap = new Map>(); + for (const name of cols) { + colMap.set( + name, + new Series({ data: resultCols.get(name) as Scalar[], index: df.index, name }), + ); + } + return new DataFrame(colMap, df.index); +} diff --git a/tests/stats/pct_change.test.ts b/tests/stats/pct_change.test.ts new file mode 100644 index 00000000..98966e8c --- /dev/null +++ b/tests/stats/pct_change.test.ts @@ -0,0 +1,252 @@ +/** + * Tests for src/stats/pct_change.ts — pctChangeSeries, pctChangeDataFrame + */ +import { describe, expect, it } from "bun:test"; +import fc from "fast-check"; +import { + DataFrame, + Series, + pctChangeDataFrame, + pctChangeSeries, +} from "../../src/index.ts"; +import type { Scalar } from "../../src/index.ts"; + +// ─── helpers ───────────────────────────────────────────────────────────────── + +function s(data: readonly Scalar[]): Series { + return new Series({ data: [...data] }); +} + +function nanEq(a: Scalar, b: Scalar): boolean { + if (typeof a === "number" && Number.isNaN(a) && typeof b === "number" && Number.isNaN(b)) { + return true; + } + return a === b; +} + +function arrEq(a: readonly Scalar[], b: readonly Scalar[]): boolean { + if (a.length !== b.length) return false; + for (let i = 0; i < a.length; i++) { + if (!nanEq(a[i] as Scalar, b[i] as Scalar)) return false; + } + return true; +} + +function close(a: Scalar, b: Scalar, eps = 1e-9): boolean { + if (a === null && b === null) return true; + if (typeof a !== "number" || typeof b !== "number") return false; + if (Number.isNaN(a) && Number.isNaN(b)) return true; + return Math.abs(a - b) < eps; +} + +function arrClose(a: readonly Scalar[], b: readonly Scalar[], eps = 1e-9): boolean { + if (a.length !== b.length) return false; + for (let i = 0; i < a.length; i++) { + if (!close(a[i] as Scalar, b[i] as Scalar, eps)) return false; + } + return true; +} + +// ─── pctChangeSeries ───────────────────────────────────────────────────────── + +describe("pctChangeSeries", () => { + it("basic increasing sequence", () => { + const result = pctChangeSeries(s([100, 110, 121, 133.1])); + expect(result.values[0]).toBeNull(); + expect(close(result.values[1] as Scalar, 0.1)).toBe(true); + expect(close(result.values[2] as Scalar, 0.1)).toBe(true); + expect(close(result.values[3] as Scalar, 0.1)).toBe(true); + }); + + it("decreasing sequence", () => { + const result = pctChangeSeries(s([200, 180, 162])); + expect(result.values[0]).toBeNull(); + expect(close(result.values[1] as Scalar, -0.1)).toBe(true); + expect(close(result.values[2] as Scalar, -0.1)).toBe(true); + }); + + it("periods=2", () => { + const result = pctChangeSeries(s([100, 105, 110, 121]), { periods: 2 }); + expect(result.values[0]).toBeNull(); + expect(result.values[1]).toBeNull(); + expect(close(result.values[2] as Scalar, 0.1)).toBe(true); + expect(close(result.values[3] as Scalar, (121 - 105) / 105)).toBe(true); + }); + + it("negative periods (look forward)", () => { + const result = pctChangeSeries(s([100, 110, 121]), { periods: -1 }); + expect(close(result.values[0] as Scalar, 0.1)).toBe(true); + expect(close(result.values[1] as Scalar, 0.1)).toBe(true); + expect(result.values[2]).toBeNull(); + }); + + it("NaN/null propagates when fillMethod=null", () => { + const result = pctChangeSeries(s([100, null, 110]), { fillMethod: null }); + expect(result.values[0]).toBeNull(); + expect(result.values[1]).toBeNull(); + expect(result.values[2]).toBeNull(); + }); + + it("fillMethod=pad fills NaN before computing", () => { + const result = pctChangeSeries(s([100, null, 110]), { fillMethod: "pad" }); + // after pad-fill: [100, 100, 110] + // pct: [null, 0, 0.1] + expect(result.values[0]).toBeNull(); + expect(close(result.values[1] as Scalar, 0)).toBe(true); + expect(close(result.values[2] as Scalar, 0.1)).toBe(true); + }); + + it("fillMethod=bfill fills NaN backward before computing", () => { + const result = pctChangeSeries(s([100, null, 110, 121]), { fillMethod: "bfill" }); + // after bfill: [100, 110, 110, 121] + // pct: [null, 0.1, 0, 0.1] + expect(result.values[0]).toBeNull(); + expect(close(result.values[1] as Scalar, 0.1)).toBe(true); + expect(close(result.values[2] as Scalar, 0)).toBe(true); + expect(close(result.values[3] as Scalar, 0.1)).toBe(true); + }); + + it("limit=1 caps forward-fill", () => { + const result = pctChangeSeries(s([100, null, null, 130]), { + fillMethod: "pad", + limit: 1, + }); + // after pad with limit=1: [100, 100, null, 130] + // pct: [null, 0, null, null] (null/100 → null) + expect(result.values[0]).toBeNull(); + expect(close(result.values[1] as Scalar, 0)).toBe(true); + expect(result.values[2]).toBeNull(); + expect(result.values[3]).toBeNull(); + }); + + it("zero denominator returns Infinity", () => { + const result = pctChangeSeries(s([0, 10]), { fillMethod: null }); + expect(result.values[1]).toBe(Infinity); + }); + + it("zero/zero denominator returns NaN", () => { + const result = pctChangeSeries(s([0, 0]), { fillMethod: null }); + expect(Number.isNaN(result.values[1] as number)).toBe(true); + }); + + it("preserves Series name and index", () => { + const src = new Series({ data: [10, 20, 30], name: "price" }); + const result = pctChangeSeries(src); + expect(result.name).toBe("price"); + expect(result.index.length).toBe(3); + }); + + it("empty series returns empty", () => { + const result = pctChangeSeries(s([])); + expect(result.values.length).toBe(0); + }); + + it("single-element series returns [null]", () => { + const result = pctChangeSeries(s([42])); + expect(result.values[0]).toBeNull(); + }); +}); + +// ─── pctChangeDataFrame ─────────────────────────────────────────────────────── + +describe("pctChangeDataFrame", () => { + it("column-wise (default)", () => { + const df = new DataFrame( + new Map([ + ["a", new Series({ data: [100, 110, 121] })], + ["b", new Series({ data: [200, 180, 198] })], + ]), + ); + const result = pctChangeDataFrame(df); + const colA = result.col("a").values; + const colB = result.col("b").values; + expect(colA[0]).toBeNull(); + expect(close(colA[1] as Scalar, 0.1)).toBe(true); + expect(close(colA[2] as Scalar, 0.1)).toBe(true); + expect(colB[0]).toBeNull(); + expect(close(colB[1] as Scalar, -0.1)).toBe(true); + expect(close(colB[2] as Scalar, 0.1)).toBe(true); + }); + + it("row-wise (axis=1)", () => { + const df = new DataFrame( + new Map([ + ["a", new Series({ data: [100, 200] })], + ["b", new Series({ data: [110, 220] })], + ["c", new Series({ data: [121, 242] })], + ]), + ); + const result = pctChangeDataFrame(df, { axis: 1 }); + // row 0: [100, 110, 121] → [null, 0.1, 0.1] + // row 1: [200, 220, 242] → [null, 0.1, 0.1] + const row0a = result.col("a").values[0]; + const row0b = result.col("b").values[0]; + const row0c = result.col("c").values[0]; + expect(row0a).toBeNull(); + expect(close(row0b as Scalar, 0.1)).toBe(true); + expect(close(row0c as Scalar, 0.1)).toBe(true); + const row1a = result.col("a").values[1]; + const row1b = result.col("b").values[1]; + expect(row1a).toBeNull(); + expect(close(row1b as Scalar, 0.1)).toBe(true); + }); + + it("preserves column order", () => { + const df = new DataFrame( + new Map([ + ["x", new Series({ data: [1, 2] })], + ["y", new Series({ data: [3, 6] })], + ]), + ); + const result = pctChangeDataFrame(df); + expect(result.columns.values).toEqual(["x", "y"]); + }); +}); + +// ─── property-based tests ───────────────────────────────────────────────────── + +describe("pctChangeSeries — property tests", () => { + it("result length equals input length", () => { + fc.assert( + fc.property(fc.array(fc.float({ noNaN: true }), { minLength: 0, maxLength: 50 }), (arr) => { + const result = pctChangeSeries(s(arr)); + return result.values.length === arr.length; + }), + ); + }); + + it("first element is always null for periods=1", () => { + fc.assert( + fc.property( + fc.array(fc.float({ noNaN: true }), { minLength: 1, maxLength: 50 }), + (arr) => { + const result = pctChangeSeries(s(arr)); + return result.values[0] === null; + }, + ), + ); + }); + + it("pct_change(x, -p) equals pct_change_reversed pattern", () => { + // For a sequence of positive numbers with periods=1 and periods=-1: + // result[-1][i] represents the change looking forward, so result[-1][i] = (x[i+1]-x[i])/x[i] + // and result[+1][i+1] = (x[i+1]-x[i])/x[i], so they should agree on matching indices + fc.assert( + fc.property( + fc.array(fc.float({ noNaN: true, min: 1, max: 1000 }), { minLength: 3, maxLength: 20 }), + (arr) => { + const fwd = pctChangeSeries(s(arr), { periods: -1, fillMethod: null }); + const bwd = pctChangeSeries(s(arr), { periods: 1, fillMethod: null }); + // fwd[i] = (arr[i+1] - arr[i]) / arr[i] + // bwd[i+1] = (arr[i+1] - arr[i]) / arr[i] ← same ratio + for (let i = 0; i < arr.length - 1; i++) { + if (!close(fwd.values[i] as Scalar, bwd.values[i + 1] as Scalar, 1e-6)) { + return false; + } + } + return true; + }, + ), + ); + }); +}); From fca79a39c64d72d0bb831a63e93a6d209f9697e0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 11 Apr 2026 20:32:34 +0000 Subject: [PATCH 03/14] Iteration 193: Add idxmin/idxmax for Series and DataFrame Run: https://github.com/githubnext/tsessebe/actions/runs/24281202174 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- playground/idxmin_idxmax.html | 439 ++++++++++++++++++++++++++++++ src/stats/idxmin_idxmax.ts | 234 ++++++++++++++++ tests/stats/idxmin_idxmax.test.ts | 270 ++++++++++++++++++ 3 files changed, 943 insertions(+) create mode 100644 playground/idxmin_idxmax.html create mode 100644 src/stats/idxmin_idxmax.ts create mode 100644 tests/stats/idxmin_idxmax.test.ts diff --git a/playground/idxmin_idxmax.html b/playground/idxmin_idxmax.html new file mode 100644 index 00000000..b771dd36 --- /dev/null +++ b/playground/idxmin_idxmax.html @@ -0,0 +1,439 @@ + + + + + + tsb — idxmin / idxmax + + + +
+
+
Loading TypeScript compiler…
+
+ + ← tsb playground +

idxmin / idxmax

+

+ Return the index label of the minimum or maximum value in a + Series or each column of a DataFrame. + Mirrors pandas.Series.idxmin(), idxmax(), + pandas.DataFrame.idxmin(), and DataFrame.idxmax(). +

+ + +
+

1 · Series.idxmin — label of the minimum value

+

Returns the index label at the position of the minimum value. + NaN / null values are skipped by default.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

2 · Series.idxmax — label of the maximum value

+

Returns the index label at the position of the maximum value.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

3 · NaN handling — skipna option

+

By default NaN / null values are skipped. Set skipna: false + to propagate NaN (returns null if any value is NaN).

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

4 · DataFrame.idxmin — row label of column minima

+

Returns a Series indexed by column names. Each value is the row label + where that column achieves its minimum.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

5 · DataFrame.idxmax — row label of column maxima

+

Returns a Series indexed by column names, where each entry is the row + label of that column's maximum value.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

6 · Edge cases — empty, all-NaN, all-equal

+

Behavior for empty series, series where every value is NaN, and series + where all values are equal.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

API Reference

+
// Series
+idxminSeries(series, { skipna?: boolean }): Label   // default skipna=true
+idxmaxSeries(series, { skipna?: boolean }): Label
+
+// DataFrame (axis=0 — min/max per column)
+idxminDataFrame(df, { skipna?: boolean }): Series   // indexed by column names
+idxmaxDataFrame(df, { skipna?: boolean }): Series
+
+ + + + + diff --git a/src/stats/idxmin_idxmax.ts b/src/stats/idxmin_idxmax.ts new file mode 100644 index 00000000..6ee745f9 --- /dev/null +++ b/src/stats/idxmin_idxmax.ts @@ -0,0 +1,234 @@ +/** + * idxmin / idxmax — return the index label of the minimum or maximum value. + * + * Mirrors `pandas.Series.idxmin()` / `pandas.Series.idxmax()` and + * `pandas.DataFrame.idxmin()` / `pandas.DataFrame.idxmax()`: + * + * - `idxminSeries(series)` — label of the minimum value (NaN/null excluded) + * - `idxmaxSeries(series)` — label of the maximum value (NaN/null excluded) + * - `idxminDataFrame(df)` — Series of row labels where each column achieves its min + * - `idxmaxDataFrame(df)` — Series of row labels where each column achieves its max + * + * When `skipna` is true (the default), NaN / null values are ignored. + * When `skipna` is false, any NaN / null causes the result to be `null`. + * + * @module + */ + +import type { DataFrame } from "../core/index.ts"; +import { Dtype, Series } from "../core/index.ts"; +import type { Label, Scalar } from "../types.ts"; + +// ─── public types ───────────────────────────────────────────────────────────── + +/** Options for {@link idxminSeries}, {@link idxmaxSeries}. */ +export interface IdxOptions { + /** + * Whether to skip NaN / null values. + * @defaultValue `true` + */ + readonly skipna?: boolean; +} + +/** Options for {@link idxminDataFrame}, {@link idxmaxDataFrame}. */ +export interface IdxDataFrameOptions { + /** + * Whether to skip NaN / null values. + * @defaultValue `true` + */ + readonly skipna?: boolean; +} + +// ─── helpers ────────────────────────────────────────────────────────────────── + +/** True when a scalar should be treated as missing. */ +function isMissing(v: Scalar): boolean { + return v === null || v === undefined || (typeof v === "number" && Number.isNaN(v)); +} + +/** + * Find the index of the extreme value (min or max) among `values`. + * Returns `null` when all values are missing (with `skipna=true`) or when + * any value is missing (with `skipna=false`). + */ +function findExtreme( + values: readonly Scalar[], + skipna: boolean, + isBetter: (a: Scalar, b: Scalar) => boolean, +): number | null { + let bestIdx: number | null = null; + let bestVal: Scalar = null; + + for (let i = 0; i < values.length; i++) { + const v = values[i] as Scalar; + if (isMissing(v)) { + if (!skipna) { + return null; + } + continue; + } + if (bestIdx === null || isBetter(v, bestVal)) { + bestIdx = i; + bestVal = v; + } + } + return bestIdx; +} + +/** Compare scalars: returns true if `a` is less than `b`. */ +function isLess(a: Scalar, b: Scalar): boolean { + if (b === null || b === undefined) { + return false; + } + return (a as number | string | boolean) < (b as number | string | boolean); +} + +/** Compare scalars: returns true if `a` is greater than `b`. */ +function isGreater(a: Scalar, b: Scalar): boolean { + if (b === null || b === undefined) { + return false; + } + return (a as number | string | boolean) > (b as number | string | boolean); +} + +// ─── public API — Series ────────────────────────────────────────────────────── + +/** + * Return the index label of the minimum value in `series`. + * + * NaN / null values are excluded when `skipna` is true (the default). + * Returns `null` when the series is empty or all values are NaN / null. + * + * Mirrors `pandas.Series.idxmin()`. + * + * @param series - Input Series. + * @param options - Options (skipna). + * @returns The index label at the minimum value, or `null` if no valid value exists. + * + * @example + * ```ts + * import { Series, idxminSeries } from "tsb"; + * + * const s = new Series({ data: [3, 1, 4, 1, 5], index: ["a", "b", "c", "d", "e"] }); + * idxminSeries(s); // "b" (first occurrence of 1) + * ``` + */ +export function idxminSeries(series: Series, options: IdxOptions = {}): Label { + const skipna = options.skipna ?? true; + const idx = findExtreme(series.values, skipna, isLess); + if (idx === null) { + return null; + } + return series.index.at(idx); +} + +/** + * Return the index label of the maximum value in `series`. + * + * NaN / null values are excluded when `skipna` is true (the default). + * Returns `null` when the series is empty or all values are NaN / null. + * + * Mirrors `pandas.Series.idxmax()`. + * + * @param series - Input Series. + * @param options - Options (skipna). + * @returns The index label at the maximum value, or `null` if no valid value exists. + * + * @example + * ```ts + * import { Series, idxmaxSeries } from "tsb"; + * + * const s = new Series({ data: [3, 1, 4, 1, 5], index: ["a", "b", "c", "d", "e"] }); + * idxmaxSeries(s); // "e" + * ``` + */ +export function idxmaxSeries(series: Series, options: IdxOptions = {}): Label { + const skipna = options.skipna ?? true; + const idx = findExtreme(series.values, skipna, isGreater); + if (idx === null) { + return null; + } + return series.index.at(idx); +} + +// ─── public API — DataFrame ─────────────────────────────────────────────────── + +/** + * Return a Series containing the index label of the minimum value for each column. + * + * The result Series is indexed by column names. + * NaN / null values are excluded when `skipna` is true (the default). + * Columns where all values are NaN / null yield `null` in the result. + * + * Mirrors `pandas.DataFrame.idxmin()` (axis=0). + * + * @param df - Input DataFrame. + * @param options - Options (skipna). + * @returns A Series indexed by column names, containing the row label of each column's min. + * + * @example + * ```ts + * import { DataFrame, idxminDataFrame } from "tsb"; + * + * const df = DataFrame.fromColumns({ a: [3, 1, 4], b: [10, 20, 5] }, { index: ["x", "y", "z"] }); + * idxminDataFrame(df).values; // ["y", "z"] + * ``` + */ +export function idxminDataFrame(df: DataFrame, options: IdxDataFrameOptions = {}): Series { + const skipna = options.skipna ?? true; + const colNames = df.columns.values; + const result: Label[] = colNames.map((colName) => { + const s = df.col(colName); + const idx = findExtreme(s.values, skipna, isLess); + if (idx === null) { + return null; + } + return df.index.at(idx); + }); + return new Series({ + data: result, + index: colNames as unknown as Label[], + name: null, + dtype: Dtype.from("object"), + }); +} + +/** + * Return a Series containing the index label of the maximum value for each column. + * + * The result Series is indexed by column names. + * NaN / null values are excluded when `skipna` is true (the default). + * Columns where all values are NaN / null yield `null` in the result. + * + * Mirrors `pandas.DataFrame.idxmax()` (axis=0). + * + * @param df - Input DataFrame. + * @param options - Options (skipna). + * @returns A Series indexed by column names, containing the row label of each column's max. + * + * @example + * ```ts + * import { DataFrame, idxmaxDataFrame } from "tsb"; + * + * const df = DataFrame.fromColumns({ a: [3, 1, 4], b: [10, 20, 5] }, { index: ["x", "y", "z"] }); + * idxmaxDataFrame(df).values; // ["z", "y"] + * ``` + */ +export function idxmaxDataFrame(df: DataFrame, options: IdxDataFrameOptions = {}): Series { + const skipna = options.skipna ?? true; + const colNames = df.columns.values; + const result: Label[] = colNames.map((colName) => { + const s = df.col(colName); + const idx = findExtreme(s.values, skipna, isGreater); + if (idx === null) { + return null; + } + return df.index.at(idx); + }); + return new Series({ + data: result, + index: colNames as unknown as Label[], + name: null, + dtype: Dtype.from("object"), + }); +} diff --git a/tests/stats/idxmin_idxmax.test.ts b/tests/stats/idxmin_idxmax.test.ts new file mode 100644 index 00000000..05cfd459 --- /dev/null +++ b/tests/stats/idxmin_idxmax.test.ts @@ -0,0 +1,270 @@ +/** + * Tests for src/stats/idxmin_idxmax.ts + * — idxminSeries, idxmaxSeries, idxminDataFrame, idxmaxDataFrame + */ +import { describe, expect, it } from "bun:test"; +import fc from "fast-check"; +import { + DataFrame, + Series, + idxmaxDataFrame, + idxmaxSeries, + idxminDataFrame, + idxminSeries, +} from "../../src/index.ts"; +import type { Label, Scalar } from "../../src/index.ts"; + +// ─── helpers ───────────────────────────────────────────────────────────────── + +function s(data: readonly Scalar[], index?: readonly Label[]): Series { + return new Series({ data: [...data], ...(index !== undefined ? { index: [...index] } : {}) }); +} + +// ─── idxminSeries ───────────────────────────────────────────────────────────── + +describe("idxminSeries", () => { + it("returns label of the minimum value", () => { + const series = s([3, 1, 4, 1, 5], ["a", "b", "c", "d", "e"]); + expect(idxminSeries(series)).toBe("b"); // first occurrence of minimum 1 + }); + + it("returns integer index label for default index", () => { + const series = s([10, 3, 7]); + expect(idxminSeries(series)).toBe(1); + }); + + it("handles single element", () => { + const series = s([42], ["x"]); + expect(idxminSeries(series)).toBe("x"); + }); + + it("returns null for empty series", () => { + const series = s([]); + expect(idxminSeries(series)).toBeNull(); + }); + + it("skips NaN by default (skipna=true)", () => { + const series = s([Number.NaN, 2, 1, Number.NaN], ["a", "b", "c", "d"]); + expect(idxminSeries(series)).toBe("c"); + }); + + it("skips null values by default", () => { + const series = s([null, 5, 2, null], ["a", "b", "c", "d"]); + expect(idxminSeries(series)).toBe("c"); + }); + + it("returns null when all values are NaN with skipna=true", () => { + const series = s([Number.NaN, Number.NaN], ["a", "b"]); + expect(idxminSeries(series)).toBeNull(); + }); + + it("returns null when any value is NaN with skipna=false", () => { + const series = s([1, Number.NaN, 3], ["a", "b", "c"]); + expect(idxminSeries(series, { skipna: false })).toBeNull(); + }); + + it("returns correct label with skipna=false when no NaN", () => { + const series = s([5, 2, 8], ["a", "b", "c"]); + expect(idxminSeries(series, { skipna: false })).toBe("b"); + }); + + it("handles negative numbers", () => { + const series = s([-1, -5, -3], ["x", "y", "z"]); + expect(idxminSeries(series)).toBe("y"); + }); + + it("handles all equal values — returns first label", () => { + const series = s([7, 7, 7], ["p", "q", "r"]); + expect(idxminSeries(series)).toBe("p"); + }); + + it("works with string values (lexicographic min)", () => { + const series = s(["banana", "apple", "cherry"], ["a", "b", "c"]); + expect(idxminSeries(series)).toBe("b"); // "apple" < "banana" < "cherry" + }); + + it("handles NaN at the start with skipna=true", () => { + const series = s([Number.NaN, 3, 1], ["a", "b", "c"]); + expect(idxminSeries(series)).toBe("c"); + }); +}); + +// ─── idxmaxSeries ───────────────────────────────────────────────────────────── + +describe("idxmaxSeries", () => { + it("returns label of the maximum value", () => { + const series = s([3, 1, 4, 1, 5], ["a", "b", "c", "d", "e"]); + expect(idxmaxSeries(series)).toBe("e"); + }); + + it("returns integer index label for default index", () => { + const series = s([10, 3, 7]); + expect(idxmaxSeries(series)).toBe(0); + }); + + it("handles single element", () => { + const series = s([42], ["x"]); + expect(idxmaxSeries(series)).toBe("x"); + }); + + it("returns null for empty series", () => { + const series = s([]); + expect(idxmaxSeries(series)).toBeNull(); + }); + + it("skips NaN by default (skipna=true)", () => { + const series = s([Number.NaN, 2, 9, Number.NaN], ["a", "b", "c", "d"]); + expect(idxmaxSeries(series)).toBe("c"); + }); + + it("returns null when all values are NaN with skipna=true", () => { + const series = s([Number.NaN, Number.NaN], ["a", "b"]); + expect(idxmaxSeries(series)).toBeNull(); + }); + + it("returns null when any value is NaN with skipna=false", () => { + const series = s([1, Number.NaN, 3], ["a", "b", "c"]); + expect(idxmaxSeries(series, { skipna: false })).toBeNull(); + }); + + it("handles negative numbers", () => { + const series = s([-1, -5, -3], ["x", "y", "z"]); + expect(idxmaxSeries(series)).toBe("x"); + }); + + it("all equal — returns first label", () => { + const series = s([3, 3, 3], ["p", "q", "r"]); + expect(idxmaxSeries(series)).toBe("p"); + }); + + it("works with string values (lexicographic max)", () => { + const series = s(["banana", "apple", "cherry"], ["a", "b", "c"]); + expect(idxmaxSeries(series)).toBe("c"); // "cherry" > "banana" > "apple" + }); +}); + +// ─── idxminDataFrame ────────────────────────────────────────────────────────── + +describe("idxminDataFrame", () => { + it("returns row label of minimum for each column", () => { + const df = DataFrame.fromColumns({ a: [3, 1, 4], b: [10, 20, 5] }, { index: ["x", "y", "z"] }); + const result = idxminDataFrame(df); + expect(result.at("a")).toBe("y"); // min of a is 1 at row "y" + expect(result.at("b")).toBe("z"); // min of b is 5 at row "z" + }); + + it("result is indexed by column names", () => { + const df = DataFrame.fromColumns({ a: [1, 2], b: [3, 4] }); + const result = idxminDataFrame(df); + expect([...result.index.values]).toEqual(["a", "b"]); + }); + + it("skips NaN by default", () => { + const df = DataFrame.fromColumns( + { a: [Number.NaN, 2, 1], b: [5, Number.NaN, 3] }, + { index: ["x", "y", "z"] }, + ); + const result = idxminDataFrame(df); + expect(result.at("a")).toBe("z"); + expect(result.at("b")).toBe("z"); + }); + + it("returns null for column with all NaN (skipna=true)", () => { + const df = DataFrame.fromColumns( + { a: [1, 2], b: [Number.NaN, Number.NaN] }, + { index: ["x", "y"] }, + ); + const result = idxminDataFrame(df); + expect(result.at("a")).toBe("x"); + expect(result.at("b")).toBeNull(); + }); + + it("handles single row DataFrame", () => { + const df = DataFrame.fromColumns({ a: [42], b: [7] }, { index: ["row0"] }); + const result = idxminDataFrame(df); + expect(result.at("a")).toBe("row0"); + expect(result.at("b")).toBe("row0"); + }); +}); + +// ─── idxmaxDataFrame ────────────────────────────────────────────────────────── + +describe("idxmaxDataFrame", () => { + it("returns row label of maximum for each column", () => { + const df = DataFrame.fromColumns({ a: [3, 1, 4], b: [10, 20, 5] }, { index: ["x", "y", "z"] }); + const result = idxmaxDataFrame(df); + expect(result.at("a")).toBe("z"); // max of a is 4 at row "z" + expect(result.at("b")).toBe("y"); // max of b is 20 at row "y" + }); + + it("result is indexed by column names", () => { + const df = DataFrame.fromColumns({ a: [1, 2], b: [3, 4] }); + const result = idxmaxDataFrame(df); + expect([...result.index.values]).toEqual(["a", "b"]); + }); + + it("skips NaN by default", () => { + const df = DataFrame.fromColumns( + { a: [Number.NaN, 2, 1], b: [5, Number.NaN, 3] }, + { index: ["x", "y", "z"] }, + ); + const result = idxmaxDataFrame(df); + expect(result.at("a")).toBe("y"); + expect(result.at("b")).toBe("x"); + }); + + it("handles single row DataFrame", () => { + const df = DataFrame.fromColumns({ a: [42], b: [7] }, { index: ["row0"] }); + const result = idxmaxDataFrame(df); + expect(result.at("a")).toBe("row0"); + expect(result.at("b")).toBe("row0"); + }); +}); + +// ─── property-based tests ───────────────────────────────────────────────────── + +describe("idxminSeries property tests", () => { + it("idxmin label points to minimum value in series", () => { + fc.assert( + fc.property(fc.array(fc.double({ noNaN: true }), { minLength: 1, maxLength: 20 }), (data) => { + const series = s(data); + const label = idxminSeries(series); + if (label === null) { + return true; + } + const minVal = Math.min(...data); + return series.at(label as number) === minVal; + }), + ); + }); + + it("idxmax label points to maximum value in series", () => { + fc.assert( + fc.property(fc.array(fc.double({ noNaN: true }), { minLength: 1, maxLength: 20 }), (data) => { + const series = s(data); + const label = idxmaxSeries(series); + if (label === null) { + return true; + } + const maxVal = Math.max(...data); + return series.at(label as number) === maxVal; + }), + ); + }); + + it("idxmin and idxmax are consistent — min <= max", () => { + fc.assert( + fc.property(fc.array(fc.double({ noNaN: true }), { minLength: 2, maxLength: 20 }), (data) => { + const series = s(data); + const minLabel = idxminSeries(series); + const maxLabel = idxmaxSeries(series); + if (minLabel === null || maxLabel === null) { + return true; + } + const minVal = series.at(minLabel as number) as number; + const maxVal = series.at(maxLabel as number) as number; + return minVal <= maxVal; + }), + ); + }); +}); From eb62a33e0c975295fdad3b64d96a1192b072b7dc Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 11 Apr 2026 20:32:35 +0000 Subject: [PATCH 04/14] =?UTF-8?q?Iteration=20194:=20Add=20astype=20?= =?UTF-8?q?=E2=80=94=20dtype=20coercion=20for=20Series=20and=20DataFrame?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run: https://github.com/githubnext/tsessebe/actions/runs/24282208612 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- playground/astype.html | 438 ++++++++++++++++++++++++++++++++++++++ src/core/astype.ts | 245 +++++++++++++++++++++ src/core/index.ts | 4 + tests/core/astype.test.ts | 292 +++++++++++++++++++++++++ 4 files changed, 979 insertions(+) create mode 100644 playground/astype.html create mode 100644 src/core/astype.ts create mode 100644 tests/core/astype.test.ts diff --git a/playground/astype.html b/playground/astype.html new file mode 100644 index 00000000..efd9e5ed --- /dev/null +++ b/playground/astype.html @@ -0,0 +1,438 @@ + + + + + + tsb — astype + + + +
+
+

Loading tsb runtime…

+
+ + ← tsb playground +

astype — dtype coercion

+

+ Cast Series and DataFrame values to a different dtype. + Mirrors pandas.Series.astype and pandas.DataFrame.astype. +

+ + +
+

1 · Series — float to int64

+

+ Cast floating-point values to integers via truncation (same as + pandas.Series.astype("int64")). +

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

2 · Series — numbers to string

+

Convert every value to its string representation. Null/undefined values + become null (not the string "null").

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

3 · Overflow clamping for bounded integer dtypes

+

+ Values that overflow the target integer dtype's range are clamped to + [min, max] — e.g. uint8 is clamped to + [0, 255]. +

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

4 · DataFrame — cast all columns

+

Pass a single dtype name to cast every column to the same type.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

5 · DataFrame — per-column dtype mapping

+

Pass a Record<string, DtypeName> to cast individual + columns. Columns not listed are carried over unchanged.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

6 · Casting to bool

+

Zero, empty string, and NaN become false; + everything else (including non-zero numbers and non-empty strings) + becomes true.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

API Reference

+
// Series cast
+astypeSeries(
+  series: Series,
+  dtype: DtypeName | Dtype,
+  options?: AstypeOptions,
+): Series
+
+// DataFrame cast (all columns or per-column mapping)
+astype(
+  df: DataFrame,
+  dtype: DtypeName | Dtype | Record<string, DtypeName | Dtype>,
+  options?: DataFrameAstypeOptions,
+): DataFrame
+
+// Low-level scalar cast
+castScalar(value: Scalar, dtype: Dtype): Scalar
+
+// Options
+interface AstypeOptions {
+  errors?: "raise" | "ignore";  // default "raise"
+}
+
+// Supported dtype names
+type DtypeName =
+  | "int8" | "int16" | "int32" | "int64"
+  | "uint8" | "uint16" | "uint32" | "uint64"
+  | "float32" | "float64"
+  | "bool" | "string" | "object"
+  | "datetime" | "timedelta" | "category"
+
+ + + + + diff --git a/src/core/astype.ts b/src/core/astype.ts new file mode 100644 index 00000000..6a9403be --- /dev/null +++ b/src/core/astype.ts @@ -0,0 +1,245 @@ +/** + * astype — dtype coercion for Series and DataFrame. + * + * Mirrors `pandas.Series.astype` and `pandas.DataFrame.astype`: + * cast values to a target dtype, with null/NaN passthrough semantics + * matching pandas' default `errors="raise"` behaviour. + * + * @module + */ + +import { DataFrame } from "./frame.ts"; +import { Series } from "./series.ts"; +import { Dtype } from "./dtype.ts"; +import type { DtypeName, Scalar } from "../types.ts"; + +// ─── helpers ────────────────────────────────────────────────────────────────── + +function isNull(v: Scalar): v is null | undefined { + return v === null || v === undefined; +} + +/** Integer clamp ranges for each integer dtype name. */ +const INT_RANGES: Readonly< + Record +> = { + int8: { lo: -128, hi: 127, unsigned: false }, + int16: { lo: -32768, hi: 32767, unsigned: false }, + int32: { lo: -2147483648, hi: 2147483647, unsigned: false }, + int64: { lo: Number.MIN_SAFE_INTEGER, hi: Number.MAX_SAFE_INTEGER, unsigned: false }, + uint8: { lo: 0, hi: 255, unsigned: true }, + uint16: { lo: 0, hi: 65535, unsigned: true }, + uint32: { lo: 0, hi: 4294967295, unsigned: true }, + uint64: { lo: 0, hi: Number.MAX_SAFE_INTEGER, unsigned: true }, +}; + +/** + * Cast a single scalar value to the target dtype. + * + * Rules per dtype kind: + * - **int/uint**: `Math.trunc(Number(v))`, clamped to the dtype range. `null/undefined → null`. + * - **float32/float64**: `Number(v)`. `null/undefined → null`. Strings that + * are not parsable become `NaN` (same as pandas `errors="coerce"`-like + * number coercion). + * - **bool**: falsy values → `false`; truthy → `true`. `null/undefined → null`. + * - **string**: `String(v)`. `null/undefined → null`. + * - **datetime**: `new Date(Number(v))` for numbers; `new Date(String(v))` for + * strings; `null/undefined → null`. + * - **object/category/timedelta**: value is returned as-is (no transformation). + */ +export function castScalar(v: Scalar, dtype: Dtype): Scalar { + if (isNull(v)) { + return null; + } + + const k = dtype.kind; + + if (k === "int" || k === "uint") { + if (typeof v === "boolean") { + return v ? 1 : 0; + } + if (v instanceof Date) { + return Math.trunc(v.getTime()); + } + const n = Number(v); + if (Number.isNaN(n)) { + return null; + } + const range = INT_RANGES[dtype.name]; + if (range === undefined) { + return Math.trunc(n); + } + const t = Math.trunc(n); + return Math.max(range.lo, Math.min(range.hi, t)); + } + + if (k === "float") { + if (typeof v === "boolean") { + return v ? 1.0 : 0.0; + } + if (v instanceof Date) { + return v.getTime(); + } + return Number(v); + } + + if (k === "bool") { + if (typeof v === "number") { + return !Number.isNaN(v) && v !== 0; + } + if (v instanceof Date) { + return true; + } + return Boolean(v); + } + + if (k === "string") { + if (v instanceof Date) { + return v.toISOString(); + } + return String(v); + } + + if (k === "datetime") { + if (v instanceof Date) { + return v; + } + if (typeof v === "number") { + return new Date(v); + } + const d = new Date(String(v)); + return Number.isNaN(d.getTime()) ? null : d; + } + + // object / category / timedelta — return unchanged + return v; +} + +// ─── AstypeOptions ──────────────────────────────────────────────────────────── + +/** Options accepted by {@link astypeSeries} and {@link astype}. */ +export interface AstypeOptions { + /** + * When `true`, values that cannot be cast are silently replaced with + * `null` instead of throwing. + * + * @default false + */ + readonly errors?: "raise" | "ignore"; +} + +// ─── astypeSeries ───────────────────────────────────────────────────────────── + +/** + * Cast a Series to a different dtype. + * + * Returns a new Series whose values have been coerced to `dtype`. The index + * and name are preserved unchanged. + * + * @example + * ```ts + * const s = new Series({ data: [1.9, 2.1, 3.7], name: "x" }); + * const si = astypeSeries(s, "int64"); + * si.values; // [1, 2, 3] + * si.dtype.name; // "int64" + * ``` + */ +export function astypeSeries( + s: Series, + dtype: DtypeName | Dtype, + options: AstypeOptions = {}, +): Series { + const targetDtype = dtype instanceof Dtype ? dtype : Dtype.from(dtype as DtypeName); + const { errors = "raise" } = options; + + const casted: Scalar[] = []; + for (const v of s.values) { + let out: Scalar; + try { + out = castScalar(v, targetDtype); + } catch (e) { + if (errors === "ignore") { + out = v; + } else { + throw e; + } + } + casted.push(out); + } + + return new Series({ + data: casted, + index: s.index, + dtype: targetDtype, + name: s.name, + }); +} + +// ─── DataFrame astype ───────────────────────────────────────────────────────── + +/** + * Options for {@link astype} (DataFrame variant). + */ +export interface DataFrameAstypeOptions extends AstypeOptions { + /** + * When `true`, only the columns listed in `dtype` (when `dtype` is a + * `Record`) are recast; other columns are carried over unchanged. + * + * When `false` (default) and `dtype` is a `Record`, columns not listed + * in the map are carried over unchanged (same behaviour). + * + * This option exists for pandas API compatibility. + */ + readonly copy?: boolean; +} + +/** + * Cast one or more columns in a DataFrame to the specified dtype(s). + * + * - Pass a single `DtypeName` or `Dtype` to cast **all** columns. + * - Pass a `Record` to cast individual columns. + * Columns not listed are returned unchanged. + * + * Returns a new DataFrame; the original is not modified. + * + * @example + * ```ts + * const df = DataFrame.fromColumns({ a: [1.5, 2.7], b: ["3", "4"] }); + * + * // Cast all columns to float64 + * astype(df, "float64"); + * + * // Cast only column "b" to int64 + * astype(df, { b: "int64" }); + * ``` + */ +export function astype( + df: DataFrame, + dtype: + | DtypeName + | Dtype + | Readonly>, + options: DataFrameAstypeOptions = {}, +): DataFrame { + const colMap = new Map>(); + + const isSingleDtype = + typeof dtype === "string" || dtype instanceof Dtype; + + for (const name of df.columns.values) { + const col = df.col(name); + if (isSingleDtype) { + colMap.set(name, astypeSeries(col, dtype as DtypeName | Dtype, options)); + } else { + const mapping = dtype as Readonly>; + const target = mapping[name]; + if (target !== undefined) { + colMap.set(name, astypeSeries(col, target, options)); + } else { + colMap.set(name, col); + } + } + } + + return new DataFrame(colMap, df.index); +} diff --git a/src/core/index.ts b/src/core/index.ts index ada43b65..b8513810 100644 --- a/src/core/index.ts +++ b/src/core/index.ts @@ -15,3 +15,7 @@ export { CategoricalAccessor } from "./cat_accessor.ts"; export type { CatSeriesLike } from "./cat_accessor.ts"; export { MultiIndex } from "./multi_index.ts"; export type { MultiIndexOptions } from "./multi_index.ts"; +export { astypeSeries, astype, castScalar } from "./astype.ts"; +export type { AstypeOptions, DataFrameAstypeOptions } from "./astype.ts"; +export { sampleSeries, sampleDataFrame } from "./sample.ts"; +export type { SampleOptions } from "./sample.ts"; diff --git a/tests/core/astype.test.ts b/tests/core/astype.test.ts new file mode 100644 index 00000000..f6336137 --- /dev/null +++ b/tests/core/astype.test.ts @@ -0,0 +1,292 @@ +/** + * Tests for astype — dtype coercion for Series and DataFrame. + */ +import { describe, expect, it } from "bun:test"; +import fc from "fast-check"; +import { DataFrame, Dtype, Series, astype, astypeSeries, castScalar } from "../../src/index.ts"; + +describe("castScalar", () => { + describe("int64", () => { + it("casts float to int via truncation", () => { + expect(castScalar(3.9, Dtype.int64)).toBe(3); + expect(castScalar(-2.1, Dtype.int64)).toBe(-2); + }); + + it("casts boolean true/false", () => { + expect(castScalar(true, Dtype.int64)).toBe(1); + expect(castScalar(false, Dtype.int64)).toBe(0); + }); + + it("casts numeric string", () => { + expect(castScalar("42", Dtype.int64)).toBe(42); + }); + + it("returns null for null/undefined", () => { + expect(castScalar(null, Dtype.int64)).toBeNull(); + expect(castScalar(undefined, Dtype.int64)).toBeNull(); + }); + + it("returns null for non-numeric string", () => { + expect(castScalar("abc", Dtype.int64)).toBeNull(); + }); + }); + + describe("int8 clamping", () => { + it("clamps to [-128, 127]", () => { + expect(castScalar(200, Dtype.from("int8"))).toBe(127); + expect(castScalar(-200, Dtype.from("int8"))).toBe(-128); + expect(castScalar(100, Dtype.from("int8"))).toBe(100); + }); + }); + + describe("uint8 clamping", () => { + it("clamps to [0, 255]", () => { + expect(castScalar(-5, Dtype.from("uint8"))).toBe(0); + expect(castScalar(300, Dtype.from("uint8"))).toBe(255); + expect(castScalar(128, Dtype.from("uint8"))).toBe(128); + }); + }); + + describe("float64", () => { + it("casts integer to float", () => { + expect(castScalar(3, Dtype.float64)).toBe(3.0); + }); + + it("casts boolean to 0.0/1.0", () => { + expect(castScalar(true, Dtype.float64)).toBe(1.0); + expect(castScalar(false, Dtype.float64)).toBe(0.0); + }); + + it("returns null for null", () => { + expect(castScalar(null, Dtype.float64)).toBeNull(); + }); + + it("returns NaN for non-numeric string", () => { + expect(castScalar("hello", Dtype.float64)).toBeNaN(); + }); + + it("parses numeric string", () => { + expect(castScalar("3.14", Dtype.float64)).toBeCloseTo(3.14); + }); + }); + + describe("bool", () => { + it("truthy number → true", () => { + expect(castScalar(1, Dtype.bool)).toBe(true); + expect(castScalar(0, Dtype.bool)).toBe(false); + }); + + it("string 'hello' → true", () => { + expect(castScalar("hello", Dtype.bool)).toBe(true); + expect(castScalar("", Dtype.bool)).toBe(false); + }); + + it("null → null", () => { + expect(castScalar(null, Dtype.bool)).toBeNull(); + }); + + it("NaN → false", () => { + expect(castScalar(Number.NaN, Dtype.bool)).toBe(false); + }); + }); + + describe("string", () => { + it("converts number to string", () => { + expect(castScalar(42, Dtype.string)).toBe("42"); + }); + + it("converts boolean to string", () => { + expect(castScalar(true, Dtype.string)).toBe("true"); + }); + + it("null → null", () => { + expect(castScalar(null, Dtype.string)).toBeNull(); + }); + + it("converts Date to ISO string", () => { + const d = new Date("2024-01-15T00:00:00.000Z"); + expect(castScalar(d, Dtype.string)).toBe("2024-01-15T00:00:00.000Z"); + }); + }); + + describe("datetime", () => { + it("converts timestamp number to Date", () => { + const ts = 1705276800000; + const result = castScalar(ts, Dtype.datetime); + expect(result instanceof Date).toBe(true); + expect((result as Date).getTime()).toBe(ts); + }); + + it("converts ISO string to Date", () => { + const result = castScalar("2024-01-15T00:00:00.000Z", Dtype.datetime); + expect(result instanceof Date).toBe(true); + expect((result as Date).getFullYear()).toBe(2024); + }); + + it("returns null for invalid date string", () => { + expect(castScalar("not-a-date", Dtype.datetime)).toBeNull(); + }); + + it("passes Date through unchanged", () => { + const d = new Date(0); + expect(castScalar(d, Dtype.datetime)).toBe(d); + }); + + it("null → null", () => { + expect(castScalar(null, Dtype.datetime)).toBeNull(); + }); + }); + + describe("object passthrough", () => { + it("returns value unchanged for object dtype", () => { + const v = { x: 1 } as unknown as import("../../src/types.ts").Scalar; + expect(castScalar(v, Dtype.object)).toBe(v); + }); + }); +}); + +describe("astypeSeries", () => { + it("casts float series to int64", () => { + const s = new Series({ data: [1.9, 2.1, 3.7], name: "x" }); + const si = astypeSeries(s, "int64"); + expect(si.dtype.name).toBe("int64"); + expect([...si.values]).toEqual([1, 2, 3]); + expect(si.name).toBe("x"); + }); + + it("casts int series to float64", () => { + const s = new Series({ data: [1, 2, 3] }); + const sf = astypeSeries(s, "float64"); + expect(sf.dtype.name).toBe("float64"); + expect([...sf.values]).toEqual([1.0, 2.0, 3.0]); + }); + + it("casts int series to bool", () => { + const s = new Series({ data: [0, 1, 2] }); + const sb = astypeSeries(s, "bool"); + expect([...sb.values]).toEqual([false, true, true]); + expect(sb.dtype.name).toBe("bool"); + }); + + it("casts number series to string", () => { + const s = new Series({ data: [1, 2, 3] }); + const ss = astypeSeries(s, "string"); + expect([...ss.values]).toEqual(["1", "2", "3"]); + expect(ss.dtype.name).toBe("string"); + }); + + it("preserves index labels", () => { + const s = new Series({ data: [1.5, 2.5], index: ["a", "b"] }); + const si = astypeSeries(s, "int64"); + expect(si.index.at(0)).toBe("a"); + expect(si.index.at(1)).toBe("b"); + }); + + it("null values become null in int cast", () => { + const s = new Series({ data: [1, null, 3] }); + const si = astypeSeries(s, "int64"); + expect(si.values[1]).toBeNull(); + }); + + it("accepts a Dtype instance", () => { + const s = new Series({ data: [1.9, 2.1] }); + const si = astypeSeries(s, Dtype.int64); + expect(si.dtype).toBe(Dtype.int64); + expect([...si.values]).toEqual([1, 2]); + }); + + it("property: float→int→float recovers integer part", () => { + fc.assert( + fc.property( + fc.array(fc.float({ min: -1000, max: 1000, noNaN: true }), { minLength: 0, maxLength: 20 }), + (arr) => { + const s = new Series({ data: arr }); + const si = astypeSeries(s, "int64"); + const sf = astypeSeries(si, "float64"); + for (let i = 0; i < arr.length; i++) { + const expected = Math.trunc(arr[i] as number); + expect(sf.values[i]).toBe(expected); + } + }, + ), + ); + }); + + it("property: string→int64 for integers recovers value", () => { + fc.assert( + fc.property( + fc.array(fc.integer({ min: -1000, max: 1000 }), { minLength: 0, maxLength: 20 }), + (arr) => { + const s = new Series({ data: arr.map(String) }); + const si = astypeSeries(s, "int64"); + for (let i = 0; i < arr.length; i++) { + expect(si.values[i]).toBe(arr[i]); + } + }, + ), + ); + }); +}); + +describe("astype (DataFrame)", () => { + it("casts all columns with a single dtype name", () => { + const df = DataFrame.fromColumns({ a: [1.5, 2.5], b: [3.9, 4.1] }); + const di = astype(df, "int64"); + expect([...di.col("a").values]).toEqual([1, 2]); + expect([...di.col("b").values]).toEqual([3, 4]); + expect(di.col("a").dtype.name).toBe("int64"); + expect(di.col("b").dtype.name).toBe("int64"); + }); + + it("casts all columns with a Dtype instance", () => { + const df = DataFrame.fromColumns({ a: [1, 2], b: [3, 4] }); + const ds = astype(df, Dtype.string); + expect([...ds.col("a").values]).toEqual(["1", "2"]); + }); + + it("casts individual columns using a Record mapping", () => { + const df = DataFrame.fromColumns({ a: [1.5, 2.5], b: ["10", "20"] }); + const di = astype(df, { a: "int64", b: "float64" }); + expect([...di.col("a").values]).toEqual([1, 2]); + expect([...di.col("b").values]).toEqual([10, 20]); + }); + + it("leaves unmapped columns unchanged", () => { + const df = DataFrame.fromColumns({ a: [1.5, 2.5], b: [true, false] }); + const di = astype(df, { a: "int64" }); + expect([...di.col("a").values]).toEqual([1, 2]); + // column b is bool and unchanged + expect([...di.col("b").values]).toEqual([true, false]); + }); + + it("preserves row index", () => { + const df = DataFrame.fromColumns({ x: [10, 20, 30] }); + const di = astype(df, "float64"); + expect(di.index.size).toBe(3); + }); + + it("preserves column order", () => { + const df = DataFrame.fromColumns({ z: [1], a: [2], m: [3] }); + const di = astype(df, "float64"); + expect([...di.columns.values]).toEqual(["z", "a", "m"]); + }); + + it("does not mutate the original DataFrame", () => { + const df = DataFrame.fromColumns({ a: [1.5, 2.5] }); + astype(df, "int64"); + expect(df.col("a").dtype.name).toBe("float64"); + }); + + it("property: roundtrip int↔float preserves integer values", () => { + fc.assert( + fc.property( + fc.array(fc.integer({ min: -100, max: 100 }), { minLength: 1, maxLength: 10 }), + (arr) => { + const df = DataFrame.fromColumns({ v: arr }); + const df2 = astype(astype(df, "float64"), "int64"); + expect([...df2.col("v").values]).toEqual(arr); + }, + ), + ); + }); +}); From 62206ea4df7f1d05a7f466eeb4f9dd9c5103aad7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 11 Apr 2026 20:32:36 +0000 Subject: [PATCH 05/14] =?UTF-8?q?Iteration=20195:=20Add=20replace=20?= =?UTF-8?q?=E2=80=94=20value=20substitution=20for=20Series=20and=20DataFra?= =?UTF-8?q?me?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run: https://github.com/githubnext/tsessebe/actions/runs/24282791339 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- playground/replace.html | 408 ++++++++++++++++++++++++++++++++++++ src/stats/replace.ts | 237 +++++++++++++++++++++ tests/stats/replace.test.ts | 246 ++++++++++++++++++++++ 3 files changed, 891 insertions(+) create mode 100644 playground/replace.html create mode 100644 src/stats/replace.ts create mode 100644 tests/stats/replace.test.ts diff --git a/playground/replace.html b/playground/replace.html new file mode 100644 index 00000000..19da518a --- /dev/null +++ b/playground/replace.html @@ -0,0 +1,408 @@ + + + + + + tsb — replace (value substitution) + + + +
+
+
Loading tsb runtime…
+
+ + ← Back to playground index + +

replace — value substitution

+

+ replaceSeries / replaceDataFrame substitute values + matching a pattern with a new value.
+ Supports scalar, array, and mapping (Record / Map) replacement specs.
+ Mirrors Series.replace() and DataFrame.replace() from pandas. +

+ + +
+

1 · Scalar → scalar replacement

+

+ Replace every occurrence of a single value with another value. + Works on numbers, strings, booleans, and null. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

2 · Array replacement

+

+ Replace a list of values with a single target, or perform pair-wise + replacement using two equal-length arrays. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

3 · Mapping (Record / Map) replacement

+

+ Pass a lookup table as either a plain object (Record<string, Scalar>) + or a JavaScript Map for full type flexibility. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

4 · DataFrame replacement

+

+ replaceDataFrame applies the same spec to all columns by + default. Use the columns option to restrict which columns + are affected. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

API Reference

+
// Replace values in a Series
+replaceSeries(
+  series: Series,
+  spec: ReplaceSpec,
+  options?: ReplaceOptions,
+): Series
+
+// Replace values in a DataFrame
+replaceDataFrame(
+  df: DataFrame,
+  spec: ReplaceSpec,
+  options?: DataFrameReplaceOptions,
+): DataFrame
+
+// Replacement spec variants
+type ReplaceSpec =
+  | { toReplace: Scalar;              value: Scalar }               // scalar → scalar
+  | { toReplace: Scalar[];            value: Scalar }               // array  → scalar
+  | { toReplace: Scalar[];            value: Scalar[] }             // array  → array (pair-wise)
+  | { toReplace: Record<string, Scalar> }                          // Record mapping
+  | { toReplace: Map<Scalar, Scalar> }                             // Map mapping
+
+// Options
+interface ReplaceOptions {
+  matchNaN?: boolean;  // treat NaN===NaN for matching (default: true)
+}
+
+interface DataFrameReplaceOptions extends ReplaceOptions {
+  columns?: string[];  // only replace in these columns (default: all)
+}
+
+ + + + + diff --git a/src/stats/replace.ts b/src/stats/replace.ts new file mode 100644 index 00000000..54c2662e --- /dev/null +++ b/src/stats/replace.ts @@ -0,0 +1,237 @@ +/** + * replace — value substitution for Series and DataFrame. + * + * Mirrors the following pandas methods: + * - `Series.replace(to_replace, value)` / `Series.replace(mapping)` + * - `DataFrame.replace(to_replace, value)` / `DataFrame.replace(mapping)` + * + * Supported replacement specs: + * - **Scalar → Scalar**: replace every occurrence of one value with another. + * - **Array → Scalar**: replace every value in the array with a single value. + * - **Array → Array**: pair-wise replacement (must be same length). + * - **Record / Map**: lookup-table replacement (`{ old: new, ... }`). + * + * All functions are **pure** (return new objects; inputs are unchanged). + * + * @module + */ + +import { DataFrame } from "../core/index.ts"; +import { Series } from "../core/index.ts"; +import type { Scalar } from "../types.ts"; + +// ─── types ──────────────────────────────────────────────────────────────────── + +/** A lookup table mapping old values to new values. */ +export type ReplaceMapping = Readonly> | ReadonlyMap; + +/** + * Replacement specification accepted by {@link replaceSeries} / + * {@link replaceDataFrame}. + * + * Mirrors the first two positional args of `pandas.Series.replace`. + */ +export type ReplaceSpec = + | { readonly toReplace: Scalar; readonly value: Scalar } + | { readonly toReplace: readonly Scalar[]; readonly value: Scalar } + | { readonly toReplace: readonly Scalar[]; readonly value: readonly Scalar[] } + | { readonly toReplace: ReplaceMapping }; + +/** Options shared by {@link replaceSeries} and {@link replaceDataFrame}. */ +export interface ReplaceOptions { + /** + * When `true`, treat `NaN` values as equal for matching purposes. + * Default `true`. + */ + readonly matchNaN?: boolean; +} + +/** Options for {@link replaceDataFrame}. */ +export interface DataFrameReplaceOptions extends ReplaceOptions { + /** + * If provided, only replace values in these column names. + * By default all columns are processed. + */ + readonly columns?: readonly string[]; +} + +// ─── helpers ────────────────────────────────────────────────────────────────── + +/** True when `a` and `b` are equal (with optional NaN=NaN equality). */ +function scalarEq(a: Scalar, b: Scalar, matchNaN: boolean): boolean { + if ( + matchNaN && + typeof a === "number" && + typeof b === "number" && + Number.isNaN(a) && + Number.isNaN(b) + ) { + return true; + } + if (a instanceof Date && b instanceof Date) { + return a.getTime() === b.getTime(); + } + return a === b; +} + +/** + * Build a replacement function from a {@link ReplaceSpec}. + * Returns `(v) => new_value` or `v` unchanged if no match. + */ +function buildReplacer(spec: ReplaceSpec, matchNaN: boolean): (v: Scalar) => Scalar { + // Mapping variant + if ( + "toReplace" in spec && + !Array.isArray(spec.toReplace) && + typeof spec.toReplace === "object" && + spec.toReplace !== null && + !(spec.toReplace instanceof Map) && + !("value" in spec) + ) { + // Record + const rec = spec.toReplace as Readonly>; + return (v: Scalar): Scalar => { + const key = String(v); + return Object.prototype.hasOwnProperty.call(rec, key) ? (rec[key] as Scalar) : v; + }; + } + + if ("toReplace" in spec && spec.toReplace instanceof Map) { + const map = spec.toReplace as ReadonlyMap; + return (v: Scalar): Scalar => { + for (const [k, val] of map) { + if (scalarEq(v, k, matchNaN)) { + return val; + } + } + return v; + }; + } + + // Mapping passed via { toReplace: mapping } shape + if ("toReplace" in spec && !("value" in spec)) { + const mapping = spec.toReplace as ReplaceMapping; + if (mapping instanceof Map) { + const map = mapping as ReadonlyMap; + return (v: Scalar): Scalar => { + for (const [k, val] of map) { + if (scalarEq(v, k, matchNaN)) { + return val; + } + } + return v; + }; + } + const rec = mapping as Readonly>; + return (v: Scalar): Scalar => { + const key = String(v); + return Object.prototype.hasOwnProperty.call(rec, key) ? (rec[key] as Scalar) : v; + }; + } + + const s = spec as { toReplace: Scalar | readonly Scalar[]; value: Scalar | readonly Scalar[] }; + + if (!Array.isArray(s.toReplace)) { + // Scalar → Scalar + const old = s.toReplace as Scalar; + const newVal = s.value as Scalar; + return (v: Scalar): Scalar => (scalarEq(v, old, matchNaN) ? newVal : v); + } + + const oldArr = s.toReplace as readonly Scalar[]; + + if (!Array.isArray(s.value)) { + // Array → Scalar + const newVal = s.value as Scalar; + return (v: Scalar): Scalar => { + for (const old of oldArr) { + if (scalarEq(v, old, matchNaN)) { + return newVal; + } + } + return v; + }; + } + + // Array → Array (pair-wise) + const newArr = s.value as readonly Scalar[]; + if (oldArr.length !== newArr.length) { + throw new RangeError( + `replace: toReplace and value arrays must have the same length (got ${oldArr.length} and ${newArr.length})`, + ); + } + return (v: Scalar): Scalar => { + for (let i = 0; i < oldArr.length; i++) { + if (scalarEq(v, oldArr[i] as Scalar, matchNaN)) { + return newArr[i] as Scalar; + } + } + return v; + }; +} + +// ─── Series ─────────────────────────────────────────────────────────────────── + +/** + * Replace values in a Series according to `spec`. + * + * @example + * ```ts + * import { Series } from "tsb"; + * import { replaceSeries } from "tsb"; + * + * const s = new Series({ data: [1, 2, 3, 2, 1] }); + * const r = replaceSeries(s, { toReplace: 2, value: 99 }); + * // r.values → [1, 99, 3, 99, 1] + * ``` + */ +export function replaceSeries( + series: Series, + spec: ReplaceSpec, + options: ReplaceOptions = {}, +): Series { + const matchNaN = options.matchNaN ?? true; + const replacer = buildReplacer(spec, matchNaN); + const newData = Array.from({ length: series.size }, (_, i) => + replacer(series.values[i] as Scalar), + ); + return new Series({ data: newData, index: series.index, name: series.name }); +} + +// ─── DataFrame ──────────────────────────────────────────────────────────────── + +/** + * Replace values in a DataFrame according to `spec`. + * + * @example + * ```ts + * import { DataFrame } from "tsb"; + * import { replaceDataFrame } from "tsb"; + * + * const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [2, 2, 4] }); + * const r = replaceDataFrame(df, { toReplace: 2, value: 0 }); + * // r.col("a").values → [1, 0, 3] + * // r.col("b").values → [0, 0, 4] + * ``` + */ +export function replaceDataFrame( + df: DataFrame, + spec: ReplaceSpec, + options: DataFrameReplaceOptions = {}, +): DataFrame { + const matchNaN = options.matchNaN ?? true; + const replacer = buildReplacer(spec, matchNaN); + const targetCols = new Set(options.columns ?? df.columns.values); + + const colMap = new Map>(); + for (const name of df.columns.values) { + const col = df.col(name) as Series; + if (targetCols.has(name)) { + const newData = Array.from({ length: col.size }, (_, i) => replacer(col.values[i] as Scalar)); + colMap.set(name, new Series({ data: newData, index: col.index, name: col.name })); + } else { + colMap.set(name, col); + } + } + return new DataFrame(colMap, df.index); +} diff --git a/tests/stats/replace.test.ts b/tests/stats/replace.test.ts new file mode 100644 index 00000000..452de062 --- /dev/null +++ b/tests/stats/replace.test.ts @@ -0,0 +1,246 @@ +/** + * Tests for stats/replace — value substitution for Series and DataFrame. + */ + +import { describe, expect, it } from "bun:test"; +import fc from "fast-check"; +import { DataFrame, Series } from "../../src/index.ts"; +import { replaceDataFrame, replaceSeries } from "../../src/stats/replace.ts"; + +// ─── replaceSeries — scalar → scalar ───────────────────────────────────────── + +describe("replaceSeries: scalar → scalar", () => { + it("replaces a matching value", () => { + const s = new Series({ data: [1, 2, 3, 2, 1] }); + const r = replaceSeries(s, { toReplace: 2, value: 99 }); + expect([...r.values]).toEqual([1, 99, 3, 99, 1]); + }); + + it("leaves non-matching values unchanged", () => { + const s = new Series({ data: [1, 2, 3] }); + const r = replaceSeries(s, { toReplace: 9, value: 0 }); + expect([...r.values]).toEqual([1, 2, 3]); + }); + + it("replaces string values", () => { + const s = new Series({ data: ["a", "b", "a", "c"] }); + const r = replaceSeries(s, { toReplace: "a", value: "z" }); + expect([...r.values]).toEqual(["z", "b", "z", "c"]); + }); + + it("replaces null values", () => { + const s = new Series({ data: [1, null, 3, null] }); + const r = replaceSeries(s, { toReplace: null, value: 0 }); + expect([...r.values]).toEqual([1, 0, 3, 0]); + }); + + it("replaces NaN values when matchNaN=true (default)", () => { + const s = new Series({ data: [1, Number.NaN, 3] }); + const r = replaceSeries(s, { toReplace: Number.NaN, value: 0 }); + expect([...r.values]).toEqual([1, 0, 3]); + }); + + it("does NOT replace NaN when matchNaN=false", () => { + const s = new Series({ data: [1, Number.NaN, 3] }); + const r = replaceSeries(s, { toReplace: Number.NaN, value: 0 }, { matchNaN: false }); + expect(Number.isNaN((r.values[1] as number))).toBe(true); + }); + + it("preserves index", () => { + const s = new Series({ data: [1, 2, 3], index: ["x", "y", "z"] }); + const r = replaceSeries(s, { toReplace: 2, value: 20 }); + expect([...r.index.values]).toEqual(["x", "y", "z"]); + }); + + it("preserves name", () => { + const s = new Series({ data: [1, 2], name: "myCol" }); + const r = replaceSeries(s, { toReplace: 1, value: 0 }); + expect(r.name).toBe("myCol"); + }); + + it("returns empty series when input is empty", () => { + const s = new Series({ data: [] }); + const r = replaceSeries(s, { toReplace: 1, value: 0 }); + expect(r.size).toBe(0); + }); +}); + +// ─── replaceSeries — array → scalar ─────────────────────────────────────────── + +describe("replaceSeries: array → scalar", () => { + it("replaces all listed values with single value", () => { + const s = new Series({ data: [1, 2, 3, 4, 5] }); + const r = replaceSeries(s, { toReplace: [1, 3, 5], value: 0 }); + expect([...r.values]).toEqual([0, 2, 0, 4, 0]); + }); + + it("handles empty toReplace array", () => { + const s = new Series({ data: [1, 2, 3] }); + const r = replaceSeries(s, { toReplace: [], value: 0 }); + expect([...r.values]).toEqual([1, 2, 3]); + }); +}); + +// ─── replaceSeries — array → array ──────────────────────────────────────────── + +describe("replaceSeries: array → array", () => { + it("performs pair-wise replacement", () => { + const s = new Series({ data: [1, 2, 3, 1, 2] }); + const r = replaceSeries(s, { toReplace: [1, 2], value: [10, 20] }); + expect([...r.values]).toEqual([10, 20, 3, 10, 20]); + }); + + it("throws when array lengths differ", () => { + const s = new Series({ data: [1, 2, 3] }); + expect(() => replaceSeries(s, { toReplace: [1, 2], value: [10] })).toThrow(RangeError); + }); +}); + +// ─── replaceSeries — mapping (Record) ───────────────────────────────────────── + +describe("replaceSeries: Record mapping", () => { + it("replaces using a Record map", () => { + const s = new Series({ data: [1, 2, 3, 4] }); + const r = replaceSeries(s, { toReplace: { "1": 10, "3": 30 } }); + expect([...r.values]).toEqual([10, 2, 30, 4]); + }); + + it("leaves values with no mapping entry unchanged", () => { + const s = new Series({ data: ["a", "b", "c"] }); + const r = replaceSeries(s, { toReplace: { "a": "A" } }); + expect([...r.values]).toEqual(["A", "b", "c"]); + }); +}); + +// ─── replaceSeries — mapping (Map) ──────────────────────────────────────────── + +describe("replaceSeries: Map mapping", () => { + it("replaces using a Map", () => { + const s = new Series({ data: [1, 2, 3, 2, 1] }); + const map = new Map([[1, 100], [2, 200]]); + const r = replaceSeries(s, { toReplace: map }); + expect([...r.values]).toEqual([100, 200, 3, 200, 100]); + }); + + it("handles NaN keys in Map with matchNaN=true", () => { + const s = new Series({ data: [1, Number.NaN, 3] }); + const map = new Map([[Number.NaN, 99]]); + const r = replaceSeries(s, { toReplace: map }); + expect([...r.values]).toEqual([1, 99, 3]); + }); +}); + +// ─── replaceDataFrame ───────────────────────────────────────────────────────── + +describe("replaceDataFrame: basic", () => { + it("replaces value in all columns", () => { + const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [2, 2, 4] }); + const r = replaceDataFrame(df, { toReplace: 2, value: 0 }); + expect([...r.col("a").values]).toEqual([1, 0, 3]); + expect([...r.col("b").values]).toEqual([0, 0, 4]); + }); + + it("restricts replacement to specified columns", () => { + const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [2, 2, 4] }); + const r = replaceDataFrame(df, { toReplace: 2, value: 0 }, { columns: ["a"] }); + expect([...r.col("a").values]).toEqual([1, 0, 3]); + expect([...r.col("b").values]).toEqual([2, 2, 4]); + }); + + it("preserves index", () => { + const df = DataFrame.fromColumns({ a: [1, 2, 3] }); + const r = replaceDataFrame(df, { toReplace: 1, value: 10 }); + expect([...r.index.values]).toEqual([...df.index.values]); + }); + + it("preserves columns order", () => { + const df = DataFrame.fromColumns({ a: [1], b: [2], c: [3] }); + const r = replaceDataFrame(df, { toReplace: 1, value: 99 }); + expect([...r.columns.values]).toEqual(["a", "b", "c"]); + }); + + it("uses array → scalar replacement across columns", () => { + const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [3, 4, 5] }); + const r = replaceDataFrame(df, { toReplace: [1, 3], value: 0 }); + expect([...r.col("a").values]).toEqual([0, 2, 0]); + expect([...r.col("b").values]).toEqual([0, 4, 5]); + }); + + it("uses Record mapping across columns", () => { + const df = DataFrame.fromColumns({ a: [1, 2], b: [2, 3] }); + const r = replaceDataFrame(df, { toReplace: { "2": 20 } }); + expect([...r.col("a").values]).toEqual([1, 20]); + expect([...r.col("b").values]).toEqual([20, 3]); + }); +}); + +// ─── property-based tests ───────────────────────────────────────────────────── + +describe("replaceSeries: properties", () => { + it("scalar→scalar: replaced value never appears where original matched", () => { + fc.assert( + fc.property( + fc.array(fc.integer({ min: 0, max: 9 }), { minLength: 0, maxLength: 20 }), + fc.integer({ min: 0, max: 9 }), + fc.integer({ min: 10, max: 99 }), + (data, old, newVal) => { + const s = new Series({ data }); + const r = replaceSeries(s, { toReplace: old, value: newVal }); + for (let i = 0; i < s.size; i++) { + if (s.values[i] === old) { + if (r.values[i] !== newVal) return false; + } else { + if (r.values[i] !== s.values[i]) return false; + } + } + return true; + }, + ), + ); + }); + + it("size is preserved", () => { + fc.assert( + fc.property( + fc.array(fc.integer({ min: 0, max: 9 }), { minLength: 0, maxLength: 30 }), + (data) => { + const s = new Series({ data }); + const r = replaceSeries(s, { toReplace: 5, value: 0 }); + return r.size === s.size; + }, + ), + ); + }); + + it("no-op when toReplace not present", () => { + fc.assert( + fc.property( + fc.array(fc.integer({ min: 0, max: 5 }), { minLength: 1, maxLength: 20 }), + (data) => { + const s = new Series({ data }); + // 99 is never in the array since data is 0-5 + const r = replaceSeries(s, { toReplace: 99, value: -1 }); + return [...r.values].every((v, i) => v === data[i]); + }, + ), + ); + }); + + it("array→array: pair-wise replacement is consistent", () => { + fc.assert( + fc.property( + fc.array(fc.integer({ min: 0, max: 5 }), { minLength: 0, maxLength: 20 }), + (data) => { + const s = new Series({ data }); + const r = replaceSeries(s, { toReplace: [1, 2, 3], value: [10, 20, 30] }); + const mapping: Record = { 1: 10, 2: 20, 3: 30 }; + return [...r.values].every((v, i) => { + const orig = data[i] as number; + const expected = mapping[orig] ?? orig; + return v === expected; + }); + }, + ), + ); + }); +}); From a49c9885d8bc841086ff595a6dbea281b6b632ef Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 11 Apr 2026 20:32:37 +0000 Subject: [PATCH 06/14] =?UTF-8?q?Iteration=20196:=20Add=20where/mask=20?= =?UTF-8?q?=E2=80=94=20conditional=20value=20selection=20for=20Series=20an?= =?UTF-8?q?d=20DataFrame?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run: https://github.com/githubnext/tsessebe/actions/runs/24283415842 --- playground/where_mask.html | 199 +++++++++++++++ src/stats/where_mask.ts | 430 +++++++++++++++++++++++++++++++++ tests/stats/where_mask.test.ts | 328 +++++++++++++++++++++++++ 3 files changed, 957 insertions(+) create mode 100644 playground/where_mask.html create mode 100644 src/stats/where_mask.ts create mode 100644 tests/stats/where_mask.test.ts diff --git a/playground/where_mask.html b/playground/where_mask.html new file mode 100644 index 00000000..8e3bba6a --- /dev/null +++ b/playground/where_mask.html @@ -0,0 +1,199 @@ + + + + + + tsb — where / mask + + + +

tsb — where / mask

+

+ Conditional value selection: keep or replace elements based on a boolean + condition. These are the TypeScript equivalents of + pandas.Series.where / pandas.DataFrame.where and + pandas.Series.mask / pandas.DataFrame.mask. +

+ +

Core concept

+
// where: keep where cond=true, replace with `other` where cond=false
+whereSeries(s, cond, { other: null })
+
+// mask: replace where cond=true with `other`, keep where cond=false
+maskSeries(s, cond, { other: null })
+ +
+ pandas equivalent:
+ s.where(cond, other=np.nan)
+ s.mask(cond, other=np.nan) +
+ + +

Demo 1 — whereSeries with boolean array

+
+
Code
+
const s = new Series({ data: [10, 20, 30, 40, 50], name: "prices" });
+whereSeries(s, [true, false, true, false, true]);
+// → [10, null, 30, null, 50]
+ + +
+ + +

Demo 2 — maskSeries with callable condition

+
+
Code
+
const s = new Series({ data: [1, 2, 3, 4, 5] });
+// Replace values > 3 with -1
+maskSeries(s, (v) => v > 3, { other: -1 });
+// → [1, 2, 3, -1, -1]
+ + +
+ + +

Demo 3 — whereDataFrame with 2-D condition

+
+
Code
+
const df = DataFrame.fromColumns({
+  a: [1, 2, 3],
+  b: [4, 5, 6],
+});
+const cond = [[true, false], [false, true], [true, true]];
+whereDataFrame(df, cond);
+// a: [1, null, 3]
+// b: [null, 5, 6]
+ + +
+ + +

Demo 4 — whereDataFrame with Series condition (axis=0)

+
+
Code
+
const df = DataFrame.fromColumns({
+  a: [1, 2, 3],
+  b: [10, 20, 30],
+});
+// Keep rows 0 and 2 only, replace row 1 across all columns
+const rowCond = new Series({ data: [true, false, true], index: [0, 1, 2] });
+whereDataFrame(df, rowCond, { axis: 0, other: 0 });
+// a: [1, 0, 3]
+// b: [10, 0, 30]
+ + +
+ + +

Demo 5 — maskDataFrame with DataFrame condition

+
+
Code
+
const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [4, 5, 6] });
+const condDf = DataFrame.fromColumns({
+  a: [false, true, false],
+  b: [true, false, true],
+});
+maskDataFrame(df, condDf, { other: 99 });
+// a: [1, 99, 3]
+// b: [99, 5, 99]
+ + +
+ + +

Demo 6 — Interactive editor

+
+
Edit and run
+ + + +
+ + + + diff --git a/src/stats/where_mask.ts b/src/stats/where_mask.ts new file mode 100644 index 00000000..ecbefd72 --- /dev/null +++ b/src/stats/where_mask.ts @@ -0,0 +1,430 @@ +/** + * where_mask — conditional value selection for Series and DataFrame. + * + * Mirrors the following pandas methods: + * - `Series.where(cond, other=NaN)` — keep values where `cond` is True, replace with `other` where False + * - `Series.mask(cond, other=NaN)` — replace values where `cond` is True with `other`, keep where False + * - `DataFrame.where(cond, other=NaN, axis?)` — same but for DataFrames + * - `DataFrame.mask(cond, other=NaN, axis?)` — same but for DataFrames + * + * The condition can be: + * - A `boolean[]` array aligned by position + * - A `Series` (aligned by index label when axis=0, or by position) + * - A callable `(value: Scalar) => boolean` + * + * For DataFrames, `cond` may additionally be: + * - A `DataFrame` of booleans (same shape) + * - A `boolean[][]` 2-D array + * + * All functions are **pure** (return new objects; inputs are unchanged). + * Missing values in `cond` are treated as `false`. + * + * @module + */ + +import { DataFrame } from "../core/index.ts"; +import { Series } from "../core/index.ts"; +import type { Axis, Label, Scalar } from "../types.ts"; + +// ─── public types ───────────────────────────────────────────────────────────── + +/** Condition types accepted by {@link whereSeries} and {@link maskSeries}. */ +export type SeriesCond = + | readonly boolean[] + | Series + | Series + | ((value: Scalar, label: Label) => boolean); + +/** Condition types accepted by {@link whereDataFrame} and {@link maskDataFrame}. */ +export type DataFrameCond = readonly (readonly boolean[])[] | DataFrame | SeriesCond; + +/** Options for {@link whereSeries} and {@link maskSeries}. */ +export interface WhereOptions { + /** + * Value to use where the condition is `false` (for `where`) or `true` (for `mask`). + * Defaults to `null` (propagated as missing, matching pandas NaN behaviour). + */ + readonly other?: Scalar; +} + +/** Options for {@link whereDataFrame} and {@link maskDataFrame}. */ +export interface WhereDataFrameOptions extends WhereOptions { + /** + * Axis along which to align a Series condition (when `cond` is a `Series`). + * - `0` or `"index"` (default): align by **row** labels (broadcast across columns). + * - `1` or `"columns"`: align by **column** labels (broadcast across rows). + */ + readonly axis?: Axis; +} + +// ─── helpers ────────────────────────────────────────────────────────────────── + +/** True when a scalar should be treated as missing. */ +function isMissing(v: Scalar): boolean { + return v === null || v === undefined || (typeof v === "number" && Number.isNaN(v)); +} + +/** + * Resolve a boolean condition value from a position + label, given the + * various condition types for Series. + */ +function resolveSeriesCond(cond: SeriesCond, i: number, label: Label, value: Scalar): boolean { + if (typeof cond === "function") { + return cond(value, label); + } + if (Array.isArray(cond)) { + const v = (cond as readonly boolean[])[i]; + return v === true; + } + // Series or Series + const s = cond as Series; + // Try label-based lookup first, fall back to positional + const strLabel = String(label); + for (let j = 0; j < s.index.size; j++) { + if (String(s.index.at(j)) === strLabel) { + return s.iat(j) === true; + } + } + return false; +} + +// ─── Series: where ──────────────────────────────────────────────────────────── + +/** + * Return a new Series keeping values where `cond` is `true`, replacing with + * `other` (default `null`) where `cond` is `false`. + * + * Mirrors `pandas.Series.where(cond, other=np.nan)`. + * + * @example + * ```ts + * import { Series } from "tsb"; + * import { whereSeries } from "tsb"; + * + * const s = new Series({ data: [1, 2, 3, 4, 5] }); + * whereSeries(s, [true, false, true, false, true]); // [1, null, 3, null, 5] + * whereSeries(s, (v) => (v as number) > 2, { other: 0 }); // [0, 0, 3, 4, 5] + * ``` + */ +export function whereSeries( + series: Series, + cond: SeriesCond, + options?: WhereOptions, +): Series { + const other: Scalar = options?.other !== undefined ? options.other : null; + const newData: Scalar[] = []; + for (let i = 0; i < series.size; i++) { + const label = series.index.at(i); + const value = series.iat(i); + const keep = resolveSeriesCond(cond, i, label, value); + newData.push(keep ? value : other); + } + return new Series({ + data: newData, + index: series.index, + name: series.name, + }); +} + +// ─── Series: mask ───────────────────────────────────────────────────────────── + +/** + * Return a new Series replacing values where `cond` is `true` with `other` + * (default `null`), keeping values where `cond` is `false`. + * + * Mirrors `pandas.Series.mask(cond, other=np.nan)`. + * + * @example + * ```ts + * import { Series } from "tsb"; + * import { maskSeries } from "tsb"; + * + * const s = new Series({ data: [1, 2, 3, 4, 5] }); + * maskSeries(s, [true, false, true, false, true]); // [null, 2, null, 4, null] + * maskSeries(s, (v) => (v as number) > 3, { other: -1 }); // [1, 2, 3, -1, -1] + * ``` + */ +export function maskSeries( + series: Series, + cond: SeriesCond, + options?: WhereOptions, +): Series { + const other: Scalar = options?.other !== undefined ? options.other : null; + const newData: Scalar[] = []; + for (let i = 0; i < series.size; i++) { + const label = series.index.at(i); + const value = series.iat(i); + const replace = resolveSeriesCond(cond, i, label, value); + newData.push(replace ? other : value); + } + return new Series({ + data: newData, + index: series.index, + name: series.name, + }); +} + +// ─── DataFrame: helpers ─────────────────────────────────────────────────────── + +/** Set a cell in the keep matrix safely. */ +function setCell(matrix: boolean[][], r: number, c: number, v: boolean): void { + const row = matrix[r]; + if (row !== undefined) { + row[c] = v; + } +} + +/** Build keep-matrix from a DataFrame condition. */ +function buildFromDataFrameCond( + df: DataFrame, + cond: DataFrame, + matrix: boolean[][], + invert: boolean, +): void { + const colNames = df.columns.values; + for (let r = 0; r < df.index.size; r++) { + for (let c = 0; c < colNames.length; c++) { + const colName = colNames[c]; + if (colName === undefined) { + continue; + } + let val: Scalar = null; + try { + val = cond.col(colName).iat(r); + } catch { + val = null; + } + const condTrue = val === true; + setCell(matrix, r, c, invert ? !condTrue : condTrue); + } + } +} + +/** Build keep-matrix from a 2-D boolean array condition. */ +function buildFrom2DArray( + df: DataFrame, + cond2d: readonly (readonly boolean[])[], + matrix: boolean[][], + invert: boolean, +): void { + const colNames = df.columns.values; + for (let r = 0; r < df.index.size; r++) { + for (let c = 0; c < colNames.length; c++) { + const condTrue = cond2d[r]?.[c] === true; + setCell(matrix, r, c, invert ? !condTrue : condTrue); + } + } +} + +/** Build keep-matrix from a Series/array condition on axis=0 (broadcast over columns). */ +function buildFromSeriesAxis0( + df: DataFrame, + cond: SeriesCond, + matrix: boolean[][], + invert: boolean, +): void { + const nCols = df.columns.values.length; + for (let r = 0; r < df.index.size; r++) { + const label = df.index.at(r); + const condTrue = resolveSeriesCond(cond, r, label, null); + for (let c = 0; c < nCols; c++) { + setCell(matrix, r, c, invert ? !condTrue : condTrue); + } + } +} + +/** Look up the condition value for a column by name from a Series (for axis=1). */ +function seriesCondForColumn(s: Series, colName: string): boolean { + for (let j = 0; j < s.index.size; j++) { + if (String(s.index.at(j)) === colName) { + return s.iat(j) === true; + } + } + return false; +} + +/** Resolve axis=1 condition for a single column. */ +function resolveAxis1Cond(cond: SeriesCond, c: number, colName: string): boolean { + if (cond instanceof Series) { + return seriesCondForColumn(cond as Series, colName); + } + if (Array.isArray(cond)) { + return (cond as readonly boolean[])[c] === true; + } + return false; +} + +/** Build keep-matrix from a Series/array condition on axis=1 (broadcast over rows). */ +function buildFromSeriesAxis1( + df: DataFrame, + cond: SeriesCond, + matrix: boolean[][], + invert: boolean, +): void { + const colNames = df.columns.values; + for (let c = 0; c < colNames.length; c++) { + const colName = colNames[c]; + if (colName === undefined) { + continue; + } + const condTrue = resolveAxis1Cond(cond, c, colName); + for (let r = 0; r < df.index.size; r++) { + setCell(matrix, r, c, invert ? !condTrue : condTrue); + } + } +} + +/** Build keep-matrix from a callable condition (element-wise). */ +function buildFromCallable( + df: DataFrame, + cond: (v: Scalar, l: Label) => boolean, + matrix: boolean[][], + invert: boolean, +): void { + const colNames = df.columns.values; + for (let r = 0; r < df.index.size; r++) { + for (let c = 0; c < colNames.length; c++) { + const colName = colNames[c]; + if (colName === undefined) { + continue; + } + const value = df.col(colName).iat(r); + const label = df.index.at(r); + const condTrue = cond(value, label); + setCell(matrix, r, c, invert ? !condTrue : condTrue); + } + } +} + +/** + * Build a 2-D boolean matrix (nRows × nCols) from the condition, where + * matrix[row][col] = true means "keep original value" (for `where`) or + * "replace with other" (for `mask`, where invert=true flips the meaning). + */ +function buildKeepMatrix( + df: DataFrame, + cond: DataFrameCond, + axis: Axis, + invert: boolean, +): boolean[][] { + const nRows = df.index.size; + const nCols = df.columns.values.length; + + const matrix: boolean[][] = Array.from({ length: nRows }, () => + Array.from({ length: nCols }, () => false), + ); + + if (cond instanceof DataFrame) { + buildFromDataFrameCond(df, cond, matrix, invert); + } else if (Array.isArray(cond) && cond.length > 0 && Array.isArray(cond[0])) { + buildFrom2DArray(df, cond as readonly (readonly boolean[])[], matrix, invert); + } else if (typeof cond === "function") { + buildFromCallable(df, cond as (v: Scalar, l: Label) => boolean, matrix, invert); + } else { + const isRowAxis = axis === 0 || axis === "index"; + if (isRowAxis) { + buildFromSeriesAxis0(df, cond as SeriesCond, matrix, invert); + } else { + buildFromSeriesAxis1(df, cond as SeriesCond, matrix, invert); + } + } + + return matrix; +} + +// ─── DataFrame: where ───────────────────────────────────────────────────────── + +/** + * Return a new DataFrame keeping values where `cond` is `true`, replacing + * with `other` (default `null`) where `cond` is `false`. + * + * Mirrors `pandas.DataFrame.where(cond, other=np.nan, axis=None)`. + * + * @example + * ```ts + * import { DataFrame } from "tsb"; + * import { whereDataFrame } from "tsb"; + * + * const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [4, 5, 6] }); + * const cond = [[true, false], [false, true], [true, true]]; + * whereDataFrame(df, cond); // a=[1,null,3], b=[null,5,6] + * ``` + */ +export function whereDataFrame( + df: DataFrame, + cond: DataFrameCond, + options?: WhereDataFrameOptions, +): DataFrame { + const other: Scalar = options?.other !== undefined ? options.other : null; + const axis: Axis = options?.axis ?? 0; + + const keepMatrix = buildKeepMatrix(df, cond, axis, false); + const colNames = df.columns.values; + + const colMap = new Map>(); + for (let c = 0; c < colNames.length; c++) { + const colName = colNames[c]; + if (colName === undefined) { + continue; + } + const col = df.col(colName); + const newData: Scalar[] = []; + for (let r = 0; r < df.index.size; r++) { + const keep = keepMatrix[r]?.[c] === true; + newData.push(keep ? col.iat(r) : other); + } + colMap.set(colName, new Series({ data: newData, index: df.index, name: colName })); + } + return new DataFrame(colMap, df.index); +} + +// ─── DataFrame: mask ────────────────────────────────────────────────────────── + +/** + * Return a new DataFrame replacing values where `cond` is `true` with + * `other` (default `null`), keeping values where `cond` is `false`. + * + * Mirrors `pandas.DataFrame.mask(cond, other=np.nan, axis=None)`. + * + * @example + * ```ts + * import { DataFrame } from "tsb"; + * import { maskDataFrame } from "tsb"; + * + * const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [4, 5, 6] }); + * const cond = [[true, false], [false, true], [true, true]]; + * maskDataFrame(df, cond); // a=[null,2,null], b=[4,null,null] + * ``` + */ +export function maskDataFrame( + df: DataFrame, + cond: DataFrameCond, + options?: WhereDataFrameOptions, +): DataFrame { + const other: Scalar = options?.other !== undefined ? options.other : null; + const axis: Axis = options?.axis ?? 0; + + // invert=true means: keepMatrix[r][c] = true → replace with other (mask) + const keepMatrix = buildKeepMatrix(df, cond, axis, true); + const colNames = df.columns.values; + + const colMap = new Map>(); + for (let c = 0; c < colNames.length; c++) { + const colName = colNames[c]; + if (colName === undefined) { + continue; + } + const col = df.col(colName); + const newData: Scalar[] = []; + for (let r = 0; r < df.index.size; r++) { + const keep = keepMatrix[r]?.[c] === true; + newData.push(keep ? col.iat(r) : other); + } + colMap.set(colName, new Series({ data: newData, index: df.index, name: colName })); + } + return new DataFrame(colMap, df.index); +} + +// ─── re-export isMissing for test convenience ───────────────────────────────── + +export { isMissing as _isMissingWhere }; diff --git a/tests/stats/where_mask.test.ts b/tests/stats/where_mask.test.ts new file mode 100644 index 00000000..e97afc2b --- /dev/null +++ b/tests/stats/where_mask.test.ts @@ -0,0 +1,328 @@ +/** + * Tests for stats/where_mask.ts + * + * Covers: + * - whereSeries / maskSeries with: boolean[], Series, callable + * - whereDataFrame / maskDataFrame with: 2-D array, DataFrame, 1-D Series (axis 0 & 1), callable + * - edge cases: empty, all-true, all-false, null/NaN in cond, custom `other` value + * - property-based tests with fast-check + */ + +import { describe, expect, it } from "bun:test"; +import fc from "fast-check"; +import { DataFrame, Series } from "../../src/index.ts"; +import { maskDataFrame, maskSeries, whereDataFrame, whereSeries } from "../../src/index.ts"; + +// ─── helpers ────────────────────────────────────────────────────────────────── + +function s(data: (number | null)[], name = "x"): Series { + return new Series({ data, name }) as Series; +} + +// ─── whereSeries ────────────────────────────────────────────────────────────── + +describe("whereSeries", () => { + it("keeps values where cond=true, replaces with null by default", () => { + const result = whereSeries(s([1, 2, 3, 4, 5]), [true, false, true, false, true]); + expect(result.values).toEqual([1, null, 3, null, 5]); + }); + + it("uses custom other value", () => { + const result = whereSeries(s([1, 2, 3]), [false, true, false], { other: 0 }); + expect(result.values).toEqual([0, 2, 0]); + }); + + it("works with all-true cond (identity)", () => { + const result = whereSeries(s([1, 2, 3]), [true, true, true]); + expect(result.values).toEqual([1, 2, 3]); + }); + + it("works with all-false cond (replace all)", () => { + const result = whereSeries(s([1, 2, 3]), [false, false, false], { other: -1 }); + expect(result.values).toEqual([-1, -1, -1]); + }); + + it("accepts callable condition", () => { + const result = whereSeries(s([1, 2, 3, 4, 5]), (v) => (v as number) > 2); + expect(result.values).toEqual([null, null, 3, 4, 5]); + }); + + it("accepts callable with label argument", () => { + const data = new Series({ data: [10, 20, 30], index: ["a", "b", "c"], name: "t" }); + const result = whereSeries(data, (_v, label) => label === "a" || label === "c"); + expect(result.values).toEqual([10, null, 30]); + }); + + it("accepts Series condition (label alignment)", () => { + const data = new Series({ data: [1, 2, 3], index: ["a", "b", "c"], name: "d" }); + const cond = new Series({ data: [true, false, true], index: ["a", "b", "c"], name: "c" }); + const result = whereSeries(data, cond as Series); + expect(result.values).toEqual([1, null, 3]); + }); + + it("handles empty Series", () => { + const result = whereSeries(s([]), []); + expect(result.values).toEqual([]); + }); + + it("preserves index and name", () => { + const data = new Series({ data: [1, 2], index: ["x", "y"], name: "myname" }); + const result = whereSeries(data, [true, false]); + expect(result.name).toBe("myname"); + expect(result.index.values).toEqual(["x", "y"]); + }); + + it("handles null values in input", () => { + const result = whereSeries(s([1, null, 3]), [true, true, false], { other: -1 }); + expect(result.values).toEqual([1, null, -1]); + }); +}); + +// ─── maskSeries ─────────────────────────────────────────────────────────────── + +describe("maskSeries", () => { + it("replaces values where cond=true with null by default", () => { + const result = maskSeries(s([1, 2, 3, 4, 5]), [true, false, true, false, true]); + expect(result.values).toEqual([null, 2, null, 4, null]); + }); + + it("uses custom other value", () => { + const result = maskSeries(s([1, 2, 3]), [true, false, true], { other: 0 }); + expect(result.values).toEqual([0, 2, 0]); + }); + + it("accepts callable condition", () => { + const result = maskSeries(s([1, 2, 3, 4, 5]), (v) => (v as number) > 3, { other: -1 }); + expect(result.values).toEqual([1, 2, 3, -1, -1]); + }); + + it("all-true cond replaces all values", () => { + const result = maskSeries(s([1, 2, 3]), [true, true, true], { other: 0 }); + expect(result.values).toEqual([0, 0, 0]); + }); + + it("all-false cond is identity", () => { + const result = maskSeries(s([1, 2, 3]), [false, false, false]); + expect(result.values).toEqual([1, 2, 3]); + }); + + it("accepts Series condition", () => { + const data = new Series({ data: [10, 20, 30], index: ["a", "b", "c"] }); + const cond = new Series({ data: [false, true, false], index: ["a", "b", "c"] }); + const result = maskSeries(data, cond as Series); + expect(result.values).toEqual([10, null, 30]); + }); + + it("mask is complement of where with same cond", () => { + const data = s([1, 2, 3, 4]); + const cond = [true, false, true, false]; + const w = whereSeries(data, cond, { other: 99 }); + const m = maskSeries(data, cond, { other: 99 }); + // where keeps trues, mask keeps falses — opposite patterns + for (let i = 0; i < 4; i++) { + if (cond[i]) { + expect(w.iat(i)).toBe(data.iat(i)); + expect(m.iat(i)).toBe(99); + } else { + expect(w.iat(i)).toBe(99); + expect(m.iat(i)).toBe(data.iat(i)); + } + } + }); +}); + +// ─── whereDataFrame ─────────────────────────────────────────────────────────── + +describe("whereDataFrame", () => { + it("works with 2-D boolean array", () => { + const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [4, 5, 6] }); + const cond = [ + [true, false], + [false, true], + [true, true], + ]; + const result = whereDataFrame(df, cond); + expect(result.col("a").values).toEqual([1, null, 3]); + expect(result.col("b").values).toEqual([null, 5, 6]); + }); + + it("uses custom other value", () => { + const df = DataFrame.fromColumns({ a: [1, 2], b: [3, 4] }); + const cond = [ + [false, false], + [true, true], + ]; + const result = whereDataFrame(df, cond, { other: -1 }); + expect(result.col("a").values).toEqual([-1, 2]); + expect(result.col("b").values).toEqual([-1, 4]); + }); + + it("works with DataFrame condition", () => { + const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [4, 5, 6] }); + const condDf = DataFrame.fromColumns({ a: [true, false, true], b: [false, true, true] }); + const result = whereDataFrame(df, condDf); + expect(result.col("a").values).toEqual([1, null, 3]); + expect(result.col("b").values).toEqual([null, 5, 6]); + }); + + it("works with 1-D Series condition on axis=0 (broadcast across columns)", () => { + const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [4, 5, 6] }); + const cond = new Series({ data: [true, false, true], index: [0, 1, 2] }); + const result = whereDataFrame(df, cond as Series, { axis: 0 }); + expect(result.col("a").values).toEqual([1, null, 3]); + expect(result.col("b").values).toEqual([4, null, 6]); + }); + + it("works with 1-D boolean array on axis=0", () => { + const df = DataFrame.fromColumns({ a: [10, 20], b: [30, 40] }); + const result = whereDataFrame(df, [false, true], { axis: 0, other: 0 }); + expect(result.col("a").values).toEqual([0, 20]); + expect(result.col("b").values).toEqual([0, 40]); + }); + + it("works with 1-D Series condition on axis=1 (broadcast across rows)", () => { + const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [4, 5, 6] }); + const cond = new Series({ data: [true, false], index: ["a", "b"] }); + const result = whereDataFrame(df, cond as Series, { axis: 1 }); + // column "a" cond=true → keep; column "b" cond=false → replace + expect(result.col("a").values).toEqual([1, 2, 3]); + expect(result.col("b").values).toEqual([null, null, null]); + }); + + it("works with callable condition (element-wise)", () => { + const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [4, 5, 6] }); + const result = whereDataFrame(df, (v) => (v as number) >= 3); + expect(result.col("a").values).toEqual([null, null, 3]); + expect(result.col("b").values).toEqual([4, 5, 6]); + }); + + it("preserves index and column names", () => { + const df = DataFrame.fromColumns({ x: [1, 2], y: [3, 4] }); + const result = whereDataFrame(df, [ + [true, false], + [false, true], + ]); + expect(result.columns.values).toEqual(["x", "y"]); + expect(result.index.values).toEqual([0, 1]); + }); +}); + +// ─── maskDataFrame ──────────────────────────────────────────────────────────── + +describe("maskDataFrame", () => { + it("works with 2-D boolean array", () => { + const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [4, 5, 6] }); + const cond = [ + [true, false], + [false, true], + [true, true], + ]; + const result = maskDataFrame(df, cond); + expect(result.col("a").values).toEqual([null, 2, null]); + expect(result.col("b").values).toEqual([4, null, null]); + }); + + it("mask and where are complements with same DataFrame cond", () => { + const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [4, 5, 6] }); + const cond = [ + [true, false], + [false, true], + [true, false], + ]; + const w = whereDataFrame(df, cond, { other: 99 }); + const m = maskDataFrame(df, cond, { other: 99 }); + for (const colName of ["a", "b"]) { + for (let r = 0; r < 3; r++) { + const wVal = w.col(colName).iat(r); + const mVal = m.col(colName).iat(r); + const orig = df.col(colName).iat(r); + // One must be orig, other must be 99 + expect([wVal, mVal].sort()).toEqual([99, orig].sort()); + } + } + }); + + it("works with DataFrame condition", () => { + const df = DataFrame.fromColumns({ a: [10, 20], b: [30, 40] }); + const condDf = DataFrame.fromColumns({ a: [false, true], b: [true, false] }); + const result = maskDataFrame(df, condDf, { other: 0 }); + expect(result.col("a").values).toEqual([10, 0]); + expect(result.col("b").values).toEqual([0, 40]); + }); + + it("works with callable condition", () => { + const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [4, 5, 6] }); + const result = maskDataFrame(df, (v) => (v as number) > 4, { other: -1 }); + expect(result.col("a").values).toEqual([1, 2, 3]); + expect(result.col("b").values).toEqual([4, -1, -1]); + }); +}); + +// ─── property-based tests ───────────────────────────────────────────────────── + +describe("whereSeries property tests", () => { + it("where + mask with same cond never produce the same output when values differ and other differs from values", () => { + fc.assert( + fc.property( + fc.array(fc.integer({ min: 1, max: 100 }), { minLength: 1, maxLength: 10 }), + fc.array(fc.boolean(), { minLength: 1, maxLength: 10 }), + (arr, bools) => { + const len = Math.min(arr.length, bools.length); + const data = arr.slice(0, len); + const cond = bools.slice(0, len); + const series = new Series({ data }); + const w = whereSeries(series, cond, { other: -999 }); + const m = maskSeries(series, cond, { other: -999 }); + for (let i = 0; i < len; i++) { + if (cond[i]) { + // where keeps, mask replaces + expect(w.iat(i)).toBe(data[i]); + expect(m.iat(i)).toBe(-999); + } else { + // where replaces, mask keeps + expect(w.iat(i)).toBe(-999); + expect(m.iat(i)).toBe(data[i]); + } + } + }, + ), + ); + }); + + it("where with all-true cond is identity", () => { + fc.assert( + fc.property(fc.array(fc.integer(), { minLength: 0, maxLength: 20 }), (arr) => { + const series = new Series({ data: arr }); + const cond = arr.map(() => true); + const result = whereSeries(series, cond); + for (let i = 0; i < arr.length; i++) { + expect(result.iat(i)).toBe(arr[i]); + } + }), + ); + }); + + it("mask with all-false cond is identity", () => { + fc.assert( + fc.property(fc.array(fc.integer(), { minLength: 0, maxLength: 20 }), (arr) => { + const series = new Series({ data: arr }); + const cond = arr.map(() => false); + const result = maskSeries(series, cond); + for (let i = 0; i < arr.length; i++) { + expect(result.iat(i)).toBe(arr[i]); + } + }), + ); + }); + + it("size is preserved after where/mask", () => { + fc.assert( + fc.property(fc.array(fc.integer(), { minLength: 0, maxLength: 20 }), (arr) => { + const series = new Series({ data: arr }); + const cond = arr.map((_, i) => i % 2 === 0); + expect(whereSeries(series, cond).size).toBe(arr.length); + expect(maskSeries(series, cond).size).toBe(arr.length); + }), + ); + }); +}); From 07f43b810f8a1831d21e54add0428ee8e7fa623c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 11 Apr 2026 20:32:38 +0000 Subject: [PATCH 07/14] =?UTF-8?q?Iteration=20197:=20Add=20diff/shift=20?= =?UTF-8?q?=E2=80=94=20discrete=20difference=20and=20value=20shifting=20fo?= =?UTF-8?q?r=20Series=20and=20DataFrame?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run: https://github.com/githubnext/tsessebe/actions/runs/24283807306 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- playground/diff_shift.html | 443 +++++++++++++++++++++++++++++++++ src/stats/diff_shift.ts | 368 +++++++++++++++++++++++++++ tests/stats/diff_shift.test.ts | 322 ++++++++++++++++++++++++ 3 files changed, 1133 insertions(+) create mode 100644 playground/diff_shift.html create mode 100644 src/stats/diff_shift.ts create mode 100644 tests/stats/diff_shift.test.ts diff --git a/playground/diff_shift.html b/playground/diff_shift.html new file mode 100644 index 00000000..3a300fbf --- /dev/null +++ b/playground/diff_shift.html @@ -0,0 +1,443 @@ + + + + + + tsb — diff & shift (discrete difference and value shifting) + + + +
+
+
Loading tsb runtime…
+
+ + ← Back to playground index + +

diff & shift — discrete difference and value shifting

+

+ diffSeries / diffDataFrame compute the element-wise discrete + difference (value[i] - value[i-periods]).
+ shiftSeries / shiftDataFrame shift values forward or backward + by a given number of periods, filling with a configurable value.
+ Mirrors Series.diff(), Series.shift(), + DataFrame.diff(), and DataFrame.shift() from pandas. +

+ + +
+

1 · Series diff — first discrete difference

+

+ Compute s[i] - s[i - periods] for each position. + The first periods entries are null. + Non-numeric values produce null. +

+
+
+
+ + +
+
+ + +
+
+ + +
Press ▶ Run to execute
+
+

💡 Tip: diffSeries is commonly used to compute returns, velocity, or changes over time.

+
+ + +
+

2 · Series shift — lag and lead values

+

+ Shift values forward (positive periods) or backward (negative periods). + Vacated positions are filled with fillValue (default null). +

+
+
+
+ + +
+
+ + +
+
+ + +
Press ▶ Run to execute
+
+

💡 Tip: combine shiftSeries with arithmetic to compute returns, lags, or leads.

+
+ + +
+

3 · DataFrame diff — column-wise and row-wise

+

+ axis=0 (default): diff each column independently (rows over time).
+ axis=1: diff across columns within each row. +

+
+
+
+ + +
+
+ + +
+
+ + +
Press ▶ Run to execute
+
+
+ + +
+

4 · DataFrame shift — lagging a DataFrame

+

+ Shift all columns by the same number of periods. + Useful for creating lagged features in machine learning. +

+
+
+
+ + +
+
+ + +
+
+ + +
Press ▶ Run to execute
+
+

💡 Tip: creating multiple lagged columns is a common feature-engineering technique for time series forecasting.

+
+ + +
+

API Reference

+
// Discrete difference
+diffSeries(series: Series<Scalar>, options?: DiffOptions): Series<Scalar>
+diffDataFrame(df: DataFrame, options?: DataFrameDiffOptions): DataFrame
+
+interface DiffOptions {
+  periods?: number;  // default 1; negative = look forward
+}
+interface DataFrameDiffOptions extends DiffOptions {
+  axis?: 0 | 1 | "index" | "columns";  // default 0
+}
+
+// Value shifting
+shiftSeries(series: Series<Scalar>, options?: ShiftOptions): Series<Scalar>
+shiftDataFrame(df: DataFrame, options?: DataFrameShiftOptions): DataFrame
+
+interface ShiftOptions {
+  periods?:   number;  // default 1; negative = shift backward
+  fillValue?: Scalar;  // default null
+}
+interface DataFrameShiftOptions extends ShiftOptions {
+  axis?: 0 | 1 | "index" | "columns";  // default 0
+}
+
+ +
+

+ Part of tsb — a TypeScript port of pandas. + Source: src/stats/diff_shift.ts +

+
+ + + + diff --git a/src/stats/diff_shift.ts b/src/stats/diff_shift.ts new file mode 100644 index 00000000..4f62825f --- /dev/null +++ b/src/stats/diff_shift.ts @@ -0,0 +1,368 @@ +/** + * diff_shift — discrete difference and value-shift for Series and DataFrame. + * + * Mirrors the following pandas methods: + * - `Series.diff(periods=1)` — first discrete difference shifted by `periods` + * - `Series.shift(periods=1, fill_value=NaN)` — shift index by `periods` + * - `DataFrame.diff(periods=1, axis=0)` — column-wise or row-wise diff + * - `DataFrame.shift(periods=1, fill_value=NaN, axis=0)` — column-wise or row-wise shift + * + * All functions are **pure** (return new objects; inputs are unchanged). + * Non-numeric values in `diff` yield `null`. + * + * @module + */ + +import { DataFrame } from "../core/index.ts"; +import { Series } from "../core/index.ts"; +import type { Axis, Scalar } from "../types.ts"; + +// ─── public types ────────────────────────────────────────────────────────────── + +/** Options for {@link diffSeries} and {@link diffDataFrame}. */ +export interface DiffOptions { + /** + * Number of periods to shift for calculating difference. + * Negative values shift in the opposite direction. + * Default `1`. + */ + readonly periods?: number; +} + +/** Options for {@link diffDataFrame}. */ +export interface DataFrameDiffOptions extends DiffOptions { + /** + * Axis along which to compute the difference. + * - `0` or `"index"` (default): diff down each **column**. + * - `1` or `"columns"`: diff across each **row**. + */ + readonly axis?: Axis; +} + +/** Options for {@link shiftSeries} and {@link shiftDataFrame}. */ +export interface ShiftOptions { + /** + * Number of periods to shift. + * Positive: shift forward (later rows get earlier values). + * Negative: shift backward. + * Default `1`. + */ + readonly periods?: number; + /** + * Value to fill positions that fall outside the original range. + * Default `null` (treated as missing, like pandas NaN). + */ + readonly fillValue?: Scalar; +} + +/** Options for {@link shiftDataFrame}. */ +export interface DataFrameShiftOptions extends ShiftOptions { + /** + * Axis along which to shift. + * - `0` or `"index"` (default): shift down each **column**. + * - `1` or `"columns"`: shift across each **row**. + */ + readonly axis?: Axis; +} + +// ─── helpers ─────────────────────────────────────────────────────────────────── + +/** True when `v` is a finite number (not null / undefined / NaN). */ +function isFiniteNum(v: Scalar): v is number { + return typeof v === "number" && !Number.isNaN(v); +} + +/** + * Compute element-wise discrete difference for an array of scalars. + * `result[i] = arr[i] - arr[i - periods]`. + * Non-numeric positions (either current or prior) yield `null`. + */ +function diffArray(vals: readonly Scalar[], periods: number): Scalar[] { + const n = vals.length; + const out: Scalar[] = new Array(n).fill(null); + for (let i = 0; i < n; i++) { + const j = i - periods; + if (j < 0 || j >= n) { + out[i] = null; + continue; + } + const cur = vals[i] as Scalar; + const prev = vals[j] as Scalar; + if (isFiniteNum(cur) && isFiniteNum(prev)) { + out[i] = cur - prev; + } else { + out[i] = null; + } + } + return out; +} + +/** + * Shift an array of scalars by `periods` positions, filling with `fillValue`. + * Positive `periods` moves values forward (later positions get earlier values); + * negative `periods` moves values backward. + */ +function shiftArray(vals: readonly Scalar[], periods: number, fillValue: Scalar): Scalar[] { + const n = vals.length; + const out: Scalar[] = new Array(n).fill(fillValue); + if (periods >= 0) { + for (let i = periods; i < n; i++) { + out[i] = vals[i - periods] as Scalar; + } + } else { + const offset = -periods; + for (let i = 0; i < n - offset; i++) { + out[i] = vals[i + offset] as Scalar; + } + } + return out; +} + +// ─── Series: diff ────────────────────────────────────────────────────────────── + +/** + * Compute the first discrete difference of a Series. + * + * `result[i] = series[i] - series[i - periods]`. + * The first `|periods|` positions (or last, for negative) are `null`. + * Non-numeric values yield `null`. + * + * Mirrors `pandas.Series.diff(periods=1)`. + * + * @example + * ```ts + * import { Series } from "tsb"; + * import { diffSeries } from "tsb"; + * + * const s = new Series({ data: [1, 3, 6, 10, 15] }); + * diffSeries(s).values; // [null, 2, 3, 4, 5] + * diffSeries(s, { periods: 2 }).values; // [null, null, 5, 7, 9] + * ``` + */ +export function diffSeries(series: Series, options: DiffOptions = {}): Series { + const periods = options.periods ?? 1; + const data = diffArray(series.values as readonly Scalar[], periods); + return new Series({ data, index: series.index, name: series.name }); +} + +// ─── Series: shift ───────────────────────────────────────────────────────────── + +/** + * Shift the values of a Series by `periods` positions. + * + * Positive `periods` shifts values forward (down); earlier positions are filled + * with `fillValue`. Negative `periods` shifts backward (up). + * + * Mirrors `pandas.Series.shift(periods=1, fill_value=NaN)`. + * + * @example + * ```ts + * import { Series } from "tsb"; + * import { shiftSeries } from "tsb"; + * + * const s = new Series({ data: [1, 2, 3, 4, 5] }); + * shiftSeries(s).values; // [null, 1, 2, 3, 4] + * shiftSeries(s, { periods: -1 }).values; // [2, 3, 4, 5, null] + * shiftSeries(s, { periods: 2, fillValue: 0 }).values; // [0, 0, 1, 2, 3] + * ``` + */ +export function shiftSeries(series: Series, options: ShiftOptions = {}): Series { + const periods = options.periods ?? 1; + const fillValue = options.fillValue !== undefined ? options.fillValue : null; + const data = shiftArray(series.values as readonly Scalar[], periods, fillValue); + return new Series({ data, index: series.index, name: series.name }); +} + +// ─── DataFrame: diff ────────────────────────────────────────────────────────── + +/** + * Compute the first discrete difference of a DataFrame. + * + * When `axis=0` (default), diffs down each column independently. + * When `axis=1`, diffs across each row (column N minus column N-periods). + * + * Mirrors `pandas.DataFrame.diff(periods=1, axis=0)`. + * + * @example + * ```ts + * import { DataFrame } from "tsb"; + * import { diffDataFrame } from "tsb"; + * + * const df = DataFrame.fromColumns({ a: [1, 3, 6], b: [10, 20, 35] }); + * diffDataFrame(df).col("a").values; // [null, 2, 3] + * diffDataFrame(df).col("b").values; // [null, 10, 15] + * ``` + */ +export function diffDataFrame(df: DataFrame, options: DataFrameDiffOptions = {}): DataFrame { + const periods = options.periods ?? 1; + const axis = options.axis ?? 0; + const colNames = df.columns.values; + + if (axis === 1 || axis === "columns") { + return diffDataFrameRowWise(df, colNames, periods); + } + return diffDataFrameColWise(df, colNames, periods); +} + +/** Diff each column independently (axis=0). */ +function diffDataFrameColWise( + df: DataFrame, + colNames: readonly string[], + periods: number, +): DataFrame { + const colMap = new Map>(); + for (const name of colNames) { + const col = df.col(name) as Series; + const data = diffArray(col.values as readonly Scalar[], periods); + colMap.set(name, new Series({ data, index: df.index, name })); + } + return new DataFrame(colMap, df.index); +} + +/** Diff across columns (axis=1). */ +function diffDataFrameRowWise( + df: DataFrame, + colNames: readonly string[], + periods: number, +): DataFrame { + const nRows = df.index.size; + const nCols = colNames.length; + const colMap = new Map>(); + + for (let c = 0; c < nCols; c++) { + const name = colNames[c]; + if (name === undefined) { + continue; + } + const rowData: Scalar[] = new Array(nRows).fill(null); + const priorIdx = c - periods; + if (priorIdx < 0 || priorIdx >= nCols) { + colMap.set(name, new Series({ data: rowData, index: df.index, name })); + continue; + } + const priorName = colNames[priorIdx]; + if (priorName === undefined) { + colMap.set(name, new Series({ data: rowData, index: df.index, name })); + continue; + } + const curCol = df.col(name) as Series; + const priorCol = df.col(priorName) as Series; + for (let r = 0; r < nRows; r++) { + const cur = curCol.iat(r); + const prev = priorCol.iat(r); + if (isFiniteNum(cur) && isFiniteNum(prev)) { + rowData[r] = cur - prev; + } else { + rowData[r] = null; + } + } + colMap.set(name, new Series({ data: rowData, index: df.index, name })); + } + return new DataFrame(colMap, df.index); +} + +// ─── DataFrame: shift ───────────────────────────────────────────────────────── + +/** + * Shift the values of a DataFrame by `periods` positions. + * + * When `axis=0` (default), each column is shifted independently. + * When `axis=1`, each row is shifted across columns. + * + * Mirrors `pandas.DataFrame.shift(periods=1, fill_value=NaN, axis=0)`. + * + * @example + * ```ts + * import { DataFrame } from "tsb"; + * import { shiftDataFrame } from "tsb"; + * + * const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [4, 5, 6] }); + * shiftDataFrame(df).col("a").values; // [null, 1, 2] + * shiftDataFrame(df, { periods: -1 }).col("b").values; // [5, 6, null] + * ``` + */ +export function shiftDataFrame(df: DataFrame, options: DataFrameShiftOptions = {}): DataFrame { + const periods = options.periods ?? 1; + const fillValue = options.fillValue !== undefined ? options.fillValue : null; + const axis = options.axis ?? 0; + const colNames = df.columns.values; + + if (axis === 1 || axis === "columns") { + return shiftDataFrameRowWise(df, colNames, periods, fillValue); + } + return shiftDataFrameColWise(df, colNames, periods, fillValue); +} + +/** Shift each column independently (axis=0). */ +function shiftDataFrameColWise( + df: DataFrame, + colNames: readonly string[], + periods: number, + fillValue: Scalar, +): DataFrame { + const colMap = new Map>(); + for (const name of colNames) { + const col = df.col(name) as Series; + const data = shiftArray(col.values as readonly Scalar[], periods, fillValue); + colMap.set(name, new Series({ data, index: df.index, name })); + } + return new DataFrame(colMap, df.index); +} + +/** Shift each row across columns (axis=1). */ +function shiftDataFrameRowWise( + df: DataFrame, + colNames: readonly string[], + periods: number, + fillValue: Scalar, +): DataFrame { + const nRows = df.index.size; + const nCols = colNames.length; + + // Build a 2D matrix [row][col] of shifted values + const matrix: Scalar[][] = Array.from({ length: nRows }, () => + new Array(nCols).fill(fillValue), + ); + + if (periods >= 0) { + for (let c = periods; c < nCols; c++) { + const srcName = colNames[c - periods]; + if (srcName === undefined) { + continue; + } + const src = df.col(srcName) as Series; + for (let r = 0; r < nRows; r++) { + const row = matrix[r]; + if (row !== undefined) { + row[c] = src.iat(r); + } + } + } + } else { + const offset = -periods; + for (let c = 0; c < nCols - offset; c++) { + const srcName = colNames[c + offset]; + if (srcName === undefined) { + continue; + } + const src = df.col(srcName) as Series; + for (let r = 0; r < nRows; r++) { + const row = matrix[r]; + if (row !== undefined) { + row[c] = src.iat(r); + } + } + } + } + + const colMap = new Map>(); + for (let c = 0; c < nCols; c++) { + const name = colNames[c]; + if (name === undefined) { + continue; + } + const data = matrix.map((row) => row[c] as Scalar); + colMap.set(name, new Series({ data, index: df.index, name })); + } + return new DataFrame(colMap, df.index); +} diff --git a/tests/stats/diff_shift.test.ts b/tests/stats/diff_shift.test.ts new file mode 100644 index 00000000..7aab648d --- /dev/null +++ b/tests/stats/diff_shift.test.ts @@ -0,0 +1,322 @@ +/** + * Tests for stats/diff_shift.ts + * + * Covers: + * - diffSeries: default (periods=1), custom periods, negative periods, non-numeric passthrough + * - shiftSeries: forward, backward, custom fillValue + * - diffDataFrame: axis=0 (col-wise), axis=1 (row-wise) + * - shiftDataFrame: axis=0 (col-wise), axis=1 (row-wise) + * - Property-based tests with fast-check + */ + +import { describe, expect, test } from "bun:test"; +import fc from "fast-check"; +import { + DataFrame, + Series, + diffDataFrame, + diffSeries, + shiftDataFrame, + shiftSeries, +} from "../../src/index.ts"; +import type { Scalar } from "../../src/index.ts"; + +// ─── helpers ────────────────────────────────────────────────────────────────── + +function makeSeries(data: Scalar[], name?: string): Series { + return new Series({ data, name: name ?? "s" }); +} + +// ─── diffSeries ─────────────────────────────────────────────────────────────── + +describe("diffSeries", () => { + test("default periods=1", () => { + const s = makeSeries([1, 3, 6, 10, 15]); + const result = diffSeries(s); + expect(result.values).toEqual([null, 2, 3, 4, 5]); + }); + + test("periods=2", () => { + const s = makeSeries([1, 3, 6, 10, 15]); + const result = diffSeries(s, { periods: 2 }); + expect(result.values).toEqual([null, null, 5, 7, 9]); + }); + + test("periods=-1 (backward)", () => { + const s = makeSeries([1, 3, 6, 10, 15]); + const result = diffSeries(s, { periods: -1 }); + expect(result.values).toEqual([-2, -3, -4, -5, null]); + }); + + test("preserves index and name", () => { + const s = makeSeries([10, 20, 30], "myname"); + const result = diffSeries(s); + expect(result.name).toBe("myname"); + expect(result.index.size).toBe(3); + }); + + test("non-numeric values produce null", () => { + const s = makeSeries([1, null, 3, "x", 5]); + const result = diffSeries(s); + // [null, null(1-null=null), null(null-null=null), null("x"-null), null(5-"x")] + expect(result.values[0]).toBe(null); + expect(result.values[1]).toBe(null); + expect(result.values[2]).toBe(null); + expect(result.values[3]).toBe(null); + expect(result.values[4]).toBe(null); + }); + + test("single element → [null]", () => { + const s = makeSeries([42]); + expect(diffSeries(s).values).toEqual([null]); + }); + + test("empty series", () => { + const s = makeSeries([]); + expect(diffSeries(s).values).toEqual([]); + }); + + test("periods larger than length → all null", () => { + const s = makeSeries([1, 2, 3]); + const result = diffSeries(s, { periods: 5 }); + expect(result.values).toEqual([null, null, null]); + }); + + test("NaN values produce null", () => { + const s = makeSeries([1, Number.NaN, 3]); + const result = diffSeries(s); + expect(result.values[1]).toBe(null); + expect(result.values[2]).toBe(null); + }); +}); + +// ─── shiftSeries ────────────────────────────────────────────────────────────── + +describe("shiftSeries", () => { + test("default periods=1, fills null", () => { + const s = makeSeries([1, 2, 3, 4, 5]); + expect(shiftSeries(s).values).toEqual([null, 1, 2, 3, 4]); + }); + + test("periods=2", () => { + const s = makeSeries([1, 2, 3, 4, 5]); + expect(shiftSeries(s, { periods: 2 }).values).toEqual([null, null, 1, 2, 3]); + }); + + test("periods=-1 (backward)", () => { + const s = makeSeries([1, 2, 3, 4, 5]); + expect(shiftSeries(s, { periods: -1 }).values).toEqual([2, 3, 4, 5, null]); + }); + + test("periods=-2", () => { + const s = makeSeries([1, 2, 3, 4, 5]); + expect(shiftSeries(s, { periods: -2 }).values).toEqual([3, 4, 5, null, null]); + }); + + test("custom fillValue", () => { + const s = makeSeries([1, 2, 3]); + expect(shiftSeries(s, { periods: 1, fillValue: 0 }).values).toEqual([0, 1, 2]); + }); + + test("periods=0 → same values", () => { + const s = makeSeries([10, 20, 30]); + expect(shiftSeries(s, { periods: 0 }).values).toEqual([10, 20, 30]); + }); + + test("preserves index and name", () => { + const s = makeSeries([1, 2, 3], "col"); + const result = shiftSeries(s); + expect(result.name).toBe("col"); + expect(result.index.size).toBe(3); + }); + + test("periods >= length → all fillValue", () => { + const s = makeSeries([1, 2, 3]); + expect(shiftSeries(s, { periods: 5, fillValue: -1 }).values).toEqual([-1, -1, -1]); + }); + + test("empty series", () => { + const s = makeSeries([]); + expect(shiftSeries(s).values).toEqual([]); + }); +}); + +// ─── diffDataFrame (axis=0) ─────────────────────────────────────────────────── + +describe("diffDataFrame axis=0 (column-wise)", () => { + test("default periods=1 each column independently", () => { + const df = DataFrame.fromColumns({ a: [1, 3, 6], b: [10, 20, 35] }); + const result = diffDataFrame(df); + expect(result.col("a").values).toEqual([null, 2, 3]); + expect(result.col("b").values).toEqual([null, 10, 15]); + }); + + test("periods=2", () => { + const df = DataFrame.fromColumns({ a: [1, 2, 4, 8] }); + const result = diffDataFrame(df, { periods: 2 }); + expect(result.col("a").values).toEqual([null, null, 3, 6]); + }); + + test("preserves index", () => { + const df = DataFrame.fromColumns({ a: [1, 2, 3] }); + const result = diffDataFrame(df); + expect(result.index.size).toBe(3); + }); +}); + +// ─── diffDataFrame (axis=1) ─────────────────────────────────────────────────── + +describe("diffDataFrame axis=1 (row-wise)", () => { + test("default periods=1 across columns", () => { + const df = DataFrame.fromColumns({ a: [1, 10], b: [4, 16], c: [9, 25] }); + const result = diffDataFrame(df, { axis: 1 }); + // col a: always null (no prior column) + expect(result.col("a").values).toEqual([null, null]); + // col b: b - a = [3, 6] + expect(result.col("b").values).toEqual([3, 6]); + // col c: c - b = [5, 9] + expect(result.col("c").values).toEqual([5, 9]); + }); +}); + +// ─── shiftDataFrame (axis=0) ───────────────────────────────────────────────── + +describe("shiftDataFrame axis=0 (column-wise)", () => { + test("default periods=1", () => { + const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [4, 5, 6] }); + const result = shiftDataFrame(df); + expect(result.col("a").values).toEqual([null, 1, 2]); + expect(result.col("b").values).toEqual([null, 4, 5]); + }); + + test("periods=-1", () => { + const df = DataFrame.fromColumns({ a: [1, 2, 3] }); + expect(shiftDataFrame(df, { periods: -1 }).col("a").values).toEqual([2, 3, null]); + }); + + test("custom fillValue", () => { + const df = DataFrame.fromColumns({ a: [1, 2, 3] }); + const result = shiftDataFrame(df, { periods: 2, fillValue: 0 }); + expect(result.col("a").values).toEqual([0, 0, 1]); + }); + + test("preserves column structure", () => { + const df = DataFrame.fromColumns({ x: [1, 2], y: [3, 4] }); + const result = shiftDataFrame(df); + expect(result.columns.values).toEqual(["x", "y"]); + }); +}); + +// ─── shiftDataFrame (axis=1) ───────────────────────────────────────────────── + +describe("shiftDataFrame axis=1 (row-wise)", () => { + test("periods=1 shifts columns right", () => { + const df = DataFrame.fromColumns({ a: [1, 2], b: [3, 4], c: [5, 6] }); + const result = shiftDataFrame(df, { axis: 1, periods: 1, fillValue: 0 }); + // col a gets fillValue (no prior col) + expect(result.col("a").values).toEqual([0, 0]); + // col b gets values from col a + expect(result.col("b").values).toEqual([1, 2]); + // col c gets values from col b + expect(result.col("c").values).toEqual([3, 4]); + }); + + test("periods=-1 shifts columns left", () => { + const df = DataFrame.fromColumns({ a: [1, 2], b: [3, 4], c: [5, 6] }); + const result = shiftDataFrame(df, { axis: 1, periods: -1, fillValue: 0 }); + // col a gets values from col b + expect(result.col("a").values).toEqual([3, 4]); + // col b gets values from col c + expect(result.col("b").values).toEqual([5, 6]); + // col c gets fillValue + expect(result.col("c").values).toEqual([0, 0]); + }); +}); + +// ─── property-based tests ───────────────────────────────────────────────────── + +describe("property-based: diffSeries", () => { + test("diff(periods=1) length equals input length", () => { + fc.assert( + fc.property(fc.array(fc.float({ noNaN: true }), { minLength: 0, maxLength: 50 }), (arr) => { + const s = makeSeries(arr); + const result = diffSeries(s); + expect(result.size).toBe(s.size); + }), + ); + }); + + test("diff[0] is always null for periods >= 1", () => { + fc.assert( + fc.property(fc.array(fc.float({ noNaN: true }), { minLength: 1, maxLength: 30 }), (arr) => { + const s = makeSeries(arr); + const result = diffSeries(s, { periods: 1 }); + expect(result.values[0]).toBe(null); + }), + ); + }); + + test("shift+diff reconstructs original for numeric arrays (first element is null)", () => { + fc.assert( + fc.property(fc.array(fc.integer({ min: -1000, max: 1000 }), { minLength: 2, maxLength: 20 }), (arr) => { + const data = arr as Scalar[]; + const s = makeSeries(data); + const shifted = shiftSeries(s, { periods: 1, fillValue: 0 }); + const d = diffSeries(s); + // sum of diffs [1..n] + first value ≈ last value (numeric check) + // More directly: diff[i] + shifted[i] = s[i] for i >= 1 + for (let i = 1; i < arr.length; i++) { + const diffVal = d.values[i] as number; + const shiftedVal = shifted.values[i] as number; + expect(diffVal + shiftedVal).toBeCloseTo(arr[i] as number, 10); + } + }), + ); + }); +}); + +describe("property-based: shiftSeries", () => { + test("shift preserves length", () => { + fc.assert( + fc.property( + fc.array(fc.integer(), { minLength: 0, maxLength: 50 }), + fc.integer({ min: -20, max: 20 }), + (arr, periods) => { + const s = makeSeries(arr as Scalar[]); + const result = shiftSeries(s, { periods }); + expect(result.size).toBe(s.size); + }, + ), + ); + }); + + test("shift(0) is identity", () => { + fc.assert( + fc.property(fc.array(fc.integer(), { minLength: 0, maxLength: 30 }), (arr) => { + const s = makeSeries(arr as Scalar[]); + const result = shiftSeries(s, { periods: 0 }); + for (let i = 0; i < arr.length; i++) { + expect(result.values[i]).toBe(arr[i]); + } + }), + ); + }); + + test("shift(n) then shift(-n) recovers original in the middle region", () => { + fc.assert( + fc.property( + fc.array(fc.integer({ min: -100, max: 100 }), { minLength: 4, maxLength: 30 }), + fc.integer({ min: 1, max: 5 }), + (arr, n) => { + const s = makeSeries(arr as Scalar[]); + const shifted = shiftSeries(s, { periods: n, fillValue: null }); + const recovered = shiftSeries(shifted, { periods: -n, fillValue: null }); + // middle region (indices n..len-n) should match original + for (let i = n; i < arr.length - n; i++) { + expect(recovered.values[i]).toBe(arr[i]); + } + }, + ), + ); + }); +}); From ac6f905e887161626bc7a1deff93a8d1a575fa69 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 11 Apr 2026 20:32:39 +0000 Subject: [PATCH 08/14] Iteration 199: Add duplicated/drop_duplicates and sample - stats/duplicated.ts: duplicatedSeries, duplicatedDataFrame, dropDuplicatesSeries, dropDuplicatesDataFrame with keep='first'/'last'/false and subset support - core/sample.ts: sampleSeries, sampleDataFrame with n/frac, replace, weighted sampling, and seeded RNG (randomState) - 35 tests each (unit + fast-check properties) - Playground pages: duplicated.html, sample.html Run: https://github.com/githubnext/tsessebe/actions/runs/24285279820 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- playground/duplicated.html | 198 +++++++++++++++++++ playground/sample.html | 187 ++++++++++++++++++ src/core/sample.ts | 334 +++++++++++++++++++++++++++++++++ src/stats/duplicated.ts | 274 +++++++++++++++++++++++++++ tests/core/sample.test.ts | 202 ++++++++++++++++++++ tests/stats/duplicated.test.ts | 247 ++++++++++++++++++++++++ 6 files changed, 1442 insertions(+) create mode 100644 playground/duplicated.html create mode 100644 playground/sample.html create mode 100644 src/core/sample.ts create mode 100644 src/stats/duplicated.ts create mode 100644 tests/core/sample.test.ts create mode 100644 tests/stats/duplicated.test.ts diff --git a/playground/duplicated.html b/playground/duplicated.html new file mode 100644 index 00000000..d562c003 --- /dev/null +++ b/playground/duplicated.html @@ -0,0 +1,198 @@ + + + + + + tsb — duplicated / drop_duplicates + + + +

tsb — duplicated / drop_duplicates

+

+ Detect and remove duplicate values or rows. + duplicatedSeries / duplicatedDataFrame return a boolean + Series marking which items are duplicates. + dropDuplicatesSeries / dropDuplicatesDataFrame return + a new object with duplicates removed. +

+ +

Core concept

+
// keep="first" (default): mark later duplicates as true
+duplicatedSeries(s)
+
+// keep="last": mark earlier duplicates as true
+duplicatedSeries(s, { keep: "last" })
+
+// keep=false: mark ALL occurrences of any duplicate
+duplicatedSeries(s, { keep: false })
+ +
+ pandas equivalent:
+ s.duplicated(keep='first')
+ df.duplicated(subset=['a', 'b'], keep='first')
+ s.drop_duplicates() / df.drop_duplicates() +
+ + +

Demo 1 — duplicatedSeries with keep="first"

+
+
Code
+
const s = new Series({ data: [1, 2, 1, 3, 2] });
+duplicatedSeries(s).values;
+// → [false, false, true, false, true]
+ + +
+ + +

Demo 2 — duplicatedSeries with keep=false (mark all)

+
+
Code
+
const s = new Series({ data: ["a", "b", "a", "c", "b"] });
+duplicatedSeries(s, { keep: false }).values;
+// → [true, true, true, false, true]
+ + +
+ + +

Demo 3 — dropDuplicatesSeries

+
+
Code
+
const s = new Series({ data: [10, 20, 10, 30, 20], name: "prices" });
+dropDuplicatesSeries(s).values;
+// → [10, 20, 30]
+ + +
+ + +

Demo 4 — duplicatedDataFrame with subset

+
+
Code
+
const df = DataFrame.fromRecords([
+  { name: "Alice", dept: "Eng" },
+  { name: "Bob",   dept: "Eng" },
+  { name: "Alice", dept: "HR" },
+  { name: "Bob",   dept: "Eng" }, // ← duplicate of row 1 on "name"+"dept"
+]);
+// Only consider "name" column for duplicates:
+duplicatedDataFrame(df, { subset: ["name"] }).values;
+// → [false, false, true, true]  (Alice and Bob each appear twice)
+ + +
+ + +

Demo 5 — dropDuplicatesDataFrame

+
+
Code
+
const df = DataFrame.fromRecords([
+  { a: 1, b: 2 },
+  { a: 1, b: 2 },
+  { a: 3, b: 4 },
+  { a: 3, b: 4 },
+]);
+const deduped = dropDuplicatesDataFrame(df);
+// shape: [2, 2]
+// a: [1, 3]  b: [2, 4]
+ + +
+ + +

Interactive editor

+
+
Edit and run:
+ + + +
+ + + + diff --git a/playground/sample.html b/playground/sample.html new file mode 100644 index 00000000..d29ed43a --- /dev/null +++ b/playground/sample.html @@ -0,0 +1,187 @@ + + + + + + tsb — sample + + + +

tsb — sample

+

+ Randomly sample items from a Series or rows/columns from a DataFrame. + Supports fixed count (n), fractional sampling (frac), + sampling with replacement (replace), weighted sampling, and + deterministic seeding via randomState. +

+ +

Core concept

+
// Sample 3 items (without replacement by default)
+sampleSeries(s, { n: 3 })
+
+// Sample 50% of rows
+sampleDataFrame(df, { frac: 0.5 })
+
+// Reproducible sample with seed
+sampleSeries(s, { n: 2, randomState: 42 })
+
+// Sample with replacement (bootstrap)
+sampleSeries(s, { n: 10, replace: true })
+
+// Sample columns instead of rows
+sampleDataFrame(df, { n: 2, axis: 1 })
+ +
+ pandas equivalent:
+ s.sample(n=3, random_state=42)
+ df.sample(frac=0.5, replace=False, axis=0) +
+ + +

Demo 1 — sampleSeries (n)

+
+
Code
+
const s = new Series({ data: [10, 20, 30, 40, 50], name: "scores" });
+sampleSeries(s, { n: 3, randomState: 7 }).values;
+// deterministic result with seed 7
+ + +
+ + +

Demo 2 — sampleSeries with frac

+
+
Code
+
const s = new Series({ data: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] });
+sampleSeries(s, { frac: 0.3, randomState: 42 }).values;
+// 30% of 10 items = 3 items
+ + +
+ + +

Demo 3 — bootstrap sampling (replace=true)

+
+
Code
+
const s = new Series({ data: ["a", "b", "c"] });
+// Sample more items than pool size — only possible with replace=true
+sampleSeries(s, { n: 7, replace: true, randomState: 0 }).values;
+ + +
+ + +

Demo 4 — weighted sampling

+
+
Code
+
const s = new Series({ data: ["rare", "common", "very_common"] });
+// "very_common" has 10× the weight of "rare"
+sampleSeries(s, { n: 1, weights: [1, 5, 10], randomState: 3 }).values;
+// most likely: ["very_common"]
+ + +
+ + +

Demo 5 — sampleDataFrame (rows)

+
+
Code
+
const df = DataFrame.fromRecords([
+  { city: "NYC",    pop: 8_336_817 },
+  { city: "LA",     pop: 3_979_576 },
+  { city: "Chicago",pop: 2_693_976 },
+  { city: "Houston",pop: 2_320_268 },
+  { city: "Phoenix",pop: 1_680_992 },
+]);
+const sample = sampleDataFrame(df, { n: 3, randomState: 1 });
+sample.col("city").values;
+ + +
+ + +

Interactive editor

+
+
Edit and run:
+ + + +
+ + + + diff --git a/src/core/sample.ts b/src/core/sample.ts new file mode 100644 index 00000000..869ce7b8 --- /dev/null +++ b/src/core/sample.ts @@ -0,0 +1,334 @@ +/** + * sample — random sampling from Series and DataFrame. + * + * Mirrors: + * - `pandas.Series.sample(n, frac, replace, weights, random_state, axis)` + * - `pandas.DataFrame.sample(n, frac, replace, weights, random_state, axis)` + * + * @module + */ + +import { DataFrame } from "./frame.ts"; +import { Index } from "./base-index.ts"; +import { Series } from "./series.ts"; +import type { Label, Scalar } from "../types.ts"; + +// ─── public types ───────────────────────────────────────────────────────────── + +/** Options for {@link sampleSeries} and {@link sampleDataFrame}. */ +export interface SampleOptions { + /** + * Number of items to return. Mutually exclusive with `frac`. + * @defaultValue `1` (when neither `n` nor `frac` is provided) + */ + readonly n?: number; + /** + * Fraction of items to return (e.g. `0.5` for 50%). + * Mutually exclusive with `n`. + */ + readonly frac?: number; + /** + * Allow sampling with replacement (the same item may appear multiple times). + * @defaultValue `false` + */ + readonly replace?: boolean; + /** + * Weights for each item. Must have the same length as the Series/DataFrame. + * Weights do not need to sum to 1 — they are normalized internally. + * Missing weights (null/undefined/NaN) are treated as 0. + */ + readonly weights?: readonly (number | null | undefined)[]; + /** + * Seed for the random number generator. When provided, sampling is + * deterministic (same seed + same data → same result). + * Uses a simple LCG (linear congruential generator). + */ + readonly randomState?: number; + /** + * Axis to sample along (DataFrame only). + * - `0` or `"index"` (default): sample rows. + * - `1` or `"columns"`: sample columns. + */ + readonly axis?: 0 | 1 | "index" | "columns"; +} + +// ─── seeded RNG ─────────────────────────────────────────────────────────────── + +/** + * Minimal LCG-based PRNG (Knuth constants). + * Returns a new seed and a float in [0, 1). + */ +function lcgNext(seed: number): [number, number] { + // LCG parameters (Numerical Recipes) + const a = 1664525; + const c = 1013904223; + const m = 2 ** 32; + const nextSeed = ((a * seed + c) >>> 0) % m; + return [nextSeed, nextSeed / m]; +} + +/** Build a seeded random float generator that returns [0,1). */ +function makeRng(seed: number | undefined): () => number { + if (seed === undefined) { + return () => Math.random(); + } + let s = seed >>> 0; // ensure 32-bit unsigned + return () => { + const [ns, r] = lcgNext(s); + s = ns; + return r; + }; +} + +// ─── helpers ────────────────────────────────────────────────────────────────── + +/** Resolve how many items to sample from a pool of size `poolSize`. */ +function resolveN(poolSize: number, n: number | undefined, frac: number | undefined): number { + if (n !== undefined && frac !== undefined) { + throw new Error("Sample: specify either `n` or `frac`, not both."); + } + if (frac !== undefined) { + if (frac < 0) { + throw new RangeError("Sample: `frac` must be >= 0."); + } + return Math.floor(frac * poolSize); + } + if (n !== undefined) { + if (n < 0) { + throw new RangeError("Sample: `n` must be >= 0."); + } + return n; + } + return 1; +} + +/** Normalize weights to probabilities summing to 1. */ +function normalizeWeights( + rawWeights: readonly (number | null | undefined)[], + poolSize: number, +): number[] { + if (rawWeights.length !== poolSize) { + throw new RangeError( + `Sample: weights length (${rawWeights.length}) must equal pool size (${poolSize}).`, + ); + } + const ws = rawWeights.map((w) => { + const v = w ?? 0; + if (typeof v !== "number" || Number.isNaN(v) || v < 0) { + return 0; + } + return v; + }); + const total = ws.reduce((s, v) => s + v, 0); + if (total === 0) { + throw new Error("Sample: all weights are zero."); + } + return ws.map((w) => w / total); +} + +/** + * Weighted random sample without replacement using the alias method. + * Falls back to basic weighted sampling when `replace=true`. + */ +function weightedSampleWithoutReplacement( + poolSize: number, + k: number, + probs: number[], + rng: () => number, +): number[] { + // Use reservoir sampling with exponential keys: assign key = rand^(1/w), take top-k + const keys: Array<[number, number]> = probs.map((p, i) => { + const r = rng(); + const key = p > 0 ? Math.pow(r, 1 / p) : 0; + return [key, i]; + }); + keys.sort((a, b) => b[0] - a[0]); + return keys.slice(0, k).map(([, i]) => i); +} + +/** + * Weighted sample WITH replacement: pick `k` indices based on cumulative probabilities. + */ +function weightedSampleWithReplacement( + k: number, + probs: number[], + rng: () => number, +): number[] { + const cumulative: number[] = []; + let sum = 0; + for (const p of probs) { + sum += p; + cumulative.push(sum); + } + + const result: number[] = []; + for (let i = 0; i < k; i++) { + const r = rng(); + let idx = cumulative.findIndex((c) => c >= r); + if (idx < 0) { + idx = probs.length - 1; + } + result.push(idx); + } + return result; +} + +/** + * Fisher-Yates shuffle (unweighted, without replacement) — pick the first `k` elements. + */ +function fisherYatesSample(poolSize: number, k: number, rng: () => number): number[] { + const indices = Array.from({ length: poolSize }, (_, i) => i); + for (let i = 0; i < k; i++) { + const j = i + Math.floor(rng() * (poolSize - i)); + const tmp = indices[i]; + const jVal = indices[j]; + if (tmp !== undefined && jVal !== undefined) { + indices[i] = jVal; + indices[j] = tmp; + } + } + return indices.slice(0, k); +} + +/** + * Sample with replacement (unweighted): draw `k` integers in [0, poolSize). + */ +function uniformSampleWithReplacement(poolSize: number, k: number, rng: () => number): number[] { + const result: number[] = []; + for (let i = 0; i < k; i++) { + result.push(Math.floor(rng() * poolSize)); + } + return result; +} + +/** Core sampling logic: return an array of selected positions. */ +function samplePositions( + poolSize: number, + k: number, + replace: boolean, + weights: readonly (number | null | undefined)[] | undefined, + rng: () => number, +): number[] { + if (poolSize === 0 || k === 0) { + return []; + } + if (!replace && k > poolSize) { + throw new RangeError( + `Sample: cannot sample ${k} items without replacement from a pool of ${poolSize}.`, + ); + } + + if (weights !== undefined) { + const probs = normalizeWeights(weights, poolSize); + if (replace) { + return weightedSampleWithReplacement(k, probs, rng); + } + return weightedSampleWithoutReplacement(poolSize, k, probs, rng); + } + + if (replace) { + return uniformSampleWithReplacement(poolSize, k, rng); + } + return fisherYatesSample(poolSize, k, rng); +} + +// ─── Series sample ──────────────────────────────────────────────────────────── + +/** + * Return a random sample of items from a Series. + * + * @example + * ```ts + * const s = new Series({ data: [10, 20, 30, 40, 50] }); + * sampleSeries(s, { n: 3, randomState: 42 }).values; // [30, 10, 50] (deterministic) + * ``` + */ +export function sampleSeries(series: Series, options?: SampleOptions): Series { + const opts = options ?? {}; + const k = resolveN(series.values.length, opts.n, opts.frac); + const replace = opts.replace ?? false; + const rng = makeRng(opts.randomState); + + const positions = samplePositions(series.values.length, k, replace, opts.weights, rng); + const newValues: Scalar[] = positions.map((i) => series.values[i] ?? null); + const newLabels: Label[] = positions.map((i) => series.index.at(i) ?? null); + + return new Series({ + data: newValues, + index: new Index