diff --git a/.autoloop/programs/example.md b/.autoloop/programs/example.md new file mode 100644 index 00000000..eebc3289 --- /dev/null +++ b/.autoloop/programs/example.md @@ -0,0 +1,35 @@ + + + + +# Autoloop Program + + + +## Goal + + + +REPLACE THIS with your optimization goal. + +## Target + + + +Only modify these files: +- `REPLACE_WITH_FILE` -- (describe what this file does) + +Do NOT modify: +- (list files that must not be touched) + +## Evaluation + + + +```bash +REPLACE_WITH_YOUR_EVALUATION_COMMAND +``` + +The metric is `REPLACE_WITH_METRIC_NAME`. **Lower/Higher is better.** (pick one) diff --git a/playground/diff_shift.html b/playground/diff_shift.html new file mode 100644 index 00000000..3a300fbf --- /dev/null +++ b/playground/diff_shift.html @@ -0,0 +1,443 @@ + + + + + + tsb — diff & shift (discrete difference and value shifting) + + + +
+
+
Loading tsb runtime…
+
+ + ← Back to playground index + +

diff & shift — discrete difference and value shifting

+

+ diffSeries / diffDataFrame compute the element-wise discrete + difference (value[i] - value[i-periods]).
+ shiftSeries / shiftDataFrame shift values forward or backward + by a given number of periods, filling with a configurable value.
+ Mirrors Series.diff(), Series.shift(), + DataFrame.diff(), and DataFrame.shift() from pandas. +

+ + +
+

1 · Series diff — first discrete difference

+

+ Compute s[i] - s[i - periods] for each position. + The first periods entries are null. + Non-numeric values produce null. +

+
+
+
+ + +
+
+ + +
+
+ + +
Press ▶ Run to execute
+
+

💡 Tip: diffSeries is commonly used to compute returns, velocity, or changes over time.

+
+ + +
+

2 · Series shift — lag and lead values

+

+ Shift values forward (positive periods) or backward (negative periods). + Vacated positions are filled with fillValue (default null). +

+
+
+
+ + +
+
+ + +
+
+ + +
Press ▶ Run to execute
+
+

💡 Tip: combine shiftSeries with arithmetic to compute returns, lags, or leads.

+
+ + +
+

3 · DataFrame diff — column-wise and row-wise

+

+ axis=0 (default): diff each column independently (rows over time).
+ axis=1: diff across columns within each row. +

+
+
+
+ + +
+
+ + +
+
+ + +
Press ▶ Run to execute
+
+
+ + +
+

4 · DataFrame shift — lagging a DataFrame

+

+ Shift all columns by the same number of periods. + Useful for creating lagged features in machine learning. +

+
+
+
+ + +
+
+ + +
+
+ + +
Press ▶ Run to execute
+
+

💡 Tip: creating multiple lagged columns is a common feature-engineering technique for time series forecasting.

+
+ + +
+

API Reference

+
// Discrete difference
+diffSeries(series: Series<Scalar>, options?: DiffOptions): Series<Scalar>
+diffDataFrame(df: DataFrame, options?: DataFrameDiffOptions): DataFrame
+
+interface DiffOptions {
+  periods?: number;  // default 1; negative = look forward
+}
+interface DataFrameDiffOptions extends DiffOptions {
+  axis?: 0 | 1 | "index" | "columns";  // default 0
+}
+
+// Value shifting
+shiftSeries(series: Series<Scalar>, options?: ShiftOptions): Series<Scalar>
+shiftDataFrame(df: DataFrame, options?: DataFrameShiftOptions): DataFrame
+
+interface ShiftOptions {
+  periods?:   number;  // default 1; negative = shift backward
+  fillValue?: Scalar;  // default null
+}
+interface DataFrameShiftOptions extends ShiftOptions {
+  axis?: 0 | 1 | "index" | "columns";  // default 0
+}
+
+ + + + + + diff --git a/playground/index.html b/playground/index.html index bf6823ba..752db9ca 100644 --- a/playground/index.html +++ b/playground/index.html @@ -334,6 +334,31 @@

✅ Complete +
+

📗 Excel I/O

+

XLSX file reading. readExcel() parses Excel files from a Uint8Array/ArrayBuffer — ZIP+XML parsing from scratch, shared strings, number/string/boolean cells, sheet selection, header, indexCol, skipRows, nrows.

+
✅ Complete
+
+
+

🔍 missing-value ops

+

Detect and fill missing values. isna(), notna(), isnull(), notnull() for scalars/Series/DataFrame. ffillSeries(), bfillSeries(), dataFrameFfill(), dataFrameBfill() with optional limit and axis support.

+
✅ Complete
+
+
+

📈 diff / shift

+

Discrete difference and value shifting for Series and DataFrame. diff computes element-wise differences; shift lags or leads values by a number of periods. Essential for time-series analysis.

+
✅ Complete
+
+
+

🔢 NaN-Ignoring Aggregates

+

Top-level nan-ignoring aggregate functions: nansum, nanmean, nanmedian, nanstd, nanvar, nanmin, nanmax, nanprod, nancount. Mirrors numpy.nan* functions. Works on arrays and Series.

+
✅ Complete
+
+
+

⏱️ toTimedelta

+

Convert scalars, arrays, or Series to Timedelta objects. Accepts pandas-style strings, ISO 8601, human-readable, and numeric values. Timedelta class with arithmetic: add/subtract/scale/abs/lt/gt/eq.

+
✅ Complete
+
diff --git a/playground/na_ops.html b/playground/na_ops.html new file mode 100644 index 00000000..c321438f --- /dev/null +++ b/playground/na_ops.html @@ -0,0 +1,480 @@ + + + + + + tsb — missing-value operations (isna, ffill, bfill) + + + +
+
+
Loading tsb runtime…
+
+ + ← Back to playground index + +

Missing-value operations

+

+ isna / notna — detect missing values in scalars, + Series, and DataFrames.
+ ffill / bfill — propagate the last (or next) valid + value to fill gaps.
+ Mirrors pd.isna(), Series.ffill(), and + DataFrame.bfill() from pandas. +

+ + +
+

1 · isna / notna on scalars

+

+ Returns true / false for individual values. + null, undefined, and NaN are all + considered "missing". +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

2 · isna on a Series

+

+ When passed a Series, isna returns a boolean Series of the + same length — true where values are missing. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

3 · isna on a DataFrame

+

+ Returns a DataFrame of booleans with the same shape — one column per + original column, true where missing. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

4 · Forward-fill (ffillSeries)

+

+ Propagates the last valid value forward to fill gaps. Leading + nulls that have no preceding value remain null. + Use the optional limit to cap consecutive fills. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

5 · Backward-fill (bfillSeries)

+

+ Propagates the next valid value backward to fill gaps. Trailing + nulls that have no following value remain null. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

6 · DataFrame forward-fill & backward-fill

+

+ dataFrameFfill and dataFrameBfill apply fill + column-wise by default (axis=0). Pass axis: 1 to fill + row-wise across columns. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

API Reference

+
// Module-level missing-value detection
+isna(value: Scalar): boolean
+isna(value: Series): Series<boolean>
+isna(value: DataFrame): DataFrame
+
+notna(value: Scalar): boolean
+notna(value: Series): Series<boolean>
+notna(value: DataFrame): DataFrame
+
+// Aliases
+isnull(...)  // same as isna
+notnull(...) // same as notna
+
+// Series forward / backward fill
+ffillSeries(series, options?: { limit?: number | null }): Series
+bfillSeries(series, options?: { limit?: number | null }): Series
+
+// DataFrame forward / backward fill
+dataFrameFfill(df, options?: {
+  limit?: number | null,   // max consecutive fills (default: no limit)
+  axis?: 0 | 1 | "index" | "columns",  // default 0 (column-wise)
+}): DataFrame
+
+dataFrameBfill(df, options?: {
+  limit?: number | null,
+  axis?: 0 | 1 | "index" | "columns",
+}): DataFrame
+
+ + + + + diff --git a/playground/reduce_ops.html b/playground/reduce_ops.html new file mode 100644 index 00000000..bafc9bf8 --- /dev/null +++ b/playground/reduce_ops.html @@ -0,0 +1,128 @@ + + + + + + tsb — nunique / any / all + + + +

tsb — nunique / any / all

+

+ Reduction operations that summarise a Series or DataFrame column/row into a scalar or + boolean result — mirroring + pandas.Series.nunique, + DataFrame.any, + and + DataFrame.all. +

+ +

nuniqueSeries — count distinct values

+
import { Series, nuniqueSeries } from "tsb";
+
+const s = new Series([1, 2, 2, null, 3]);
+nuniqueSeries(s);                    // 3   (null excluded by default)
+nuniqueSeries(s, { dropna: false }); // 4   (null counted as a distinct value)
+
+ +

nunique — count distinct per column (axis=0, default)

+
import { DataFrame, nunique } from "tsb";
+
+const df = DataFrame.fromColumns({
+  brand:  ["apple", "banana", "apple", "cherry"],
+  rating: [5, 3, 5, 4],
+  flag:   [true, false, true, null],
+});
+nunique(df);
+// Series { brand: 3, rating: 3, flag: 2 }
+
+ +

nunique — count distinct per row (axis=1)

+
nunique(df, { axis: 1 });
+// Series [3, 3, 3, 3]  — each row has 3 distinct values
+
+ +

anySeries / allSeries

+
import { Series, anySeries, allSeries } from "tsb";
+
+const flags = new Series([false, false, true]);
+anySeries(flags);  // true  — at least one truthy
+allSeries(flags);  // false — not all truthy
+
+const ones = new Series([1, 2, 3]);
+allSeries(ones);   // true  — all truthy
+
+// skipna option
+const withNull = new Series([1, null, 2]);
+allSeries(withNull);                   // true  (null skipped)
+allSeries(withNull, { skipna: false }); // false (null is falsy)
+
+ +

anyDataFrame / allDataFrame

+
import { DataFrame, anyDataFrame, allDataFrame } from "tsb";
+
+const df2 = DataFrame.fromColumns({
+  a: [0, 0, 1],
+  b: [1, 1, 1],
+  c: [0, 0, 0],
+});
+
+anyDataFrame(df2);
+// Series { a: true, b: true, c: false }
+
+allDataFrame(df2);
+// Series { a: false, b: true, c: false }
+
+// axis=1: reduce across columns → one boolean per row
+anyDataFrame(df2, { axis: 1 });
+// Series [true, true, true]  (row 0: 0,1,0 → any=true via b)
+
+ +

boolOnly option

+
const mixed = DataFrame.fromColumns({
+  nums: [1, 2, 3],
+  flag: [true, false, true],
+});
+
+// Only consider boolean columns
+anyDataFrame(mixed, { boolOnly: true });
+// Series { flag: true }   — 'nums' column excluded
+
+ +
+ Pandas parity note: + nunique, any, and all follow pandas' default + behaviour: missing values (null, undefined, NaN) + are excluded by default (dropna/skipna = true). Use + { dropna: false } or { skipna: false } to include them. +
+ +

← Back to playground index

+ + + + diff --git a/playground/to_timedelta.html b/playground/to_timedelta.html new file mode 100644 index 00000000..a09b42ca --- /dev/null +++ b/playground/to_timedelta.html @@ -0,0 +1,132 @@ + + + + + + tsb — toTimedelta + + + +

← tsb playground

+

toTimedelta stats

+

+ Convert scalars, arrays, or Series values to + Timedelta objects — mirroring + pandas.to_timedelta(). +

+ +

Supported input formats

+ + + + + + + + + + +
FormatExampleResult (ms)
Pandas-style"1 days 02:03:04"93 784 000 ms
Clock (HH:MM:SS)"01:30:00"5 400 000 ms
ISO 8601"P1DT2H"93 600 000 ms
Human-readable"1h 30m 20s"5 420 000 ms
number (unit="ns")1_000_000_0001 000 ms
number (unit="ms")50005 000 ms
Timedeltanew Timedelta(1000)unchanged
null / undefined / NaNnullnull
+ +

Timedelta class

+ + + + + + + + + + + + + + + + +
Property / MethodDescription
.totalMsTotal duration in milliseconds (signed)
.daysWhole days
.hoursHours within the current day (0–23)
.minutesMinutes within the current hour (0–59)
.secondsSeconds within the current minute (0–59)
.msMilliseconds within the current second (0–999)
.abs()Absolute value
.add(other)Add two Timedeltas
.subtract(other)Subtract a Timedelta
.scale(n)Multiply by a scalar
.lt(other)Less-than comparison
.gt(other)Greater-than comparison
.eq(other)Equality comparison
.toString()Pandas-style string representation
+ +

Error handling

+ + + + + +
errors=Behaviour
"raise" (default)Throws TypeError on unparseable input
"coerce"Returns null on unparseable input
"ignore"Returns the original value unchanged
+ +

Quick examples

+
import { toTimedelta, Timedelta, formatTimedelta, Series } from "tsb";
+
+// Scalar — various string formats
+toTimedelta("1 days 02:03:04");       // Timedelta(93_784_000 ms)
+toTimedelta("01:30:00");              // Timedelta(5_400_000 ms)
+toTimedelta("P1DT2H3M4S");            // ISO 8601
+toTimedelta("1h 30m 20s 500ms");      // human-readable
+
+// Scalar — numeric
+toTimedelta(1_000_000_000);           // default unit "ns" → 1000 ms
+toTimedelta(5000, { unit: "ms" });    // 5000 ms
+toTimedelta(2, { unit: "D" });        // 2 days
+
+// Missing values
+toTimedelta(null);                    // null
+toTimedelta("nope", { errors: "coerce" }); // null
+toTimedelta("nope", { errors: "ignore" }); // "nope" (unchanged)
+
+// Timedelta arithmetic
+const a = toTimedelta("1h") as Timedelta;
+const b = toTimedelta("30m") as Timedelta;
+a.add(b).toString();          // "0 days 01:30:00"
+a.subtract(b).totalMs;        // 1_800_000
+
+// Array
+toTimedelta(["1h", "30m", null]);
+// => [Timedelta(3_600_000), Timedelta(1_800_000), null]
+
+// Series
+const s = new Series({ data: ["1h", "30m", null] });
+toTimedelta(s);
+// => Series<Timedelta | null> with dtype=timedelta
+
+// formatTimedelta
+formatTimedelta(new Timedelta(86_400_000 + 3_661_000));
+// => "1 day 01:01:01"
+ +

Python / pandas equivalent

+ + + diff --git a/src/core/sample.ts b/src/core/sample.ts new file mode 100644 index 00000000..7cd8d529 --- /dev/null +++ b/src/core/sample.ts @@ -0,0 +1,328 @@ +/** + * sample — random sampling from Series and DataFrame. + * + * Mirrors: + * - `pandas.Series.sample(n, frac, replace, weights, random_state, axis)` + * - `pandas.DataFrame.sample(n, frac, replace, weights, random_state, axis)` + * + * @module + */ + +import type { Label, Scalar } from "../types.ts"; +import { Index } from "./base-index.ts"; +import { DataFrame } from "./frame.ts"; +import { Series } from "./series.ts"; + +// ─── public types ───────────────────────────────────────────────────────────── + +/** Options for {@link sampleSeries} and {@link sampleDataFrame}. */ +export interface SampleOptions { + /** + * Number of items to return. Mutually exclusive with `frac`. + * @defaultValue `1` (when neither `n` nor `frac` is provided) + */ + readonly n?: number; + /** + * Fraction of items to return (e.g. `0.5` for 50%). + * Mutually exclusive with `n`. + */ + readonly frac?: number; + /** + * Allow sampling with replacement (the same item may appear multiple times). + * @defaultValue `false` + */ + readonly replace?: boolean; + /** + * Weights for each item. Must have the same length as the Series/DataFrame. + * Weights do not need to sum to 1 — they are normalized internally. + * Missing weights (null/undefined/NaN) are treated as 0. + */ + readonly weights?: readonly (number | null | undefined)[]; + /** + * Seed for the random number generator. When provided, sampling is + * deterministic (same seed + same data → same result). + * Uses a simple LCG (linear congruential generator). + */ + readonly randomState?: number; + /** + * Axis to sample along (DataFrame only). + * - `0` or `"index"` (default): sample rows. + * - `1` or `"columns"`: sample columns. + */ + readonly axis?: 0 | 1 | "index" | "columns"; +} + +// ─── seeded RNG ─────────────────────────────────────────────────────────────── + +/** + * Minimal LCG-based PRNG (Knuth constants). + * Returns a new seed and a float in [0, 1). + */ +function lcgNext(seed: number): [number, number] { + // LCG parameters (Numerical Recipes) + const a = 1664525; + const c = 1013904223; + const m = 2 ** 32; + const nextSeed = ((a * seed + c) >>> 0) % m; + return [nextSeed, nextSeed / m]; +} + +/** Build a seeded random float generator that returns [0,1). */ +function makeRng(seed: number | undefined): () => number { + if (seed === undefined) { + return () => Math.random(); + } + let s = seed >>> 0; // ensure 32-bit unsigned + return () => { + const [ns, r] = lcgNext(s); + s = ns; + return r; + }; +} + +// ─── helpers ────────────────────────────────────────────────────────────────── + +/** Resolve how many items to sample from a pool of size `poolSize`. */ +function resolveN(poolSize: number, n: number | undefined, frac: number | undefined): number { + if (n !== undefined && frac !== undefined) { + throw new Error("Sample: specify either `n` or `frac`, not both."); + } + if (frac !== undefined) { + if (frac < 0) { + throw new RangeError("Sample: `frac` must be >= 0."); + } + return Math.floor(frac * poolSize); + } + if (n !== undefined) { + if (n < 0) { + throw new RangeError("Sample: `n` must be >= 0."); + } + return n; + } + return 1; +} + +/** Normalize weights to probabilities summing to 1. */ +function normalizeWeights( + rawWeights: readonly (number | null | undefined)[], + poolSize: number, +): number[] { + if (rawWeights.length !== poolSize) { + throw new RangeError( + `Sample: weights length (${rawWeights.length}) must equal pool size (${poolSize}).`, + ); + } + const ws = rawWeights.map((w) => { + const v = w ?? 0; + if (typeof v !== "number" || Number.isNaN(v) || v < 0) { + return 0; + } + return v; + }); + const total = ws.reduce((s, v) => s + v, 0); + if (total === 0) { + throw new Error("Sample: all weights are zero."); + } + return ws.map((w) => w / total); +} + +/** + * Weighted random sample without replacement using the alias method. + * Falls back to basic weighted sampling when `replace=true`. + */ +function weightedSampleWithoutReplacement( + poolSize: number, + k: number, + probs: number[], + rng: () => number, +): number[] { + // Use reservoir sampling with exponential keys: assign key = rand^(1/w), take top-k + const keys: Array<[number, number]> = probs.map((p, i) => { + const r = rng(); + const key = p > 0 ? r ** (1 / p) : 0; + return [key, i]; + }); + keys.sort((a, b) => b[0] - a[0]); + return keys.slice(0, k).map(([, i]) => i); +} + +/** + * Weighted sample WITH replacement: pick `k` indices based on cumulative probabilities. + */ +function weightedSampleWithReplacement(k: number, probs: number[], rng: () => number): number[] { + const cumulative: number[] = []; + let sum = 0; + for (const p of probs) { + sum += p; + cumulative.push(sum); + } + + const result: number[] = []; + for (let i = 0; i < k; i++) { + const r = rng(); + let idx = cumulative.findIndex((c) => c >= r); + if (idx < 0) { + idx = probs.length - 1; + } + result.push(idx); + } + return result; +} + +/** + * Fisher-Yates shuffle (unweighted, without replacement) — pick the first `k` elements. + */ +function fisherYatesSample(poolSize: number, k: number, rng: () => number): number[] { + const indices = Array.from({ length: poolSize }, (_, i) => i); + for (let i = 0; i < k; i++) { + const j = i + Math.floor(rng() * (poolSize - i)); + const tmp = indices[i]; + const jVal = indices[j]; + if (tmp !== undefined && jVal !== undefined) { + indices[i] = jVal; + indices[j] = tmp; + } + } + return indices.slice(0, k); +} + +/** + * Sample with replacement (unweighted): draw `k` integers in [0, poolSize). + */ +function uniformSampleWithReplacement(poolSize: number, k: number, rng: () => number): number[] { + const result: number[] = []; + for (let i = 0; i < k; i++) { + result.push(Math.floor(rng() * poolSize)); + } + return result; +} + +/** Core sampling logic: return an array of selected positions. */ +function samplePositions( + poolSize: number, + k: number, + replace: boolean, + weights: readonly (number | null | undefined)[] | undefined, + rng: () => number, +): number[] { + if (poolSize === 0 || k === 0) { + return []; + } + if (!replace && k > poolSize) { + throw new RangeError( + `Sample: cannot sample ${k} items without replacement from a pool of ${poolSize}.`, + ); + } + + if (weights !== undefined) { + const probs = normalizeWeights(weights, poolSize); + if (replace) { + return weightedSampleWithReplacement(k, probs, rng); + } + return weightedSampleWithoutReplacement(poolSize, k, probs, rng); + } + + if (replace) { + return uniformSampleWithReplacement(poolSize, k, rng); + } + return fisherYatesSample(poolSize, k, rng); +} + +// ─── Series sample ──────────────────────────────────────────────────────────── + +/** + * Return a random sample of items from a Series. + * + * @example + * ```ts + * const s = new Series({ data: [10, 20, 30, 40, 50] }); + * sampleSeries(s, { n: 3, randomState: 42 }).values; // [30, 10, 50] (deterministic) + * ``` + */ +export function sampleSeries(series: Series, options?: SampleOptions): Series { + const opts = options ?? {}; + const k = resolveN(series.values.length, opts.n, opts.frac); + const replace = opts.replace ?? false; + const rng = makeRng(opts.randomState); + + const positions = samplePositions(series.values.length, k, replace, opts.weights, rng); + const newValues: Scalar[] = positions.map((i) => series.values[i] ?? null); + const newLabels: Label[] = positions.map((i) => series.index.at(i) ?? null); + + return new Series({ + data: newValues, + index: new Index