diff --git a/playground/apply.html b/playground/apply.html new file mode 100644 index 00000000..41fc7a65 --- /dev/null +++ b/playground/apply.html @@ -0,0 +1,186 @@ + + + + + + tsb — apply / map + + + +

tsb — apply / map

+

+ Apply functions element-wise or per-column/row. + applySeries maps a function over each element. + mapSeries supports function, Map, or plain-object lookup. + applyDataFrame reduces each column or row to a scalar. + applyExpandDataFrame transforms each column/row into a new Series. + mapDataFrame applies a function element-wise across the whole DataFrame. +

+ +

Core concept

+
// Element-wise apply on a Series
+applySeries(s, (v) => Math.sqrt(v as number))
+
+// Map via lookup table
+mapSeries(s, { a: 1, b: 2, c: 3 })
+
+// Reduce each column to a scalar
+applyDataFrame(df, (col) => col.values.reduce((a, b) => a + b, 0))
+
+// Transform each column, return a DataFrame
+applyExpandDataFrame(df, (col) => new Series({ data: col.values.map(v => v * 2), index: col.index }))
+
+// Element-wise map on a DataFrame
+mapDataFrame(df, (v) => (v as number) ** 2)
+ +
+ pandas equivalent:
+ s.apply(func) / s.map(func_or_dict)
+ df.apply(func, axis=0) / df.applymap(func) (now df.map(func)) +
+ + +

Demo 1 — applySeries element-wise

+
+
Code
+
const s = new Series({ data: [1, 4, 9, 16] });
+applySeries(s, (v) => Math.sqrt(v as number)).values;
+// → [1, 2, 3, 4]
+ + +
+ + +

Demo 2 — mapSeries with object lookup

+
+
Code
+
const s = new Series({ data: ["a", "b", "c", "d"] });
+mapSeries(s, { a: 1, b: 2, c: 3 }).values;
+// → [1, 2, 3, null]  ("d" not in lookup → null)
+ + +
+ + +

Demo 3 — applyDataFrame: sum of each column (axis=0)

+
+
Code
+
const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [10, 20, 30] });
+applyDataFrame(df, (col) =>
+  (col.values as number[]).reduce((acc, v) => acc + v, 0)
+).values;
+// → [6, 60]  (indexed by column names)
+ + +
+ + +

Demo 4 — applyDataFrame: sum of each row (axis=1)

+
+
Code
+
const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [4, 5, 6] });
+applyDataFrame(df, (row) =>
+  (row.values as number[]).reduce((acc, v) => acc + v, 0),
+  { axis: 1 }
+).values;
+// → [5, 7, 9]
+ + +
+ + +

Demo 5 — applyExpandDataFrame: double each column

+
+
Code
+
const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [4, 5, 6] });
+applyExpandDataFrame(df, (col) =>
+  new Series({ data: (col.values as number[]).map(v => v * 2), index: col.index })
+);
+// a: [2, 4, 6]  b: [8, 10, 12]
+ + +
+ + +

Demo 6 — mapDataFrame: element-wise square

+
+
Code
+
const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [4, 5, 6] });
+mapDataFrame(df, (v) => (v as number) ** 2);
+// a: [1, 4, 9]  b: [16, 25, 36]
+ + +
+ + + + diff --git a/playground/astype.html b/playground/astype.html new file mode 100644 index 00000000..efd9e5ed --- /dev/null +++ b/playground/astype.html @@ -0,0 +1,438 @@ + + + + + + tsb — astype + + + +
+
+

Loading tsb runtime…

+
+ + ← tsb playground +

astype — dtype coercion

+

+ Cast Series and DataFrame values to a different dtype. + Mirrors pandas.Series.astype and pandas.DataFrame.astype. +

+ + +
+

1 · Series — float to int64

+

+ Cast floating-point values to integers via truncation (same as + pandas.Series.astype("int64")). +

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

2 · Series — numbers to string

+

Convert every value to its string representation. Null/undefined values + become null (not the string "null").

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

3 · Overflow clamping for bounded integer dtypes

+

+ Values that overflow the target integer dtype's range are clamped to + [min, max] — e.g. uint8 is clamped to + [0, 255]. +

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

4 · DataFrame — cast all columns

+

Pass a single dtype name to cast every column to the same type.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

5 · DataFrame — per-column dtype mapping

+

Pass a Record<string, DtypeName> to cast individual + columns. Columns not listed are carried over unchanged.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

6 · Casting to bool

+

Zero, empty string, and NaN become false; + everything else (including non-zero numbers and non-empty strings) + becomes true.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

API Reference

+
// Series cast
+astypeSeries(
+  series: Series,
+  dtype: DtypeName | Dtype,
+  options?: AstypeOptions,
+): Series
+
+// DataFrame cast (all columns or per-column mapping)
+astype(
+  df: DataFrame,
+  dtype: DtypeName | Dtype | Record<string, DtypeName | Dtype>,
+  options?: DataFrameAstypeOptions,
+): DataFrame
+
+// Low-level scalar cast
+castScalar(value: Scalar, dtype: Dtype): Scalar
+
+// Options
+interface AstypeOptions {
+  errors?: "raise" | "ignore";  // default "raise"
+}
+
+// Supported dtype names
+type DtypeName =
+  | "int8" | "int16" | "int32" | "int64"
+  | "uint8" | "uint16" | "uint32" | "uint64"
+  | "float32" | "float64"
+  | "bool" | "string" | "object"
+  | "datetime" | "timedelta" | "category"
+
+ + + + + diff --git a/playground/clip_advanced.html b/playground/clip_advanced.html new file mode 100644 index 00000000..eb200294 --- /dev/null +++ b/playground/clip_advanced.html @@ -0,0 +1,163 @@ + + + + + + tsb — clip_advanced (per-element clipping) + + + +

tsb — clip_advanced (per-element clipping)

+

+ Clip Series and DataFrame values to per-element bounds. + Unlike the simple scalar clip, clipAdvancedSeries and + clipAdvancedDataFrame support array, Series, and DataFrame bounds — + enabling per-position or element-wise bound specification. +

+ +

Core concept

+
// Scalar bounds (like pandas s.clip(lower=0, upper=5))
+clipAdvancedSeries(s, { lower: 0, upper: 5 })
+
+// Per-element array bounds
+clipAdvancedSeries(s, { lower: [1, 2, 3], upper: [4, 5, 6] })
+
+// Series bounds (positional alignment)
+clipAdvancedSeries(s, { lower: loSeries, upper: hiSeries })
+
+// DataFrame element-wise bounds
+clipAdvancedDataFrame(df, { lower: loDf, upper: hiDf })
+
+// Series broadcast on DataFrame (axis=0: one bound per column; axis=1: one per row)
+clipAdvancedDataFrame(df, { lower: loSeries, axis: 1 })
+ +
+ pandas equivalent:
+ s.clip(lower=lo_array, upper=hi_array)
+ df.clip(lower=lo_df, upper=hi_df) +
+ + +

Demo 1 — clipAdvancedSeries with scalar bounds

+
+
Code
+
const s = new Series({ data: [-3, 1, 5, 10] });
+clipAdvancedSeries(s, { lower: 0, upper: 6 }).values;
+// → [0, 1, 5, 6]
+ + +
+ + +

Demo 2 — clipAdvancedSeries with per-element array bounds

+
+
Code
+
const s = new Series({ data: [-1, 0, 5, 12] });
+const lo = [2, -1, 4, 10];
+const hi = [5,  3, 8, 11];
+clipAdvancedSeries(s, { lower: lo, upper: hi }).values;
+// → [2, 0, 5, 11]
+ + +
+ + +

Demo 3 — clipAdvancedSeries with Series bounds

+
+
Code
+
const s = new Series({ data: [0, 5, 10, 15] });
+const loBound = new Series({ data: [1, 3, 8, 12] });
+const hiBound = new Series({ data: [2, 7, 9, 20] });
+clipAdvancedSeries(s, { lower: loBound, upper: hiBound }).values;
+// → [1, 5, 9, 15]
+ + +
+ + +

Demo 4 — clipAdvancedDataFrame with DataFrame bounds

+
+
Code
+
const df = DataFrame.fromColumns({ a: [1, 5, 9], b: [2, 6, 10] });
+const lo = DataFrame.fromColumns({ a: [2, 3, 4], b: [1, 4, 8] });
+const hi = DataFrame.fromColumns({ a: [3, 7, 8], b: [5, 9, 12] });
+const result = clipAdvancedDataFrame(df, { lower: lo, upper: hi });
+result.col("a").values; // → [2, 5, 8]
+result.col("b").values; // → [2, 6, 10]
+ + +
+ + +

Demo 5 — clipAdvancedDataFrame with Series broadcast (axis=1)

+
+
Code
+
// axis=1: one lower bound per row
+const df = DataFrame.fromColumns({ a: [1, 5, 9], b: [2, 6, 10] });
+const loPerRow = new Series({ data: [0, 4, 10] });
+const result = clipAdvancedDataFrame(df, { lower: loPerRow, axis: 1 });
+result.col("a").values; // → [1, 5, 10]
+result.col("b").values; // → [2, 6, 10]
+ + +
+ + + + diff --git a/playground/cut.html b/playground/cut.html new file mode 100644 index 00000000..24ee65a1 --- /dev/null +++ b/playground/cut.html @@ -0,0 +1,343 @@ + + + + + + tsb — cut / qcut + + + +

tsb — cut / qcut

+

+ Bin continuous numeric data into discrete intervals. + cut uses equal-width (or user-defined) bins; + qcut uses equal-frequency (quantile-based) bins. + Both return a Series<string | null> of bin labels. +

+ +

Core concept

+
// Equal-width bins
+cut(s, 4)                      // 4 bins of equal width
+cut(s, [0, 10, 50, 100])       // explicit edges
+
+// Equal-frequency bins (quartiles)
+qcut(s, 4)                     // 4 bins, each with ~25% of data
+qcut(s, [0, 0.25, 0.5, 0.75, 1])  // explicit quantile levels
+
+// Custom labels
+cut(s, 3, { labels: ["low", "mid", "high"] })
+
+// Return bin edges too
+const [binned, edges] = cut(s, 3, { retbins: true })
+
+// Integer bin codes
+cutCodes(s, 4)  // → Series of 0, 1, 2, 3 integers
+ +
+ pandas equivalent:
+ pd.cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False)
+ pd.qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise') +
+ + +

Demo 1 — cut: equal-width bins

+
+
Code
+
const s = new Series({ data: [1, 7, 5, 4, 2, 3, 8, 6], name: "score" });
+cut(s, 4).values;
+// Each value assigned to one of 4 equal-width bins
+ + +
+ + +

Demo 2 — cut: explicit bin edges

+
+
Code
+
const s = new Series({ data: [15, 35, 55, 75, 95] });
+cut(s, [0, 25, 50, 75, 100]).values;
+// → ["(0, 25]", "(25, 50]", "(50, 75]", "(75, 100]", "(75, 100]"]
+ + +
+ + +

Demo 3 — cut: custom labels

+
+
Code
+
const grades = new Series({ data: [45, 62, 78, 91, 55] });
+cut(grades, [0, 60, 70, 80, 100], { labels: ["F", "D", "C", "B/A"] }).values;
+// → grade letter for each score
+ + +
+ + +

Demo 4 — qcut: quartile bins

+
+
Code
+
const s = new Series({ data: [3, 1, 7, 2, 9, 4, 6, 8, 5, 10] });
+qcut(s, 4).values;
+// Equal-frequency quartile bins — each bin contains ~25% of values
+ + +
+ + +

Demo 5 — retbins: get bin edges back

+
+
Code
+
const s = new Series({ data: [10, 30, 50, 70, 90] });
+const [binned, edges] = cut(s, 3, { retbins: true });
+// edges: the computed bin boundaries
+ + +
+ + +

Demo 6 — cutCodes: integer bin codes

+
+
Code
+
const s = new Series({ data: [5, 15, 25, 35, 45] });
+cutCodes(s, [0, 10, 20, 30, 40, 50]).values;
+// → [0, 1, 2, 3, 4]  (integer bin indices)
+ + +
+ + +

Demo 7 — null / NaN handling

+
+
Code
+
const s = new Series({ data: [1, null, NaN, 5, 10] });
+cut(s, 3).values;
+// null and NaN stay as null in the output
+ + +
+ + +

Demo 8 — qcut: handling duplicate edges with ties

+
+
Code
+
const s = new Series({ data: [1, 1, 1, 2, 3, 4, 5, 5, 5] });
+qcut(s, 4, { duplicates: "drop" }).values;
+// Ties cause duplicate quantile boundaries — "drop" removes them
+ + +
+ + + + diff --git a/playground/diff_shift.html b/playground/diff_shift.html new file mode 100644 index 00000000..3a300fbf --- /dev/null +++ b/playground/diff_shift.html @@ -0,0 +1,443 @@ + + + + + + tsb — diff & shift (discrete difference and value shifting) + + + +
+
+
Loading tsb runtime…
+
+ + ← Back to playground index + +

diff & shift — discrete difference and value shifting

+

+ diffSeries / diffDataFrame compute the element-wise discrete + difference (value[i] - value[i-periods]).
+ shiftSeries / shiftDataFrame shift values forward or backward + by a given number of periods, filling with a configurable value.
+ Mirrors Series.diff(), Series.shift(), + DataFrame.diff(), and DataFrame.shift() from pandas. +

+ + +
+

1 · Series diff — first discrete difference

+

+ Compute s[i] - s[i - periods] for each position. + The first periods entries are null. + Non-numeric values produce null. +

+
+
+
+ + +
+
+ + +
+
+ + +
Press ▶ Run to execute
+
+

💡 Tip: diffSeries is commonly used to compute returns, velocity, or changes over time.

+
+ + +
+

2 · Series shift — lag and lead values

+

+ Shift values forward (positive periods) or backward (negative periods). + Vacated positions are filled with fillValue (default null). +

+
+
+
+ + +
+
+ + +
+
+ + +
Press ▶ Run to execute
+
+

💡 Tip: combine shiftSeries with arithmetic to compute returns, lags, or leads.

+
+ + +
+

3 · DataFrame diff — column-wise and row-wise

+

+ axis=0 (default): diff each column independently (rows over time).
+ axis=1: diff across columns within each row. +

+
+
+
+ + +
+
+ + +
+
+ + +
Press ▶ Run to execute
+
+
+ + +
+

4 · DataFrame shift — lagging a DataFrame

+

+ Shift all columns by the same number of periods. + Useful for creating lagged features in machine learning. +

+
+
+
+ + +
+
+ + +
+
+ + +
Press ▶ Run to execute
+
+

💡 Tip: creating multiple lagged columns is a common feature-engineering technique for time series forecasting.

+
+ + +
+

API Reference

+
// Discrete difference
+diffSeries(series: Series<Scalar>, options?: DiffOptions): Series<Scalar>
+diffDataFrame(df: DataFrame, options?: DataFrameDiffOptions): DataFrame
+
+interface DiffOptions {
+  periods?: number;  // default 1; negative = look forward
+}
+interface DataFrameDiffOptions extends DiffOptions {
+  axis?: 0 | 1 | "index" | "columns";  // default 0
+}
+
+// Value shifting
+shiftSeries(series: Series<Scalar>, options?: ShiftOptions): Series<Scalar>
+shiftDataFrame(df: DataFrame, options?: DataFrameShiftOptions): DataFrame
+
+interface ShiftOptions {
+  periods?:   number;  // default 1; negative = shift backward
+  fillValue?: Scalar;  // default null
+}
+interface DataFrameShiftOptions extends ShiftOptions {
+  axis?: 0 | 1 | "index" | "columns";  // default 0
+}
+
+ + + + + + diff --git a/playground/duplicated.html b/playground/duplicated.html new file mode 100644 index 00000000..d562c003 --- /dev/null +++ b/playground/duplicated.html @@ -0,0 +1,198 @@ + + + + + + tsb — duplicated / drop_duplicates + + + +

tsb — duplicated / drop_duplicates

+

+ Detect and remove duplicate values or rows. + duplicatedSeries / duplicatedDataFrame return a boolean + Series marking which items are duplicates. + dropDuplicatesSeries / dropDuplicatesDataFrame return + a new object with duplicates removed. +

+ +

Core concept

+
// keep="first" (default): mark later duplicates as true
+duplicatedSeries(s)
+
+// keep="last": mark earlier duplicates as true
+duplicatedSeries(s, { keep: "last" })
+
+// keep=false: mark ALL occurrences of any duplicate
+duplicatedSeries(s, { keep: false })
+ +
+ pandas equivalent:
+ s.duplicated(keep='first')
+ df.duplicated(subset=['a', 'b'], keep='first')
+ s.drop_duplicates() / df.drop_duplicates() +
+ + +

Demo 1 — duplicatedSeries with keep="first"

+
+
Code
+
const s = new Series({ data: [1, 2, 1, 3, 2] });
+duplicatedSeries(s).values;
+// → [false, false, true, false, true]
+ + +
+ + +

Demo 2 — duplicatedSeries with keep=false (mark all)

+
+
Code
+
const s = new Series({ data: ["a", "b", "a", "c", "b"] });
+duplicatedSeries(s, { keep: false }).values;
+// → [true, true, true, false, true]
+ + +
+ + +

Demo 3 — dropDuplicatesSeries

+
+
Code
+
const s = new Series({ data: [10, 20, 10, 30, 20], name: "prices" });
+dropDuplicatesSeries(s).values;
+// → [10, 20, 30]
+ + +
+ + +

Demo 4 — duplicatedDataFrame with subset

+
+
Code
+
const df = DataFrame.fromRecords([
+  { name: "Alice", dept: "Eng" },
+  { name: "Bob",   dept: "Eng" },
+  { name: "Alice", dept: "HR" },
+  { name: "Bob",   dept: "Eng" }, // ← duplicate of row 1 on "name"+"dept"
+]);
+// Only consider "name" column for duplicates:
+duplicatedDataFrame(df, { subset: ["name"] }).values;
+// → [false, false, true, true]  (Alice and Bob each appear twice)
+ + +
+ + +

Demo 5 — dropDuplicatesDataFrame

+
+
Code
+
const df = DataFrame.fromRecords([
+  { a: 1, b: 2 },
+  { a: 1, b: 2 },
+  { a: 3, b: 4 },
+  { a: 3, b: 4 },
+]);
+const deduped = dropDuplicatesDataFrame(df);
+// shape: [2, 2]
+// a: [1, 3]  b: [2, 4]
+ + +
+ + +

Interactive editor

+
+
Edit and run:
+ + + +
+ + + + diff --git a/playground/get_dummies.html b/playground/get_dummies.html new file mode 100644 index 00000000..efa302e9 --- /dev/null +++ b/playground/get_dummies.html @@ -0,0 +1,259 @@ + + + + + + tsb — getDummies / fromDummies + + + +

getDummies one-hot encoding

+

Convert categorical variables into binary indicator columns — mirrors pandas.get_dummies and pandas.from_dummies.

+ + +
+

1. Basic Series → dummy DataFrame

+
+
+
Input Series
+
const s = new Series({
+  data: ["cat", "dog", "cat", "fish"],
+  name: "animal"
+});
+getDummies(s);
+
+
+
Output
+
+
+
+
+ + +
+

2. Custom prefix and separator

+
+
+
Code
+
getDummies(s, {
+  prefix: "pet",
+  prefixSep: "__"
+});
+
+
+
Columns
+
+
+
+
+ + +
+

3. Drop first level (avoid multicollinearity)

+
+
+
Code
+
const s2 = new Series({
+  data: ["a","b","c","a"],
+  name: "x"
+});
+getDummies(s2, { dropFirst: true });
+
+
+
Columns (a dropped)
+
+
+
+
+ + +
+

4. Include NaN indicator column

+
+
+
Code
+
const s3 = new Series({
+  data: ["a", null, "b", null],
+  name: "x"
+});
+getDummies(s3, { dummyNa: true });
+
+
+
Result (with x_nan column)
+
+
+
+
+ + +
+

5. DataFrame — encode categorical columns automatically

+
+
+
Code
+
const df = DataFrame.fromColumns({
+  score: [90, 85, 72],
+  grade: ["A", "B", "C"],
+  pass: [true, true, false]
+});
+getDummies(df);
+
+
+
Columns
+
+
+
+
+ + +
+

6. Encode only specified columns

+
+
+
Code
+
const df2 = DataFrame.fromColumns({
+  color: ["r","g","b"],
+  shape: ["sq","ci","sq"],
+  n: [1,2,3]
+});
+getDummies(df2, { columns: ["color"] });
+
+
+
Result
+
+
+
+
+ + +
+

7. fromDummies — reverse one-hot encoding

+
+
+
Code
+
const original = new Series({
+  data: ["cat","dog","cat","fish"],
+  name: "pet"
+});
+const dummies = getDummies(original);
+const recovered = fromDummies(dummies, { sep: "_" });
+
+
+
Recovered values
+
+
+
+
+ + +
+

8. Interactive encoder

+
Enter comma-separated values:
+
+ + + + +
+
+
+ + + + diff --git a/playground/idxmin_idxmax.html b/playground/idxmin_idxmax.html new file mode 100644 index 00000000..b771dd36 --- /dev/null +++ b/playground/idxmin_idxmax.html @@ -0,0 +1,439 @@ + + + + + + tsb — idxmin / idxmax + + + +
+
+
Loading TypeScript compiler…
+
+ + ← tsb playground +

idxmin / idxmax

+

+ Return the index label of the minimum or maximum value in a + Series or each column of a DataFrame. + Mirrors pandas.Series.idxmin(), idxmax(), + pandas.DataFrame.idxmin(), and DataFrame.idxmax(). +

+ + +
+

1 · Series.idxmin — label of the minimum value

+

Returns the index label at the position of the minimum value. + NaN / null values are skipped by default.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

2 · Series.idxmax — label of the maximum value

+

Returns the index label at the position of the maximum value.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

3 · NaN handling — skipna option

+

By default NaN / null values are skipped. Set skipna: false + to propagate NaN (returns null if any value is NaN).

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

4 · DataFrame.idxmin — row label of column minima

+

Returns a Series indexed by column names. Each value is the row label + where that column achieves its minimum.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

5 · DataFrame.idxmax — row label of column maxima

+

Returns a Series indexed by column names, where each entry is the row + label of that column's maximum value.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

6 · Edge cases — empty, all-NaN, all-equal

+

Behavior for empty series, series where every value is NaN, and series + where all values are equal.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

API Reference

+
// Series
+idxminSeries(series, { skipna?: boolean }): Label   // default skipna=true
+idxmaxSeries(series, { skipna?: boolean }): Label
+
+// DataFrame (axis=0 — min/max per column)
+idxminDataFrame(df, { skipna?: boolean }): Series   // indexed by column names
+idxmaxDataFrame(df, { skipna?: boolean }): Series
+
+ + + + + diff --git a/playground/index.html b/playground/index.html index bf6823ba..1222d8ed 100644 --- a/playground/index.html +++ b/playground/index.html @@ -254,6 +254,11 @@

Element-wise transformations. clip(), seriesAbs(), seriesRound() for Series and DataFrame with min/max bounds, decimal precision, and axis support.

✅ Complete
+
+

🔍 missing-value ops

+

Detect and fill missing values. isna(), notna(), isnull(), notnull() for scalars/Series/DataFrame. ffillSeries(), bfillSeries(), dataFrameFfill(), dataFrameBfill() with optional limit and axis support.

+
✅ Complete
+

🔢 value_counts

Count unique values. valueCounts() for Series and dataFrameValueCounts() for DataFrame with normalize, sort, ascending, and dropna options.

@@ -265,6 +270,70 @@

✅ Complete

+

📊 pct_change

+

Fractional change between elements. pctChangeSeries() and pctChangeDataFrame() with periods, fillMethod (pad/bfill), limit, and axis options.

+
✅ Complete
+
+
+

🔎 idxmin / idxmax

+

Return the index label of the minimum or maximum value. idxminSeries(), idxmaxSeries(), idxminDataFrame(), idxmaxDataFrame() with skipna support.

+
✅ Complete
+
+
+

🔄 astype

+

Cast Series and DataFrame values to a different dtype. astypeSeries(), astype() with per-column mapping support and integer clamping.

+
✅ Complete
+
+
+

🔁 replace

+

Substitute values in Series and DataFrame. Supports scalar, array (many→one, pair-wise), Record, and Map replacement specs.

+
✅ Complete
+
+
+

🔀 where / mask

+

Conditional value selection. where keeps values where the condition is true; mask replaces them. Supports boolean arrays, Series, DataFrame, and callable conditions.

+
✅ Complete
+
+
+

📈 diff / shift

+

Discrete difference and value shifting for Series and DataFrame. diff computes element-wise differences; shift lags or leads values by a number of periods. Essential for time-series analysis.

+
✅ Complete
+
+
+

🔍 duplicated / drop_duplicates

+

Detect and remove duplicate values or rows. Supports keep="first", keep="last", and keep=false (mark all occurrences). DataFrame supports a subset of columns.

+
✅ Complete
+
+
+

🎲 sample

+

Random sampling from Series and DataFrame. Supports fixed count, fractional sampling, with/without replacement, weighted sampling, and seeded deterministic results via randomState.

+
✅ Complete
+
+
+

✂️ clip_advanced

+

Per-element clipping with scalar, array, Series, or DataFrame bounds. Supports axis-based Series broadcasting for DataFrames — mirrors pandas.Series.clip(lower, upper) with array bounds.

+
✅ Complete
+
+
+

🔧 apply / map

+

Function application and value mapping. applySeries, mapSeries (function/dict lookup), applyDataFrame (reduce per col/row), applyExpandDataFrame (transform per col/row), mapDataFrame (element-wise).

+
✅ Complete
+
+
+

🪣 cut / qcut

+

Bin continuous data into discrete intervals. cut for equal-width or user-defined bins; qcut for equal-frequency quantile bins. Custom labels, retbins, cutCodes, and cutCategories.

+
✅ Complete
+
+
+

📐 Interval / IntervalIndex

+

Bounded interval objects and an ordered index of intervals. Interval supports all four closed types; IntervalIndex supports lookup, overlap queries, and intervalRange for equal-length ranges.

+
✅ Complete
+
+
+

🎲 getDummies / fromDummies

+

One-hot encode categorical Series or DataFrame columns into binary indicator columns. getDummies supports custom prefix, separator, dropFirst, and dummyNa. fromDummies reverses the encoding.

+
✅ Complete
+

📥 insertColumn / popColumn

Insert and remove DataFrame columns at precise positions. insertColumn(df, loc, col, values) inserts at integer position, popColumn(df, col) returns { series, df }. Also includes reorderColumns and moveColumn. Mirrors pandas.DataFrame.insert() and .pop().

✅ Complete
diff --git a/playground/interval.html b/playground/interval.html new file mode 100644 index 00000000..412965fb --- /dev/null +++ b/playground/interval.html @@ -0,0 +1,267 @@ + + + + + + tsb — Interval / IntervalIndex + + + +

tsb — Interval / IntervalIndex

+

+ A single bounded interval and an ordered array of intervals as an axis label. + Mirrors pandas.Interval and pandas.IntervalIndex. + Works seamlessly with cut() and qcut(). +

+ +

Core concept

+
// Single interval — closed on right by default: (left, right]
+const iv = new Interval(0, 5);
+iv.contains(3);           // true
+iv.contains(0);           // false  (open on left)
+iv.contains(5);           // true   (closed on right)
+iv.overlaps(new Interval(4, 10));  // true
+
+// IntervalIndex from break points
+const idx = IntervalIndex.fromBreaks([0, 25, 50, 75, 100]);
+idx.indexOf(60);          // 2  → (50, 75]
+
+// Equal-length intervals via intervalRange
+const rng = intervalRange(0, 1, { periods: 4 });
+// [(0.0, 0.25], (0.25, 0.5], (0.5, 0.75], (0.75, 1.0]]
+ +
+ pandas equivalent:
+ pd.Interval(left, right, closed='right')
+ pd.IntervalIndex.from_breaks(breaks, closed='right')
+ pd.interval_range(start, end, periods=N, freq=step) +
+ + +

Demo 1 — Interval: construction and membership

+
+
Code
+
const iv = new Interval(0, 5);            // (0, 5]
+[iv.left, iv.right, iv.closed, iv.length, iv.mid]
+// → [0, 5, "right", 5, 2.5]
+
+iv.contains(0)   // false — left is open
+iv.contains(5)   // true  — right is closed
+iv.contains(2.5) // true
+ + +
+ + +

Demo 2 — Interval: closed="left" | "both" | "neither"

+
+
Code
+
const left    = new Interval(0, 5, "left");    // [0, 5)
+const both    = new Interval(0, 5, "both");    // [0, 5]
+const neither = new Interval(0, 5, "neither"); // (0, 5)
+
+// endpoint membership
+[left.contains(0), left.contains(5)]       // true, false
+[both.contains(0), both.contains(5)]       // true, true
+[neither.contains(0), neither.contains(5)] // false, false
+ + +
+ + +

Demo 3 — Interval.overlaps

+
+
Code
+
const a = new Interval(0, 3);
+const b = new Interval(2, 5);
+const c = new Interval(4, 8);
+
+a.overlaps(b)  // true  — share interior [2, 3]
+b.overlaps(c)  // true  — share interior [4, 5]
+a.overlaps(c)  // false — disjoint
+ + +
+ + +

Demo 4 — IntervalIndex.fromBreaks

+
+
Code
+
const idx = IntervalIndex.fromBreaks([0, 25, 50, 75, 100]);
+idx.size           // 4
+idx.get(1)         // (25, 50]
+idx.indexOf(60)    // 2  — 60 falls in (50, 75]
+idx.indexOf(-5)    // -1 — out of range
+idx.left           // [0, 25, 50, 75]
+idx.mid            // [12.5, 37.5, 62.5, 87.5]
+ + +
+ + +

Demo 5 — IntervalIndex.fromArrays

+
+
Code
+
// Build from separate left and right arrays (non-uniform intervals)
+const idx = IntervalIndex.fromArrays(
+  [0, 1, 3, 6],   // left endpoints
+  [1, 3, 6, 10],  // right endpoints
+  { closed: "left" },
+);
+idx.values.map(iv => iv.toString())
+// ["[0, 1)", "[1, 3)", "[3, 6)", "[6, 10)"]
+ + +
+ + +

Demo 6 — intervalRange: equal-width by period count

+
+
Code
+
const rng = intervalRange(0, 1, { periods: 4 });
+rng.values.map(iv => iv.toString())
+// ["(0, 0.25]", "(0.25, 0.5]", "(0.5, 0.75]", "(0.75, 1]"]
+
+rng.length   // [0.25, 0.25, 0.25, 0.25]
+ + +
+ + +

Demo 7 — intervalRange: equal-width by step size (freq)

+
+
Code
+
const rng = intervalRange(0, 10, { freq: 2.5 });
+rng.values.map(iv => iv.toString())
+// ["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"]
+ + +
+ + +

Demo 8 — IntervalIndex.overlapping

+
+
Code
+
const idx = IntervalIndex.fromBreaks([0, 2, 4, 6, 8, 10]);
+const query = new Interval(3, 7);
+const hits = idx.overlapping(query);
+hits.values.map(iv => iv.toString())
+// All intervals that share any interior point with (3, 7]
+ + +
+ + + + diff --git a/playground/na_ops.html b/playground/na_ops.html new file mode 100644 index 00000000..c321438f --- /dev/null +++ b/playground/na_ops.html @@ -0,0 +1,480 @@ + + + + + + tsb — missing-value operations (isna, ffill, bfill) + + + +
+
+
Loading tsb runtime…
+
+ + ← Back to playground index + +

Missing-value operations

+

+ isna / notna — detect missing values in scalars, + Series, and DataFrames.
+ ffill / bfill — propagate the last (or next) valid + value to fill gaps.
+ Mirrors pd.isna(), Series.ffill(), and + DataFrame.bfill() from pandas. +

+ + +
+

1 · isna / notna on scalars

+

+ Returns true / false for individual values. + null, undefined, and NaN are all + considered "missing". +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

2 · isna on a Series

+

+ When passed a Series, isna returns a boolean Series of the + same length — true where values are missing. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

3 · isna on a DataFrame

+

+ Returns a DataFrame of booleans with the same shape — one column per + original column, true where missing. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

4 · Forward-fill (ffillSeries)

+

+ Propagates the last valid value forward to fill gaps. Leading + nulls that have no preceding value remain null. + Use the optional limit to cap consecutive fills. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

5 · Backward-fill (bfillSeries)

+

+ Propagates the next valid value backward to fill gaps. Trailing + nulls that have no following value remain null. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

6 · DataFrame forward-fill & backward-fill

+

+ dataFrameFfill and dataFrameBfill apply fill + column-wise by default (axis=0). Pass axis: 1 to fill + row-wise across columns. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

API Reference

+
// Module-level missing-value detection
+isna(value: Scalar): boolean
+isna(value: Series): Series<boolean>
+isna(value: DataFrame): DataFrame
+
+notna(value: Scalar): boolean
+notna(value: Series): Series<boolean>
+notna(value: DataFrame): DataFrame
+
+// Aliases
+isnull(...)  // same as isna
+notnull(...) // same as notna
+
+// Series forward / backward fill
+ffillSeries(series, options?: { limit?: number | null }): Series
+bfillSeries(series, options?: { limit?: number | null }): Series
+
+// DataFrame forward / backward fill
+dataFrameFfill(df, options?: {
+  limit?: number | null,   // max consecutive fills (default: no limit)
+  axis?: 0 | 1 | "index" | "columns",  // default 0 (column-wise)
+}): DataFrame
+
+dataFrameBfill(df, options?: {
+  limit?: number | null,
+  axis?: 0 | 1 | "index" | "columns",
+}): DataFrame
+
+ + + + + diff --git a/playground/pct_change.html b/playground/pct_change.html new file mode 100644 index 00000000..3576797a --- /dev/null +++ b/playground/pct_change.html @@ -0,0 +1,448 @@ + + + + + + tsb — pct_change + + + +
+
+
Initializing playground…
+
+ ← Back to roadmap +

📊 pct_change — Interactive Playground

+

Compute the fractional change between each element and a prior element. + Mirrors pandas.Series.pct_change() / + pandas.DataFrame.pct_change().
+ Edit any code block below and press ▶ Run + (or Ctrl+Enter) to execute it live in your browser. +

+ + +
+

1 · Basic pct_change on a Series

+

pctChangeSeries(series) returns the fractional (not percentage) change + from each previous element. The first element is always null.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

2 · Multi-period change

+

The periods option controls the lag. Use periods: 2 to + compare each value to the one two steps earlier — useful for month-over-month + comparisons in quarterly data.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

3 · Handling missing values

+

By default, pctChangeSeries forward-fills (fillMethod: "pad") + NaN/null values before computing the ratio — so gaps don't break the chain. + Set fillMethod: null to propagate NaN instead.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

4 · Limit consecutive fills

+

The limit option caps how many consecutive NaN values get forward-filled. + Useful when you want to tolerate short gaps but not bridge large ones.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

5 · DataFrame column-wise pct_change

+

pctChangeDataFrame(df) applies pctChangeSeries to every + column independently. Ideal for comparing multiple assets or metrics simultaneously.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

6 · Negative periods (look-forward change)

+

A negative periods value computes the forward change: how much will + this element change by the time we reach |periods| steps ahead. + Useful for computing returns on a "hold for N periods" strategy.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

API Reference

+

All functions return a new Series/DataFrame of the same shape — inputs are never mutated.

+
// Series
+pctChangeSeries(series, {
+  periods?: number,           // default 1 (positive = look back, negative = look forward)
+  fillMethod?: "pad" | "bfill" | null,  // default "pad"
+  limit?: number | null,      // max consecutive fills; default unlimited
+}): Series
+
+// DataFrame
+pctChangeDataFrame(df, {
+  periods?: number,
+  fillMethod?: "pad" | "bfill" | null,
+  limit?: number | null,
+  axis?: 0 | 1 | "index" | "columns",  // default 0 (column-wise)
+}): DataFrame
+
+ + + + + diff --git a/playground/replace.html b/playground/replace.html new file mode 100644 index 00000000..19da518a --- /dev/null +++ b/playground/replace.html @@ -0,0 +1,408 @@ + + + + + + tsb — replace (value substitution) + + + +
+
+
Loading tsb runtime…
+
+ + ← Back to playground index + +

replace — value substitution

+

+ replaceSeries / replaceDataFrame substitute values + matching a pattern with a new value.
+ Supports scalar, array, and mapping (Record / Map) replacement specs.
+ Mirrors Series.replace() and DataFrame.replace() from pandas. +

+ + +
+

1 · Scalar → scalar replacement

+

+ Replace every occurrence of a single value with another value. + Works on numbers, strings, booleans, and null. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

2 · Array replacement

+

+ Replace a list of values with a single target, or perform pair-wise + replacement using two equal-length arrays. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

3 · Mapping (Record / Map) replacement

+

+ Pass a lookup table as either a plain object (Record<string, Scalar>) + or a JavaScript Map for full type flexibility. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

4 · DataFrame replacement

+

+ replaceDataFrame applies the same spec to all columns by + default. Use the columns option to restrict which columns + are affected. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

API Reference

+
// Replace values in a Series
+replaceSeries(
+  series: Series,
+  spec: ReplaceSpec,
+  options?: ReplaceOptions,
+): Series
+
+// Replace values in a DataFrame
+replaceDataFrame(
+  df: DataFrame,
+  spec: ReplaceSpec,
+  options?: DataFrameReplaceOptions,
+): DataFrame
+
+// Replacement spec variants
+type ReplaceSpec =
+  | { toReplace: Scalar;              value: Scalar }               // scalar → scalar
+  | { toReplace: Scalar[];            value: Scalar }               // array  → scalar
+  | { toReplace: Scalar[];            value: Scalar[] }             // array  → array (pair-wise)
+  | { toReplace: Record<string, Scalar> }                          // Record mapping
+  | { toReplace: Map<Scalar, Scalar> }                             // Map mapping
+
+// Options
+interface ReplaceOptions {
+  matchNaN?: boolean;  // treat NaN===NaN for matching (default: true)
+}
+
+interface DataFrameReplaceOptions extends ReplaceOptions {
+  columns?: string[];  // only replace in these columns (default: all)
+}
+
+ + + + + diff --git a/playground/sample.html b/playground/sample.html new file mode 100644 index 00000000..d29ed43a --- /dev/null +++ b/playground/sample.html @@ -0,0 +1,187 @@ + + + + + + tsb — sample + + + +

tsb — sample

+

+ Randomly sample items from a Series or rows/columns from a DataFrame. + Supports fixed count (n), fractional sampling (frac), + sampling with replacement (replace), weighted sampling, and + deterministic seeding via randomState. +

+ +

Core concept

+
// Sample 3 items (without replacement by default)
+sampleSeries(s, { n: 3 })
+
+// Sample 50% of rows
+sampleDataFrame(df, { frac: 0.5 })
+
+// Reproducible sample with seed
+sampleSeries(s, { n: 2, randomState: 42 })
+
+// Sample with replacement (bootstrap)
+sampleSeries(s, { n: 10, replace: true })
+
+// Sample columns instead of rows
+sampleDataFrame(df, { n: 2, axis: 1 })
+ +
+ pandas equivalent:
+ s.sample(n=3, random_state=42)
+ df.sample(frac=0.5, replace=False, axis=0) +
+ + +

Demo 1 — sampleSeries (n)

+
+
Code
+
const s = new Series({ data: [10, 20, 30, 40, 50], name: "scores" });
+sampleSeries(s, { n: 3, randomState: 7 }).values;
+// deterministic result with seed 7
+ + +
+ + +

Demo 2 — sampleSeries with frac

+
+
Code
+
const s = new Series({ data: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] });
+sampleSeries(s, { frac: 0.3, randomState: 42 }).values;
+// 30% of 10 items = 3 items
+ + +
+ + +

Demo 3 — bootstrap sampling (replace=true)

+
+
Code
+
const s = new Series({ data: ["a", "b", "c"] });
+// Sample more items than pool size — only possible with replace=true
+sampleSeries(s, { n: 7, replace: true, randomState: 0 }).values;
+ + +
+ + +

Demo 4 — weighted sampling

+
+
Code
+
const s = new Series({ data: ["rare", "common", "very_common"] });
+// "very_common" has 10× the weight of "rare"
+sampleSeries(s, { n: 1, weights: [1, 5, 10], randomState: 3 }).values;
+// most likely: ["very_common"]
+ + +
+ + +

Demo 5 — sampleDataFrame (rows)

+
+
Code
+
const df = DataFrame.fromRecords([
+  { city: "NYC",    pop: 8_336_817 },
+  { city: "LA",     pop: 3_979_576 },
+  { city: "Chicago",pop: 2_693_976 },
+  { city: "Houston",pop: 2_320_268 },
+  { city: "Phoenix",pop: 1_680_992 },
+]);
+const sample = sampleDataFrame(df, { n: 3, randomState: 1 });
+sample.col("city").values;
+ + +
+ + +

Interactive editor

+
+
Edit and run:
+ + + +
+ + + + diff --git a/src/core/astype.ts b/src/core/astype.ts new file mode 100644 index 00000000..6a9403be --- /dev/null +++ b/src/core/astype.ts @@ -0,0 +1,245 @@ +/** + * astype — dtype coercion for Series and DataFrame. + * + * Mirrors `pandas.Series.astype` and `pandas.DataFrame.astype`: + * cast values to a target dtype, with null/NaN passthrough semantics + * matching pandas' default `errors="raise"` behaviour. + * + * @module + */ + +import { DataFrame } from "./frame.ts"; +import { Series } from "./series.ts"; +import { Dtype } from "./dtype.ts"; +import type { DtypeName, Scalar } from "../types.ts"; + +// ─── helpers ────────────────────────────────────────────────────────────────── + +function isNull(v: Scalar): v is null | undefined { + return v === null || v === undefined; +} + +/** Integer clamp ranges for each integer dtype name. */ +const INT_RANGES: Readonly< + Record +> = { + int8: { lo: -128, hi: 127, unsigned: false }, + int16: { lo: -32768, hi: 32767, unsigned: false }, + int32: { lo: -2147483648, hi: 2147483647, unsigned: false }, + int64: { lo: Number.MIN_SAFE_INTEGER, hi: Number.MAX_SAFE_INTEGER, unsigned: false }, + uint8: { lo: 0, hi: 255, unsigned: true }, + uint16: { lo: 0, hi: 65535, unsigned: true }, + uint32: { lo: 0, hi: 4294967295, unsigned: true }, + uint64: { lo: 0, hi: Number.MAX_SAFE_INTEGER, unsigned: true }, +}; + +/** + * Cast a single scalar value to the target dtype. + * + * Rules per dtype kind: + * - **int/uint**: `Math.trunc(Number(v))`, clamped to the dtype range. `null/undefined → null`. + * - **float32/float64**: `Number(v)`. `null/undefined → null`. Strings that + * are not parsable become `NaN` (same as pandas `errors="coerce"`-like + * number coercion). + * - **bool**: falsy values → `false`; truthy → `true`. `null/undefined → null`. + * - **string**: `String(v)`. `null/undefined → null`. + * - **datetime**: `new Date(Number(v))` for numbers; `new Date(String(v))` for + * strings; `null/undefined → null`. + * - **object/category/timedelta**: value is returned as-is (no transformation). + */ +export function castScalar(v: Scalar, dtype: Dtype): Scalar { + if (isNull(v)) { + return null; + } + + const k = dtype.kind; + + if (k === "int" || k === "uint") { + if (typeof v === "boolean") { + return v ? 1 : 0; + } + if (v instanceof Date) { + return Math.trunc(v.getTime()); + } + const n = Number(v); + if (Number.isNaN(n)) { + return null; + } + const range = INT_RANGES[dtype.name]; + if (range === undefined) { + return Math.trunc(n); + } + const t = Math.trunc(n); + return Math.max(range.lo, Math.min(range.hi, t)); + } + + if (k === "float") { + if (typeof v === "boolean") { + return v ? 1.0 : 0.0; + } + if (v instanceof Date) { + return v.getTime(); + } + return Number(v); + } + + if (k === "bool") { + if (typeof v === "number") { + return !Number.isNaN(v) && v !== 0; + } + if (v instanceof Date) { + return true; + } + return Boolean(v); + } + + if (k === "string") { + if (v instanceof Date) { + return v.toISOString(); + } + return String(v); + } + + if (k === "datetime") { + if (v instanceof Date) { + return v; + } + if (typeof v === "number") { + return new Date(v); + } + const d = new Date(String(v)); + return Number.isNaN(d.getTime()) ? null : d; + } + + // object / category / timedelta — return unchanged + return v; +} + +// ─── AstypeOptions ──────────────────────────────────────────────────────────── + +/** Options accepted by {@link astypeSeries} and {@link astype}. */ +export interface AstypeOptions { + /** + * When `true`, values that cannot be cast are silently replaced with + * `null` instead of throwing. + * + * @default false + */ + readonly errors?: "raise" | "ignore"; +} + +// ─── astypeSeries ───────────────────────────────────────────────────────────── + +/** + * Cast a Series to a different dtype. + * + * Returns a new Series whose values have been coerced to `dtype`. The index + * and name are preserved unchanged. + * + * @example + * ```ts + * const s = new Series({ data: [1.9, 2.1, 3.7], name: "x" }); + * const si = astypeSeries(s, "int64"); + * si.values; // [1, 2, 3] + * si.dtype.name; // "int64" + * ``` + */ +export function astypeSeries( + s: Series, + dtype: DtypeName | Dtype, + options: AstypeOptions = {}, +): Series { + const targetDtype = dtype instanceof Dtype ? dtype : Dtype.from(dtype as DtypeName); + const { errors = "raise" } = options; + + const casted: Scalar[] = []; + for (const v of s.values) { + let out: Scalar; + try { + out = castScalar(v, targetDtype); + } catch (e) { + if (errors === "ignore") { + out = v; + } else { + throw e; + } + } + casted.push(out); + } + + return new Series({ + data: casted, + index: s.index, + dtype: targetDtype, + name: s.name, + }); +} + +// ─── DataFrame astype ───────────────────────────────────────────────────────── + +/** + * Options for {@link astype} (DataFrame variant). + */ +export interface DataFrameAstypeOptions extends AstypeOptions { + /** + * When `true`, only the columns listed in `dtype` (when `dtype` is a + * `Record`) are recast; other columns are carried over unchanged. + * + * When `false` (default) and `dtype` is a `Record`, columns not listed + * in the map are carried over unchanged (same behaviour). + * + * This option exists for pandas API compatibility. + */ + readonly copy?: boolean; +} + +/** + * Cast one or more columns in a DataFrame to the specified dtype(s). + * + * - Pass a single `DtypeName` or `Dtype` to cast **all** columns. + * - Pass a `Record` to cast individual columns. + * Columns not listed are returned unchanged. + * + * Returns a new DataFrame; the original is not modified. + * + * @example + * ```ts + * const df = DataFrame.fromColumns({ a: [1.5, 2.7], b: ["3", "4"] }); + * + * // Cast all columns to float64 + * astype(df, "float64"); + * + * // Cast only column "b" to int64 + * astype(df, { b: "int64" }); + * ``` + */ +export function astype( + df: DataFrame, + dtype: + | DtypeName + | Dtype + | Readonly>, + options: DataFrameAstypeOptions = {}, +): DataFrame { + const colMap = new Map>(); + + const isSingleDtype = + typeof dtype === "string" || dtype instanceof Dtype; + + for (const name of df.columns.values) { + const col = df.col(name); + if (isSingleDtype) { + colMap.set(name, astypeSeries(col, dtype as DtypeName | Dtype, options)); + } else { + const mapping = dtype as Readonly>; + const target = mapping[name]; + if (target !== undefined) { + colMap.set(name, astypeSeries(col, target, options)); + } else { + colMap.set(name, col); + } + } + } + + return new DataFrame(colMap, df.index); +} diff --git a/src/core/index.ts b/src/core/index.ts index 255aade6..b0a45a5a 100644 --- a/src/core/index.ts +++ b/src/core/index.ts @@ -15,6 +15,10 @@ export { CategoricalAccessor } from "./cat_accessor.ts"; export type { CatSeriesLike } from "./cat_accessor.ts"; export { MultiIndex } from "./multi_index.ts"; export type { MultiIndexOptions } from "./multi_index.ts"; +export { astypeSeries, astype, castScalar } from "./astype.ts"; +export type { AstypeOptions, DataFrameAstypeOptions } from "./astype.ts"; +export { sampleSeries, sampleDataFrame } from "./sample.ts"; +export type { SampleOptions } from "./sample.ts"; export { insertColumn, popColumn, diff --git a/src/core/sample.ts b/src/core/sample.ts new file mode 100644 index 00000000..869ce7b8 --- /dev/null +++ b/src/core/sample.ts @@ -0,0 +1,334 @@ +/** + * sample — random sampling from Series and DataFrame. + * + * Mirrors: + * - `pandas.Series.sample(n, frac, replace, weights, random_state, axis)` + * - `pandas.DataFrame.sample(n, frac, replace, weights, random_state, axis)` + * + * @module + */ + +import { DataFrame } from "./frame.ts"; +import { Index } from "./base-index.ts"; +import { Series } from "./series.ts"; +import type { Label, Scalar } from "../types.ts"; + +// ─── public types ───────────────────────────────────────────────────────────── + +/** Options for {@link sampleSeries} and {@link sampleDataFrame}. */ +export interface SampleOptions { + /** + * Number of items to return. Mutually exclusive with `frac`. + * @defaultValue `1` (when neither `n` nor `frac` is provided) + */ + readonly n?: number; + /** + * Fraction of items to return (e.g. `0.5` for 50%). + * Mutually exclusive with `n`. + */ + readonly frac?: number; + /** + * Allow sampling with replacement (the same item may appear multiple times). + * @defaultValue `false` + */ + readonly replace?: boolean; + /** + * Weights for each item. Must have the same length as the Series/DataFrame. + * Weights do not need to sum to 1 — they are normalized internally. + * Missing weights (null/undefined/NaN) are treated as 0. + */ + readonly weights?: readonly (number | null | undefined)[]; + /** + * Seed for the random number generator. When provided, sampling is + * deterministic (same seed + same data → same result). + * Uses a simple LCG (linear congruential generator). + */ + readonly randomState?: number; + /** + * Axis to sample along (DataFrame only). + * - `0` or `"index"` (default): sample rows. + * - `1` or `"columns"`: sample columns. + */ + readonly axis?: 0 | 1 | "index" | "columns"; +} + +// ─── seeded RNG ─────────────────────────────────────────────────────────────── + +/** + * Minimal LCG-based PRNG (Knuth constants). + * Returns a new seed and a float in [0, 1). + */ +function lcgNext(seed: number): [number, number] { + // LCG parameters (Numerical Recipes) + const a = 1664525; + const c = 1013904223; + const m = 2 ** 32; + const nextSeed = ((a * seed + c) >>> 0) % m; + return [nextSeed, nextSeed / m]; +} + +/** Build a seeded random float generator that returns [0,1). */ +function makeRng(seed: number | undefined): () => number { + if (seed === undefined) { + return () => Math.random(); + } + let s = seed >>> 0; // ensure 32-bit unsigned + return () => { + const [ns, r] = lcgNext(s); + s = ns; + return r; + }; +} + +// ─── helpers ────────────────────────────────────────────────────────────────── + +/** Resolve how many items to sample from a pool of size `poolSize`. */ +function resolveN(poolSize: number, n: number | undefined, frac: number | undefined): number { + if (n !== undefined && frac !== undefined) { + throw new Error("Sample: specify either `n` or `frac`, not both."); + } + if (frac !== undefined) { + if (frac < 0) { + throw new RangeError("Sample: `frac` must be >= 0."); + } + return Math.floor(frac * poolSize); + } + if (n !== undefined) { + if (n < 0) { + throw new RangeError("Sample: `n` must be >= 0."); + } + return n; + } + return 1; +} + +/** Normalize weights to probabilities summing to 1. */ +function normalizeWeights( + rawWeights: readonly (number | null | undefined)[], + poolSize: number, +): number[] { + if (rawWeights.length !== poolSize) { + throw new RangeError( + `Sample: weights length (${rawWeights.length}) must equal pool size (${poolSize}).`, + ); + } + const ws = rawWeights.map((w) => { + const v = w ?? 0; + if (typeof v !== "number" || Number.isNaN(v) || v < 0) { + return 0; + } + return v; + }); + const total = ws.reduce((s, v) => s + v, 0); + if (total === 0) { + throw new Error("Sample: all weights are zero."); + } + return ws.map((w) => w / total); +} + +/** + * Weighted random sample without replacement using the alias method. + * Falls back to basic weighted sampling when `replace=true`. + */ +function weightedSampleWithoutReplacement( + poolSize: number, + k: number, + probs: number[], + rng: () => number, +): number[] { + // Use reservoir sampling with exponential keys: assign key = rand^(1/w), take top-k + const keys: Array<[number, number]> = probs.map((p, i) => { + const r = rng(); + const key = p > 0 ? Math.pow(r, 1 / p) : 0; + return [key, i]; + }); + keys.sort((a, b) => b[0] - a[0]); + return keys.slice(0, k).map(([, i]) => i); +} + +/** + * Weighted sample WITH replacement: pick `k` indices based on cumulative probabilities. + */ +function weightedSampleWithReplacement( + k: number, + probs: number[], + rng: () => number, +): number[] { + const cumulative: number[] = []; + let sum = 0; + for (const p of probs) { + sum += p; + cumulative.push(sum); + } + + const result: number[] = []; + for (let i = 0; i < k; i++) { + const r = rng(); + let idx = cumulative.findIndex((c) => c >= r); + if (idx < 0) { + idx = probs.length - 1; + } + result.push(idx); + } + return result; +} + +/** + * Fisher-Yates shuffle (unweighted, without replacement) — pick the first `k` elements. + */ +function fisherYatesSample(poolSize: number, k: number, rng: () => number): number[] { + const indices = Array.from({ length: poolSize }, (_, i) => i); + for (let i = 0; i < k; i++) { + const j = i + Math.floor(rng() * (poolSize - i)); + const tmp = indices[i]; + const jVal = indices[j]; + if (tmp !== undefined && jVal !== undefined) { + indices[i] = jVal; + indices[j] = tmp; + } + } + return indices.slice(0, k); +} + +/** + * Sample with replacement (unweighted): draw `k` integers in [0, poolSize). + */ +function uniformSampleWithReplacement(poolSize: number, k: number, rng: () => number): number[] { + const result: number[] = []; + for (let i = 0; i < k; i++) { + result.push(Math.floor(rng() * poolSize)); + } + return result; +} + +/** Core sampling logic: return an array of selected positions. */ +function samplePositions( + poolSize: number, + k: number, + replace: boolean, + weights: readonly (number | null | undefined)[] | undefined, + rng: () => number, +): number[] { + if (poolSize === 0 || k === 0) { + return []; + } + if (!replace && k > poolSize) { + throw new RangeError( + `Sample: cannot sample ${k} items without replacement from a pool of ${poolSize}.`, + ); + } + + if (weights !== undefined) { + const probs = normalizeWeights(weights, poolSize); + if (replace) { + return weightedSampleWithReplacement(k, probs, rng); + } + return weightedSampleWithoutReplacement(poolSize, k, probs, rng); + } + + if (replace) { + return uniformSampleWithReplacement(poolSize, k, rng); + } + return fisherYatesSample(poolSize, k, rng); +} + +// ─── Series sample ──────────────────────────────────────────────────────────── + +/** + * Return a random sample of items from a Series. + * + * @example + * ```ts + * const s = new Series({ data: [10, 20, 30, 40, 50] }); + * sampleSeries(s, { n: 3, randomState: 42 }).values; // [30, 10, 50] (deterministic) + * ``` + */ +export function sampleSeries(series: Series, options?: SampleOptions): Series { + const opts = options ?? {}; + const k = resolveN(series.values.length, opts.n, opts.frac); + const replace = opts.replace ?? false; + const rng = makeRng(opts.randomState); + + const positions = samplePositions(series.values.length, k, replace, opts.weights, rng); + const newValues: Scalar[] = positions.map((i) => series.values[i] ?? null); + const newLabels: Label[] = positions.map((i) => series.index.at(i) ?? null); + + return new Series({ + data: newValues, + index: new Index