diff --git a/biome.json b/biome.json index 81748897..7c43b424 100644 --- a/biome.json +++ b/biome.json @@ -27,17 +27,24 @@ "rules": { "recommended": true, "complexity": { - "all": true + "all": true, + "noExcessiveCognitiveComplexity": "warn", + "noForEach": "warn", + "useLiteralKeys": "warn", + "noUselessSwitchCase": "warn" }, "correctness": { - "all": true + "all": true, + "noNodejsModules": "warn", + "noUnusedVariables": "warn" }, "nursery": { "all": true }, "performance": { "all": true, - "noBarrelFile": "off" + "noBarrelFile": "off", + "useTopLevelRegex": "warn" }, "security": { "all": true @@ -45,10 +52,18 @@ "style": { "all": true, "noDefaultExport": "off", - "useNamingConvention": "off" + "useNamingConvention": "off", + "noNonNullAssertion": "warn", + "noNamespaceImport": "warn", + "noParameterProperties": "warn", + "useDefaultSwitchClause": "warn", + "useCollapsedElseIf": "warn" }, "suspicious": { - "all": true + "all": true, + "noAssignInExpressions": "warn", + "noMisplacedAssertion": "warn", + "noApproximativeNumericConstant": "warn" } } }, diff --git a/playground/astype.html b/playground/astype.html new file mode 100644 index 00000000..efd9e5ed --- /dev/null +++ b/playground/astype.html @@ -0,0 +1,438 @@ + + + + + + tsb — astype + + + +
+
+

Loading tsb runtime…

+
+ + ← tsb playground +

astype — dtype coercion

+

+ Cast Series and DataFrame values to a different dtype. + Mirrors pandas.Series.astype and pandas.DataFrame.astype. +

+ + +
+

1 · Series — float to int64

+

+ Cast floating-point values to integers via truncation (same as + pandas.Series.astype("int64")). +

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

2 · Series — numbers to string

+

Convert every value to its string representation. Null/undefined values + become null (not the string "null").

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

3 · Overflow clamping for bounded integer dtypes

+

+ Values that overflow the target integer dtype's range are clamped to + [min, max] — e.g. uint8 is clamped to + [0, 255]. +

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

4 · DataFrame — cast all columns

+

Pass a single dtype name to cast every column to the same type.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

5 · DataFrame — per-column dtype mapping

+

Pass a Record<string, DtypeName> to cast individual + columns. Columns not listed are carried over unchanged.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

6 · Casting to bool

+

Zero, empty string, and NaN become false; + everything else (including non-zero numbers and non-empty strings) + becomes true.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

API Reference

+
// Series cast
+astypeSeries(
+  series: Series,
+  dtype: DtypeName | Dtype,
+  options?: AstypeOptions,
+): Series
+
+// DataFrame cast (all columns or per-column mapping)
+astype(
+  df: DataFrame,
+  dtype: DtypeName | Dtype | Record<string, DtypeName | Dtype>,
+  options?: DataFrameAstypeOptions,
+): DataFrame
+
+// Low-level scalar cast
+castScalar(value: Scalar, dtype: Dtype): Scalar
+
+// Options
+interface AstypeOptions {
+  errors?: "raise" | "ignore";  // default "raise"
+}
+
+// Supported dtype names
+type DtypeName =
+  | "int8" | "int16" | "int32" | "int64"
+  | "uint8" | "uint16" | "uint32" | "uint64"
+  | "float32" | "float64"
+  | "bool" | "string" | "object"
+  | "datetime" | "timedelta" | "category"
+
+ + + + + diff --git a/playground/clip_advanced.html b/playground/clip_advanced.html new file mode 100644 index 00000000..eb200294 --- /dev/null +++ b/playground/clip_advanced.html @@ -0,0 +1,163 @@ + + + + + + tsb — clip_advanced (per-element clipping) + + + +

tsb — clip_advanced (per-element clipping)

+

+ Clip Series and DataFrame values to per-element bounds. + Unlike the simple scalar clip, clipAdvancedSeries and + clipAdvancedDataFrame support array, Series, and DataFrame bounds — + enabling per-position or element-wise bound specification. +

+ +

Core concept

+
// Scalar bounds (like pandas s.clip(lower=0, upper=5))
+clipAdvancedSeries(s, { lower: 0, upper: 5 })
+
+// Per-element array bounds
+clipAdvancedSeries(s, { lower: [1, 2, 3], upper: [4, 5, 6] })
+
+// Series bounds (positional alignment)
+clipAdvancedSeries(s, { lower: loSeries, upper: hiSeries })
+
+// DataFrame element-wise bounds
+clipAdvancedDataFrame(df, { lower: loDf, upper: hiDf })
+
+// Series broadcast on DataFrame (axis=0: one bound per column; axis=1: one per row)
+clipAdvancedDataFrame(df, { lower: loSeries, axis: 1 })
+ +
+ pandas equivalent:
+ s.clip(lower=lo_array, upper=hi_array)
+ df.clip(lower=lo_df, upper=hi_df) +
+ + +

Demo 1 — clipAdvancedSeries with scalar bounds

+
+
Code
+
const s = new Series({ data: [-3, 1, 5, 10] });
+clipAdvancedSeries(s, { lower: 0, upper: 6 }).values;
+// → [0, 1, 5, 6]
+ + +
+ + +

Demo 2 — clipAdvancedSeries with per-element array bounds

+
+
Code
+
const s = new Series({ data: [-1, 0, 5, 12] });
+const lo = [2, -1, 4, 10];
+const hi = [5,  3, 8, 11];
+clipAdvancedSeries(s, { lower: lo, upper: hi }).values;
+// → [2, 0, 5, 11]
+ + +
+ + +

Demo 3 — clipAdvancedSeries with Series bounds

+
+
Code
+
const s = new Series({ data: [0, 5, 10, 15] });
+const loBound = new Series({ data: [1, 3, 8, 12] });
+const hiBound = new Series({ data: [2, 7, 9, 20] });
+clipAdvancedSeries(s, { lower: loBound, upper: hiBound }).values;
+// → [1, 5, 9, 15]
+ + +
+ + +

Demo 4 — clipAdvancedDataFrame with DataFrame bounds

+
+
Code
+
const df = DataFrame.fromColumns({ a: [1, 5, 9], b: [2, 6, 10] });
+const lo = DataFrame.fromColumns({ a: [2, 3, 4], b: [1, 4, 8] });
+const hi = DataFrame.fromColumns({ a: [3, 7, 8], b: [5, 9, 12] });
+const result = clipAdvancedDataFrame(df, { lower: lo, upper: hi });
+result.col("a").values; // → [2, 5, 8]
+result.col("b").values; // → [2, 6, 10]
+ + +
+ + +

Demo 5 — clipAdvancedDataFrame with Series broadcast (axis=1)

+
+
Code
+
// axis=1: one lower bound per row
+const df = DataFrame.fromColumns({ a: [1, 5, 9], b: [2, 6, 10] });
+const loPerRow = new Series({ data: [0, 4, 10] });
+const result = clipAdvancedDataFrame(df, { lower: loPerRow, axis: 1 });
+result.col("a").values; // → [1, 5, 10]
+result.col("b").values; // → [2, 6, 10]
+ + +
+ + + + diff --git a/playground/excel.html b/playground/excel.html new file mode 100644 index 00000000..563736cf --- /dev/null +++ b/playground/excel.html @@ -0,0 +1,561 @@ + + + + + + tsb — readExcel playground + + + + +

📊 readExcel — XLSX file reading

+

+ tsb can read Excel XLSX files natively — no dependencies. The + readExcel() function accepts a Uint8Array or + ArrayBuffer and returns a DataFrame. +

+ +
+ Python equivalent: + pd.read_excel("data.xlsx") +
+ +

Basic usage

+
import { readExcel, xlsxSheetNames } from "tsb";
+
+// Read first sheet (default)
+const df = readExcel(buffer);
+console.log(df.shape);         // [rows, cols]
+console.log(df.columns.toArray()); // column names
+
+// List all sheet names
+const sheets = xlsxSheetNames(buffer);
+// → ["Sheet1", "Summary", "Data"]
+
+// Read a specific sheet by name
+const df2 = readExcel(buffer, { sheetName: "Summary" });
+
+// Read a specific sheet by index
+const df3 = readExcel(buffer, { sheetName: 1 });
+
+ +

Options

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
OptionTypeDefaultDescription
sheetNamestring | number0Sheet to read (name or 0-based index)
headernumber | null0Row index of the header, or null for no header
indexColstring | number | nullnullColumn to use as the row index
skipRowsnumber0Data rows to skip after the header
nrowsnumberunlimitedMaximum number of data rows to read
naValuesstring[][]Additional strings to treat as NA
+ +

Interactive demo

+

Upload an .xlsx file to inspect it, or use the demo data below.

+ +
+ +

or

+ +
+ +
+ +   + +   + +   + +

+ +
+ +
Upload a file or click "Load demo data" to start.
+ +

Advanced example

+
// Use a named column as the row index
+const df = readExcel(buffer, { indexCol: "ID" });
+
+// Skip 2 rows and read at most 100 rows
+const df2 = readExcel(buffer, { skipRows: 2, nrows: 100 });
+
+// Treat custom strings as missing
+const df3 = readExcel(buffer, { naValues: ["N/A", "MISSING", "-"] });
+
+// DataFrame operations work immediately
+df.describe();
+df.col("revenue").sum();
+df.groupby("region").mean();
+
+ +

Python equivalent

+
# pandas
+import pandas as pd
+
+df = pd.read_excel("data.xlsx", sheet_name=0)
+df = pd.read_excel("data.xlsx", sheet_name="Summary")
+df = pd.read_excel("data.xlsx", header=None)
+df = pd.read_excel("data.xlsx", index_col="ID")
+df = pd.read_excel("data.xlsx", skiprows=2, nrows=100)
+
+ + + + diff --git a/playground/idxmin_idxmax.html b/playground/idxmin_idxmax.html new file mode 100644 index 00000000..4ae4e7d3 --- /dev/null +++ b/playground/idxmin_idxmax.html @@ -0,0 +1,446 @@ + + + + + + tsb — idxmin / idxmax + + + +
+
+
Loading TypeScript compiler…
+
+ + ← tsb playground +

idxmin / idxmax

+

+ Return the index label of the minimum or maximum value in a + Series or each column of a DataFrame. + Mirrors pandas.Series.idxmin(), idxmax(), + pandas.DataFrame.idxmin(), and DataFrame.idxmax(). +

+ + +
+

1 · Series.idxmin — label of the minimum value

+

Returns the index label at the position of the minimum value. + NaN / null values are skipped by default.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

2 · Series.idxmax — label of the maximum value

+

Returns the index label at the position of the maximum value.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

3 · NaN handling — skipna option

+

By default NaN / null values are skipped. Set skipna: false + to propagate NaN (returns null if any value is NaN).

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

4 · DataFrame.idxmin — row label of column minima

+

Returns a Series indexed by column names. Each value is the row label + where that column achieves its minimum.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

5 · DataFrame.idxmax — row label of column maxima

+

Returns a Series indexed by column names, where each entry is the row + label of that column's maximum value.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

6 · Edge cases — empty, all-NaN, all-equal

+

Behavior for empty series, series where every value is NaN, and series + where all values are equal.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

API Reference

+
// Series
+idxminSeries(series, { skipna?: boolean }): Label   // default skipna=true
+idxmaxSeries(series, { skipna?: boolean }): Label
+
+// DataFrame (axis=0 — min/max per column)
+idxminDataFrame(df, { skipna?: boolean }): Series   // indexed by column names
+idxmaxDataFrame(df, { skipna?: boolean }): Series
+
+ + + + + diff --git a/playground/mode.html b/playground/mode.html new file mode 100644 index 00000000..0a149227 --- /dev/null +++ b/playground/mode.html @@ -0,0 +1,125 @@ + + + + + + tsb — mode + + + +

← tsb playground

+

📊 mode

+

+ modeSeries / modeDataFrame — + return the most-frequent value(s), mirroring + Series.mode() and + DataFrame.mode(). +

+

Equivalent Python: series.mode()

+ +

1 · Single mode

+
+
const s = new Series({ data: [1, 2, 2, 3] });
+modeSeries(s).values;
+// → [2]
+
+
+ +

2 · Tied modes — all returned sorted

+
+
const s = new Series({ data: [1, 1, 2, 2, 3] });
+modeSeries(s).values;
+// → [1, 2]
+
+
+ +

3 · String values

+
+
const s = new Series({ data: ["cat", "dog", "dog", "bird"] });
+modeSeries(s).values;
+// → ["dog"]
+
+
+ +

4 · Null values excluded (dropna=true default)

+
+
const s = new Series({ data: [null, 1, 1, null, null] });
+modeSeries(s).values;
+// → [1]
+
+
+ +

5 · DataFrame column-wise (axis=0)

+
+
const df = DataFrame.fromColumns({ a: [1, 1, 2, 2], b: [5, 5, 5, 6] });
+modeDataFrame(df);
+// a: [1, 2], b: [5, null]  (null-padded)
+
+
+ +

6 · DataFrame row-wise (axis=1)

+
+
const df = DataFrame.fromColumns({ a: [1, 2], b: [1, 3], c: [2, 3] });
+modeDataFrame(df, { axis: 1 });
+// row 0: mode=1, row 1: mode=3
+
+
+ + + + diff --git a/playground/nancumops.html b/playground/nancumops.html new file mode 100644 index 00000000..d7014593 --- /dev/null +++ b/playground/nancumops.html @@ -0,0 +1,295 @@ + + + + + + tsb — NaN-Ignoring Aggregates (nancumops) + + + + +

🔢 NaN-Ignoring Aggregates

+

+ nansum, nanmean, nanmedian, nanstd, nanvar, + nanmin, nanmax, nanprod, nancount + — mirrors numpy.nan* functions in pandas workflows. +

+ + +
+

🧮 Live Calculator

+

Enter a comma-separated list of numbers (use NaN, null for missing).

+ + + + + +
+
+ + +
+

📖 Function Reference

+ + + + + + + + + + + + + + + + + + + + +
FunctionDescriptionEmpty/all-NaN returnspandas / numpy equivalent
nancount(input)Count of valid (non-NaN) numeric values0np.count_nonzero(~np.isnan(a))
nansum(input)Sum, ignoring NaN/null0np.nansum(a)
nanmean(input)Mean, ignoring NaN/nullNaNnp.nanmean(a)
nanmedian(input)Median, ignoring NaN/nullNaNnp.nanmedian(a)
nanvar(input, {ddof})Variance (ddof=1 default)NaNnp.nanvar(a, ddof=1)
nanstd(input, {ddof})Std deviation (ddof=1 default)NaNnp.nanstd(a, ddof=1)
nanmin(input)Minimum, ignoring NaN/nullNaNnp.nanmin(a)
nanmax(input)Maximum, ignoring NaN/nullNaNnp.nanmax(a)
nanprod(input)Product, ignoring NaN/null1np.nanprod(a)
+
+ + +
+

💡 Usage Examples

+ +
+ Basic array usage +
+import { nansum, nanmean, nanmedian, nanstd } from "tsb";
+
+const data = [1, 2, NaN, null, 3, 5];
+
+nansum(data);     // 11
+nanmean(data);    // 2.75
+nanmedian(data);  // 2.5
+nanstd(data);     // 1.708...
+
+
+# Python / pandas equivalent +import numpy as np + +data = [1, 2, np.nan, np.nan, 3, 5] + +np.nansum(data) # 11.0 +np.nanmean(data) # 2.75 +np.nanmedian(data) # 2.5 +np.nanstd(data, ddof=1) # 1.708... +
+
+ +
+ Using with Series +
+import { Series, nansum, nanmean, nancount } from "tsb";
+
+const s = new Series({ data: [10, null, 30, NaN, 50] });
+
+nancount(s);  // 3
+nansum(s);    // 90
+nanmean(s);   // 30
+
+
+# Python / pandas equivalent +import pandas as pd, numpy as np + +s = pd.Series([10, np.nan, 30, np.nan, 50]) + +s.count() # 3 +s.sum() # 90.0 +s.mean() # 30.0 +
+
+ +
+ Variance and std with ddof +
+import { nanvar, nanstd } from "tsb";
+
+const xs = [2, 4, 4, 4, 5, 5, 7, 9];
+
+// Sample (ddof=1, default)
+nanvar(xs);           // ≈ 4.571
+nanstd(xs);           // ≈ 2.138
+
+// Population (ddof=0)
+nanvar(xs, { ddof: 0 });  // 4.0
+nanstd(xs, { ddof: 0 });  // 2.0
+
+
+# Python / pandas equivalent +import numpy as np + +xs = [2, 4, 4, 4, 5, 5, 7, 9] + +np.nanvar(xs, ddof=1) # 4.571... +np.nanstd(xs, ddof=1) # 2.138... + +np.nanvar(xs, ddof=0) # 4.0 +np.nanstd(xs, ddof=0) # 2.0 +
+
+
+ + +
+

⚡ NaN Impact Demo

+

See how NaN values affect results with and without nan-ignoring functions.

+ +
+
+ + + + + diff --git a/playground/nunique.html b/playground/nunique.html new file mode 100644 index 00000000..add4399d --- /dev/null +++ b/playground/nunique.html @@ -0,0 +1,112 @@ + + + + + + tsb — nunique / any / all + + + +

← tsb playground

+

🔢 nunique / any / all

+

+ Count unique values and perform boolean reductions, mirroring + Series.nunique(), + Series.any(), and + Series.all(). +

+ +

1 · nunique — count distinct values

+
+
import { Series, nuniqueSeries } from "tsb";
+
+const s = new Series({ data: [1, 2, 2, 3, 3, 3, null] });
+
+nuniqueSeries(s);                 // 3 (null excluded by default)
+nuniqueSeries(s, { dropna: false }); // 4 (null counted as a distinct value)
+
nuniqueSeries(s) → 3 +nuniqueSeries(s, {dropna:false}) → 4
+
+ +

2 · any — is any element truthy?

+
+
import { anySeries } from "tsb";
+
+const allZero = new Series({ data: [0, 0, 0] });
+const hasOne  = new Series({ data: [0, 0, 1] });
+
+anySeries(allZero); // false
+anySeries(hasOne);  // true
+
+// With nulls (skipna=true by default)
+const withNull = new Series({ data: [null, 0, null] });
+anySeries(withNull); // false — null skipped, 0 is falsy
+
anySeries(allZero) → false +anySeries(hasOne) → true +anySeries(withNull) → false
+
+ +

3 · all — are all elements truthy?

+
+
import { allSeries } from "tsb";
+
+const allTrue = new Series({ data: [1, 2, 3] });
+const hasFalsy = new Series({ data: [1, 0, 3] });
+
+allSeries(allTrue);  // true
+allSeries(hasFalsy); // false
+
+// Empty or all-null series vacuously returns true
+allSeries(new Series({ data: [] }));              // true
+allSeries(new Series({ data: [null, null] }));    // true
+
allSeries(allTrue) → true +allSeries(hasFalsy) → false +allSeries([]) → true (vacuous) +allSeries([null]) → true (vacuous)
+
+ +

4 · DataFrame nunique

+
+
import { DataFrame, nuniqueDataFrame } from "tsb";
+
+const df = DataFrame.fromColumns({
+  category: ["A", "B", "A", "C"],
+  value:    [1,   2,   1,   3  ],
+});
+
+nuniqueDataFrame(df);          // per-column: category→3, value→3
+nuniqueDataFrame(df, { axis: 1 }); // per-row: how many distinct values in each row
+
nuniqueDataFrame(df) → category: 3, value: 3 +nuniqueDataFrame(df, {axis:1}) → row0: 2, row1: 2, row2: 2, row3: 2
+
+ +

5 · DataFrame any / all

+
+
import { anyDataFrame, allDataFrame } from "tsb";
+
+const df2 = DataFrame.fromColumns({
+  a: [0, 0, 1],
+  b: [1, 1, 1],
+});
+
+anyDataFrame(df2); // a: true, b: true  (each col has at least one truthy)
+allDataFrame(df2); // a: false, b: true (col a has a 0)
+
+// axis=1: reduce across columns per row
+anyDataFrame(df2, { axis: 1 }); // row0: true, row1: true, row2: true
+allDataFrame(df2, { axis: 1 }); // row0: false, row1: false, row2: true
+
anyDataFrame(df2) → a: true, b: true +allDataFrame(df2) → a: false, b: true +anyDataFrame(df2,{axis:1}) → [true, true, true] +allDataFrame(df2,{axis:1}) → [false, false, true]
+
+ + diff --git a/playground/pct_change.html b/playground/pct_change.html new file mode 100644 index 00000000..ec1b4e3b --- /dev/null +++ b/playground/pct_change.html @@ -0,0 +1,452 @@ + + + + + + tsb — pct_change + + + +
+
+
Initializing playground…
+
+ ← Back to roadmap +

📊 pct_change — Interactive Playground

+

Compute the fractional change between each element and a prior element. + Mirrors pandas.Series.pct_change() / + pandas.DataFrame.pct_change().
+ Edit any code block below and press ▶ Run + (or Ctrl+Enter) to execute it live in your browser. +

+ + +
+

1 · Basic pct_change on a Series

+

pctChangeSeries(series) returns the fractional (not percentage) change + from each previous element. The first element is always null.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

2 · Multi-period change

+

The periods option controls the lag. Use periods: 2 to + compare each value to the one two steps earlier — useful for month-over-month + comparisons in quarterly data.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

3 · Handling missing values

+

By default, pctChangeSeries forward-fills (fillMethod: "pad") + NaN/null values before computing the ratio — so gaps don't break the chain. + Set fillMethod: null to propagate NaN instead.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

4 · Limit consecutive fills

+

The limit option caps how many consecutive NaN values get forward-filled. + Useful when you want to tolerate short gaps but not bridge large ones.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

5 · DataFrame column-wise pct_change

+

pctChangeDataFrame(df) applies pctChangeSeries to every + column independently. Ideal for comparing multiple assets or metrics simultaneously.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

6 · Negative periods (look-forward change)

+

A negative periods value computes the forward change: how much will + this element change by the time we reach |periods| steps ahead. + Useful for computing returns on a "hold for N periods" strategy.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

API Reference

+

All functions return a new Series/DataFrame of the same shape — inputs are never mutated.

+
// Series
+pctChangeSeries(series, {
+  periods?: number,           // default 1 (positive = look back, negative = look forward)
+  fillMethod?: "pad" | "bfill" | null,  // default "pad"
+  limit?: number | null,      // max consecutive fills; default unlimited
+}): Series
+
+// DataFrame
+pctChangeDataFrame(df, {
+  periods?: number,
+  fillMethod?: "pad" | "bfill" | null,
+  limit?: number | null,
+  axis?: 0 | 1 | "index" | "columns",  // default 0 (column-wise)
+}): DataFrame
+
+ + + + + diff --git a/playground/quantile.html b/playground/quantile.html new file mode 100644 index 00000000..fb019d88 --- /dev/null +++ b/playground/quantile.html @@ -0,0 +1,182 @@ + + + + + + tsb — quantile + + + +

← tsb playground

+

📐 quantile

+

+ quantileSeries / quantileDataFrame — + compute quantile(s) / percentile(s), mirroring + Series.quantile() and + DataFrame.quantile(). +

+

Equivalent Python: series.quantile(q=0.5) / df.quantile(q=0.5)

+ +

1 · Scalar quantile (median)

+
+
const s = new Series({ data: [1, 2, 3, 4, 5] });
+quantileSeries(s);          // default q=0.5 → 3
+quantileSeries(s, { q: 0.25 }); // → 2
+quantileSeries(s, { q: 0.75 }); // → 4
+
+
+ +

2 · Multiple quantile levels

+
+
const s = new Series({ data: [1, 2, 3, 4, 5] });
+const q = quantileSeries(s, { q: [0.25, 0.5, 0.75] });
+// Series indexed by q-values: { 0.25: 2, 0.5: 3, 0.75: 4 }
+
+
+ +

3 · Interpolation methods

+
+
const s = new Series({ data: [0, 10] });
+// q=0.5 → position 0.5 between indices 0 and 1
+quantileSeries(s, { q: 0.5, interpolation: "linear" });   // 5
+quantileSeries(s, { q: 0.5, interpolation: "lower" });    // 0
+quantileSeries(s, { q: 0.5, interpolation: "higher" });   // 10
+quantileSeries(s, { q: 0.5, interpolation: "midpoint" }); // 5
+quantileSeries(s, { q: 0.5, interpolation: "nearest" });  // 0
+
+
+ +

4 · NaN handling (skipna=true by default)

+
+
const s = new Series({ data: [1, null, 3, NaN, 5] });
+quantileSeries(s, { q: 0.5 });             // ignores null/NaN → 3
+quantileSeries(s, { q: 0.5, skipna: false }); // NaN propagates → NaN
+
+
+ +

5 · DataFrame — axis=0 (per-column quantiles)

+
+
const df = DataFrame.fromColumns({ a: [1, 2, 3, 4], b: [10, 20, 30, 40] });
+quantileDataFrame(df, { q: 0.5 });
+// Series { a: 2.5, b: 25 }
+
+quantileDataFrame(df, { q: [0.25, 0.5, 0.75] });
+// DataFrame 3×2: rows=[0.25, 0.5, 0.75], cols=[a, b]
+
+
+ +

6 · DataFrame — axis=1 (per-row quantiles)

+
+
const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [3, 4, 5], c: [5, 6, 7] });
+quantileDataFrame(df, { axis: 1, q: 0.5 });
+// Series — median of each row: [3, 4, 5]
+
+
+ +

7 · Q=[0, 0.25, 0.5, 0.75, 1] summary table

+
+
const df = DataFrame.fromColumns({ score: [55, 70, 80, 88, 92, 95, 99] });
+quantileDataFrame(df, { q: [0, 0.25, 0.5, 0.75, 1] });
+// → summary statistics table
+
+
+ + + + diff --git a/playground/replace.html b/playground/replace.html new file mode 100644 index 00000000..19da518a --- /dev/null +++ b/playground/replace.html @@ -0,0 +1,408 @@ + + + + + + tsb — replace (value substitution) + + + +
+
+
Loading tsb runtime…
+
+ + ← Back to playground index + +

replace — value substitution

+

+ replaceSeries / replaceDataFrame substitute values + matching a pattern with a new value.
+ Supports scalar, array, and mapping (Record / Map) replacement specs.
+ Mirrors Series.replace() and DataFrame.replace() from pandas. +

+ + +
+

1 · Scalar → scalar replacement

+

+ Replace every occurrence of a single value with another value. + Works on numbers, strings, booleans, and null. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

2 · Array replacement

+

+ Replace a list of values with a single target, or perform pair-wise + replacement using two equal-length arrays. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

3 · Mapping (Record / Map) replacement

+

+ Pass a lookup table as either a plain object (Record<string, Scalar>) + or a JavaScript Map for full type flexibility. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

4 · DataFrame replacement

+

+ replaceDataFrame applies the same spec to all columns by + default. Use the columns option to restrict which columns + are affected. +

+
+
+
+ + +
+
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

API Reference

+
// Replace values in a Series
+replaceSeries(
+  series: Series,
+  spec: ReplaceSpec,
+  options?: ReplaceOptions,
+): Series
+
+// Replace values in a DataFrame
+replaceDataFrame(
+  df: DataFrame,
+  spec: ReplaceSpec,
+  options?: DataFrameReplaceOptions,
+): DataFrame
+
+// Replacement spec variants
+type ReplaceSpec =
+  | { toReplace: Scalar;              value: Scalar }               // scalar → scalar
+  | { toReplace: Scalar[];            value: Scalar }               // array  → scalar
+  | { toReplace: Scalar[];            value: Scalar[] }             // array  → array (pair-wise)
+  | { toReplace: Record<string, Scalar> }                          // Record mapping
+  | { toReplace: Map<Scalar, Scalar> }                             // Map mapping
+
+// Options
+interface ReplaceOptions {
+  matchNaN?: boolean;  // treat NaN===NaN for matching (default: true)
+}
+
+interface DataFrameReplaceOptions extends ReplaceOptions {
+  columns?: string[];  // only replace in these columns (default: all)
+}
+
+ + + + + diff --git a/playground/sem_var.html b/playground/sem_var.html new file mode 100644 index 00000000..a3114054 --- /dev/null +++ b/playground/sem_var.html @@ -0,0 +1,90 @@ + + + + + + tsb — sem_var + + + +

← tsb playground

+

📊 Variance & Standard Error (sem_var)

+

+ varSeries / semSeries / + varDataFrame / semDataFrame — + compute sample/population variance and standard error of the mean, mirroring + Series.var() and + Series.sem(). +

+

Equivalent Python: series.var(ddof=1) / series.sem()

+ +

1 · Sample variance (ddof=1)

+
+
import { Series, varSeries } from "tsb";
+
+const s = new Series({ data: [2, 4, 4, 4, 5, 5, 7, 9] });
+varSeries(s);           // 4.0  (sample variance, ddof=1)
+varSeries(s, { ddof: 0 }); // 3.5  (population variance, ddof=0)
+
varSeries(s) → 4.0 +varSeries(s, {ddof:0}) → 3.5
+
+ +

2 · Standard error of the mean

+
+
import { semSeries } from "tsb";
+
+// SEM = sqrt(var / n)
+semSeries(s); // sqrt(4 / 8) ≈ 0.7071
+
semSeries(s) ≈ 0.7071
+
+ +

3 · Handling missing values

+
+
const s2 = new Series({ data: [1, 2, 3, null, 5] });
+
+varSeries(s2);                 // skipna=true (default): ignores null
+varSeries(s2, { skipna: false }); // propagates NaN when null present
+varSeries(s2, { minCount: 5 });   // NaN: need 5 valid values but only 4
+
varSeries(s2) → 2.9167 (approx) +varSeries(s2, {skipna:false}) → NaN +varSeries(s2, {minCount:5}) → NaN
+
+ +

4 · DataFrame column-wise variance

+
+
import { DataFrame, varDataFrame, semDataFrame } from "tsb";
+
+const df = DataFrame.fromColumns({
+  a: [1, 2, 3],
+  b: [10, 20, 30],
+});
+
+varDataFrame(df);          // Series { a: 1, b: 100 }
+semDataFrame(df);          // Series { a: sqrt(1/3), b: sqrt(100/3) }
+varDataFrame(df, { axis: 1 }); // row-wise variance
+
varDataFrame(df) → a: 1.0, b: 100.0 +semDataFrame(df) → a: ≈0.577, b: ≈5.774 +varDataFrame(df, {axis:1}) → row0: 20.25, row1: 81.0, row2: 182.25
+
+ +

5 · numericOnly — skip non-numeric columns

+
+
const df2 = DataFrame.fromColumns({
+  score: [10, 20, 30],
+  label: ["A", "B", "C"],
+});
+
+varDataFrame(df2, { numericOnly: true });
+// Only includes "score", excludes "label"
+
varDataFrame(df2, {numericOnly:true}) → score: 100.0
+
+ + diff --git a/playground/skew_kurt.html b/playground/skew_kurt.html new file mode 100644 index 00000000..bec28a8b --- /dev/null +++ b/playground/skew_kurt.html @@ -0,0 +1,137 @@ + + + + + + tsb — skew & kurtosis + + + +

← tsb playground

+

📐 skewSeries / kurtSeries

+

+ skewSeries / kurtSeries — + compute the adjusted Fisher–Pearson skewness and excess kurtosis (bias-corrected), mirroring + Series.skew() and + Series.kurt(). +

+

Equivalent Python: series.skew() / series.kurt()

+ +

1 · Symmetric distribution — skew ≈ 0

+
+
const s = new Series({ data: [1, 2, 3, 4, 5] });
+skewSeries(s);
+// → 0
+
+
+ +

2 · Right-skewed distribution — positive skew

+
+
const s = new Series({ data: [1, 2, 3, 4, 100] });
+skewSeries(s);
+// → large positive value
+
+
+ +

3 · Kurtosis — uniform-like (platykurtic, negative excess)

+
+
const s = new Series({ data: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] });
+kurtSeries(s);
+// → negative (flatter than normal)
+
+
+ +

4 · NaN propagation — too few values

+
+
skewSeries(new Series({ data: [1, 2] }));  // NaN — need ≥ 3
+kurtSeries(new Series({ data: [1, 2, 3] })); // NaN — need ≥ 4
+
+
+ +

5 · DataFrame column-wise skewness

+
+
const df = DataFrame.fromColumns({
+  symmetric: [1, 2, 3, 4, 5],
+  right_skew: [1, 2, 3, 4, 100],
+});
+skewDataFrame(df).values;
+
+
+ +

6 · DataFrame row-wise kurtosis

+
+
const df = DataFrame.fromColumns({
+  a: [1, 10], b: [2, 10], c: [3, 10], d: [4, 10], e: [100, 10], f: [5, 10],
+});
+kurtDataFrame(df, { axis: 1 }).values;
+
+
+ + + + diff --git a/playground/to_datetime.html b/playground/to_datetime.html new file mode 100644 index 00000000..9ed06810 --- /dev/null +++ b/playground/to_datetime.html @@ -0,0 +1,118 @@ + + + + + + tsb — toDatetime + + + +

← tsb playground

+

toDatetime stats

+

+ Convert scalars, arrays, or Series values to JavaScript + Date objects — mirroring + pandas.to_datetime(). +

+ +

Supported input formats

+ + + + + + + + + + + +
FormatExampleResult
ISO 8601 date"2024-03-15"Mar 15 2024
ISO 8601 datetime"2024-03-15T12:00:00Z"Mar 15 2024 12:00 UTC
US format (MM/DD/YYYY)"01/15/2024"Jan 15 2024
European (DD-MM-YYYY)"15-03-2024"Mar 15 2024
Compact (YYYYMMDD)"20240315"Mar 15 2024
Unix ms (number)1710460800000Mar 15 2024 00:00 UTC
Unix s (unit="s")1710460800Mar 15 2024 00:00 UTC
Date objectnew Date(2024,2,15)unchanged
null / undefined / NaNnullnull
+ +

Error handling

+ + + + + +
errors=Behaviour
"raise" (default)Throws TypeError on unparseable input
"coerce"Returns null on unparseable input
"ignore"Returns the original value unchanged
+ +

Quick examples

+
import { toDatetime, Series } from "tsb";
+
+// Scalar
+toDatetime("2024-03-15");         // Date: Mar 15 2024
+toDatetime(1710460800000);        // Date from Unix ms
+toDatetime(1710460800, { unit: "s" }); // Date from Unix seconds
+toDatetime(null);                 // null
+toDatetime("nope", { errors: "coerce" }); // null
+toDatetime("nope", { errors: "ignore" }); // "nope"
+
+// Array
+toDatetime(["2024-01-01", null, "2024-06-15"]);
+// => [Date, null, Date]
+
+// Series
+const s = new Series({ data: ["2024-01-01", "2024-06-15", null] });
+toDatetime(s);
+// => Series<Date | null> with dtype=datetime
+ +

Python / pandas equivalent

+ + +

Live demo

+

Enter a date string or number and click Convert:

+ + +
+ + + + diff --git a/src/core/astype.ts b/src/core/astype.ts new file mode 100644 index 00000000..572352c4 --- /dev/null +++ b/src/core/astype.ts @@ -0,0 +1,239 @@ +/** + * astype — dtype coercion for Series and DataFrame. + * + * Mirrors `pandas.Series.astype` and `pandas.DataFrame.astype`: + * cast values to a target dtype, with null/NaN passthrough semantics + * matching pandas' default `errors="raise"` behaviour. + * + * @module + */ + +import type { DtypeName, Scalar } from "../types.ts"; +import { Dtype } from "./dtype.ts"; +import { DataFrame } from "./frame.ts"; +import { Series } from "./series.ts"; + +// ─── helpers ────────────────────────────────────────────────────────────────── + +function isNull(v: Scalar): v is null | undefined { + return v === null || v === undefined; +} + +/** Integer clamp ranges for each integer dtype name. */ +const INT_RANGES: Readonly> = { + int8: { lo: -128, hi: 127, unsigned: false }, + int16: { lo: -32768, hi: 32767, unsigned: false }, + int32: { lo: -2147483648, hi: 2147483647, unsigned: false }, + int64: { lo: Number.MIN_SAFE_INTEGER, hi: Number.MAX_SAFE_INTEGER, unsigned: false }, + uint8: { lo: 0, hi: 255, unsigned: true }, + uint16: { lo: 0, hi: 65535, unsigned: true }, + uint32: { lo: 0, hi: 4294967295, unsigned: true }, + uint64: { lo: 0, hi: Number.MAX_SAFE_INTEGER, unsigned: true }, +}; + +/** + * Cast a single scalar value to the target dtype. + * + * Rules per dtype kind: + * - **int/uint**: `Math.trunc(Number(v))`, clamped to the dtype range. `null/undefined → null`. + * - **float32/float64**: `Number(v)`. `null/undefined → null`. Strings that + * are not parsable become `NaN` (same as pandas `errors="coerce"`-like + * number coercion). + * - **bool**: falsy values → `false`; truthy → `true`. `null/undefined → null`. + * - **string**: `String(v)`. `null/undefined → null`. + * - **datetime**: `new Date(Number(v))` for numbers; `new Date(String(v))` for + * strings; `null/undefined → null`. + * - **object/category/timedelta**: value is returned as-is (no transformation). + */ +export function castScalar(v: Scalar, dtype: Dtype): Scalar { + if (isNull(v)) { + return null; + } + + const k = dtype.kind; + + if (k === "int" || k === "uint") { + if (typeof v === "boolean") { + return v ? 1 : 0; + } + if (v instanceof Date) { + return Math.trunc(v.getTime()); + } + const n = Number(v); + if (Number.isNaN(n)) { + return null; + } + const range = INT_RANGES[dtype.name]; + if (range === undefined) { + return Math.trunc(n); + } + const t = Math.trunc(n); + return Math.max(range.lo, Math.min(range.hi, t)); + } + + if (k === "float") { + if (typeof v === "boolean") { + return v ? 1.0 : 0.0; + } + if (v instanceof Date) { + return v.getTime(); + } + return Number(v); + } + + if (k === "bool") { + if (typeof v === "number") { + return !Number.isNaN(v) && v !== 0; + } + if (v instanceof Date) { + return true; + } + return Boolean(v); + } + + if (k === "string") { + if (v instanceof Date) { + return v.toISOString(); + } + return String(v); + } + + if (k === "datetime") { + if (v instanceof Date) { + return v; + } + if (typeof v === "number") { + return new Date(v); + } + const d = new Date(String(v)); + return Number.isNaN(d.getTime()) ? null : d; + } + + // object / category / timedelta — return unchanged + return v; +} + +// ─── AstypeOptions ──────────────────────────────────────────────────────────── + +/** Options accepted by {@link astypeSeries} and {@link astype}. */ +export interface AstypeOptions { + /** + * When `true`, values that cannot be cast are silently replaced with + * `null` instead of throwing. + * + * @default false + */ + readonly errors?: "raise" | "ignore"; +} + +// ─── astypeSeries ───────────────────────────────────────────────────────────── + +/** + * Cast a Series to a different dtype. + * + * Returns a new Series whose values have been coerced to `dtype`. The index + * and name are preserved unchanged. + * + * @example + * ```ts + * const s = new Series({ data: [1.9, 2.1, 3.7], name: "x" }); + * const si = astypeSeries(s, "int64"); + * si.values; // [1, 2, 3] + * si.dtype.name; // "int64" + * ``` + */ +export function astypeSeries( + s: Series, + dtype: DtypeName | Dtype, + options: AstypeOptions = {}, +): Series { + const targetDtype = dtype instanceof Dtype ? dtype : Dtype.from(dtype as DtypeName); + const { errors = "raise" } = options; + + const casted: Scalar[] = []; + for (const v of s.values) { + let out: Scalar; + try { + out = castScalar(v, targetDtype); + } catch (e) { + if (errors === "ignore") { + out = v; + } else { + throw e; + } + } + casted.push(out); + } + + return new Series({ + data: casted, + index: s.index, + dtype: targetDtype, + name: s.name, + }); +} + +// ─── DataFrame astype ───────────────────────────────────────────────────────── + +/** + * Options for {@link astype} (DataFrame variant). + */ +export interface DataFrameAstypeOptions extends AstypeOptions { + /** + * When `true`, only the columns listed in `dtype` (when `dtype` is a + * `Record`) are recast; other columns are carried over unchanged. + * + * When `false` (default) and `dtype` is a `Record`, columns not listed + * in the map are carried over unchanged (same behaviour). + * + * This option exists for pandas API compatibility. + */ + readonly copy?: boolean; +} + +/** + * Cast one or more columns in a DataFrame to the specified dtype(s). + * + * - Pass a single `DtypeName` or `Dtype` to cast **all** columns. + * - Pass a `Record` to cast individual columns. + * Columns not listed are returned unchanged. + * + * Returns a new DataFrame; the original is not modified. + * + * @example + * ```ts + * const df = DataFrame.fromColumns({ a: [1.5, 2.7], b: ["3", "4"] }); + * + * // Cast all columns to float64 + * astype(df, "float64"); + * + * // Cast only column "b" to int64 + * astype(df, { b: "int64" }); + * ``` + */ +export function astype( + df: DataFrame, + dtype: DtypeName | Dtype | Readonly>, + options: DataFrameAstypeOptions = {}, +): DataFrame { + const colMap = new Map>(); + + const isSingleDtype = typeof dtype === "string" || dtype instanceof Dtype; + + for (const name of df.columns.values) { + const col = df.col(name); + if (isSingleDtype) { + colMap.set(name, astypeSeries(col, dtype as DtypeName | Dtype, options)); + } else { + const mapping = dtype as Readonly>; + const target = mapping[name]; + if (target !== undefined) { + colMap.set(name, astypeSeries(col, target, options)); + } else { + colMap.set(name, col); + } + } + } + + return new DataFrame(colMap, df.index); +} diff --git a/src/core/index.ts b/src/core/index.ts index e737ec8f..3fd31e7c 100644 --- a/src/core/index.ts +++ b/src/core/index.ts @@ -126,3 +126,5 @@ export { isPeriodDtype, isIntervalDtype, } from "./api_types.ts"; +export { astypeSeries, astype, castScalar } from "./astype.ts"; +export type { AstypeOptions, DataFrameAstypeOptions } from "./astype.ts"; diff --git a/src/index.ts b/src/index.ts index b95ad4be..37c6e62e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -465,3 +465,73 @@ export type { SeriesToStringOptions, DataFrameToStringOptions, } from "./stats/index.ts"; + +// PR #120 unique modules — re-exported from sub-barrels +export { astypeSeries, astype, castScalar } from "./core/index.ts"; +export type { AstypeOptions, DataFrameAstypeOptions } from "./core/index.ts"; +// readExcel / xlsxSheetNames use node:zlib — import from "tsb/io/read_excel" directly +export { clipAdvancedSeries, clipAdvancedDataFrame } from "./stats/index.ts"; +export type { + SeriesBound, + DataFrameBound, + ClipAdvancedSeriesOptions, + ClipAdvancedDataFrameOptions, +} from "./stats/index.ts"; +export { idxminSeries, idxmaxSeries, idxminDataFrame, idxmaxDataFrame } from "./stats/index.ts"; +export type { IdxOptions, IdxDataFrameOptions } from "./stats/index.ts"; +export { modeSeries, modeDataFrame } from "./stats/index.ts"; +export type { ModeSeriesOptions, ModeDataFrameOptions } from "./stats/index.ts"; +export { + nancount, + nansum, + nanmean, + nanmedian, + nanvar, + nanstd, + nanmin, + nanmax, + nanprod, +} from "./stats/index.ts"; +export type { NanInput, NanAggOptions } from "./stats/index.ts"; +export { + nuniqueSeries, + nuniqueDataFrame, + anySeries, + allSeries, + anyDataFrame, + allDataFrame, +} from "./stats/index.ts"; +export type { + NuniqueSeriesOptions, + NuniqueDataFrameOptions, + AnyAllSeriesOptions, + AnyAllDataFrameOptions, +} from "./stats/index.ts"; +export { pctChangeSeries, pctChangeDataFrame } from "./stats/index.ts"; +export type { + PctChangeFillMethod, + PctChangeOptions, + DataFramePctChangeOptions, +} from "./stats/index.ts"; +export { quantileSeries, quantileDataFrame } from "./stats/index.ts"; +export type { + QuantileInterpolation, + QuantileSeriesOptions, + QuantileDataFrameOptions, +} from "./stats/index.ts"; +export { replaceSeries, replaceDataFrame } from "./stats/index.ts"; +export type { + ReplaceMapping, + ReplaceSpec, + ReplaceOptions, + DataFrameReplaceOptions, +} from "./stats/index.ts"; +export { varSeries, semSeries, varDataFrame, semDataFrame } from "./stats/index.ts"; +export type { VarSemSeriesOptions, VarSemDataFrameOptions } from "./stats/index.ts"; +export { skewSeries, kurtSeries, skewDataFrame, kurtDataFrame } from "./stats/index.ts"; +export type { + SkewKurtSeriesOptions, + SkewKurtDataFrameOptions, +} from "./stats/index.ts"; +export { toDatetime } from "./stats/index.ts"; +export type { DatetimeUnit, DatetimeErrors, ToDatetimeOptions } from "./stats/index.ts"; diff --git a/src/io/index.ts b/src/io/index.ts index d4f27f3b..e868c4c8 100644 --- a/src/io/index.ts +++ b/src/io/index.ts @@ -10,3 +10,6 @@ export { readJson, toJson } from "./json.ts"; export type { ReadJsonOptions, ToJsonOptions, JsonOrient } from "./json.ts"; export { jsonNormalize } from "./json_normalize.ts"; export type { JsonPath, JsonNormalizeOptions } from "./json_normalize.ts"; +// readExcel / xlsxSheetNames use node:zlib and cannot be bundled for the +// browser. Import them directly from "tsb/io/read_excel" when running in +// Node / Bun. diff --git a/src/io/read_excel.ts b/src/io/read_excel.ts new file mode 100644 index 00000000..97d06065 --- /dev/null +++ b/src/io/read_excel.ts @@ -0,0 +1,645 @@ +/** + * readExcel — XLSX file reading for DataFrame. + * + * Mirrors `pandas.read_excel()`: + * - `readExcel(data, options?)` — parse an XLSX binary buffer into a DataFrame. + * - `xlsxSheetNames(data)` — list sheet names without parsing cell data. + * + * Supports: + * - Shared string table (type `"s"`) + * - Inline strings (type `"inlineStr"`) + * - Numbers (type absent or `"n"`) + * - Booleans (type `"b"`) + * - Formula cached values (type `"str"`) + * - Error cells (type `"e"`) — returned as null + * - ZIP STORED (method 0) and DEFLATED (method 8) entries + * + * Limitations (deferred): + * - XLSX only — not XLS (legacy binary format) + * - No ZIP64 support (up to ~4 GB) + * - Date serial numbers are not converted (returned as numeric) + * + * @module + */ + +// biome-ignore lint/correctness/noNodejsModules: raw DEFLATE decompression for ZIP/XLSX requires node:zlib +import { inflateRawSync } from "node:zlib"; +import { DataFrame } from "../core/index.ts"; +import { Index } from "../core/index.ts"; +import { RangeIndex } from "../core/index.ts"; +import { Series } from "../core/index.ts"; +import { Dtype } from "../core/index.ts"; +import type { DtypeName, Label, Scalar } from "../types.ts"; + +// ─── public types ───────────────────────────────────────────────────────────── + +/** Options for {@link readExcel}. */ +export interface ReadExcelOptions { + /** + * Which sheet to read. + * - `string`: exact sheet name + * - `number`: 0-based sheet index + * - Default: `0` (first sheet) + */ + readonly sheetName?: string | number; + /** + * Row index of the header row, or `null` for no header (columns become + * `"0"`, `"1"`, `"2"`, …). + * Default: `0`. + */ + readonly header?: number | null; + /** + * Column name or 0-based index of the column to use as the row index. + * Default: `null` (use a default `RangeIndex`). + */ + readonly indexCol?: string | number | null; + /** + * Number of data rows to skip after the header row. + * Default: `0`. + */ + readonly skipRows?: number; + /** + * Maximum number of data rows to read. + */ + readonly nrows?: number; + /** + * Additional strings to treat as NA (beyond the built-in set: + * `""`, `"NA"`, `"N/A"`, `"null"`, `"NaN"`, `"nan"`, `"#N/A"`). + */ + readonly naValues?: readonly string[]; + /** + * Explicit dtype overrides per column name. + */ + readonly dtype?: Readonly>; +} + +// ─── ZIP low-level helpers ──────────────────────────────────────────────────── + +/** Read a little-endian uint16 from a buffer. */ +function readU16(buf: Uint8Array, off: number): number { + return ((buf[off] ?? 0) | ((buf[off + 1] ?? 0) << 8)) >>> 0; +} + +/** Read a little-endian uint32 from a buffer. */ +function readU32(buf: Uint8Array, off: number): number { + return ( + ((buf[off] ?? 0) | + ((buf[off + 1] ?? 0) << 8) | + ((buf[off + 2] ?? 0) << 16) | + ((buf[off + 3] ?? 0) << 24)) >>> + 0 + ); +} + +const ZIP_EOCD_SIG = 0x06054b50; +const ZIP_CD_SIG = 0x02014b50; +const ZIP_COMP_STORED = 0; +const ZIP_COMP_DEFLATE = 8; + +interface ZipEntry { + readonly name: string; + readonly compressedSize: number; + readonly uncompressedSize: number; + readonly method: number; + readonly dataOffset: number; +} + +/** Search for the End-of-Central-Directory record. */ +function findEocd(buf: Uint8Array): number { + const minOff = Math.max(0, buf.length - 65558); + for (let i = buf.length - 22; i >= minOff; i--) { + if (readU32(buf, i) === ZIP_EOCD_SIG) { + return i; + } + } + throw new Error("Not a valid XLSX file: no ZIP end-of-central-directory found"); +} + +/** Compute the actual data offset from the local file header. */ +function localDataOffset(buf: Uint8Array, localOff: number): number { + const nameLen = readU16(buf, localOff + 26); + const extraLen = readU16(buf, localOff + 28); + return localOff + 30 + nameLen + extraLen; +} + +/** Parse the ZIP central directory and return a name→entry map. */ +function parseZipEntries(buf: Uint8Array): Map { + const eocd = findEocd(buf); + const cdOffset = readU32(buf, eocd + 16); + const cdSize = readU32(buf, eocd + 12); + const dec = new TextDecoder("utf-8"); + const entries = new Map(); + let pos = cdOffset; + while (pos < cdOffset + cdSize && pos + 46 <= buf.length) { + if (readU32(buf, pos) !== ZIP_CD_SIG) { + break; + } + const method = readU16(buf, pos + 10); + const compressedSize = readU32(buf, pos + 20); + const uncompressedSize = readU32(buf, pos + 24); + const nameLen = readU16(buf, pos + 28); + const extraLen = readU16(buf, pos + 30); + const commentLen = readU16(buf, pos + 32); + const localOff = readU32(buf, pos + 42); + const name = dec.decode(buf.subarray(pos + 46, pos + 46 + nameLen)); + const dataOffset = localDataOffset(buf, localOff); + entries.set(name, { name, compressedSize, uncompressedSize, method, dataOffset }); + pos += 46 + nameLen + extraLen + commentLen; + } + return entries; +} + +/** Decompress a ZIP entry and decode it as a UTF-8 string. */ +function extractEntry(buf: Uint8Array, entry: ZipEntry): string { + const raw = buf.subarray(entry.dataOffset, entry.dataOffset + entry.compressedSize); + let bytes: Uint8Array; + if (entry.method === ZIP_COMP_STORED) { + bytes = raw; + } else if (entry.method === ZIP_COMP_DEFLATE) { + bytes = inflateRawSync(raw); + } else { + throw new Error(`Unsupported ZIP compression method: ${entry.method}`); + } + return new TextDecoder("utf-8").decode(bytes); +} + +/** Extract a named entry or return null if absent. */ +function getZipEntry(buf: Uint8Array, entries: Map, name: string): string | null { + const entry = entries.get(name); + if (entry === undefined) { + return null; + } + return extractEntry(buf, entry); +} + +// ─── XML helpers ────────────────────────────────────────────────────────────── + +// Top-level regex constants (Biome useTopLevelRegex) +const RE_XML_ENTITY = /&(?:amp|lt|gt|quot|apos);/g; +const RE_SST_SI = /([\s\S]*?)<\/si>/g; +const RE_SST_T = /]*)>([\s\S]*?)<\/t>/g; +const RE_WB_SHEET = /]*)>/g; +const RE_REL = /]*)>/g; +const RE_ROW = /]*)>([\s\S]*?)<\/row>/g; +const RE_CELL = /]*)>([\s\S]*?)<\/c>/g; +const RE_CELL_V = /([\s\S]*?)<\/v>/; +const RE_CELL_IS = /[\s\S]*?]*)>([\s\S]*?)<\/t>/; +const RE_COL_LETTERS = /^([A-Z]+)(\d+)$/; + +/** Replace XML character references with their literal characters. */ +function xmlUnescape(s: string): string { + return s.replace(RE_XML_ENTITY, (m) => { + if (m === "&") { + return "&"; + } + if (m === "<") { + return "<"; + } + if (m === ">") { + return ">"; + } + if (m === """) { + return '"'; + } + return "'"; + }); +} + +/** + * Extract the value of a single named XML attribute from an attribute string. + * Uses `new RegExp` (not a literal) to support dynamic attribute names. + */ +function attrVal(attrStr: string, key: string): string { + const re = new RegExp(`\\b${key}="([^"]*)"`); + return re.exec(attrStr)?.[1] ?? ""; +} + +// ─── XLSX-specific XML parsing ──────────────────────────────────────────────── + +/** Iterate all non-overlapping matches of a global regex against a string. */ +function* regexAll(re: RegExp, str: string): Generator { + re.lastIndex = 0; + let m = re.exec(str); + while (m !== null) { + yield m; + m = re.exec(str); + } +} + +/** Parse the shared string table XML into an array of strings. */ +function parseSiText(siContent: string): string { + let text = ""; + for (const t of regexAll(RE_SST_T, siContent)) { + text += xmlUnescape(t[1] ?? ""); + } + return text; +} + +/** Parse the shared string table XML into an array of strings. */ +function parseSharedStrings(xml: string): string[] { + const strings: string[] = []; + for (const si of regexAll(RE_SST_SI, xml)) { + strings.push(parseSiText(si[1] ?? "")); + } + return strings; +} + +interface SheetInfo { + readonly name: string; + readonly rid: string; +} + +/** Parse the workbook XML and return a list of sheet descriptors. */ +function parseWorkbookSheets(xml: string): SheetInfo[] { + const sheets: SheetInfo[] = []; + for (const m of regexAll(RE_WB_SHEET, xml)) { + const attrs = m[1] ?? ""; + const name = xmlUnescape(attrVal(attrs, "name")); + const rid = attrVal(attrs, "r:id"); + if (name !== "") { + sheets.push({ name, rid }); + } + } + return sheets; +} + +/** Parse the workbook relationships XML and return a rid→target map. */ +function parseRelationships(xml: string): Map { + const map = new Map(); + for (const m of regexAll(RE_REL, xml)) { + const attrs = m[1] ?? ""; + const id = attrVal(attrs, "Id"); + const target = attrVal(attrs, "Target"); + if (id !== "") { + map.set(id, target); + } + } + return map; +} + +// ─── Cell parsing ───────────────────────────────────────────────────────────── + +/** Convert a column letter string (e.g. "A", "AB") to a 0-based index. */ +function colLetterToIndex(col: string): number { + let idx = 0; + for (const ch of col) { + idx = idx * 26 + (ch.charCodeAt(0) - 64); + } + return idx - 1; +} + +/** + * Parse a cell reference (e.g. "A1") into [rowIndex, colIndex] (both 0-based). + */ +function parseCellRef(ref: string): readonly [number, number] { + const m = RE_COL_LETTERS.exec(ref); + if (m === null) { + throw new Error(`Invalid cell reference: ${ref}`); + } + const colLetters = m[1] ?? ""; + const rowNum = Number.parseInt(m[2] ?? "1", 10); + return [rowNum - 1, colLetterToIndex(colLetters)]; +} + +/** Resolve a cell value given its type tag and raw text. */ +function resolveCellValue( + cellType: string, + vText: string, + isText: string, + sharedStrings: readonly string[], +): Scalar { + if (cellType === "s") { + const idx = Number.parseInt(vText, 10); + return sharedStrings[idx] ?? null; + } + if (cellType === "b") { + return vText === "1"; + } + if (cellType === "inlineStr") { + return xmlUnescape(isText); + } + if (cellType === "e") { + return null; + } + // "str" (formula string), "n" (number), or absent (number) + if (vText === "") { + return null; + } + const n = Number(vText); + return Number.isNaN(n) ? xmlUnescape(vText) : n; +} + +interface RawRow { + readonly rowIndex: number; + readonly cells: ReadonlyMap; +} + +/** Parse a single `` element into a RawRow. */ +function parseOneRow( + rowAttrs: string, + rowContent: string, + sharedStrings: readonly string[], +): RawRow { + const rowIdxStr = attrVal(rowAttrs, "r"); + const rowIndex = rowIdxStr === "" ? 0 : Number.parseInt(rowIdxStr, 10) - 1; + const cells = new Map(); + for (const cellMatch of regexAll(RE_CELL, rowContent)) { + const cellAttrs = cellMatch[1] ?? ""; + const cellContent = cellMatch[2] ?? ""; + const ref = attrVal(cellAttrs, "r"); + if (ref === "") { + continue; + } + const cellType = attrVal(cellAttrs, "t"); + const vMatch = RE_CELL_V.exec(cellContent); + const vText = vMatch !== null ? xmlUnescape(vMatch[1] ?? "") : ""; + const isMatch = RE_CELL_IS.exec(cellContent); + const isText = isMatch?.[1] ?? ""; + const [, colIdx] = parseCellRef(ref); + cells.set(colIdx, resolveCellValue(cellType, vText, isText, sharedStrings)); + } + return { rowIndex, cells }; +} + +/** Parse all `` elements from a worksheet XML string. */ +function parseWorksheetRows(xml: string, sharedStrings: readonly string[]): RawRow[] { + const rows: RawRow[] = []; + for (const rowMatch of regexAll(RE_ROW, xml)) { + rows.push(parseOneRow(rowMatch[1] ?? "", rowMatch[2] ?? "", sharedStrings)); + } + return rows; +} + +// ─── DataFrame construction ─────────────────────────────────────────────────── + +const BUILTIN_NA = new Set(["", "NA", "N/A", "null", "NaN", "nan", "#N/A"]); + +/** True when a string value should be coerced to null. */ +function isNaStr(s: string, extraNa: ReadonlySet): boolean { + return BUILTIN_NA.has(s) || extraNa.has(s); +} + +/** Coerce a raw cell value to null when it matches an NA sentinel. */ +function coerceNa(val: Scalar, extraNa: ReadonlySet): Scalar { + if (typeof val === "string" && isNaStr(val, extraNa)) { + return null; + } + return val; +} + +/** Compute the maximum column index across all rows. */ +function maxColIndex(rows: readonly RawRow[]): number { + let max = 0; + for (const row of rows) { + for (const col of row.cells.keys()) { + if (col > max) { + max = col; + } + } + } + return max; +} + +interface ColumnarData { + readonly columns: string[]; + readonly data: Scalar[][]; +} + +/** Pad header labels array to `numCols` with numeric fallback names. */ +function padHeaderLabels(labels: string[], numCols: number): void { + while (labels.length < numCols) { + labels.push(String(labels.length)); + } +} + +/** Extract header labels from the header row. */ +function extractHeaderLabels( + rows: readonly RawRow[], + headerRow: number, + numCols: number, +): string[] { + const labels: string[] = []; + const hRow = rows.find((r) => r.rowIndex === headerRow); + if (hRow !== undefined) { + for (let c = 0; c < numCols; c++) { + const v = hRow.cells.get(c) ?? null; + labels.push(v !== null ? String(v) : String(c)); + } + } + return labels; +} + +/** Pivot sliced data rows into per-column arrays. */ +function pivotToColumns( + sliced: readonly RawRow[], + numCols: number, + extraNa: ReadonlySet, +): Scalar[][] { + const data: Scalar[][] = Array.from({ length: numCols }, (): Scalar[] => []); + for (const row of sliced) { + for (let c = 0; c < numCols; c++) { + const val = coerceNa(row.cells.get(c) ?? null, extraNa); + (data[c] as Scalar[]).push(val); + } + } + return data; +} + +/** Separate header and data rows, then pivot to column-oriented arrays. */ +function buildColumnarData( + rows: readonly RawRow[], + headerRow: number | null, + skipRows: number, + nrows: number | undefined, + extraNa: ReadonlySet, +): ColumnarData { + const numCols = rows.length === 0 ? 0 : maxColIndex(rows) + 1; + const dataRows = rows.filter((r) => headerRow === null || r.rowIndex !== headerRow); + const headerLabels = headerRow !== null ? extractHeaderLabels(rows, headerRow, numCols) : []; + padHeaderLabels(headerLabels, numCols); + const sliced = dataRows.slice(skipRows, nrows !== undefined ? skipRows + nrows : undefined); + const data = pivotToColumns(sliced, numCols, extraNa); + return { columns: headerLabels, data }; +} + +/** Infer a dtype from a column's scalar values. */ +function inferColDtype(values: readonly Scalar[], override: DtypeName | undefined): DtypeName { + if (override !== undefined) { + return override; + } + let allNum = true; + let allBool = true; + let allStr = true; + for (const v of values) { + if (v === null || v === undefined) { + continue; + } + if (typeof v !== "number") { + allNum = false; + } + if (typeof v !== "boolean") { + allBool = false; + } + if (typeof v !== "string") { + allStr = false; + } + } + if (allBool) { + return "bool"; + } + if (allNum) { + return "float64"; + } + if (allStr) { + return "string"; + } + return "object"; +} + +/** Build a DataFrame from parsed rows and options. */ +function buildDataFrame(rows: readonly RawRow[], options: ReadExcelOptions): DataFrame { + const headerRow = options.header !== undefined ? (options.header ?? null) : 0; + const skipRows = options.skipRows ?? 0; + const extraNa = new Set(options.naValues ?? []); + const dtypeOvr: Readonly> = options.dtype ?? {}; + const { columns, data } = buildColumnarData(rows, headerRow, skipRows, options.nrows, extraNa); + const indexColOpt = options.indexCol ?? null; + const indexColIdx = resolveIndexColIdx(columns, indexColOpt); + const rowCount = (data[0] ?? []).length; + const colMap = new Map>(); + for (let c = 0; c < columns.length; c++) { + if (c === indexColIdx) { + continue; + } + const colName = columns[c] ?? String(c); + const colData = data[c] ?? []; + const dtypeName = inferColDtype(colData, dtypeOvr[colName]); + colMap.set(colName, new Series({ data: colData, dtype: Dtype.from(dtypeName), name: colName })); + } + const toLabel = (v: Scalar): Label => + v === undefined || typeof v === "bigint" || v instanceof Date ? null : v; + const rowIndex = + indexColIdx >= 0 + ? new Index