Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 109 additions & 0 deletions playground/str_get_dummies.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>tsb — str.get_dummies: multi-label string encoding</title>
<style>
* { box-sizing: border-box; margin: 0; padding: 0; }
body { font-family: system-ui, sans-serif; background: #0d1117; color: #c9d1d9; line-height: 1.6; padding: 2rem; }
h1 { color: #58a6ff; font-size: 1.8rem; margin-bottom: .5rem; }
h2 { color: #79c0ff; font-size: 1.2rem; margin: 2rem 0 .75rem; }
p { color: #8b949e; margin-bottom: 1rem; max-width: 800px; }
code { background: #161b22; padding: .1rem .4rem; border-radius: 4px; font-family: monospace; font-size: .9em; color: #a5d6ff; }
.card { background: #161b22; border: 1px solid #30363d; border-radius: 8px; padding: 1.5rem; margin-bottom: 1.5rem; max-width: 900px; }
textarea { width: 100%; background: #0d1117; border: 1px solid #30363d; border-radius: 6px; color: #c9d1d9; font-family: monospace; font-size: .85rem; padding: .75rem; resize: vertical; min-height: 140px; }
button { background: #238636; color: #fff; border: none; border-radius: 6px; padding: .5rem 1.25rem; cursor: pointer; font-size: .9rem; margin-top: .75rem; }
button:hover { background: #2ea043; }
pre { background: #0d1117; border: 1px solid #21262d; border-radius: 6px; padding: 0.75rem 1rem; overflow-x: auto; font-size: 0.85rem; white-space: pre-wrap; margin-top: 0.5rem; color: #7ee787; font-family: monospace; }
a { color: #58a6ff; }
</style>
</head>
<body>
<h1>str.get_dummies — multi-label string encoding</h1>
<p>
Port of <code>pandas.Series.str.get_dummies(sep)</code>. Splits each
string by a separator (default <code>"|"</code>) and returns a
<code>DataFrame</code> of binary indicator columns — one per unique token,
sorted lexicographically. <code>null</code> / <code>undefined</code> /
<code>NaN</code> values produce a row of all zeros.
</p>
<p><a href="./index.html">← back to index</a></p>

<div class="card">
<h2>Example 1 — basic split on <code>|</code></h2>
<textarea id="ex1-code">
const { Series, strGetDummies } = tsb;
const s = new Series({ data: ["a|b", "b|c", "a"], name: "tags" });
const df = strGetDummies(s);
console.log(JSON.stringify(df.toRecords(), null, 2));
console.log("columns =", df.columns.values.join(", "));
</textarea>
<button onclick="run('ex1')">Run</button>
<pre id="ex1-out">(click Run)</pre>
</div>

<div class="card">
<h2>Example 2 — custom separator</h2>
<textarea id="ex2-code">
const { Series, strGetDummies } = tsb;
const s = new Series({ data: ["red,green", "green,blue", "red"] });
const df = strGetDummies(s, { sep: "," });
console.log(JSON.stringify(df.toRecords(), null, 2));
</textarea>
<button onclick="run('ex2')">Run</button>
<pre id="ex2-out">(click Run)</pre>
</div>

<div class="card">
<h2>Example 3 — null / undefined / NaN → all-zero rows</h2>
<textarea id="ex3-code">
const { Series, strGetDummies } = tsb;
const s = new Series({ data: ["a|b", null, undefined, NaN, "b"] });
const df = strGetDummies(s);
console.log(JSON.stringify(df.toRecords(), null, 2));
</textarea>
<button onclick="run('ex3')">Run</button>
<pre id="ex3-out">(click Run)</pre>
</div>

<div class="card">
<h2>Example 4 — preserved Series index</h2>
<textarea id="ex4-code">
const { Series, strGetDummies } = tsb;
const s = new Series({ data: ["python|pandas", "python|numpy", "pandas|numpy|scipy"], index: ["row-1", "row-2", "row-3"] });
const df = strGetDummies(s);
console.log("index =", df.index.values.join(", "));
console.log(JSON.stringify(df.toRecords(), null, 2));
</textarea>
<button onclick="run('ex4')">Run</button>
<pre id="ex4-out">(click Run)</pre>
</div>

<script type="module">
let tsb;
try {
tsb = await import("../src/index.ts");
} catch {
tsb = await import("https://esm.sh/tsb@latest");
}
window.tsb = tsb;

window.run = function run(id) {
const code = document.getElementById(`${id}-code`).value;
const out = document.getElementById(`${id}-out`);
const logs = [];
const origLog = console.log;
console.log = (...args) => logs.push(args.map(String).join(" "));
try {
new Function("tsb", code)(tsb);
out.textContent = logs.join("\n") || "(no output)";
} catch (e) {
out.textContent = "Error: " + e.message;
} finally {
console.log = origLog;
}
};
</script>
</body>
</html>
3 changes: 2 additions & 1 deletion src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,6 @@ export {
} from "./core/index.ts";
export {
strNormalize,
strGetDummies,
strExtractAll,
strRemovePrefix,
strRemoveSuffix,
Expand All @@ -401,6 +400,8 @@ export {
strIndent,
strDedent,
} from "./stats/index.ts";
export { strGetDummies } from "./stats/index.ts";
export type { StrGetDummiesOptions } from "./stats/index.ts";
export type {
NormalizeForm,
StrInput,
Expand Down
3 changes: 2 additions & 1 deletion src/stats/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,6 @@ export { fillna, countna, countValid } from "./notna_isna.ts";
export type { IsnaInput, FillnaOptions, DropnaOptions } from "./notna_isna.ts";
export {
strNormalize,
strGetDummies,
strExtractAll,
strRemovePrefix,
strRemoveSuffix,
Expand All @@ -212,6 +211,8 @@ export {
strByteLength,
} from "./string_ops.ts";
export type { NormalizeForm, StrInput, ExtractAllOptions } from "./string_ops.ts";
export { strGetDummies } from "./str_get_dummies.ts";
export type { StrGetDummiesOptions } from "./str_get_dummies.ts";
export {
strSplitExpand,
strExtractGroups,
Expand Down
129 changes: 129 additions & 0 deletions src/stats/str_get_dummies.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
/**
* str_get_dummies — split string Series by separator and return a DataFrame of
* binary dummy/indicator variables.
*
* Mirrors `pandas.Series.str.get_dummies(sep='|')`.
*
* Each element is split by `sep`; the unique tokens across all elements become
* columns. A cell is **1** if the token appeared in that row, **0** otherwise.
* Missing values (`null` / `undefined` / `NaN`) contribute no tokens and
* produce a row of all zeros. Columns are sorted lexicographically and the
* original Series index is preserved on the returned DataFrame.
*
* @example
* ```ts
* import { Series, strGetDummies } from "tsb";
*
* const s = new Series({ data: ["a|b", "b|c", "a"], name: "flags" });
* const df = strGetDummies(s);
* // DataFrame:
* // a b c
* // 0 1 1 0
* // 1 0 1 1
* // 2 1 0 0
* ```
*
* @module
*/

import { DataFrame, Series } from "../core/index.ts";
import type { Scalar } from "../types.ts";

// ─── Options ─────────────────────────────────────────────────────────────────

/** Options for {@link strGetDummies}. */
export interface StrGetDummiesOptions {
/**
* Separator string used to split each element.
* @default "|"
*/
readonly sep?: string;

/**
* Optional prefix prepended to every column name.
* @default ""
*/
readonly prefix?: string;

/**
* Separator between the prefix and the token name.
* @default "_"
*/
readonly prefixSep?: string;
}

// ─── Implementation ───────────────────────────────────────────────────────────

/**
* Split each string in `series` by `sep` and return a DataFrame of binary
* dummy/indicator variables — one column per unique token.
*
* Mirrors `pandas.Series.str.get_dummies(sep)`.
*
* @param series A Series whose values are strings (or null/undefined/NaN).
* @param options Options controlling the separator (default `"|"`).
* @returns A DataFrame with the same index as `series` and integer
* (`0`/`1`) columns — one per unique token, sorted
* lexicographically.
*
* @example
* ```ts
* import { Series, strGetDummies } from "tsb";
*
* const s = new Series({ data: ["a|b", "b|c", null], name: "tags" });
* const df = strGetDummies(s, { sep: "|" });
* // a b c
* // 0 1 1 0
* // 1 0 1 1
* // 2 0 0 0
* ```
*/
export function strGetDummies(
series: Series<Scalar>,
options: StrGetDummiesOptions = {},
): DataFrame {
const sep = options.sep ?? "|";
const prefix = options.prefix ?? "";
const prefixSep = options.prefixSep ?? "_";
const colName = (token: string): string =>
prefix === "" ? token : `${prefix}${prefixSep}${token}`;
const vals = series.values;
const n = vals.length;

// Collect all unique tokens and per-row token sets.
const tokenSet = new Set<string>();
const rowTokens: Set<string>[] = new Array<Set<string>>(n);

for (let i = 0; i < n; i++) {
const v = vals[i];
const tokens = new Set<string>();
if (v !== null && v !== undefined && !(typeof v === "number" && Number.isNaN(v))) {
const str = typeof v === "string" ? v : String(v);
if (str !== "") {
for (const tok of str.split(sep)) {
tokens.add(tok);
tokenSet.add(tok);
}
}
}
rowTokens[i] = tokens;
}

// Sort tokens lexicographically (pandas sorts columns for get_dummies).
const columns = [...tokenSet].sort();

// Build one Series per column. Use a Map (rather than a plain object)
// so that lexicographic order is preserved even for integer-like token
// names (plain object keys re-order numeric strings).
const idx = series.index;
const colMap = new Map<string, Series<Scalar>>();
for (const col of columns) {
const arr: Scalar[] = new Array<Scalar>(n);
for (let i = 0; i < n; i++) {
arr[i] = rowTokens[i]?.has(col) === true ? 1 : 0;
}
colMap.set(colName(col), new Series<Scalar>({ data: arr, index: idx }));
}

return new DataFrame(colMap, idx);
}
88 changes: 1 addition & 87 deletions src/stats/string_ops.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
* the accessor or better expressed as pure standalone utilities:
*
* - `strNormalize` — Unicode normalization (NFC / NFD / NFKC / NFKD)
* - `strGetDummies` — split strings by delimiter → one-hot DataFrame
* - `strExtractAll` — extract ALL regex matches per element
* - `strRemovePrefix` — remove a leading prefix
* - `strRemoveSuffix` — remove a trailing suffix
Expand All @@ -21,7 +20,7 @@
* @module
*/

import { DataFrame, Series } from "../core/index.ts";
import { Series } from "../core/index.ts";
import type { Scalar } from "../types.ts";

// ─── public types ─────────────────────────────────────────────────────────────
Expand All @@ -32,27 +31,6 @@ export type NormalizeForm = "NFC" | "NFD" | "NFKC" | "NFKD";
/** Input accepted by all string-op functions. */
export type StrInput = Series<Scalar> | readonly Scalar[] | readonly string[] | string;

/** Options for {@link strGetDummies}. */
export interface StrGetDummiesOptions {
/**
* The delimiter used to split each element into tokens.
* @default "|"
*/
readonly sep?: string;

/**
* Prefix prepended to every column name in the output DataFrame.
* @default ""
*/
readonly prefix?: string;

/**
* Separator between the prefix and the token name.
* @default "_"
*/
readonly prefixSep?: string;
}

/** Options for {@link strExtractAll}. */
export interface ExtractAllOptions {
/**
Expand Down Expand Up @@ -138,70 +116,6 @@ export function strNormalize(
return buildSeries(data, input);
}

// ─── strGetDummies ────────────────────────────────────────────────────────────

/**
* Encode each string element as a row in a one-hot DataFrame by splitting on a
* delimiter.
*
* Mirrors `pandas.Series.str.get_dummies(sep)`.
*
* @param input - Series or string array.
* @param options - `sep` (default `"|"`), `prefix` and `prefixSep` for column names.
* @returns A `DataFrame` of 0/1 integer values, one column per unique token.
*
* @example
* ```ts
* const s = new Series({ data: ["a|b", "b|c", "a"] });
* strGetDummies(s);
* // DataFrame
* // a b c
* // 0 1 1 0
* // 1 0 1 1
* // 2 1 0 0
* ```
*/
export function strGetDummies(
input: readonly string[] | Series<Scalar>,
options: StrGetDummiesOptions = {},
): DataFrame {
const sep = options.sep ?? "|";
const prefix = options.prefix ?? "";
const prefixSep = options.prefixSep ?? "_";

const strs = toStringArray(input);

// 1. Collect all unique tokens in first-seen order.
const seen = new Set<string>();
const tokenRows: string[][] = strs.map((s) => {
const tokens = s === "" ? [] : s.split(sep);
for (const t of tokens) {
seen.add(t);
}
return tokens;
});

const allTokens = [...seen].sort(); // stable alphabetical order

// 2. Build column name with optional prefix.
const colName = (token: string): string =>
prefix === "" ? token : `${prefix}${prefixSep}${token}`;

// 3. Build one Scalar[] per column.
const columns: Record<string, Scalar[]> = {};
for (const token of allTokens) {
const name = colName(token);
columns[name] = tokenRows.map((row) => (row.includes(token) ? 1 : 0));
}

// 4. Preserve the row index from a Series input.
if (input instanceof Series) {
const rowIndex = input.index;
return DataFrame.fromColumns(columns, { index: rowIndex });
}
return DataFrame.fromColumns(columns);
}

// ─── strExtractAll ────────────────────────────────────────────────────────────

/**
Expand Down
Loading
Loading