From 8dff3f1c6bb46e57dd90dcd3c802bd7982d56616 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 25 Apr 2026 14:59:03 +0000
Subject: [PATCH 1/3] Initial plan


From eec09812be59db225043a99b86b8cd1840940a25 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 25 Apr 2026 15:09:34 +0000
Subject: [PATCH 2/3] Iteration 281: extract strGetDummies into dedicated
 module

Agent-Logs-Url: https://github.com/githubnext/tsessebe/sessions/615193c1-c4f0-473c-bd90-e155a60846a7

Co-authored-by: mrjf <180956+mrjf@users.noreply.github.com>
---
 playground/str_get_dummies.html     | 109 ++++++++++
 src/index.ts                        |   3 +-
 src/stats/index.ts                  |   3 +-
 src/stats/str_get_dummies.ts        | 129 ++++++++++++
 src/stats/string_ops.ts             |  88 +-------
 tests/stats/str_get_dummies.test.ts | 304 ++++++++++++++++++++++++++++
 tests/stats/string_ops.test.ts      |  65 +-----
 7 files changed, 548 insertions(+), 153 deletions(-)
 create mode 100644 playground/str_get_dummies.html
 create mode 100644 src/stats/str_get_dummies.ts
 create mode 100644 tests/stats/str_get_dummies.test.ts
diff --git a/playground/str_get_dummies.html b/playground/str_get_dummies.html
new file mode 100644
index 00000000..467403eb
--- /dev/null
+++ b/playground/str_get_dummies.html
@@ -0,0 +1,109 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>tsb — str.get_dummies: multi-label string encoding</title>
+    <style>
+      * { box-sizing: border-box; margin: 0; padding: 0; }
+      body { font-family: system-ui, sans-serif; background: #0d1117; color: #c9d1d9; line-height: 1.6; padding: 2rem; }
+      h1 { color: #58a6ff; font-size: 1.8rem; margin-bottom: .5rem; }
+      h2 { color: #79c0ff; font-size: 1.2rem; margin: 2rem 0 .75rem; }
+      p  { color: #8b949e; margin-bottom: 1rem; max-width: 800px; }
+      code { background: #161b22; padding: .1rem .4rem; border-radius: 4px; font-family: monospace; font-size: .9em; color: #a5d6ff; }
+      .card { background: #161b22; border: 1px solid #30363d; border-radius: 8px; padding: 1.5rem; margin-bottom: 1.5rem; max-width: 900px; }
+      textarea { width: 100%; background: #0d1117; border: 1px solid #30363d; border-radius: 6px; color: #c9d1d9; font-family: monospace; font-size: .85rem; padding: .75rem; resize: vertical; min-height: 140px; }
+      button { background: #238636; color: #fff; border: none; border-radius: 6px; padding: .5rem 1.25rem; cursor: pointer; font-size: .9rem; margin-top: .75rem; }
+      button:hover { background: #2ea043; }
+      pre { background: #0d1117; border: 1px solid #21262d; border-radius: 6px; padding: 0.75rem 1rem; overflow-x: auto; font-size: 0.85rem; white-space: pre-wrap; margin-top: 0.5rem; color: #7ee787; font-family: monospace; }
+      a { color: #58a6ff; }
+    </style>
+  </head>
+  <body>
+    <h1>str.get_dummies — multi-label string encoding</h1>
+    <p>
+      Port of <code>pandas.Series.str.get_dummies(sep)</code>.  Splits each
+      string by a separator (default <code>"|"</code>) and returns a
+      <code>DataFrame</code> of binary indicator columns — one per unique token,
+      sorted lexicographically.  <code>null</code> / <code>undefined</code> /
+      <code>NaN</code> values produce a row of all zeros.
+    </p>
+    <p><a href="./index.html">← back to index</a></p>
+
+    <div class="card">
+      <h2>Example 1 — basic split on <code>|</code></h2>
+      <textarea id="ex1-code">
+const { Series, strGetDummies } = tsb;
+const s = new Series({ data: ["a|b", "b|c", "a"], name: "tags" });
+const df = strGetDummies(s);
+console.log(JSON.stringify(df.toRecords(), null, 2));
+console.log("columns =", df.columns.values.join(", "));
+</textarea>
+      <button onclick="run('ex1')">Run</button>
+      <pre id="ex1-out">(click Run)</pre>
+    </div>
+
+    <div class="card">
+      <h2>Example 2 — custom separator</h2>
+      <textarea id="ex2-code">
+const { Series, strGetDummies } = tsb;
+const s = new Series({ data: ["red,green", "green,blue", "red"] });
+const df = strGetDummies(s, { sep: "," });
+console.log(JSON.stringify(df.toRecords(), null, 2));
+</textarea>
+      <button onclick="run('ex2')">Run</button>
+      <pre id="ex2-out">(click Run)</pre>
+    </div>
+
+    <div class="card">
+      <h2>Example 3 — null / undefined / NaN → all-zero rows</h2>
+      <textarea id="ex3-code">
+const { Series, strGetDummies } = tsb;
+const s = new Series({ data: ["a|b", null, undefined, NaN, "b"] });
+const df = strGetDummies(s);
+console.log(JSON.stringify(df.toRecords(), null, 2));
+</textarea>
+      <button onclick="run('ex3')">Run</button>
+      <pre id="ex3-out">(click Run)</pre>
+    </div>
+
+    <div class="card">
+      <h2>Example 4 — preserved Series index</h2>
+      <textarea id="ex4-code">
+const { Series, strGetDummies } = tsb;
+const s = new Series({ data: ["python|pandas", "python|numpy", "pandas|numpy|scipy"], index: ["row-1", "row-2", "row-3"] });
+const df = strGetDummies(s);
+console.log("index =", df.index.values.join(", "));
+console.log(JSON.stringify(df.toRecords(), null, 2));
+</textarea>
+      <button onclick="run('ex4')">Run</button>
+      <pre id="ex4-out">(click Run)</pre>
+    </div>
+
+    <script type="module">
+      let tsb;
+      try {
+        tsb = await import("../src/index.ts");
+      } catch {
+        tsb = await import("https://esm.sh/tsb@latest");
+      }
+      window.tsb = tsb;
+
+      window.run = function run(id) {
+        const code = document.getElementById(`${id}-code`).value;
+        const out = document.getElementById(`${id}-out`);
+        const logs = [];
+        const origLog = console.log;
+        console.log = (...args) => logs.push(args.map(String).join(" "));
+        try {
+          new Function("tsb", code)(tsb);
+          out.textContent = logs.join("\n") || "(no output)";
+        } catch (e) {
+          out.textContent = "Error: " + e.message;
+        } finally {
+          console.log = origLog;
+        }
+      };
+    </script>
+  </body>
+</html>
diff --git a/src/index.ts b/src/index.ts
index 411cb787..27a15d16 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -386,7 +386,6 @@ export {
 } from "./core/index.ts";
 export {
   strNormalize,
-  strGetDummies,
   strExtractAll,
   strRemovePrefix,
   strRemoveSuffix,
@@ -401,6 +400,8 @@ export {
   strIndent,
   strDedent,
 } from "./stats/index.ts";
+export { strGetDummies } from "./stats/index.ts";
+export type { StrGetDummiesOptions } from "./stats/index.ts";
 export type {
   NormalizeForm,
   StrInput,
diff --git a/src/stats/index.ts b/src/stats/index.ts
index cd2c46fe..454582aa 100644
--- a/src/stats/index.ts
+++ b/src/stats/index.ts
@@ -203,7 +203,6 @@ export { fillna, countna, countValid } from "./notna_isna.ts";
 export type { IsnaInput, FillnaOptions, DropnaOptions } from "./notna_isna.ts";
 export {
   strNormalize,
-  strGetDummies,
   strExtractAll,
   strRemovePrefix,
   strRemoveSuffix,
@@ -212,6 +211,8 @@ export {
   strByteLength,
 } from "./string_ops.ts";
 export type { NormalizeForm, StrInput, ExtractAllOptions } from "./string_ops.ts";
+export { strGetDummies } from "./str_get_dummies.ts";
+export type { StrGetDummiesOptions } from "./str_get_dummies.ts";
 export {
   strSplitExpand,
   strExtractGroups,
diff --git a/src/stats/str_get_dummies.ts b/src/stats/str_get_dummies.ts
new file mode 100644
index 00000000..cdda92ec
--- /dev/null
+++ b/src/stats/str_get_dummies.ts
@@ -0,0 +1,129 @@
+/**
+ * str_get_dummies — split string Series by separator and return a DataFrame of
+ * binary dummy/indicator variables.
+ *
+ * Mirrors `pandas.Series.str.get_dummies(sep='|')`.
+ *
+ * Each element is split by `sep`; the unique tokens across all elements become
+ * columns.  A cell is **1** if the token appeared in that row, **0** otherwise.
+ * Missing values (`null` / `undefined` / `NaN`) contribute no tokens and
+ * produce a row of all zeros.  Columns are sorted lexicographically and the
+ * original Series index is preserved on the returned DataFrame.
+ *
+ * @example
+ * ```ts
+ * import { Series, strGetDummies } from "tsb";
+ *
+ * const s = new Series({ data: ["a|b", "b|c", "a"], name: "flags" });
+ * const df = strGetDummies(s);
+ * // DataFrame:
+ * //    a  b  c
+ * // 0  1  1  0
+ * // 1  0  1  1
+ * // 2  1  0  0
+ * ```
+ *
+ * @module
+ */
+
+import { DataFrame, Series } from "../core/index.ts";
+import type { Scalar } from "../types.ts";
+
+// ─── Options ─────────────────────────────────────────────────────────────────
+
+/** Options for {@link strGetDummies}. */
+export interface StrGetDummiesOptions {
+  /**
+   * Separator string used to split each element.
+   * @default "|"
+   */
+  readonly sep?: string;
+
+  /**
+   * Optional prefix prepended to every column name.
+   * @default ""
+   */
+  readonly prefix?: string;
+
+  /**
+   * Separator between the prefix and the token name.
+   * @default "_"
+   */
+  readonly prefixSep?: string;
+}
+
+// ─── Implementation ───────────────────────────────────────────────────────────
+
+/**
+ * Split each string in `series` by `sep` and return a DataFrame of binary
+ * dummy/indicator variables — one column per unique token.
+ *
+ * Mirrors `pandas.Series.str.get_dummies(sep)`.
+ *
+ * @param series  A Series whose values are strings (or null/undefined/NaN).
+ * @param options Options controlling the separator (default `"|"`).
+ * @returns       A DataFrame with the same index as `series` and integer
+ *                (`0`/`1`) columns — one per unique token, sorted
+ *                lexicographically.
+ *
+ * @example
+ * ```ts
+ * import { Series, strGetDummies } from "tsb";
+ *
+ * const s = new Series({ data: ["a|b", "b|c", null], name: "tags" });
+ * const df = strGetDummies(s, { sep: "|" });
+ * //    a  b  c
+ * // 0  1  1  0
+ * // 1  0  1  1
+ * // 2  0  0  0
+ * ```
+ */
+export function strGetDummies(
+  series: Series<Scalar>,
+  options: StrGetDummiesOptions = {},
+): DataFrame {
+  const sep = options.sep ?? "|";
+  const prefix = options.prefix ?? "";
+  const prefixSep = options.prefixSep ?? "_";
+  const colName = (token: string): string =>
+    prefix === "" ? token : `${prefix}${prefixSep}${token}`;
+  const vals = series.values;
+  const n = vals.length;
+
+  // Collect all unique tokens and per-row token sets.
+  const tokenSet = new Set<string>();
+  const rowTokens: Set<string>[] = new Array<Set<string>>(n);
+
+  for (let i = 0; i < n; i++) {
+    const v = vals[i];
+    const tokens = new Set<string>();
+    if (v !== null && v !== undefined && !(typeof v === "number" && Number.isNaN(v))) {
+      const str = typeof v === "string" ? v : String(v);
+      if (str !== "") {
+        for (const tok of str.split(sep)) {
+          tokens.add(tok);
+          tokenSet.add(tok);
+        }
+      }
+    }
+    rowTokens[i] = tokens;
+  }
+
+  // Sort tokens lexicographically (pandas sorts columns for get_dummies).
+  const columns = [...tokenSet].sort();
+
+  // Build one Series per column.  Use a Map (rather than a plain object)
+  // so that lexicographic order is preserved even for integer-like token
+  // names (plain object keys re-order numeric strings).
+  const idx = series.index;
+  const colMap = new Map<string, Series<Scalar>>();
+  for (const col of columns) {
+    const arr: Scalar[] = new Array<Scalar>(n);
+    for (let i = 0; i < n; i++) {
+      arr[i] = rowTokens[i]?.has(col) === true ? 1 : 0;
+    }
+    colMap.set(colName(col), new Series<Scalar>({ data: arr, index: idx }));
+  }
+
+  return new DataFrame(colMap, idx);
+}
diff --git a/src/stats/string_ops.ts b/src/stats/string_ops.ts
index b5b486fb..5a444c50 100644
--- a/src/stats/string_ops.ts
+++ b/src/stats/string_ops.ts
@@ -10,7 +10,6 @@
  * the accessor or better expressed as pure standalone utilities:
  *
  * - `strNormalize`  — Unicode normalization (NFC / NFD / NFKC / NFKD)
- * - `strGetDummies` — split strings by delimiter → one-hot DataFrame
  * - `strExtractAll` — extract ALL regex matches per element
  * - `strRemovePrefix` — remove a leading prefix
  * - `strRemoveSuffix` — remove a trailing suffix
@@ -21,7 +20,7 @@
  * @module
  */
 
-import { DataFrame, Series } from "../core/index.ts";
+import { Series } from "../core/index.ts";
 import type { Scalar } from "../types.ts";
 
 // ─── public types ─────────────────────────────────────────────────────────────
@@ -32,27 +31,6 @@ export type NormalizeForm = "NFC" | "NFD" | "NFKC" | "NFKD";
 /** Input accepted by all string-op functions. */
 export type StrInput = Series<Scalar> | readonly Scalar[] | readonly string[] | string;
 
-/** Options for {@link strGetDummies}. */
-export interface StrGetDummiesOptions {
-  /**
-   * The delimiter used to split each element into tokens.
-   * @default "|"
-   */
-  readonly sep?: string;
-
-  /**
-   * Prefix prepended to every column name in the output DataFrame.
-   * @default ""
-   */
-  readonly prefix?: string;
-
-  /**
-   * Separator between the prefix and the token name.
-   * @default "_"
-   */
-  readonly prefixSep?: string;
-}
-
 /** Options for {@link strExtractAll}. */
 export interface ExtractAllOptions {
   /**
@@ -138,70 +116,6 @@ export function strNormalize(
   return buildSeries(data, input);
 }
 
-// ─── strGetDummies ────────────────────────────────────────────────────────────
-
-/**
- * Encode each string element as a row in a one-hot DataFrame by splitting on a
- * delimiter.
- *
- * Mirrors `pandas.Series.str.get_dummies(sep)`.
- *
- * @param input   - Series or string array.
- * @param options - `sep` (default `"|"`), `prefix` and `prefixSep` for column names.
- * @returns A `DataFrame` of 0/1 integer values, one column per unique token.
- *
- * @example
- * ```ts
- * const s = new Series({ data: ["a|b", "b|c", "a"] });
- * strGetDummies(s);
- * // DataFrame
- * //    a  b  c
- * // 0  1  1  0
- * // 1  0  1  1
- * // 2  1  0  0
- * ```
- */
-export function strGetDummies(
-  input: readonly string[] | Series<Scalar>,
-  options: StrGetDummiesOptions = {},
-): DataFrame {
-  const sep = options.sep ?? "|";
-  const prefix = options.prefix ?? "";
-  const prefixSep = options.prefixSep ?? "_";
-
-  const strs = toStringArray(input);
-
-  // 1. Collect all unique tokens in first-seen order.
-  const seen = new Set<string>();
-  const tokenRows: string[][] = strs.map((s) => {
-    const tokens = s === "" ? [] : s.split(sep);
-    for (const t of tokens) {
-      seen.add(t);
-    }
-    return tokens;
-  });
-
-  const allTokens = [...seen].sort(); // stable alphabetical order
-
-  // 2. Build column name with optional prefix.
-  const colName = (token: string): string =>
-    prefix === "" ? token : `${prefix}${prefixSep}${token}`;
-
-  // 3. Build one Scalar[] per column.
-  const columns: Record<string, Scalar[]> = {};
-  for (const token of allTokens) {
-    const name = colName(token);
-    columns[name] = tokenRows.map((row) => (row.includes(token) ? 1 : 0));
-  }
-
-  // 4. Preserve the row index from a Series input.
-  if (input instanceof Series) {
-    const rowIndex = input.index;
-    return DataFrame.fromColumns(columns, { index: rowIndex });
-  }
-  return DataFrame.fromColumns(columns);
-}
-
 // ─── strExtractAll ────────────────────────────────────────────────────────────
 
 /**
diff --git a/tests/stats/str_get_dummies.test.ts b/tests/stats/str_get_dummies.test.ts
new file mode 100644
index 00000000..30f39718
--- /dev/null
+++ b/tests/stats/str_get_dummies.test.ts
@@ -0,0 +1,304 @@
+/**
+ * Tests for strGetDummies — pandas.Series.str.get_dummies(sep) port.
+ */
+
+import { describe, expect, test } from "bun:test";
+import * as fc from "fast-check";
+import { DataFrame, Series, strGetDummies } from "../../src/index.ts";
+import type { Scalar } from "../../src/index.ts";
+
+function s(data: readonly Scalar[]): Series<Scalar> {
+  return new Series<Scalar>({ data: [...data] });
+}
+
+// ─── Basic functionality ──────────────────────────────────────────────────────
+
+describe("strGetDummies — basic", () => {
+  test("splits by default pipe separator", () => {
+    const df = strGetDummies(s(["a|b", "b|c", "a"]));
+    expect(df).toBeInstanceOf(DataFrame);
+    expect([...df.columns.values]).toEqual(["a", "b", "c"]);
+    expect(df.shape).toEqual([3, 3]);
+  });
+
+  test("correct indicator values", () => {
+    const df = strGetDummies(s(["a|b", "b|c", "a"]));
+    expect([...df.col("a").values]).toEqual([1, 0, 1]);
+    expect([...df.col("b").values]).toEqual([1, 1, 0]);
+    expect([...df.col("c").values]).toEqual([0, 1, 0]);
+  });
+
+  test("custom separator", () => {
+    const df = strGetDummies(s(["x,y", "y,z", "x"]), { sep: "," });
+    expect([...df.columns.values]).toEqual(["x", "y", "z"]);
+    expect([...df.col("x").values]).toEqual([1, 0, 1]);
+  });
+
+  test("single-value elements", () => {
+    const df = strGetDummies(s(["a", "b", "c"]));
+    expect([...df.columns.values]).toEqual(["a", "b", "c"]);
+    expect([...df.col("a").values]).toEqual([1, 0, 0]);
+  });
+
+  test("columns are sorted lexicographically", () => {
+    const df = strGetDummies(s(["z|a|m", "a|z"]));
+    expect([...df.columns.values]).toEqual(["a", "m", "z"]);
+  });
+
+  test("null values produce all-zero rows", () => {
+    const df = strGetDummies(s(["a|b", null, "b"]));
+    expect([...df.col("a").values]).toEqual([1, 0, 0]);
+    expect([...df.col("b").values]).toEqual([1, 0, 1]);
+  });
+
+  test("undefined values produce all-zero rows", () => {
+    const df = strGetDummies(s(["a", undefined as unknown as Scalar, "b"]));
+    expect([...df.col("a").values]).toEqual([1, 0, 0]);
+    expect([...df.col("b").values]).toEqual([0, 0, 1]);
+  });
+
+  test("NaN values produce all-zero rows", () => {
+    const df = strGetDummies(s(["a|b", Number.NaN, "b"]));
+    expect([...df.col("a").values]).toEqual([1, 0, 0]);
+    expect([...df.col("b").values]).toEqual([1, 0, 1]);
+  });
+
+  test("empty-string elements produce all-zero rows", () => {
+    const df = strGetDummies(s(["a|b", ""]));
+    expect([...df.col("a").values]).toEqual([1, 0]);
+    expect([...df.col("b").values]).toEqual([1, 0]);
+  });
+
+  test("all nulls returns empty-column DataFrame with preserved row count", () => {
+    const df = strGetDummies(s([null, null]));
+    expect([...df.columns.values]).toEqual([]);
+    expect(df.shape).toEqual([2, 0]);
+  });
+
+  test("empty series returns empty DataFrame", () => {
+    const df = strGetDummies(s([]));
+    expect(df.shape).toEqual([0, 0]);
+  });
+
+  test("preserves original index", () => {
+    const ser = new Series<Scalar>({ data: ["a|b", "c"], index: ["r1", "r2"] });
+    const df = strGetDummies(ser);
+    expect([...df.index.values]).toEqual(["r1", "r2"]);
+  });
+
+  test("duplicate tokens in same element are counted once", () => {
+    const df = strGetDummies(s(["a|a|b"]));
+    expect([...df.col("a").values]).toEqual([1]);
+    expect([...df.col("b").values]).toEqual([1]);
+  });
+
+  test("whitespace tokens are preserved as-is (not stripped)", () => {
+    const df = strGetDummies(s([" a | b "]));
+    expect([...df.columns.values]).toEqual([" a ", " b "]);
+  });
+
+  test("prefix and prefixSep option", () => {
+    const df = strGetDummies(s(["x|y"]), { prefix: "tag", prefixSep: "-" });
+    expect([...df.columns.values]).toEqual(["tag-x", "tag-y"]);
+  });
+});
+
+// ─── Result shape and types ───────────────────────────────────────────────────
+
+describe("strGetDummies — result shape", () => {
+  test("row count matches input series length", () => {
+    const df = strGetDummies(s(["a|b", "c", "d|e|f"]));
+    expect(df.shape[0]).toBe(3);
+  });
+
+  test("column count equals unique token count", () => {
+    const df = strGetDummies(s(["a|b|c", "b|d"]));
+    expect(df.shape[1]).toBe(4);
+  });
+
+  test("all values are 0 or 1", () => {
+    const df = strGetDummies(s(["x|y", "y|z", "x|z"]));
+    for (const col of df.columns.values as readonly string[]) {
+      for (const v of df.col(col).values) {
+        expect(v === 0 || v === 1).toBe(true);
+      }
+    }
+  });
+
+  test("sum of row values equals unique-token count per row", () => {
+    const df = strGetDummies(s(["a|b|c", "a", "b|c"]));
+    const cols = df.columns.values as readonly string[];
+    const rowSums = [0, 1, 2].map((i) =>
+      cols.reduce((acc, col) => {
+        const v = df.col(col).values[i];
+        return acc + (typeof v === "number" ? v : 0);
+      }, 0),
+    );
+    expect(rowSums).toEqual([3, 1, 2]);
+  });
+});
+
+// ─── Pandas parity examples ───────────────────────────────────────────────────
+
+describe("strGetDummies — pandas parity", () => {
+  test("pandas example: a|b, b|c, a", () => {
+    // >>> pd.Series(['a|b', 'b|c', 'a']).str.get_dummies()
+    //    a  b  c
+    // 0  1  1  0
+    // 1  0  1  1
+    // 2  1  0  0
+    const df = strGetDummies(s(["a|b", "b|c", "a"]));
+    expect(df.toRecords()).toEqual([
+      { a: 1, b: 1, c: 0 },
+      { a: 0, b: 1, c: 1 },
+      { a: 1, b: 0, c: 0 },
+    ]);
+  });
+
+  test("pandas example: custom sep comma", () => {
+    const df = strGetDummies(s(["a,b", "b,c", "a"]), { sep: "," });
+    expect(df.toRecords()).toEqual([
+      { a: 1, b: 1, c: 0 },
+      { a: 0, b: 1, c: 1 },
+      { a: 1, b: 0, c: 0 },
+    ]);
+  });
+
+  test("multi-label tags scenario", () => {
+    const df = strGetDummies(
+      s(["python|pandas", "python|numpy", "pandas|numpy|scipy"]),
+    );
+    expect([...df.columns.values]).toEqual(["numpy", "pandas", "python", "scipy"]);
+    expect([...df.col("python").values]).toEqual([1, 1, 0]);
+    expect([...df.col("scipy").values]).toEqual([0, 0, 1]);
+  });
+});
+
+// ─── Property-based tests ─────────────────────────────────────────────────────
+
+describe("strGetDummies — property-based", () => {
+  test("row count always equals series length", () => {
+    fc.assert(
+      fc.property(
+        fc.array(fc.option(fc.string({ maxLength: 10 }), { nil: null }), {
+          minLength: 1,
+          maxLength: 20,
+        }),
+        (arr) => {
+          const df = strGetDummies(s(arr as Scalar[]));
+          return df.shape[0] === arr.length;
+        },
+      ),
+    );
+  });
+
+  test("all cell values are 0 or 1", () => {
+    fc.assert(
+      fc.property(
+        fc.array(
+          fc.option(
+            fc
+              .array(fc.string({ minLength: 1, maxLength: 5 }), {
+                minLength: 1,
+                maxLength: 4,
+              })
+              .map((parts) => parts.join("|")),
+            { nil: null },
+          ),
+          { minLength: 1, maxLength: 15 },
+        ),
+        (arr) => {
+          const df = strGetDummies(s(arr as Scalar[]));
+          for (const col of df.columns.values as readonly string[]) {
+            for (const v of df.col(col).values) {
+              if (v !== 0 && v !== 1) {
+                return false;
+              }
+            }
+          }
+          return true;
+        },
+      ),
+    );
+  });
+
+  test("index is preserved", () => {
+    fc.assert(
+      fc.property(
+        fc.array(fc.string({ maxLength: 10 }), { minLength: 1, maxLength: 15 }),
+        (arr) => {
+          const ser = s(arr as Scalar[]);
+          const df = strGetDummies(ser);
+          const origIdx = [...ser.index.values];
+          const dfIdx = [...df.index.values];
+          return (
+            origIdx.length === dfIdx.length && origIdx.every((v, i) => v === dfIdx[i])
+          );
+        },
+      ),
+    );
+  });
+
+  test("columns are always sorted lexicographically", () => {
+    fc.assert(
+      fc.property(
+        fc.array(
+          fc
+            .array(fc.string({ minLength: 1, maxLength: 5 }), {
+              minLength: 1,
+              maxLength: 4,
+            })
+            .map((parts) => parts.join("|")),
+          { minLength: 1, maxLength: 10 },
+        ),
+        (arr) => {
+          const df = strGetDummies(s(arr as Scalar[]));
+          const cols = (df.columns.values as readonly string[]).map(String);
+          const sorted = [...cols].sort();
+          return cols.every((c, i) => c === sorted[i]);
+        },
+      ),
+    );
+  });
+
+  test("null/undefined/NaN rows always have row-sum 0", () => {
+    fc.assert(
+      fc.property(
+        fc.array(
+          fc.oneof(
+            fc.constant(null),
+            fc.constant(undefined),
+            fc.constant(Number.NaN),
+            fc
+              .array(fc.string({ minLength: 1, maxLength: 4 }), {
+                minLength: 1,
+                maxLength: 3,
+              })
+              .map((parts) => parts.join("|")),
+          ),
+          { minLength: 1, maxLength: 10 },
+        ),
+        (arr) => {
+          const df = strGetDummies(s(arr as Scalar[]));
+          const cols = df.columns.values as readonly string[];
+          for (let i = 0; i < arr.length; i++) {
+            const v = arr[i];
+            const isMissing =
+              v === null ||
+              v === undefined ||
+              (typeof v === "number" && Number.isNaN(v));
+            if (!isMissing) {
+              continue;
+            }
+            for (const col of cols) {
+              if (df.col(col).values[i] !== 0) {
+                return false;
+              }
+            }
+          }
+          return true;
+        },
+      ),
+    );
+  });
+});
diff --git a/tests/stats/string_ops.test.ts b/tests/stats/string_ops.test.ts
index 0f5b84cc..435af513 100644
--- a/tests/stats/string_ops.test.ts
+++ b/tests/stats/string_ops.test.ts
@@ -1,6 +1,6 @@
 /**
  * Tests for src/stats/string_ops.ts
- * — strNormalize, strGetDummies, strExtractAll, strRemovePrefix,
+ * — strNormalize, strExtractAll, strRemovePrefix,
  *   strRemoveSuffix, strTranslate, strCharWidth, strByteLength
  */
 import { describe, expect, it } from "bun:test";
@@ -11,7 +11,6 @@ import {
   strByteLength,
   strCharWidth,
   strExtractAll,
-  strGetDummies,
   strNormalize,
   strRemovePrefix,
   strRemoveSuffix,
@@ -78,68 +77,6 @@ describe("strNormalize", () => {
   });
 });
 
-// ─── strGetDummies ────────────────────────────────────────────────────────────
-
-describe("strGetDummies", () => {
-  it("basic | separator", () => {
-    const df = strGetDummies(s(["a|b", "b|c", "a"]));
-    expect(df.shape[0]).toBe(3);
-    expect([...df.columns.values].sort()).toEqual(["a", "b", "c"]);
-    expect(df.col("a").values[0]).toBe(1);
-    expect(df.col("a").values[1]).toBe(0);
-    expect(df.col("a").values[2]).toBe(1);
-    expect(df.col("b").values[0]).toBe(1);
-    expect(df.col("b").values[1]).toBe(1);
-    expect(df.col("b").values[2]).toBe(0);
-    expect(df.col("c").values[0]).toBe(0);
-    expect(df.col("c").values[1]).toBe(1);
-    expect(df.col("c").values[2]).toBe(0);
-  });
-
-  it("custom separator", () => {
-    const df = strGetDummies(s(["a,b", "b,c"]), { sep: "," });
-    expect([...df.columns.values].sort()).toEqual(["a", "b", "c"]);
-  });
-
-  it("prefix option", () => {
-    const df = strGetDummies(s(["x|y"]), { prefix: "tag", prefixSep: "-" });
-    expect([...df.columns.values].sort()).toEqual(["tag-x", "tag-y"]);
-  });
-
-  it("empty string element maps to no tokens", () => {
-    const df = strGetDummies(s(["a|b", ""]));
-    expect(df.col("a").values[1]).toBe(0);
-    expect(df.col("b").values[1]).toBe(0);
-  });
-
-  it("single-token element", () => {
-    const df = strGetDummies(s(["a", "b", "a"]));
-    expect(df.shape[0]).toBe(3);
-    expect(df.col("a").values[0]).toBe(1);
-    expect(df.col("a").values[1]).toBe(0);
-    expect(df.col("a").values[2]).toBe(1);
-  });
-
-  it("all same token → single column of ones", () => {
-    const df = strGetDummies(s(["x", "x", "x"]));
-    expect(df.shape[1]).toBe(1);
-    expect([...df.col("x").values]).toEqual([1, 1, 1]);
-  });
-
-  it("preserves Series index in output rows", () => {
-    const ser = new Series({ data: ["a|b", "b"] as Scalar[], index: [10, 20] });
-    const df = strGetDummies(ser);
-    expect(df.index.values[0]).toBe(10);
-    expect(df.index.values[1]).toBe(20);
-  });
-
-  it("array input (not Series)", () => {
-    const df = strGetDummies(["a|b", "c"]);
-    expect(df.shape[0]).toBe(2);
-    expect([...df.columns.values].sort()).toEqual(["a", "b", "c"]);
-  });
-});
-
 // ─── strExtractAll ────────────────────────────────────────────────────────────
 
 describe("strExtractAll", () => {

From 917c7e69a485cd11d6c89b09fa2f59ad59d600a9 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Sat, 25 Apr 2026 19:14:18 +0000
Subject: [PATCH 3/3] fix: apply biome formatting to str_get_dummies test

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tests/stats/str_get_dummies.test.ts | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/tests/stats/str_get_dummies.test.ts b/tests/stats/str_get_dummies.test.ts
index 30f39718..924f2c1e 100644
--- a/tests/stats/str_get_dummies.test.ts
+++ b/tests/stats/str_get_dummies.test.ts
@@ -165,9 +165,7 @@ describe("strGetDummies — pandas parity", () => {
   });
 
   test("multi-label tags scenario", () => {
-    const df = strGetDummies(
-      s(["python|pandas", "python|numpy", "pandas|numpy|scipy"]),
-    );
+    const df = strGetDummies(s(["python|pandas", "python|numpy", "pandas|numpy|scipy"]));
     expect([...df.columns.values]).toEqual(["numpy", "pandas", "python", "scipy"]);
     expect([...df.col("python").values]).toEqual([1, 1, 0]);
     expect([...df.col("scipy").values]).toEqual([0, 0, 1]);
@@ -231,9 +229,7 @@ describe("strGetDummies — property-based", () => {
           const df = strGetDummies(ser);
           const origIdx = [...ser.index.values];
           const dfIdx = [...df.index.values];
-          return (
-            origIdx.length === dfIdx.length && origIdx.every((v, i) => v === dfIdx[i])
-          );
+          return origIdx.length === dfIdx.length && origIdx.every((v, i) => v === dfIdx[i]);
         },
       ),
     );
@@ -284,9 +280,7 @@ describe("strGetDummies — property-based", () => {
           for (let i = 0; i < arr.length; i++) {
             const v = arr[i];
             const isMissing =
-              v === null ||
-              v === undefined ||
-              (typeof v === "number" && Number.isNaN(v));
+              v === null || v === undefined || (typeof v === "number" && Number.isNaN(v));
             if (!isMissing) {
               continue;
             }