githubnext · mrjf · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/.autoloop/programs/tsb-perf-evolve/code/README.md b/.autoloop/programs/tsb-perf-evolve/code/README.md
@@ -0,0 +1,11 @@
+# tsb-perf-evolve — code/
+
+This directory holds the **fixed inputs** for the program: the benchmark scripts and a small config. The autoloop iterations should rarely touch these files. The thing that *evolves* is `src/core/series.ts` (specifically the `sortValues` method) — see `../program.md` for the full picture.
+
+## Files
+
+- `config.yaml` — tunables read by the AlphaEvolve playbook (`exploitation_ratio`, `num_islands`, `population_size`, `archive_size`, dataset size).
+- `benchmark.ts` — tsb-side benchmark. Builds a Series of `dataset_size` random floats with ~5% NaN, calls `sortValues` in a tight loop, prints `{"function": "Series.sortValues", "mean_ms": …, "iterations": …, "total_ms": …}`.
+- `benchmark.py` — pandas-side benchmark. Builds an equivalent `pd.Series`, calls `.sort_values()` in the same loop structure, prints the same JSON shape.
+
+The two benchmarks must stay aligned: same dataset size, same NaN ratio, same warm-up + measured iteration counts. If you tweak one, tweak the other.
diff --git a/.autoloop/programs/tsb-perf-evolve/code/benchmark.py b/.autoloop/programs/tsb-perf-evolve/code/benchmark.py
@@ -0,0 +1,60 @@
+"""pandas-side benchmark for Series.sort_values.
+
+Output: a single JSON line on stdout with the shape
+    {"function": "Series.sort_values", "mean_ms": <number>,
+     "iterations": <number>, "total_ms": <number>}
+
+Dataset shape and iteration counts mirror ./benchmark.ts — keep the two in
+lockstep. Fixed seed for reproducibility across runs.
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+import time
+
+import numpy as np
+import pandas as pd
+
+# Inlined from config.yaml (kept in sync with benchmark.ts).
+DATASET_SIZE = 100_000
+NAN_RATIO = 0.05
+WARMUP_ITERATIONS = 5
+MEASURED_ITERATIONS = 50
+RANDOM_SEED = 42
+
+
+def build_data() -> pd.Series:
+    rng = np.random.default_rng(RANDOM_SEED)
+    values = rng.uniform(-500_000.0, 500_000.0, size=DATASET_SIZE)
+    nan_mask = rng.random(size=DATASET_SIZE) < NAN_RATIO
+    values[nan_mask] = np.nan
+    return pd.Series(values, dtype="float64")
+
+
+def main() -> None:
+    series = build_data()
+
+    # Warm-up.
+    for _ in range(WARMUP_ITERATIONS):
+        series.sort_values()
+
+    start = time.perf_counter()
+    for _ in range(MEASURED_ITERATIONS):
+        series.sort_values()
+    total_s = time.perf_counter() - start
+    total_ms = total_s * 1000.0
+    mean_ms = total_ms / MEASURED_ITERATIONS
+
+    result = {
+        "function": "Series.sort_values",
+        "mean_ms": mean_ms,
+        "iterations": MEASURED_ITERATIONS,
+        "total_ms": total_ms,
+    }
+    sys.stdout.write(json.dumps(result) + "\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.autoloop/programs/tsb-perf-evolve/code/benchmark.ts b/.autoloop/programs/tsb-perf-evolve/code/benchmark.ts
@@ -0,0 +1,75 @@
+// tsb-side benchmark for Series.sortValues.
+// Output: a single JSON line on stdout with the shape
+//   {"function": "Series.sortValues", "mean_ms": <number>, "iterations": <number>, "total_ms": <number>}
+//
+// Dataset shape and iteration counts come from ./config.yaml — keep this file
+// and ./benchmark.py in lockstep.
+
+import { Series } from "../../../../src/index.ts";
+
+// Inlined from config.yaml — the autoloop agent should keep these in sync.
+// (No YAML parser dependency to keep this benchmark hermetic.)
+const DATASET_SIZE = 100_000;
+const NAN_RATIO = 0.05;
+const WARMUP_ITERATIONS = 5;
+const MEASURED_ITERATIONS = 50;
+const RANDOM_SEED = 42;
+
+// A tiny deterministic PRNG (mulberry32). Note: this is *not* the same
+// algorithm as numpy's default_rng on the Python side, so for any given seed
+// the two benchmarks will see different concrete values. They will still see
+// the same *distribution* (uniform over [-500_000, 500_000) with the same NaN
+// fraction), and that is what matters for a sorting micro-benchmark — the
+// dataset shape, not the exact bit pattern. If you ever need byte-identical
+// inputs across the two sides, swap mulberry32 for a portable PRNG that has a
+// matching numpy implementation (e.g. PCG64).
+function mulberry32(seed: number): () => number {
+  let a = seed >>> 0;
+  return () => {
+    a = (a + 0x6d2b79f5) >>> 0;
+    let t = a;
+    t = Math.imul(t ^ (t >>> 15), t | 1);
+    t ^= t + Math.imul(t ^ (t >>> 7), t | 61);
+    return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
+  };
+}
+
+function buildData(): readonly (number | null)[] {
+  const rng = mulberry32(RANDOM_SEED);
+  const out: (number | null)[] = new Array(DATASET_SIZE);
+  for (let i = 0; i < DATASET_SIZE; i++) {
+    out[i] = rng() < NAN_RATIO ? null : rng() * 1_000_000 - 500_000;
+  }
+  return out;
+}
+
+function nowMs(): number {
+  return performance.now();
+}
+
+function main(): void {
+  const data = buildData();
+  const series = new Series<number | null>({ data, dtype: "float64" });
+
+  // Warm-up — let the JIT specialize.
+  for (let i = 0; i < WARMUP_ITERATIONS; i++) {
+    series.sortValues();
+  }
+
+  const start = nowMs();
+  for (let i = 0; i < MEASURED_ITERATIONS; i++) {
+    series.sortValues();
+  }
+  const totalMs = nowMs() - start;
+  const meanMs = totalMs / MEASURED_ITERATIONS;
+
+  const result = {
+    function: "Series.sortValues",
+    mean_ms: meanMs,
+    iterations: MEASURED_ITERATIONS,
+    total_ms: totalMs,
+  };
+  process.stdout.write(`${JSON.stringify(result)}\n`);
+}
+
+main();
diff --git a/.autoloop/programs/tsb-perf-evolve/code/config.yaml b/.autoloop/programs/tsb-perf-evolve/code/config.yaml
@@ -0,0 +1,22 @@
+# AlphaEvolve tunables — read by strategy/alphaevolve.md every iteration.
+
+# Operator weights. Must sum to 1.0. Defaults bias toward exploitation.
+exploitation_ratio: 0.50
+exploration_ratio: 0.30
+crossover_ratio: 0.15
+migration_ratio: 0.05
+
+# Island count. Should match the number of islands enumerated in
+# strategy/alphaevolve.md's "Pick parent(s)" section.
+num_islands: 5
+
+# MAP-Elites population caps.
+population_size: 40
+archive_size: 10
+
+# Benchmark dataset shape. Both benchmark.ts and benchmark.py read this.
+dataset_size: 100000
+nan_ratio: 0.05
+warmup_iterations: 5
+measured_iterations: 50
+random_seed: 42
diff --git a/.autoloop/programs/tsb-perf-evolve/program.md b/.autoloop/programs/tsb-perf-evolve/program.md
@@ -0,0 +1,80 @@
+---
+schedule: every 6h
+---
+
+# tsb perf evolve — Series.sortValues vs pandas Series.sort_values
+
+## Goal
+
+Evolve the implementation of `Series.sortValues` (`src/core/series.ts`) so that, on the synthetic benchmark in `code/benchmark.ts`, tsb runs **at least as fast as pandas** on the equivalent `Series.sort_values` call (`code/benchmark.py`).
+
+Concretely, we minimize the **ratio**
+
+    fitness = mean_ms_tsb / mean_ms_pandas
+
+`fitness < 1.0` means tsb is faster than pandas; lower is better. We will keep iterating as long as fitness keeps improving.
+
+This is a **performance-evolution program** — there is one self-contained artifact (`Series.sortValues`), one scalar fitness (the ratio), and many plausible algorithmic families to try (comparison sort, typed-array indirect sort, dtype-dispatched non-comparison sort, batched/SoA, etc.). It is the canonical case for the AlphaEvolve strategy.
+
+### Validity invariants
+
+A candidate is valid iff:
+
+1. The existing test suite for `sortValues` passes: `bun test tests/core/series.sortValues.test.ts` (and any property tests that exercise it).
+2. The function signature is unchanged: `sortValues(ascending = true, naPosition: "first" | "last" = "last"): Series<T>`.
+3. No new runtime dependencies (devDependencies for benchmarking are fine).
+4. TypeScript strict mode is satisfied — no `any`, no `as` casts, no `@ts-ignore`.
+5. Behaviour is identical to the current implementation for: numeric (with NaN), string, mixed dtypes, ascending and descending, both `naPosition` values, and an empty Series.
+
+The evaluator runs the test suite and the benchmark; if either fails, the candidate is rejected.
+
+## Target
+
+Only modify these files:
+- `src/core/series.ts` — the `sortValues` method body (and any small private helpers inside `series.ts` that it calls). Keep the public signature unchanged.
+- `.autoloop/programs/tsb-perf-evolve/code/**` — benchmark scripts and config. (You will rarely need to touch these — the evaluator is fixed; the benchmark dataset is fixed; only tweak if a candidate genuinely needs a new bench scenario.)
+
+Do NOT modify:
+- `tests/**` — test files (they are the validity oracle; do not weaken them).
+- `README.md` — read-only.
+- `.autoloop/programs/**` other than this program's `code/` dir.
+- `.github/workflows/autoloop*` — autoloop workflow files.
+- Any `src/**` file other than `src/core/series.ts`.
+
+## Evolution Strategy
+
+This program uses the **AlphaEvolve** strategy. On every iteration, read `strategy/alphaevolve.md` and follow it literally — it supersedes the generic analyze/accept/reject steps in the default autoloop loop.
+
+Support files:
+- `strategy/alphaevolve.md` — the runtime playbook (operators, parent selection, population rules).
+- `strategy/prompts/mutation.md` — framing for exploitation and exploration operators.
+- `strategy/prompts/crossover.md` — framing for crossover and migration operators.
+
+Population state lives in the state file on the `memory/autoloop` branch under the `## 🧬 Population` subsection (see the playbook for the schema).
+
+## Evaluation
+
+```bash
+set -euo pipefail
+
+# 1. Validity — existing tests for sortValues must still pass.
+bun test tests/core/series.sortValues.test.ts >/tmp/perf-evolve-tests.log 2>&1 || {
+  echo '{"fitness": null, "rejected_reason": "tests failed"}'
+  exit 0
+}
+
+# 2. Benchmark — tsb side.
+tsb_ms=$(bun run .autoloop/programs/tsb-perf-evolve/code/benchmark.ts | python3 -c "import json,sys; print(json.load(sys.stdin)['mean_ms'])")
+
+# 3. Benchmark — pandas side. Skip gracefully if pandas isn't available.
+if ! python3 -c 'import pandas' 2>/dev/null; then
+  pip3 install pandas --quiet 2>/dev/null || true
+fi
+pd_ms=$(python3 .autoloop/programs/tsb-perf-evolve/code/benchmark.py | python3 -c "import json,sys; print(json.load(sys.stdin)['mean_ms'])")
+
+# 4. Fitness = ratio. Lower is better.
+ratio=$(python3 -c "print(${tsb_ms} / ${pd_ms})")
+echo "{\"fitness\": ${ratio}, \"tsb_mean_ms\": ${tsb_ms}, \"pandas_mean_ms\": ${pd_ms}}"
+```
+
+The metric is `fitness` (= `tsb_mean_ms / pandas_mean_ms`). **Lower is better.** A value below `1.0` means tsb is now faster than pandas on this workload.
diff --git a/.autoloop/programs/tsb-perf-evolve/strategy/alphaevolve.md b/.autoloop/programs/tsb-perf-evolve/strategy/alphaevolve.md
@@ -0,0 +1,142 @@
+# AlphaEvolve Strategy — tsb-perf-evolve
+
+This file is the **runtime playbook** for this program. The autoloop agent reads it at the start of every iteration and follows it literally. It supersedes the generic "Analyze and Propose" / "Accept or Reject" steps in the default autoloop iteration loop — all other steps (state read, branch management, state file updates) still apply.
+
+## Problem framing
+
+The target artifact is the body of `Series.sortValues` in `src/core/series.ts`. Fitness is the ratio `tsb_mean_ms / pandas_mean_ms` measured on the fixed benchmark in `code/benchmark.ts` (and its pandas mirror `code/benchmark.py`); **lower is better**, with `< 1.0` meaning tsb is faster than pandas. A candidate is valid iff the existing tests for `sortValues` pass, the public signature is unchanged, no new runtime dependencies are added, TypeScript strict mode is satisfied, and behaviour matches the reference for numeric/string/mixed dtypes, both ascending values, and both `naPosition` settings.
+
+## Per-iteration loop
+
+### Step 1. Load state
+
+1. Read `program.md` — Goal, Target, Evaluation.
+2. Read the program's state file from the repo-memory folder (`tsb-perf-evolve.md`). Locate the `## 🧬 Population` subsection. If it does not exist, create it using the schema in [Population schema](#population-schema).
+3. Read `code/config.yaml` for tunables (`exploitation_ratio`, `num_islands`, `population_size`, `archive_size`, `dataset_size`, etc.). Do not hard-code values you can read from config — the maintainer may have tuned them.
+4. Read both prompt templates in `strategy/prompts/`. These frame how you reason about mutations and crossovers for sorting code.
+
+### Step 2. Pick operator
+
+Sample one operator using these weights (tuned for a perf problem with a small handful of plausible algorithmic families — exploitation-heavy because once an island has a working candidate, refinement usually pays):
+
+| Operator | Default weight | When it fires |
+|---|---|---|
+| Exploitation | 0.50 | Refine one of the elites — the current best or a near-best. |
+| Exploration | 0.30 | Generate a candidate from an **under-represented island** or a novel family. |
+| Crossover | 0.15 | Combine ideas from two parents on different islands. |
+| Migration | 0.05 | Take a technique that works on island A and port it into a solution on island B. |
+
+Deterministic overrides (apply *before* sampling):
+
+- If the population is empty or has one member → **Exploration** (seed diversity).
+- If the last 3 statuses in `recent_statuses` are all `rejected` → force **Exploration** with a previously-unused island.
+- If the last 5 statuses are all `rejected` → force **Migration** or a radically new island; also revisit any domain knowledge in `prompts/mutation.md` that has not yet been applied.
+
+Record your chosen operator in the iteration's reasoning — the state file's Iteration History entry must include it.
+
+### Step 3. Pick parent(s)
+
+**Islands** for this program (algorithmic families for sorting a 1-D numeric Series with NaN):
+
+- **Island 0 — Comparison sort (objects)**: the current implementation — `Array.prototype.sort` over `{v, i}` pairs with a comparator that handles NaN.
+- **Island 1 — Indirect typed-array sort**: copy values into a `Float64Array`, sort an index `Uint32Array` by that, then gather. NaN handled by partition.
+- **Island 2 — Decorate-sort-undecorate with packed keys**: encode `(value, index)` into a single sortable representation (e.g. pack into a `BigInt64Array` or use parallel typed arrays), sort once, gather.
+- **Island 3 — Non-comparison / radix**: dispatch on dtype; for finite floats, transform to a sortable unsigned representation and run an LSD radix sort, then untransform.
+- **Island 4 — Hybrid**: small-input fast path (Array.prototype.sort) + large-input dispatch into one of the above families based on `dataset_size` and dtype.
+
+Parent selection by operator:
+
+- **Exploitation** — pick the best scorer; break ties by picking the most recent.
+- **Exploration** — pick the island with the fewest members (or a brand-new island number if all are full), then either start from its best member or from scratch.
+- **Crossover** — pick two parents on **different islands**. Bias toward one elite (top quartile) and one diverse (any island with a distinct feature-cell — see [Feature dimensions](#feature-dimensions)).
+- **Migration** — pick one donor island (the source of the technique) and one recipient island (where the technique will be grafted in). The parent you actually edit is on the recipient island.
+
+### Step 4. Apply the operator
+
+Frame your reasoning using the matching prompt template:
+
+- Exploitation or Exploration → `strategy/prompts/mutation.md`
+- Crossover or Migration → `strategy/prompts/crossover.md`
+
+Before writing any code, state (in your visible reasoning):
+
+1. Chosen operator + why.
+2. Parent(s) picked — their IDs, island, score, and a one-line summary of each parent's approach.
+3. What specifically you're changing, and your hypothesis for *why* it should improve the fitness.
+4. Validity pre-check — walk through why the proposed candidate will satisfy each invariant:
+   - Existing tests for `sortValues` will pass (numeric + NaN, string, ascending/descending, both `naPosition` values, empty Series).
+   - Public signature unchanged: `sortValues(ascending = true, naPosition: "first" | "last" = "last"): Series<T>`.
+   - No new runtime dependency added to `package.json`.
+   - No `any`, no `as`, no `@ts-ignore`.
+   - Index alignment preserved — every output value is paired with the original index of the input row it came from.
+5. Novelty check: confirm this is not a near-duplicate of an existing population member or of anything in the state file's 🚧 Foreclosed Avenues.
+
+### Step 5. Implement
+
+Edit only the files listed in `program.md`'s Target section. The diff style for this program is **minimal diff** — `series.ts` is a large file and only the body of `sortValues` (plus, occasionally, a small private helper added immediately above it) should change. Do not reformat unrelated parts of the file.
+
+### Step 6. Evaluate
+
+Run the evaluation command from `program.md`. Parse the `fitness` field from the JSON output (along with `tsb_mean_ms` and `pandas_mean_ms` for the population entry).
+
+### Step 7. Update the population
+
+Regardless of whether the iteration is accepted or rejected at the branch level, the candidate has been tried and should be recorded in the population — the population is a memory of what's been explored, not just what's been kept.
+
+Append a new entry to the `## 🧬 Population` subsection in the state file using the schema below. Then enforce these caps:
+
+- **Population cap**: `population_size` from `code/config.yaml` (default 40). If exceeded, evict the *worst* member in the most-crowded feature cell (MAP-Elites style — never evict the best of any cell).
+- **Elite archive**: the top `archive_size` from `code/config.yaml` (default 10) by fitness are always preserved regardless of cell crowding.
+
+### Step 8. Fold through to the default loop
+
+Continue with the normal autoloop Step 5 (Accept or Reject → commit / discard, update state file's Machine State, Iteration History, Lessons Learned, etc.) as defined in the workflow. The only additional requirements from AlphaEvolve are:
+
+- The Iteration History entry must include `operator`, `parent_id(s)`, `island`, and `fitness` fields (in addition to the normal status/change/metric/notes).
+- Lessons Learned additions should be phrased as *transferable heuristics* about the problem space, not as reports of what this iteration did. (E.g. "Indirect sort over `Uint32Array` indices beats object-pair sort above n≈10k" — not "Iteration 17 tried indirect sort.")
+
+## Feature dimensions
+
+MAP-Elites partitions the population into **feature cells**. Each candidate is described by a small tuple of qualitative features, and the population keeps the best candidate per cell — this is what creates diversity pressure even when many candidates have similar fitness.
+
+For this program, use these feature dimensions:
+
+- **Dimension 1 — Storage**: `boxed-pairs` / `parallel-typed-arrays` / `packed-typed-array` / `wasm-buffer`
+- **Dimension 2 — Algorithm class**: `comparison` / `non-comparison` / `hybrid`
+
+When evaluating a candidate, classify it into one cell per dimension. The combined `(storage, algorithm)` tuple is its **feature cell**. Record the cell in the population entry (see schema).
+
+## Population schema
+
+The population lives in the state file `tsb-perf-evolve.md` on the `memory/autoloop` branch as a subsection. Use this exact layout so maintainers can read and edit it:
+
+```markdown
+## 🧬 Population
+
+> 🤖 *Managed by the AlphaEvolve strategy. One entry per candidate that has been evaluated (accepted or rejected). Newest first.*
+
+### Candidate <id>  ·  island <n>  ·  fitness <score>  ·  gen <iter>
+
+- **Operator**: exploitation / exploration / crossover / migration
+- **Parent(s)**: [<id1>, <id2>]
+- **Feature cell**: <storage-bucket> · <algorithm-bucket>
+- **Approach**: <one-line summary of the technique>
+- **Status**: ✅ accepted / ❌ rejected
+- **Notes**: <what worked or didn't, anything worth remembering — e.g. "tsb=12.3ms / pandas=8.7ms / ratio=1.41">
+
+Code:
+
+\`\`\`typescript
+<the candidate sortValues body, or a diff against parent if too large to inline>
+\`\`\`
+
+---
+```
+
+Identifiers:
+- `<id>` is `c{NNN}` zero-padded, monotonically increasing across the program's lifetime.
+- `<n>` is the island number (0-indexed, 0..4 for this program).
+- `<score>` is the raw `fitness` (the tsb/pandas ms ratio).
+- `<iter>` is the iteration number from the Machine State table.
+
+When evicting members under the population cap, **never** delete an entry — instead, prepend a strikethrough header (`### ~~Candidate c042~~ (evicted, gen 87)`) and remove the entire `Code:` block (both the `Code:` label and the surrounding triple-backtick `typescript` code fence) to keep the file size bounded. The metadata stays so future iterations can see what was tried.