structured-world · polaz · Mar 28, 2026 · Mar 26, 2026 · Mar 27, 2026 · Mar 27, 2026
diff --git a/.github/scripts/run-benchmarks.sh b/.github/scripts/run-benchmarks.sh
@@ -1,43 +1,239 @@
 #!/bin/bash
-# Run compare_ffi benchmarks and produce github-action-benchmark JSON.
-# Output: benchmark-results.json (customSmallerIsBetter format — lower time = better)
+# Run the Criterion benchmark matrix and produce:
+# - benchmark-results.json for github-action-benchmark
+# - benchmark-report.md for human review
+#
+# Output format note:
+# - benchmark JSON uses customSmallerIsBetter (lower ms/iter is better)
+# - report markdown also includes per-scenario compression size + ratio summaries
 set -eo pipefail
 
-echo "Running benchmarks..." >&2
+echo "Running benchmark matrix..." >&2
 
-# Run criterion benchmarks, capture output
-cargo bench --bench compare_ffi -p structured-zstd -- --output-format bencher | tee /tmp/bench-raw.txt
+if [ -n "${GITHUB_ACTIONS:-}" ] && [ -z "${STRUCTURED_ZSTD_BENCH_LARGE_BYTES:-}" ]; then
+  export STRUCTURED_ZSTD_BENCH_LARGE_BYTES=16777216
+fi
+BENCH_RAW_FILE="$(mktemp -t structured-zstd-bench-raw.XXXXXX)"
+trap 'rm -f "$BENCH_RAW_FILE"' EXIT
+
+export STRUCTURED_ZSTD_EMIT_REPORT=1
+cargo bench --bench compare_ffi -p structured-zstd -- --output-format bencher | tee "$BENCH_RAW_FILE"
 
 echo "Parsing results..." >&2
 
-# Parse criterion bencher output into github-action-benchmark JSON
-# Format: "test <name> ... bench: <ns> ns/iter (+/- <variance>)"
-python3 - <<'PYEOF'
-import json, re, sys
-
-results = []
-with open("/tmp/bench-raw.txt") as f:
-    for line in f:
-        m = re.match(r"test (\S+)\s+\.\.\. bench:\s+([\d,]+) ns/iter", line)
-        if m:
-            name = m.group(1)
-            ns = int(m.group(2).replace(",", ""))
-            # Convert ns to ms for readability
+BENCH_RAW_FILE="$BENCH_RAW_FILE" python3 - <<'PYEOF'
+import json
+import os
+import re
+import sys
+
+BENCH_RE = re.compile(r"test (\S+)\s+\.\.\. bench:\s+([\d,]+) ns/iter")
+REPORT_RE = re.compile(
+    r'^REPORT scenario=(\S+) label="((?:[^"\\]|\\.)+)" level=(\S+) input_bytes=(\d+) rust_bytes=(\d+) ffi_bytes=(\d+) rust_ratio=([0-9.]+) ffi_ratio=([0-9.]+)$'
+)
+MEM_RE = re.compile(
+    r'^REPORT_MEM scenario=(\S+) label="((?:[^"\\]|\\.)+)" level=(\S+) stage=(\S+) rust_buffer_bytes_estimate=(\d+) ffi_buffer_bytes_estimate=(\d+)$'
+)
+DICT_RE = re.compile(
+    r'^REPORT_DICT scenario=(\S+) label="((?:[^"\\]|\\.)+)" level=(\S+) dict_bytes=(\d+) train_ms=([0-9.]+) ffi_no_dict_bytes=(\d+) ffi_with_dict_bytes=(\d+) ffi_no_dict_ratio=([0-9.]+) ffi_with_dict_ratio=([0-9.]+)$'
+)
+
+def unescape_report_label(value):
+    output = []
+    i = 0
+    while i < len(value):
+        ch = value[i]
+        if ch == "\\" and i + 1 < len(value):
+            i += 1
+            output.append(value[i])
+        else:
+            output.append(ch)
+        i += 1
+    return "".join(output)
+
+def markdown_table_escape(value):
+    escaped = value.strip()
+    escaped = escaped.replace("\\", "\\\\")
+    escaped = escaped.replace("|", "\\|")
+    escaped = escaped.replace("`", "\\`")
+    escaped = escaped.replace("[", "\\[")
+    escaped = escaped.replace("]", "\\]")
+    escaped = escaped.replace("*", "\\*")
+    escaped = escaped.replace("_", "\\_")
+    escaped = escaped.replace("<", "&lt;")
+    escaped = escaped.replace(">", "&gt;")
+    escaped = escaped.replace("%", "&#37;")
+    return escaped.replace("\n", "<br>")
+
+benchmark_results = []
+timings = []
+ratios = []
+memory_rows = []
+dictionary_rows = []
+raw_path = os.environ["BENCH_RAW_FILE"]
+
+with open(raw_path) as f:
+    for raw_line in f:
+        line = raw_line.strip()
+
+        bench_match = BENCH_RE.match(line)
+        if bench_match:
+            name = bench_match.group(1)
+            ns = int(bench_match.group(2).replace(",", ""))
             ms = ns / 1_000_000
-            results.append({
+            benchmark_results.append({
                 "name": name,
                 "unit": "ms",
                 "value": round(ms, 3),
             })
+            timings.append((name, ms))
+            continue
 
-if not results:
+        report_match = REPORT_RE.match(line)
+        if report_match:
+            scenario, label, level, input_bytes, rust_bytes, ffi_bytes, rust_ratio, ffi_ratio = report_match.groups()
+            label = unescape_report_label(label)
+            ratios.append({
+                "scenario": scenario,
+                "label": label,
+                "level": level,
+                "input_bytes": int(input_bytes),
+                "rust_bytes": int(rust_bytes),
+                "ffi_bytes": int(ffi_bytes),
+                "rust_ratio": float(rust_ratio),
+                "ffi_ratio": float(ffi_ratio),
+            })
+            continue
+
+        mem_match = MEM_RE.match(line)
+        if mem_match:
+            (
+                scenario,
+                label,
+                level,
+                stage,
+                rust_buffer_bytes_estimate,
+                ffi_buffer_bytes_estimate,
+            ) = mem_match.groups()
+            label = unescape_report_label(label)
+            memory_rows.append({
+                "scenario": scenario,
+                "label": label,
+                "level": level,
+                "stage": stage,
+                "rust_buffer_bytes_estimate": int(rust_buffer_bytes_estimate),
+                "ffi_buffer_bytes_estimate": int(ffi_buffer_bytes_estimate),
+            })
+            continue
+
+        dict_match = DICT_RE.match(line)
+        if dict_match:
+            (
+                scenario,
+                label,
+                level,
+                dict_bytes,
+                train_ms,
+                ffi_no_dict_bytes,
+                ffi_with_dict_bytes,
+                ffi_no_dict_ratio,
+                ffi_with_dict_ratio,
+            ) = dict_match.groups()
+            label = unescape_report_label(label)
+            dictionary_rows.append({
+                "scenario": scenario,
+                "label": label,
+                "level": level,
+                "dict_bytes": int(dict_bytes),
+                "train_ms": float(train_ms),
+                "ffi_no_dict_bytes": int(ffi_no_dict_bytes),
+                "ffi_with_dict_bytes": int(ffi_with_dict_bytes),
+                "ffi_no_dict_ratio": float(ffi_no_dict_ratio),
+                "ffi_with_dict_ratio": float(ffi_with_dict_ratio),
+            })
+
+if not benchmark_results:
     print("ERROR: No benchmark results parsed!", file=sys.stderr)
     sys.exit(1)
 
+if not ratios:
+    print(
+        "ERROR: No REPORT ratio lines parsed; benchmark-report.md would have an empty ratio section.",
+        file=sys.stderr,
+    )
+    sys.exit(1)
+
+if not memory_rows:
+    print("ERROR: No REPORT_MEM lines parsed; memory section would be empty.", file=sys.stderr)
+    sys.exit(1)
+
+if not dictionary_rows:
+    print("WARN: No REPORT_DICT lines parsed; dictionary section will be empty.", file=sys.stderr)
+
 with open("benchmark-results.json", "w") as f:
-    json.dump(results, f, indent=2)
+    json.dump(benchmark_results, f, indent=2)
+
+lines = [
+    "# Benchmark Report",
+    "",
+    "Generated by `.github/scripts/run-benchmarks.sh` from `cargo bench --bench compare_ffi`.",
+    "",
+    "## Compression Ratios",
+    "",
+    "| Scenario | Label | Level | Input bytes | Rust bytes | C bytes | Rust ratio | C ratio |",
+    "| --- | --- | --- | ---: | ---: | ---: | ---: | ---: |",
+]
+
+for row in sorted(ratios, key=lambda item: (item["scenario"], item["level"])):
+    label = markdown_table_escape(row["label"])
+    lines.append(
+        f'| {row["scenario"]} | {label} | {row["level"]} | {row["input_bytes"]} | {row["rust_bytes"]} | {row["ffi_bytes"]} | {row["rust_ratio"]:.4f} | {row["ffi_ratio"]:.4f} |'
+    )
+
+lines.extend([
+    "",
+    "## Buffer Size Estimates (Input + Output)",
+    "",
+    "| Scenario | Label | Level | Stage | Rust buffer bytes (estimate) | C buffer bytes (estimate) |",
+    "| --- | --- | --- | --- | ---: | ---: |",
+])
+
+for row in sorted(memory_rows, key=lambda item: (item["scenario"], item["level"], item["stage"])):
+    label = markdown_table_escape(row["label"])
+    lines.append(
+        f'| {row["scenario"]} | {label} | {row["level"]} | {row["stage"]} | {row["rust_buffer_bytes_estimate"]} | {row["ffi_buffer_bytes_estimate"]} |'
+    )
+
+lines.extend([
+    "",
+    "## Dictionary Compression (C FFI)",
+    "",
+    "| Scenario | Label | Level | Dict bytes | Train ms | C bytes (no dict) | C bytes (with dict) | C ratio (no dict) | C ratio (with dict) |",
+    "| --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |",
+])
+
+for row in sorted(dictionary_rows, key=lambda item: (item["scenario"], item["level"])):
+    label = markdown_table_escape(row["label"])
+    lines.append(
+        f'| {row["scenario"]} | {label} | {row["level"]} | {row["dict_bytes"]} | {row["train_ms"]:.3f} | {row["ffi_no_dict_bytes"]} | {row["ffi_with_dict_bytes"]} | {row["ffi_no_dict_ratio"]:.4f} | {row["ffi_with_dict_ratio"]:.4f} |'
+    )
+
+lines.extend([
+    "",
+    "## Timing Metrics",
+    "",
+    "| Benchmark | ms/iter |",
+    "| --- | ---: |",
+])
+
+for name, ms in sorted(timings):
+    lines.append(f"| `{name}` | {ms:.3f} |")
+
+with open("benchmark-report.md", "w") as f:
+    f.write("\n".join(lines) + "\n")
 
-print(f"Wrote {len(results)} benchmark results to benchmark-results.json", file=sys.stderr)
-for r in results:
-    print(f"  {r['name']}: {r['value']} {r['unit']}", file=sys.stderr)
+print(f"Wrote {len(benchmark_results)} timing results to benchmark-results.json", file=sys.stderr)
+print(f"Wrote {len(ratios)} ratio rows to benchmark-report.md", file=sys.stderr)
+print(f"Wrote {len(memory_rows)} memory rows to benchmark-report.md", file=sys.stderr)
+print(f"Wrote {len(dictionary_rows)} dictionary rows to benchmark-report.md", file=sys.stderr)
 PYEOF
diff --git a/.gitignore b/.gitignore
@@ -6,5 +6,7 @@ Cargo.lock
 /orig-zstd
 fuzz_decodecorpus
 perf.data*
+benchmark-results.json
+benchmark-report.md
 fuzz/corpus
 .idea
diff --git a/BENCHMARKS.md b/BENCHMARKS.md
@@ -0,0 +1,75 @@
+# Benchmark Suite
+
+`structured-zstd` keeps its compression/decompression performance tracking in the Criterion bench
+matrix at `zstd/benches/compare_ffi.rs`.
+
+## Scenarios
+
+The current matrix covers:
+
+- small random payloads (`1 KiB`, `10 KiB`)
+- a small structured log payload (`4 KiB`)
+- a repository corpus fixture (`decodecorpus_files/z000033`)
+- high entropy random payloads (`1 MiB`)
+- low entropy repeated payloads (`1 MiB`)
+- a large structured stream (`100 MiB`)
+- optional Silesia corpus files when `STRUCTURED_ZSTD_SILESIA_DIR=/path/to/silesia` is set
+  - load is bounded by `STRUCTURED_ZSTD_SILESIA_MAX_FILES` (default `12`) and
+    `STRUCTURED_ZSTD_SILESIA_MAX_FILE_BYTES` (default `67108864`)
+
+The local default for the large scenario is `100 MiB`. In GitHub Actions, when
+`STRUCTURED_ZSTD_BENCH_LARGE_BYTES` is unset, `.github/scripts/run-benchmarks.sh` defaults it to
+`16 MiB` to keep CI regression runs bounded while still exercising the same code path.
+
+## Level Mapping
+
+The benchmark suite only compares levels that are currently implemented end-to-end in the pure Rust
+encoder:
+
+- `structured-zstd::Fastest` vs `zstd` level `1`
+- `structured-zstd::Default` vs `zstd` level `3`
+
+`Better` and `Best` are intentionally excluded until the encoder implements them.
+
+Dictionary benchmarks are tracked separately with C FFI `with_dict` vs `without_dict` runs, using a
+dictionary trained from scenario samples. Pure Rust dictionary compression is still pending and is
+therefore not part of the pure-Rust-vs-C timing matrix yet.
+
+## Commands
+
+Run the full Criterion matrix:
+
+```bash
+cargo bench --bench compare_ffi -p structured-zstd -- --output-format bencher
+```
+
+Generate the CI-style JSON and markdown report locally:
+
+```bash
+bash .github/scripts/run-benchmarks.sh
+```
+
+Generate a flamegraph for a hot path:
+
+```bash
+bash scripts/bench-flamegraph.sh
+```
+
+Override the benchmark targeted by the flamegraph script:
+
+```bash
+bash scripts/bench-flamegraph.sh decompress/default/decodecorpus-z000033/matrix/pure_rust
+```
+
+## Outputs
+
+`run-benchmarks.sh` writes:
+
+- `benchmark-results.json` for GitHub regression tracking
+- `benchmark-report.md` with:
+  - compression ratio tables (`REPORT`)
+  - input+output buffer size estimate tables (`REPORT_MEM`)
+  - dictionary compression tables (`REPORT_DICT`)
+  - timing rows for all benchmark functions
+
+Criterion also writes its usual detailed estimates under `target/criterion/`.
diff --git a/README.md b/README.md
@@ -7,6 +7,14 @@ Pure Rust zstd implementation — managed fork of [ruzstd](https://github.com/Ki
 [![docs.rs](https://docs.rs/structured-zstd/badge.svg)](https://docs.rs/structured-zstd)
 [![License: Apache-2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](LICENSE)
 
+## Benchmarks Dashboard
+
+Historical benchmark charts are published to GitHub Pages:
+
+- [Performance dashboard](https://structured-world.github.io/structured-zstd/dev/bench/)
+
+Note: the root Pages URL can be empty; benchmark charts live under `/dev/bench/`.
+
 ## Managed Fork
 
 This is a **maintained fork** of [KillingSpark/zstd-rs](https://github.com/KillingSpark/zstd-rs) (ruzstd) by [Structured World Foundation](https://sw.foundation). We maintain additional features and hardening for the [CoordiNode](https://github.com/structured-world/coordinode) database engine.
@@ -45,6 +53,10 @@ Complete RFC 8878 implementation. Performance: ~1.4-3.5x slower than C zstd depe
 
 When the `dict_builder` feature is enabled, the `dictionary` module can create raw content dictionaries. Within 0.2% of the official implementation on the `github-users` sample set.
 
+## Benchmarking
+
+Performance tracking lives in [BENCHMARKS.md](BENCHMARKS.md). The suite compares `structured-zstd` against the C reference across small payloads, entropy extremes, a `100 MiB` large-stream scenario, repository corpus fixtures, and optional local Silesia corpora. Reports now include compression ratios, input+output buffer size estimates, and C FFI dictionary compression (with/without dictionary) for small and corpus scenarios.
+
 ## Usage
 
 ### Compression