From a216fe2319dc0bf9ecb1d129b21bf0ce8dba3fd8 Mon Sep 17 00:00:00 2001
From: Travis Thieman <travis.thieman@gmail.com>
Date: Thu, 12 Mar 2026 11:28:46 -0400
Subject: [PATCH 1/8] Add memory benchmarks with allocation assertions for
 streaming builtins
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Update testutil.RunScript/RunScriptCtx to accept testing.TB (not just
  *testing.T) so benchmarks can use the same helpers as unit tests
- Add head_bench_test.go: BenchmarkHeadTenLines, BenchmarkHeadBytes,
  BenchmarkHeadSingleLongLine, TestHeadMemoryBoundedLines/Bytes — asserts
  head -n 10 allocates < 1MB on a 10MB input stream
- Add cat_bench_test.go: BenchmarkCat, BenchmarkCatNumbered,
  TestCatMemoryBounded — ceiling 6MB on 1MB input (output is buffered
  through the test harness)
- Add wc_bench_test.go: BenchmarkWcLines, BenchmarkWcAll,
  TestWcMemoryBounded — asserts wc -l allocates < 1MB on 10MB input
- Add tail_bench_test.go: BenchmarkTailTenLines, BenchmarkTailBytes,
  TestTailMemoryBounded — ceiling 32MB on 10MB input (tail reads all
  lines to find last K, so total allocs are O(n) but bounded)

All four TestXxxMemoryBounded assertions pass on main branch.
Synthetic inputs use io.LimitReader over a repeatReader to avoid
creating large files — tests are fast and have no I/O overhead.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 interp/builtins/cat/cat_bench_test.go   | 102 +++++++++++++++++
 interp/builtins/head/head_bench_test.go | 139 ++++++++++++++++++++++++
 interp/builtins/tail/tail_bench_test.go | 105 ++++++++++++++++++
 interp/builtins/testutil/testutil.go    |   8 +-
 interp/builtins/wc/wc_bench_test.go     | 100 +++++++++++++++++
 5 files changed, 451 insertions(+), 3 deletions(-)
 create mode 100644 interp/builtins/cat/cat_bench_test.go
 create mode 100644 interp/builtins/head/head_bench_test.go
 create mode 100644 interp/builtins/tail/tail_bench_test.go
 create mode 100644 interp/builtins/wc/wc_bench_test.go

diff --git a/interp/builtins/cat/cat_bench_test.go b/interp/builtins/cat/cat_bench_test.go
new file mode 100644
index 00000000..a206e386
--- /dev/null
+++ b/interp/builtins/cat/cat_bench_test.go
@@ -0,0 +1,102 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0.
+// This product includes software developed at Datadog (https://www.datadoghq.com/).
+// Copyright 2026-present Datadog, Inc.
+
+package cat_test
+
+import (
+	"io"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/DataDog/rshell/interp"
+	"github.com/DataDog/rshell/interp/builtins/testutil"
+)
+
+// repeatReaderCat yields a repeating line pattern indefinitely.
+type repeatReaderCat struct {
+	line []byte
+	pos  int
+}
+
+func newRepeatReaderCat(line string) *repeatReaderCat {
+	return &repeatReaderCat{line: []byte(line)}
+}
+
+func (r *repeatReaderCat) Read(p []byte) (int, error) {
+	n := 0
+	for n < len(p) {
+		if r.pos >= len(r.line) {
+			r.pos = 0
+		}
+		copied := copy(p[n:], r.line[r.pos:])
+		r.pos += copied
+		n += copied
+	}
+	return n, nil
+}
+
+// createLargeFileCat writes totalBytes of repeating content to dir/filename.
+func createLargeFileCat(tb testing.TB, dir, filename, line string, totalBytes int) string {
+	tb.Helper()
+	path := filepath.Join(dir, filename)
+	f, err := os.Create(path)
+	if err != nil {
+		tb.Fatal(err)
+	}
+	defer f.Close()
+	if _, err := io.Copy(f, io.LimitReader(newRepeatReaderCat(line), int64(totalBytes))); err != nil {
+		tb.Fatal(err)
+	}
+	return path
+}
+
+// cmdRunBCat runs a cat command with AllowedPaths set to dir (bench variant).
+func cmdRunBCat(b *testing.B, script, dir string) (string, string, int) {
+	b.Helper()
+	return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir}))
+}
+
+// BenchmarkCat measures cat on a 1MB file.
+func BenchmarkCat(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileCat(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 1<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBCat(b, "cat input.txt", dir)
+	}
+}
+
+// BenchmarkCatNumbered measures cat -n on a 1MB file.
+func BenchmarkCatNumbered(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileCat(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 1<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBCat(b, "cat -n input.txt", dir)
+	}
+}
+
+// TestCatMemoryBounded asserts that cat's allocation scales reasonably with
+// file size (output buffering is expected, but should not be pathological).
+func TestCatMemoryBounded(t *testing.T) {
+	dir := t.TempDir()
+	createLargeFileCat(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 1<<20)
+
+	result := testing.Benchmark(func(b *testing.B) {
+		b.ReportAllocs()
+		for b.Loop() {
+			cmdRunBCat(b, "cat input.txt", dir)
+		}
+	})
+
+	// cat buffers output through the test harness, so we allow up to 6x file size
+	const maxBytesPerOp = 6 << 20 // 6MB ceiling for a 1MB input
+	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
+		t.Errorf("cat allocated %d bytes/op on 1MB input; want < %d", bpo, maxBytesPerOp)
+	}
+}
diff --git a/interp/builtins/head/head_bench_test.go b/interp/builtins/head/head_bench_test.go
new file mode 100644
index 00000000..17cfa37d
--- /dev/null
+++ b/interp/builtins/head/head_bench_test.go
@@ -0,0 +1,139 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0.
+// This product includes software developed at Datadog (https://www.datadoghq.com/).
+// Copyright 2026-present Datadog, Inc.
+
+package head_test
+
+import (
+	"io"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/DataDog/rshell/interp"
+	"github.com/DataDog/rshell/interp/builtins/testutil"
+)
+
+// repeatReader yields a repeating line pattern indefinitely.
+type repeatReader struct {
+	line []byte
+	pos  int
+}
+
+func newRepeatReader(line string) *repeatReader {
+	return &repeatReader{line: []byte(line)}
+}
+
+func (r *repeatReader) Read(p []byte) (int, error) {
+	n := 0
+	for n < len(p) {
+		if r.pos >= len(r.line) {
+			r.pos = 0
+		}
+		copied := copy(p[n:], r.line[r.pos:])
+		r.pos += copied
+		n += copied
+	}
+	return n, nil
+}
+
+// createLargeFile writes totalBytes of repeating line content to dir/filename.
+func createLargeFile(tb testing.TB, dir, filename, line string, totalBytes int) string {
+	tb.Helper()
+	path := filepath.Join(dir, filename)
+	f, err := os.Create(path)
+	if err != nil {
+		tb.Fatal(err)
+	}
+	defer f.Close()
+	if _, err := io.Copy(f, io.LimitReader(newRepeatReader(line), int64(totalBytes))); err != nil {
+		tb.Fatal(err)
+	}
+	return path
+}
+
+// cmdRunB runs a head command with AllowedPaths set to dir (bench variant).
+// Uses testutil.RunScript which accepts testing.TB.
+func cmdRunB(b *testing.B, script, dir string) (string, string, int) {
+	b.Helper()
+	return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir}))
+}
+
+// BenchmarkHeadTenLines measures head -n 10 on a 10MB file of short lines.
+func BenchmarkHeadTenLines(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFile(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunB(b, "head -n 10 input.txt", dir)
+	}
+}
+
+// BenchmarkHeadBytes measures head -c 1024 on a 10MB file.
+func BenchmarkHeadBytes(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFile(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunB(b, "head -c 1024 input.txt", dir)
+	}
+}
+
+// BenchmarkHeadSingleLongLine measures head -n 1 on a 10MB file with one huge line.
+func BenchmarkHeadSingleLongLine(b *testing.B) {
+	dir := b.TempDir()
+	// One 10MB line (no embedded newlines)
+	createLargeFile(b, dir, "input.txt", "x", 10<<20)
+	// Append a newline so it's a valid line
+	f, err := os.OpenFile(filepath.Join(dir, "input.txt"), os.O_APPEND|os.O_WRONLY, 0)
+	if err != nil {
+		b.Fatal(err)
+	}
+	_, _ = f.WriteString("\n")
+	f.Close()
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunB(b, "head -n 1 input.txt", dir)
+	}
+}
+
+// TestHeadMemoryBoundedLines asserts that head -n 10 uses O(1) memory
+// regardless of input file size.
+func TestHeadMemoryBoundedLines(t *testing.T) {
+	dir := t.TempDir()
+	createLargeFile(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+
+	result := testing.Benchmark(func(b *testing.B) {
+		b.ReportAllocs()
+		for b.Loop() {
+			cmdRunB(b, "head -n 10 input.txt", dir)
+		}
+	})
+
+	const maxBytesPerOp = 1 << 20 // 1MB ceiling
+	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
+		t.Errorf("head -n 10 allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp)
+	}
+}
+
+// TestHeadMemoryBoundedBytes asserts that head -c 1024 uses O(1) memory.
+func TestHeadMemoryBoundedBytes(t *testing.T) {
+	dir := t.TempDir()
+	createLargeFile(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+
+	result := testing.Benchmark(func(b *testing.B) {
+		b.ReportAllocs()
+		for b.Loop() {
+			cmdRunB(b, "head -c 1024 input.txt", dir)
+		}
+	})
+
+	const maxBytesPerOp = 1 << 20 // 1MB ceiling
+	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
+		t.Errorf("head -c 1024 allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp)
+	}
+}
diff --git a/interp/builtins/tail/tail_bench_test.go b/interp/builtins/tail/tail_bench_test.go
new file mode 100644
index 00000000..e42b7c6a
--- /dev/null
+++ b/interp/builtins/tail/tail_bench_test.go
@@ -0,0 +1,105 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0.
+// This product includes software developed at Datadog (https://www.datadoghq.com/).
+// Copyright 2026-present Datadog, Inc.
+
+package tail_test
+
+import (
+	"io"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/DataDog/rshell/interp"
+	"github.com/DataDog/rshell/interp/builtins/testutil"
+)
+
+// repeatReaderTail yields a repeating line pattern indefinitely.
+type repeatReaderTail struct {
+	line []byte
+	pos  int
+}
+
+func newRepeatReaderTail(line string) *repeatReaderTail {
+	return &repeatReaderTail{line: []byte(line)}
+}
+
+func (r *repeatReaderTail) Read(p []byte) (int, error) {
+	n := 0
+	for n < len(p) {
+		if r.pos >= len(r.line) {
+			r.pos = 0
+		}
+		copied := copy(p[n:], r.line[r.pos:])
+		r.pos += copied
+		n += copied
+	}
+	return n, nil
+}
+
+// createLargeFileTail writes totalBytes of repeating content to dir/filename.
+func createLargeFileTail(tb testing.TB, dir, filename, line string, totalBytes int) string {
+	tb.Helper()
+	path := filepath.Join(dir, filename)
+	f, err := os.Create(path)
+	if err != nil {
+		tb.Fatal(err)
+	}
+	defer f.Close()
+	if _, err := io.Copy(f, io.LimitReader(newRepeatReaderTail(line), int64(totalBytes))); err != nil {
+		tb.Fatal(err)
+	}
+	return path
+}
+
+// cmdRunBTail runs a tail command with AllowedPaths set to dir (bench variant).
+func cmdRunBTail(b *testing.B, script, dir string) (string, string, int) {
+	b.Helper()
+	return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir}))
+}
+
+// BenchmarkTailTenLines measures tail -n 10 on a 10MB file.
+func BenchmarkTailTenLines(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileTail(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBTail(b, "tail -n 10 input.txt", dir)
+	}
+}
+
+// BenchmarkTailBytes measures tail -c 1024 on a 10MB file.
+func BenchmarkTailBytes(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileTail(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBTail(b, "tail -c 1024 input.txt", dir)
+	}
+}
+
+// TestTailMemoryBounded asserts that tail -n 10 allocation is bounded.
+// Note: tail reads the whole file to find the last N lines, so total
+// allocations are O(n), but live heap (the ring buffer) is O(K).
+// This test checks that the ceiling doesn't grow unboundedly.
+func TestTailMemoryBounded(t *testing.T) {
+	dir := t.TempDir()
+	createLargeFileTail(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+
+	result := testing.Benchmark(func(b *testing.B) {
+		b.ReportAllocs()
+		for b.Loop() {
+			cmdRunBTail(b, "tail -n 10 input.txt", dir)
+		}
+	})
+
+	// tail reads line-by-line through a scanner; each line is allocated then
+	// discarded from the ring buffer — total allocs are O(n) but capped here.
+	const maxBytesPerOp = 32 << 20 // 32MB ceiling for a 10MB input
+	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
+		t.Errorf("tail -n 10 allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp)
+	}
+}
diff --git a/interp/builtins/testutil/testutil.go b/interp/builtins/testutil/testutil.go
index f262617c..3b982159 100644
--- a/interp/builtins/testutil/testutil.go
+++ b/interp/builtins/testutil/testutil.go
@@ -20,8 +20,9 @@ import (
 )
 
 // RunScriptCtx runs a shell script with a context and returns stdout, stderr,
-// and the exit code.
-func RunScriptCtx(ctx context.Context, t *testing.T, script, dir string, opts ...interp.RunnerOption) (string, string, int) {
+// and the exit code. It accepts testing.TB so it can be used in both tests
+// and benchmarks.
+func RunScriptCtx(ctx context.Context, t testing.TB, script, dir string, opts ...interp.RunnerOption) (string, string, int) {
 	t.Helper()
 	parser := syntax.NewParser()
 	prog, err := parser.Parse(strings.NewReader(script), "")
@@ -51,7 +52,8 @@ func RunScriptCtx(ctx context.Context, t *testing.T, script, dir string, opts ..
 }
 
 // RunScript runs a shell script and returns stdout, stderr, and the exit code.
-func RunScript(t *testing.T, script, dir string, opts ...interp.RunnerOption) (string, string, int) {
+// It accepts testing.TB so it can be used in both tests and benchmarks.
+func RunScript(t testing.TB, script, dir string, opts ...interp.RunnerOption) (string, string, int) {
 	t.Helper()
 	return RunScriptCtx(context.Background(), t, script, dir, opts...)
 }
diff --git a/interp/builtins/wc/wc_bench_test.go b/interp/builtins/wc/wc_bench_test.go
new file mode 100644
index 00000000..7da10518
--- /dev/null
+++ b/interp/builtins/wc/wc_bench_test.go
@@ -0,0 +1,100 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0.
+// This product includes software developed at Datadog (https://www.datadoghq.com/).
+// Copyright 2026-present Datadog, Inc.
+
+package wc_test
+
+import (
+	"io"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/DataDog/rshell/interp"
+	"github.com/DataDog/rshell/interp/builtins/testutil"
+)
+
+// repeatReaderWc yields a repeating line pattern indefinitely.
+type repeatReaderWc struct {
+	line []byte
+	pos  int
+}
+
+func newRepeatReaderWc(line string) *repeatReaderWc {
+	return &repeatReaderWc{line: []byte(line)}
+}
+
+func (r *repeatReaderWc) Read(p []byte) (int, error) {
+	n := 0
+	for n < len(p) {
+		if r.pos >= len(r.line) {
+			r.pos = 0
+		}
+		copied := copy(p[n:], r.line[r.pos:])
+		r.pos += copied
+		n += copied
+	}
+	return n, nil
+}
+
+// createLargeFileWc writes totalBytes of repeating content to dir/filename.
+func createLargeFileWc(tb testing.TB, dir, filename, line string, totalBytes int) string {
+	tb.Helper()
+	path := filepath.Join(dir, filename)
+	f, err := os.Create(path)
+	if err != nil {
+		tb.Fatal(err)
+	}
+	defer f.Close()
+	if _, err := io.Copy(f, io.LimitReader(newRepeatReaderWc(line), int64(totalBytes))); err != nil {
+		tb.Fatal(err)
+	}
+	return path
+}
+
+// cmdRunBWc runs a wc command with AllowedPaths set to dir (bench variant).
+func cmdRunBWc(b *testing.B, script, dir string) (string, string, int) {
+	b.Helper()
+	return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir}))
+}
+
+// BenchmarkWcLines measures wc -l on a 10MB file.
+func BenchmarkWcLines(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileWc(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBWc(b, "wc -l input.txt", dir)
+	}
+}
+
+// BenchmarkWcAll measures wc (all counts) on a 10MB file.
+func BenchmarkWcAll(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileWc(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBWc(b, "wc input.txt", dir)
+	}
+}
+
+// TestWcMemoryBounded asserts that wc uses O(1) memory regardless of file size.
+func TestWcMemoryBounded(t *testing.T) {
+	dir := t.TempDir()
+	createLargeFileWc(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+
+	result := testing.Benchmark(func(b *testing.B) {
+		b.ReportAllocs()
+		for b.Loop() {
+			cmdRunBWc(b, "wc -l input.txt", dir)
+		}
+	})
+
+	const maxBytesPerOp = 1 << 20 // 1MB ceiling for a streaming counter
+	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
+		t.Errorf("wc -l allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp)
+	}
+}

From f77a09ab359d59e440459d0b4bb9384b8e801988 Mon Sep 17 00:00:00 2001
From: Travis Thieman <travis.thieman@gmail.com>
Date: Thu, 12 Mar 2026 12:05:23 -0400
Subject: [PATCH 2/8] Address review comments on memory benchmark PR

- Move repeatReader to testutil.NewRepeatReader, eliminating four
  duplicate implementations across cat, head, tail, wc bench tests
- Rename BenchmarkHeadSingleLongLine to BenchmarkHeadSingleLineNearCap
  and reduce line size from 10MB to 900KB (below MaxLineBytes=1MiB),
  so the benchmark exercises the successful large-line path instead of
  the error path (codex P2)
- Fix BenchmarkHeadSingleLineNearCap: defer f.Close() with error
  checking instead of bare f.Close() (reviewer P3)
- TestTailMemoryBounded: use 1MB input with 4MB ceiling instead of
  10MB input with 32MB ceiling; add detailed comment explaining the
  O(n) allocation characteristic (reviewer P3)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 interp/builtins/cat/cat_bench_test.go   | 25 +-----------
 interp/builtins/head/head_bench_test.go | 47 ++++++++---------------
 interp/builtins/tail/tail_bench_test.go | 51 +++++++++----------------
 interp/builtins/testutil/testutil.go    | 28 ++++++++++++++
 interp/builtins/wc/wc_bench_test.go     | 25 +-----------
 5 files changed, 65 insertions(+), 111 deletions(-)

diff --git a/interp/builtins/cat/cat_bench_test.go b/interp/builtins/cat/cat_bench_test.go
index a206e386..6baf62a4 100644
--- a/interp/builtins/cat/cat_bench_test.go
+++ b/interp/builtins/cat/cat_bench_test.go
@@ -15,29 +15,6 @@ import (
 	"github.com/DataDog/rshell/interp/builtins/testutil"
 )
 
-// repeatReaderCat yields a repeating line pattern indefinitely.
-type repeatReaderCat struct {
-	line []byte
-	pos  int
-}
-
-func newRepeatReaderCat(line string) *repeatReaderCat {
-	return &repeatReaderCat{line: []byte(line)}
-}
-
-func (r *repeatReaderCat) Read(p []byte) (int, error) {
-	n := 0
-	for n < len(p) {
-		if r.pos >= len(r.line) {
-			r.pos = 0
-		}
-		copied := copy(p[n:], r.line[r.pos:])
-		r.pos += copied
-		n += copied
-	}
-	return n, nil
-}
-
 // createLargeFileCat writes totalBytes of repeating content to dir/filename.
 func createLargeFileCat(tb testing.TB, dir, filename, line string, totalBytes int) string {
 	tb.Helper()
@@ -47,7 +24,7 @@ func createLargeFileCat(tb testing.TB, dir, filename, line string, totalBytes in
 		tb.Fatal(err)
 	}
 	defer f.Close()
-	if _, err := io.Copy(f, io.LimitReader(newRepeatReaderCat(line), int64(totalBytes))); err != nil {
+	if _, err := io.Copy(f, io.LimitReader(testutil.NewRepeatReader(line), int64(totalBytes))); err != nil {
 		tb.Fatal(err)
 	}
 	return path
diff --git a/interp/builtins/head/head_bench_test.go b/interp/builtins/head/head_bench_test.go
index 17cfa37d..188cdd63 100644
--- a/interp/builtins/head/head_bench_test.go
+++ b/interp/builtins/head/head_bench_test.go
@@ -15,29 +15,6 @@ import (
 	"github.com/DataDog/rshell/interp/builtins/testutil"
 )
 
-// repeatReader yields a repeating line pattern indefinitely.
-type repeatReader struct {
-	line []byte
-	pos  int
-}
-
-func newRepeatReader(line string) *repeatReader {
-	return &repeatReader{line: []byte(line)}
-}
-
-func (r *repeatReader) Read(p []byte) (int, error) {
-	n := 0
-	for n < len(p) {
-		if r.pos >= len(r.line) {
-			r.pos = 0
-		}
-		copied := copy(p[n:], r.line[r.pos:])
-		r.pos += copied
-		n += copied
-	}
-	return n, nil
-}
-
 // createLargeFile writes totalBytes of repeating line content to dir/filename.
 func createLargeFile(tb testing.TB, dir, filename, line string, totalBytes int) string {
 	tb.Helper()
@@ -47,7 +24,7 @@ func createLargeFile(tb testing.TB, dir, filename, line string, totalBytes int)
 		tb.Fatal(err)
 	}
 	defer f.Close()
-	if _, err := io.Copy(f, io.LimitReader(newRepeatReader(line), int64(totalBytes))); err != nil {
+	if _, err := io.Copy(f, io.LimitReader(testutil.NewRepeatReader(line), int64(totalBytes))); err != nil {
 		tb.Fatal(err)
 	}
 	return path
@@ -82,18 +59,26 @@ func BenchmarkHeadBytes(b *testing.B) {
 	}
 }
 
-// BenchmarkHeadSingleLongLine measures head -n 1 on a 10MB file with one huge line.
-func BenchmarkHeadSingleLongLine(b *testing.B) {
+// BenchmarkHeadSingleLineNearCap measures head -n 1 on a file with one line
+// just below MaxLineBytes (1MiB). Lines exceeding MaxLineBytes trigger an
+// error path; this benchmark exercises the successful large-line path.
+func BenchmarkHeadSingleLineNearCap(b *testing.B) {
 	dir := b.TempDir()
-	// One 10MB line (no embedded newlines)
-	createLargeFile(b, dir, "input.txt", "x", 10<<20)
-	// Append a newline so it's a valid line
+	// 900KB line -- safely below MaxLineBytes (1MiB) so head succeeds.
+	createLargeFile(b, dir, "input.txt", "x", 900<<10)
+	// Append a newline to complete the line.
 	f, err := os.OpenFile(filepath.Join(dir, "input.txt"), os.O_APPEND|os.O_WRONLY, 0)
 	if err != nil {
 		b.Fatal(err)
 	}
-	_, _ = f.WriteString("\n")
-	f.Close()
+	defer func() {
+		if err := f.Close(); err != nil {
+			b.Errorf("close input.txt: %v", err)
+		}
+	}()
+	if _, err := f.WriteString("\n"); err != nil {
+		b.Fatal(err)
+	}
 	b.ResetTimer()
 	b.ReportAllocs()
 	for b.Loop() {
diff --git a/interp/builtins/tail/tail_bench_test.go b/interp/builtins/tail/tail_bench_test.go
index e42b7c6a..8ba58d25 100644
--- a/interp/builtins/tail/tail_bench_test.go
+++ b/interp/builtins/tail/tail_bench_test.go
@@ -15,29 +15,6 @@ import (
 	"github.com/DataDog/rshell/interp/builtins/testutil"
 )
 
-// repeatReaderTail yields a repeating line pattern indefinitely.
-type repeatReaderTail struct {
-	line []byte
-	pos  int
-}
-
-func newRepeatReaderTail(line string) *repeatReaderTail {
-	return &repeatReaderTail{line: []byte(line)}
-}
-
-func (r *repeatReaderTail) Read(p []byte) (int, error) {
-	n := 0
-	for n < len(p) {
-		if r.pos >= len(r.line) {
-			r.pos = 0
-		}
-		copied := copy(p[n:], r.line[r.pos:])
-		r.pos += copied
-		n += copied
-	}
-	return n, nil
-}
-
 // createLargeFileTail writes totalBytes of repeating content to dir/filename.
 func createLargeFileTail(tb testing.TB, dir, filename, line string, totalBytes int) string {
 	tb.Helper()
@@ -47,7 +24,7 @@ func createLargeFileTail(tb testing.TB, dir, filename, line string, totalBytes i
 		tb.Fatal(err)
 	}
 	defer f.Close()
-	if _, err := io.Copy(f, io.LimitReader(newRepeatReaderTail(line), int64(totalBytes))); err != nil {
+	if _, err := io.Copy(f, io.LimitReader(testutil.NewRepeatReader(line), int64(totalBytes))); err != nil {
 		tb.Fatal(err)
 	}
 	return path
@@ -82,12 +59,23 @@ func BenchmarkTailBytes(b *testing.B) {
 }
 
 // TestTailMemoryBounded asserts that tail -n 10 allocation is bounded.
-// Note: tail reads the whole file to find the last N lines, so total
-// allocations are O(n), but live heap (the ring buffer) is O(K).
-// This test checks that the ceiling doesn't grow unboundedly.
+//
+// tail must scan the entire input to find the last N lines, so total
+// allocations are O(input size): one []byte copy per scanned line goes into
+// the ring buffer, and old entries are evicted as new ones arrive. Live heap
+// is O(K) (the ring size), but the GC has not necessarily freed evicted
+// entries by the time AllocedBytesPerOp is sampled.
+//
+// With a 1MB input of 44-byte lines (~23 300 lines) the expected total
+// allocation is roughly 1MB (one copy per line x line length). A 4MB ceiling
+// allows 4x headroom for Go runtime and test-harness overhead while still
+// catching regressions that accumulate all lines in memory.
 func TestTailMemoryBounded(t *testing.T) {
+	const line = "the quick brown fox jumps over the lazy dog\n" // 44 bytes
+	const inputSize = 1 << 20                                    // 1 MB -> ~23 300 lines
+
 	dir := t.TempDir()
-	createLargeFileTail(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+	createLargeFileTail(t, dir, "input.txt", line, inputSize)
 
 	result := testing.Benchmark(func(b *testing.B) {
 		b.ReportAllocs()
@@ -96,10 +84,9 @@ func TestTailMemoryBounded(t *testing.T) {
 		}
 	})
 
-	// tail reads line-by-line through a scanner; each line is allocated then
-	// discarded from the ring buffer — total allocs are O(n) but capped here.
-	const maxBytesPerOp = 32 << 20 // 32MB ceiling for a 10MB input
+	// 4MB ceiling for a 1MB input (4x multiplier for runtime/harness overhead).
+	const maxBytesPerOp = 4 << 20
 	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
-		t.Errorf("tail -n 10 allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp)
+		t.Errorf("tail -n 10 allocated %d bytes/op on %d-byte input; want < %d", bpo, inputSize, maxBytesPerOp)
 	}
 }
diff --git a/interp/builtins/testutil/testutil.go b/interp/builtins/testutil/testutil.go
index 3b982159..5f0a01d5 100644
--- a/interp/builtins/testutil/testutil.go
+++ b/interp/builtins/testutil/testutil.go
@@ -10,6 +10,7 @@ import (
 	"bytes"
 	"context"
 	"errors"
+	"io"
 	"strings"
 	"testing"
 
@@ -19,6 +20,33 @@ import (
 	"github.com/DataDog/rshell/interp"
 )
 
+// repeatReader is an io.Reader that repeats a fixed line pattern indefinitely.
+type repeatReader struct {
+	line []byte
+	pos  int
+}
+
+func (r *repeatReader) Read(p []byte) (int, error) {
+	n := 0
+	for n < len(p) {
+		if r.pos >= len(r.line) {
+			r.pos = 0
+		}
+		copied := copy(p[n:], r.line[r.pos:])
+		r.pos += copied
+		n += copied
+	}
+	return n, nil
+}
+
+// NewRepeatReader returns an io.Reader that yields the given line pattern
+// indefinitely. Use io.LimitReader to cap the total bytes produced.
+// It is intended for benchmark setup — generating large synthetic files
+// without keeping the full content in memory.
+func NewRepeatReader(line string) io.Reader {
+	return &repeatReader{line: []byte(line)}
+}
+
 // RunScriptCtx runs a shell script with a context and returns stdout, stderr,
 // and the exit code. It accepts testing.TB so it can be used in both tests
 // and benchmarks.
diff --git a/interp/builtins/wc/wc_bench_test.go b/interp/builtins/wc/wc_bench_test.go
index 7da10518..99bfc09e 100644
--- a/interp/builtins/wc/wc_bench_test.go
+++ b/interp/builtins/wc/wc_bench_test.go
@@ -15,29 +15,6 @@ import (
 	"github.com/DataDog/rshell/interp/builtins/testutil"
 )
 
-// repeatReaderWc yields a repeating line pattern indefinitely.
-type repeatReaderWc struct {
-	line []byte
-	pos  int
-}
-
-func newRepeatReaderWc(line string) *repeatReaderWc {
-	return &repeatReaderWc{line: []byte(line)}
-}
-
-func (r *repeatReaderWc) Read(p []byte) (int, error) {
-	n := 0
-	for n < len(p) {
-		if r.pos >= len(r.line) {
-			r.pos = 0
-		}
-		copied := copy(p[n:], r.line[r.pos:])
-		r.pos += copied
-		n += copied
-	}
-	return n, nil
-}
-
 // createLargeFileWc writes totalBytes of repeating content to dir/filename.
 func createLargeFileWc(tb testing.TB, dir, filename, line string, totalBytes int) string {
 	tb.Helper()
@@ -47,7 +24,7 @@ func createLargeFileWc(tb testing.TB, dir, filename, line string, totalBytes int
 		tb.Fatal(err)
 	}
 	defer f.Close()
-	if _, err := io.Copy(f, io.LimitReader(newRepeatReaderWc(line), int64(totalBytes))); err != nil {
+	if _, err := io.Copy(f, io.LimitReader(testutil.NewRepeatReader(line), int64(totalBytes))); err != nil {
 		tb.Fatal(err)
 	}
 	return path

From 5586f5c41db5c97d0787576dae5f7055d533e2a1 Mon Sep 17 00:00:00 2001
From: Travis Thieman <travis.thieman@gmail.com>
Date: Thu, 12 Mar 2026 15:19:32 -0400
Subject: [PATCH 3/8] Expand memory benchmarks to all remaining builtins

Adds *_bench_test.go for cut, grep, ls, strings, tr, and uniq, completing
coverage of every file-processing builtin in the interpreter.

| Command | Input | AllocedBytesPerOp | Ceiling | Notes |
|---------|-------|-------------------|---------|-------|
| grep -c | 10MB | ~11.5MB | 32MB | O(n) scanner strings, O(1) output |
| uniq | 10MB | ~11.5MB | 32MB | O(n) scanner strings, O(1) live heap |
| cut -b | 10MB | ~16.8MB | 48MB | O(n) scanner + proportional output |
| cut -f | 1MB  | ~5.5MB  | 16MB | strings.Split per line (O(fields/line)) |
| ls     | 1000 entries | <1MB | 10MB | O(entries) to sort; fixed dir size |
| strings | 1MB | ~3MB | 6MB | O(1) chunks; output buffering is O(n) |
| tr | 1MB | ~3MB | 6MB | O(1) lookup table; output buffering O(n) |

Each file follows the same pattern as head/cat/wc/tail:
- BenchmarkXxx functions for go test -bench profiling
- TestXxxMemoryBounded assertion tests that call testing.Benchmark()
  internally and assert AllocedBytesPerOp() stays under a ceiling

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 interp/builtins/cut/cut_bench_test.go         | 118 ++++++++++++++++++
 interp/builtins/grep/grep_bench_test.go       | 108 ++++++++++++++++
 interp/builtins/ls/ls_bench_test.go           |  92 ++++++++++++++
 .../strings_cmd/strings_bench_test.go         |  84 +++++++++++++
 interp/builtins/tr/tr_bench_test.go           |  98 +++++++++++++++
 interp/builtins/uniq/uniq_bench_test.go       |  84 +++++++++++++
 6 files changed, 584 insertions(+)
 create mode 100644 interp/builtins/cut/cut_bench_test.go
 create mode 100644 interp/builtins/grep/grep_bench_test.go
 create mode 100644 interp/builtins/ls/ls_bench_test.go
 create mode 100644 interp/builtins/strings_cmd/strings_bench_test.go
 create mode 100644 interp/builtins/tr/tr_bench_test.go
 create mode 100644 interp/builtins/uniq/uniq_bench_test.go

diff --git a/interp/builtins/cut/cut_bench_test.go b/interp/builtins/cut/cut_bench_test.go
new file mode 100644
index 00000000..ad18ab16
--- /dev/null
+++ b/interp/builtins/cut/cut_bench_test.go
@@ -0,0 +1,118 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0.
+// This product includes software developed at Datadog (https://www.datadoghq.com/).
+// Copyright 2026-present Datadog, Inc.
+
+package cut_test
+
+import (
+	"io"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/DataDog/rshell/interp"
+	"github.com/DataDog/rshell/interp/builtins/testutil"
+)
+
+func createLargeFileCut(tb testing.TB, dir, filename, line string, totalBytes int) string {
+	tb.Helper()
+	path := filepath.Join(dir, filename)
+	f, err := os.Create(path)
+	if err != nil {
+		tb.Fatal(err)
+	}
+	defer f.Close()
+	if _, err := io.Copy(f, io.LimitReader(testutil.NewRepeatReader(line), int64(totalBytes))); err != nil {
+		tb.Fatal(err)
+	}
+	return path
+}
+
+func cmdRunBCut(b *testing.B, script, dir string) (string, string, int) {
+	b.Helper()
+	return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir}))
+}
+
+// BenchmarkCutBytes measures cut -b 1-10 on a 10MB file of short lines.
+func BenchmarkCutBytes(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileCut(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBCut(b, "cut -b 1-10 input.txt", dir)
+	}
+}
+
+// BenchmarkCutFields measures cut -f 1 -d ' ' on a 10MB file of short lines.
+func BenchmarkCutFields(b *testing.B) {
+	dir := b.TempDir()
+	// Tab-delimited: "field1\tfield2\tfield3"
+	createLargeFileCut(b, dir, "input.txt", "alpha\tbeta\tgamma\tdelta\n", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBCut(b, "cut -f 1 input.txt", dir)
+	}
+}
+
+// BenchmarkCutFieldsMultiple measures cut selecting multiple fields on a 10MB file.
+func BenchmarkCutFieldsMultiple(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileCut(b, dir, "input.txt", "alpha\tbeta\tgamma\tdelta\n", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBCut(b, "cut -f 1,3 input.txt", dir)
+	}
+}
+
+// TestCutMemoryBounded asserts that cut -b allocation is bounded relative to
+// input size. cut is a streaming command: it reads one line at a time (up to
+// MaxLineBytes = 1 MiB per line). Total allocations are O(input size) because
+// bufio.Scanner copies each line into a new buffer, but live heap stays O(1).
+//
+// With 10MB of 44-byte lines (~227k lines), scanning allocates ~10MB of line
+// data, plus output buffering for the 10-byte selections (~2.3MB). A 48MB
+// ceiling provides ~3x headroom over the observed ~16.8MB while still catching
+// regressions such as accumulating all lines before emitting.
+func TestCutMemoryBounded(t *testing.T) {
+	dir := t.TempDir()
+	createLargeFileCut(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+
+	result := testing.Benchmark(func(b *testing.B) {
+		b.ReportAllocs()
+		for b.Loop() {
+			cmdRunBCut(b, "cut -b 1-10 input.txt", dir)
+		}
+	})
+
+	const maxBytesPerOp = 48 << 20 // 48MB ceiling (~3x observed ~16.8MB)
+	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
+		t.Errorf("cut -b 1-10 allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp)
+	}
+}
+
+// TestCutFieldsMemoryBounded asserts that cut -f allocation is bounded.
+// Field mode calls strings.Split on each line, allocating a []string per line.
+// This is O(input size) in total allocations. Using 1MB input keeps the
+// expected allocation manageable (~5.5MB observed) while still validating
+// that no additional unbounded growth occurs.
+func TestCutFieldsMemoryBounded(t *testing.T) {
+	dir := t.TempDir()
+	// 1MB (not 10MB) because strings.Split allocates O(fields) per line.
+	createLargeFileCut(t, dir, "input.txt", "alpha\tbeta\tgamma\tdelta\n", 1<<20)
+
+	result := testing.Benchmark(func(b *testing.B) {
+		b.ReportAllocs()
+		for b.Loop() {
+			cmdRunBCut(b, "cut -f 1 input.txt", dir)
+		}
+	})
+
+	const maxBytesPerOp = 16 << 20 // 16MB ceiling (~3x observed ~5.5MB on 1MB input)
+	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
+		t.Errorf("cut -f 1 allocated %d bytes/op on 1MB input; want < %d", bpo, maxBytesPerOp)
+	}
+}
diff --git a/interp/builtins/grep/grep_bench_test.go b/interp/builtins/grep/grep_bench_test.go
new file mode 100644
index 00000000..d79ba944
--- /dev/null
+++ b/interp/builtins/grep/grep_bench_test.go
@@ -0,0 +1,108 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0.
+// This product includes software developed at Datadog (https://www.datadoghq.com/).
+// Copyright 2026-present Datadog, Inc.
+
+package grep_test
+
+import (
+	"io"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/DataDog/rshell/interp"
+	"github.com/DataDog/rshell/interp/builtins/testutil"
+)
+
+func createLargeFileGrep(tb testing.TB, dir, filename, line string, totalBytes int) string {
+	tb.Helper()
+	path := filepath.Join(dir, filename)
+	f, err := os.Create(path)
+	if err != nil {
+		tb.Fatal(err)
+	}
+	defer f.Close()
+	if _, err := io.Copy(f, io.LimitReader(testutil.NewRepeatReader(line), int64(totalBytes))); err != nil {
+		tb.Fatal(err)
+	}
+	return path
+}
+
+func cmdRunBGrep(b *testing.B, script, dir string) (string, string, int) {
+	b.Helper()
+	return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir}))
+}
+
+// BenchmarkGrepMatch measures grep on a 10MB file where every line matches.
+func BenchmarkGrepMatch(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileGrep(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBGrep(b, "grep fox input.txt", dir)
+	}
+}
+
+// BenchmarkGrepNoMatch measures grep on a 10MB file where no lines match.
+func BenchmarkGrepNoMatch(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileGrep(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBGrep(b, "grep NOMATCH input.txt", dir)
+	}
+}
+
+// BenchmarkGrepFixedStrings measures grep -F on a 10MB file.
+func BenchmarkGrepFixedStrings(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileGrep(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBGrep(b, "grep -F fox input.txt", dir)
+	}
+}
+
+// BenchmarkGrepCount measures grep -c on a 10MB file.
+func BenchmarkGrepCount(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileGrep(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBGrep(b, "grep -c fox input.txt", dir)
+	}
+}
+
+// TestGrepMemoryBounded asserts that grep allocation is bounded relative to
+// input size. grep is a streaming command: it reads one line at a time (up to
+// MaxLineBytes = 1 MiB per line). Total allocations are O(input size) because
+// bufio.Scanner.Text() allocates a new string per line, but live heap stays
+// O(1). When every line matches, the output buffer also scales with input size;
+// using -c avoids that second O(n) factor and isolates the scanner overhead.
+//
+// With 10MB of 44-byte lines (~227k lines) the scanner allocates roughly 1
+// string per line ≈ 10MB of string data. A 32MB ceiling allows 3x headroom for
+// runtime, output buffering, and test-harness overhead while still catching
+// regressions such as accidentally storing all lines in a slice.
+func TestGrepMemoryBounded(t *testing.T) {
+	dir := t.TempDir()
+	createLargeFileGrep(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+
+	result := testing.Benchmark(func(b *testing.B) {
+		b.ReportAllocs()
+		for b.Loop() {
+			// Use -c to avoid output scaling O(n) with match count.
+			cmdRunBGrep(b, "grep -c fox input.txt", dir)
+		}
+	})
+
+	const maxBytesPerOp = 32 << 20 // 32MB ceiling (~3x observed ~11.5MB)
+	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
+		t.Errorf("grep -c allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp)
+	}
+}
diff --git a/interp/builtins/ls/ls_bench_test.go b/interp/builtins/ls/ls_bench_test.go
new file mode 100644
index 00000000..365f8afb
--- /dev/null
+++ b/interp/builtins/ls/ls_bench_test.go
@@ -0,0 +1,92 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0.
+// This product includes software developed at Datadog (https://www.datadoghq.com/).
+// Copyright 2026-present Datadog, Inc.
+
+package ls_test
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/DataDog/rshell/interp"
+	"github.com/DataDog/rshell/interp/builtins/testutil"
+)
+
+// createFileDir creates a directory containing n empty files named
+// file0000.txt … fileNNNN.txt and returns the directory path.
+func createFileDir(tb testing.TB, n int) string {
+	tb.Helper()
+	dir := tb.TempDir()
+	for i := range n {
+		name := filepath.Join(dir, fmt.Sprintf("file%04d.txt", i))
+		f, err := os.Create(name)
+		if err != nil {
+			tb.Fatal(err)
+		}
+		f.Close()
+	}
+	return dir
+}
+
+func cmdRunBLs(b *testing.B, script, dir string) (string, string, int) {
+	b.Helper()
+	return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir}))
+}
+
+// BenchmarkLs measures ls on a directory with 1000 entries.
+func BenchmarkLs(b *testing.B) {
+	dir := createFileDir(b, 1000)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBLs(b, "ls .", dir)
+	}
+}
+
+// BenchmarkLsLong measures ls -l on a directory with 1000 entries.
+func BenchmarkLsLong(b *testing.B) {
+	dir := createFileDir(b, 1000)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBLs(b, "ls -l .", dir)
+	}
+}
+
+// BenchmarkLsSmallDir measures ls on a small directory (10 entries).
+func BenchmarkLsSmallDir(b *testing.B) {
+	dir := createFileDir(b, 10)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBLs(b, "ls .", dir)
+	}
+}
+
+// TestLsMemoryBounded asserts that ls allocation scales linearly with the
+// number of directory entries rather than diverging to pathological levels.
+// ls must load all directory entries into memory to sort them (O(n) live heap),
+// but should not buffer additional data beyond what os.ReadDir returns.
+//
+// With 1000 entries of ~12-byte names the expected allocation is roughly
+// 1000 × (name string + FileInfo struct) ≈ a few hundred KB. A 10MB ceiling
+// catches regressions that accidentally buffer full file contents or loop
+// without bound.
+func TestLsMemoryBounded(t *testing.T) {
+	dir := createFileDir(t, 1000)
+
+	result := testing.Benchmark(func(b *testing.B) {
+		b.ReportAllocs()
+		for b.Loop() {
+			cmdRunBLs(b, "ls .", dir)
+		}
+	})
+
+	const maxBytesPerOp = 10 << 20 // 10MB ceiling for 1000-entry directory
+	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
+		t.Errorf("ls allocated %d bytes/op on 1000-entry dir; want < %d", bpo, maxBytesPerOp)
+	}
+}
diff --git a/interp/builtins/strings_cmd/strings_bench_test.go b/interp/builtins/strings_cmd/strings_bench_test.go
new file mode 100644
index 00000000..032eef60
--- /dev/null
+++ b/interp/builtins/strings_cmd/strings_bench_test.go
@@ -0,0 +1,84 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0.
+// This product includes software developed at Datadog (https://www.datadoghq.com/).
+// Copyright 2026-present Datadog, Inc.
+
+package strings_cmd_test
+
+import (
+	"io"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/DataDog/rshell/interp"
+	"github.com/DataDog/rshell/interp/builtins/testutil"
+)
+
+func createLargeFileStrings(tb testing.TB, dir, filename, line string, totalBytes int) string {
+	tb.Helper()
+	path := filepath.Join(dir, filename)
+	f, err := os.Create(path)
+	if err != nil {
+		tb.Fatal(err)
+	}
+	defer f.Close()
+	if _, err := io.Copy(f, io.LimitReader(testutil.NewRepeatReader(line), int64(totalBytes))); err != nil {
+		tb.Fatal(err)
+	}
+	return path
+}
+
+func cmdRunBStrings(b *testing.B, script, dir string) (string, string, int) {
+	b.Helper()
+	return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir}))
+}
+
+// BenchmarkStrings measures strings on a 1MB file containing many short
+// printable sequences separated by null bytes. Each line is a 43-byte printable
+// string followed by a null byte, producing ~24k strings.
+func BenchmarkStrings(b *testing.B) {
+	dir := b.TempDir()
+	// Mix of printable chars + null byte so strings emits many short tokens.
+	createLargeFileStrings(b, dir, "input.bin", "the quick brown fox jumps over lazy\x00", 1<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBStrings(b, "strings input.bin", dir)
+	}
+}
+
+// BenchmarkStringsPrintableOnly measures strings on a 1MB fully-printable file.
+// The entire file is one continuous printable run that exceeds maxStringLen
+// (1 MiB cap), so only the first 1 MiB is emitted.
+func BenchmarkStringsPrintableOnly(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileStrings(b, dir, "input.txt", "abcdefghij", 1<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBStrings(b, "strings input.txt", dir)
+	}
+}
+
+// TestStringsMemoryBounded asserts that strings uses bounded memory regardless
+// of input size. strings reads in 32 KiB chunks and caps individual string
+// accumulation at maxStringLen (1 MiB). With short printable sequences
+// separated by non-printable bytes the current string buffer stays small.
+// A 6MB ceiling allows for output buffering and test-harness overhead.
+func TestStringsMemoryBounded(t *testing.T) {
+	dir := t.TempDir()
+	createLargeFileStrings(t, dir, "input.bin", "the quick brown fox jumps over lazy\x00", 1<<20)
+
+	result := testing.Benchmark(func(b *testing.B) {
+		b.ReportAllocs()
+		for b.Loop() {
+			cmdRunBStrings(b, "strings input.bin", dir)
+		}
+	})
+
+	const maxBytesPerOp = 6 << 20 // 6MB ceiling for a 1MB input with output buffering
+	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
+		t.Errorf("strings allocated %d bytes/op on 1MB input; want < %d", bpo, maxBytesPerOp)
+	}
+}
diff --git a/interp/builtins/tr/tr_bench_test.go b/interp/builtins/tr/tr_bench_test.go
new file mode 100644
index 00000000..9f11aa0f
--- /dev/null
+++ b/interp/builtins/tr/tr_bench_test.go
@@ -0,0 +1,98 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0.
+// This product includes software developed at Datadog (https://www.datadoghq.com/).
+// Copyright 2026-present Datadog, Inc.
+
+package tr_test
+
+import (
+	"io"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/DataDog/rshell/interp"
+	"github.com/DataDog/rshell/interp/builtins/testutil"
+)
+
+func createLargeFileTr(tb testing.TB, dir, filename, line string, totalBytes int) string {
+	tb.Helper()
+	path := filepath.Join(dir, filename)
+	f, err := os.Create(path)
+	if err != nil {
+		tb.Fatal(err)
+	}
+	defer f.Close()
+	if _, err := io.Copy(f, io.LimitReader(testutil.NewRepeatReader(line), int64(totalBytes))); err != nil {
+		tb.Fatal(err)
+	}
+	return path
+}
+
+func cmdRunBTr(b *testing.B, script, dir string) (string, string, int) {
+	b.Helper()
+	return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir}))
+}
+
+// BenchmarkTrTranslate measures tr 'a-z' 'A-Z' on a 1MB file piped through tr.
+// tr reads input from stdin in fixed 32 KiB chunks and translates byte-by-byte
+// using a pre-built 256-entry lookup table.
+func BenchmarkTrTranslate(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileTr(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 1<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBTr(b, "cat input.txt | tr 'a-z' 'A-Z'", dir)
+	}
+}
+
+// BenchmarkTrDelete measures tr -d on a 1MB file.
+func BenchmarkTrDelete(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileTr(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 1<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBTr(b, "cat input.txt | tr -d ' '", dir)
+	}
+}
+
+// BenchmarkTrSqueeze measures tr -s on a 1MB file.
+func BenchmarkTrSqueeze(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileTr(b, dir, "input.txt", "the  quick  brown  fox  jumps  over  the  lazy  dog\n", 1<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBTr(b, "cat input.txt | tr -s ' '", dir)
+	}
+}
+
+// TestTrMemoryBounded asserts that tr uses O(1) memory regardless of input
+// size. tr operates on a 256-entry lookup table built once at startup. Input
+// is read in fixed 32 KiB chunks and translated in-place; no allocation is
+// proportional to input length. Output is buffered through the test harness,
+// so total allocations are O(input size) due to output buffering — using 1MB
+// input keeps the ceiling manageable.
+//
+// With 1MB of input, output is also ~1MB; a 6MB ceiling provides ~3x headroom
+// over the expected ~2–3MB (output buffer doublings + runtime overhead) while
+// still catching regressions such as accumulating the entire translated output
+// in a pre-allocated slice.
+func TestTrMemoryBounded(t *testing.T) {
+	dir := t.TempDir()
+	createLargeFileTr(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 1<<20)
+
+	result := testing.Benchmark(func(b *testing.B) {
+		b.ReportAllocs()
+		for b.Loop() {
+			cmdRunBTr(b, "cat input.txt | tr 'a-z' 'A-Z'", dir)
+		}
+	})
+
+	const maxBytesPerOp = 6 << 20 // 6MB ceiling for a 1MB input with output buffering
+	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
+		t.Errorf("tr allocated %d bytes/op on 1MB input; want < %d", bpo, maxBytesPerOp)
+	}
+}
diff --git a/interp/builtins/uniq/uniq_bench_test.go b/interp/builtins/uniq/uniq_bench_test.go
new file mode 100644
index 00000000..a0babc21
--- /dev/null
+++ b/interp/builtins/uniq/uniq_bench_test.go
@@ -0,0 +1,84 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0.
+// This product includes software developed at Datadog (https://www.datadoghq.com/).
+// Copyright 2026-present Datadog, Inc.
+
+package uniq_test
+
+import (
+	"io"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/DataDog/rshell/interp"
+	"github.com/DataDog/rshell/interp/builtins/testutil"
+)
+
+func createLargeFileUniq(tb testing.TB, dir, filename, line string, totalBytes int) string {
+	tb.Helper()
+	path := filepath.Join(dir, filename)
+	f, err := os.Create(path)
+	if err != nil {
+		tb.Fatal(err)
+	}
+	defer f.Close()
+	if _, err := io.Copy(f, io.LimitReader(testutil.NewRepeatReader(line), int64(totalBytes))); err != nil {
+		tb.Fatal(err)
+	}
+	return path
+}
+
+func cmdRunBUniq(b *testing.B, script, dir string) (string, string, int) {
+	b.Helper()
+	return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir}))
+}
+
+// BenchmarkUniq measures uniq on a 10MB file of identical lines (all deduplicated to one).
+func BenchmarkUniq(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileUniq(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBUniq(b, "uniq input.txt", dir)
+	}
+}
+
+// BenchmarkUniqCount measures uniq -c on a 10MB file.
+func BenchmarkUniqCount(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileUniq(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		cmdRunBUniq(b, "uniq -c input.txt", dir)
+	}
+}
+
+// TestUniqMemoryBounded asserts that uniq allocation is bounded relative to
+// input size. uniq is a streaming command: only the current and previous lines
+// are kept in memory at any time (live heap is O(1)), but total allocations are
+// O(input size) because bufio.Scanner.Text() allocates a new string per line.
+//
+// With 10MB of 44-byte identical lines (~227k lines) the scanner allocates
+// roughly one 44-byte string per line ≈ 10MB of string data total. Output is
+// just a single deduplicated line (~44 bytes), so output buffering is trivial.
+// A 32MB ceiling provides 3x headroom for runtime overhead while still catching
+// regressions such as accumulating all lines in a slice before deduplicating.
+func TestUniqMemoryBounded(t *testing.T) {
+	dir := t.TempDir()
+	createLargeFileUniq(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+
+	result := testing.Benchmark(func(b *testing.B) {
+		b.ReportAllocs()
+		for b.Loop() {
+			cmdRunBUniq(b, "uniq input.txt", dir)
+		}
+	})
+
+	const maxBytesPerOp = 32 << 20 // 32MB ceiling (~3x observed ~11.5MB)
+	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
+		t.Errorf("uniq allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp)
+	}
+}

From d2bef132188bff0eba09ba45cbd7cfb64312f96d Mon Sep 17 00:00:00 2001
From: Travis Thieman <travis.thieman@gmail.com>
Date: Fri, 13 Mar 2026 11:17:17 -0400
Subject: [PATCH 4/8] Fix O(n) allocations in streaming builtins; assert <5MB
 on 10MB inputs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implementation changes:
- grep: sc.Text() → sc.Bytes(), Match([]byte), FindAllIndex for -o flag,
  contextLine.text []byte, printMatchLine/printContextLine write bytes
  directly avoiding strings.Builder per match
- uniq: sc.Text() → sc.Bytes() with reused prevLine/prevKey []byte buffers,
  compareKeyBytes/asciiToLowerBytes/skipFieldsBytesN byte-slice variants,
  writeLine uses two Write calls instead of string concatenation
- cut: processBytes replaced strings.Builder with direct Stdout.Write of
  selected ranges; processFields replaced string(raw)+strings.Split with
  single-pass byte scanner calling inRanges per field
- tail: ring buffer uses append(ring[i][:0], raw...) to reuse backing
  arrays after initial fill, eliminating per-line make([]byte)+copy
- testutil: add RunScriptDiscard/RunScriptDiscardCtx helpers that route
  stdout to io.Discard for allocation-only measurement

Test changes:
- All Test*MemoryBounded tests updated to use RunScriptDiscard and assert
  AllocedBytesPerOp < 4MB on 10MB inputs (was: loose ceilings 16-48MB on
  1-10MB inputs)
- bytes.Equal added to import allowlist (used by uniq compareKeyBytes)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 interp/builtins/cat/cat_bench_test.go         |  13 +-
 interp/builtins/cut/cut.go                    | 118 ++++++++++--------
 interp/builtins/cut/cut_bench_test.go         |  33 ++---
 interp/builtins/grep/grep.go                  |  50 ++++----
 interp/builtins/grep/grep_bench_test.go       |  22 ++--
 .../strings_cmd/strings_bench_test.go         |  11 +-
 interp/builtins/tail/tail.go                  |   6 +-
 interp/builtins/tail/tail_bench_test.go       |  20 +--
 interp/builtins/testutil/testutil.go          |  38 ++++++
 interp/builtins/tr/tr_bench_test.go           |  17 +--
 interp/builtins/uniq/uniq.go                  | 115 +++++++++--------
 interp/builtins/uniq/uniq_bench_test.go       |  18 +--
 tests/allowed_symbols_test.go                 |   2 +
 13 files changed, 243 insertions(+), 220 deletions(-)

diff --git a/interp/builtins/cat/cat_bench_test.go b/interp/builtins/cat/cat_bench_test.go
index 6baf62a4..f1bedc6e 100644
--- a/interp/builtins/cat/cat_bench_test.go
+++ b/interp/builtins/cat/cat_bench_test.go
@@ -58,22 +58,21 @@ func BenchmarkCatNumbered(b *testing.B) {
 	}
 }
 
-// TestCatMemoryBounded asserts that cat's allocation scales reasonably with
-// file size (output buffering is expected, but should not be pathological).
+// TestCatMemoryBounded asserts that cat uses O(1) memory regardless of input
+// size. cat streams input to output in fixed chunks with no per-line allocation.
 func TestCatMemoryBounded(t *testing.T) {
 	dir := t.TempDir()
-	createLargeFileCat(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 1<<20)
+	createLargeFileCat(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
 
 	result := testing.Benchmark(func(b *testing.B) {
 		b.ReportAllocs()
 		for b.Loop() {
-			cmdRunBCat(b, "cat input.txt", dir)
+			testutil.RunScriptDiscard(b, "cat input.txt", dir, interp.AllowedPaths([]string{dir}))
 		}
 	})
 
-	// cat buffers output through the test harness, so we allow up to 6x file size
-	const maxBytesPerOp = 6 << 20 // 6MB ceiling for a 1MB input
+	const maxBytesPerOp = 4 << 20
 	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
-		t.Errorf("cat allocated %d bytes/op on 1MB input; want < %d", bpo, maxBytesPerOp)
+		t.Errorf("cat allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp)
 	}
 }
diff --git a/interp/builtins/cut/cut.go b/interp/builtins/cut/cut.go
index a27ad0cc..1d05e2b7 100644
--- a/interp/builtins/cut/cut.go
+++ b/interp/builtins/cut/cut.go
@@ -392,30 +392,46 @@ func processBytes(callCtx *builtins.CallContext, raw []byte, cfg *cutConfig) {
 		if cfg.outDelimSet {
 			processBytesComplementWithOutDelim(callCtx, raw, cfg)
 		} else {
-			var sb strings.Builder
+			start := -1
 			for i := range n {
-				pos := i + 1
-				if !inRanges(pos, cfg.ranges) {
-					sb.WriteByte(raw[i])
+				if !inRanges(i+1, cfg.ranges) {
+					if start < 0 {
+						start = i
+					}
+				} else {
+					if start >= 0 {
+						callCtx.Stdout.Write(raw[start:i]) //nolint:errcheck
+						start = -1
+					}
 				}
 			}
-			callCtx.Out(sb.String())
+			if start >= 0 {
+				callCtx.Stdout.Write(raw[start:]) //nolint:errcheck
+			}
 		}
 	} else {
 		if cfg.outDelimSet {
 			processBytesWithOutDelim(callCtx, raw, cfg)
 		} else {
-			var sb strings.Builder
+			start := -1
 			for i := range n {
-				pos := i + 1
-				if inRanges(pos, cfg.ranges) {
-					sb.WriteByte(raw[i])
+				if inRanges(i+1, cfg.ranges) {
+					if start < 0 {
+						start = i
+					}
+				} else {
+					if start >= 0 {
+						callCtx.Stdout.Write(raw[start:i]) //nolint:errcheck
+						start = -1
+					}
 				}
 			}
-			callCtx.Out(sb.String())
+			if start >= 0 {
+				callCtx.Stdout.Write(raw[start:]) //nolint:errcheck
+			}
 		}
 	}
-	callCtx.Out("\n")
+	callCtx.Stdout.Write([]byte{'\n'}) //nolint:errcheck
 }
 
 // processBytesWithOutDelim outputs selected byte ranges with the output
@@ -455,56 +471,58 @@ func processBytesComplementWithOutDelim(callCtx *builtins.CallContext, raw []byt
 
 // processFields selects fields from a line.
 func processFields(callCtx *builtins.CallContext, raw []byte, cfg *cutConfig) {
-	line := string(raw)
-	delimStr := string(cfg.delimByte)
-
-	// Check if line contains the delimiter.
-	if strings.IndexByte(line, cfg.delimByte) < 0 {
+	hasDelim := false
+	for _, b := range raw {
+		if b == cfg.delimByte {
+			hasDelim = true
+			break
+		}
+	}
+	if !hasDelim {
 		if cfg.onlyDelimited {
-			return // suppress line
+			return
 		}
-		// No delimiter: print the whole line + newline.
-		callCtx.Out(line)
-		callCtx.Out("\n")
+		callCtx.Stdout.Write(raw)         //nolint:errcheck
+		callCtx.Stdout.Write([]byte{'\n'}) //nolint:errcheck
 		return
 	}
 
-	fields := strings.Split(line, delimStr)
-	nFields := len(fields)
+	nFields := 1
+	for _, b := range raw {
+		if b == cfg.delimByte {
+			nFields++
+		}
+	}
+
+	fieldIdx := 0
+	fieldStart := 0
+	firstOutput := true
 
-	// Determine which fields to select.
-	var selected []int
-	if cfg.complement {
-		compRanges := complementRanges(cfg.ranges, nFields)
-		for _, r := range compRanges {
-			for i := r[0]; i <= r[1] && i <= nFields; i++ {
-				selected = append(selected, i)
-			}
+	for i := 0; i <= len(raw); i++ {
+		if i < len(raw) && raw[i] != cfg.delimByte {
+			continue
 		}
-	} else {
-		for _, r := range cfg.ranges {
-			start := r[0]
-			end := r[1]
-			if start > nFields {
-				break
-			}
-			if end > nFields {
-				end = nFields
-			}
-			for i := start; i <= end; i++ {
-				selected = append(selected, i)
-			}
+		fieldIdx++
+		fieldNum := fieldIdx
+
+		selected := false
+		if cfg.complement {
+			selected = !inRanges(fieldNum, cfg.ranges)
+		} else {
+			selected = inRanges(fieldNum, cfg.ranges)
 		}
-	}
 
-	// Output selected fields joined by the output delimiter.
-	for i, idx := range selected {
-		if i > 0 {
-			callCtx.Out(cfg.outDelim)
+		if selected {
+			if !firstOutput {
+				callCtx.Out(cfg.outDelim)
+			}
+			callCtx.Stdout.Write(raw[fieldStart:i]) //nolint:errcheck
+			firstOutput = false
 		}
-		callCtx.Out(fields[idx-1])
+
+		fieldStart = i + 1
 	}
-	callCtx.Out("\n")
+	callCtx.Stdout.Write([]byte{'\n'}) //nolint:errcheck
 }
 
 // complementRanges returns the complement of the given sorted, merged ranges
diff --git a/interp/builtins/cut/cut_bench_test.go b/interp/builtins/cut/cut_bench_test.go
index ad18ab16..41346c75 100644
--- a/interp/builtins/cut/cut_bench_test.go
+++ b/interp/builtins/cut/cut_bench_test.go
@@ -68,15 +68,9 @@ func BenchmarkCutFieldsMultiple(b *testing.B) {
 	}
 }
 
-// TestCutMemoryBounded asserts that cut -b allocation is bounded relative to
-// input size. cut is a streaming command: it reads one line at a time (up to
-// MaxLineBytes = 1 MiB per line). Total allocations are O(input size) because
-// bufio.Scanner copies each line into a new buffer, but live heap stays O(1).
-//
-// With 10MB of 44-byte lines (~227k lines), scanning allocates ~10MB of line
-// data, plus output buffering for the 10-byte selections (~2.3MB). A 48MB
-// ceiling provides ~3x headroom over the observed ~16.8MB while still catching
-// regressions such as accumulating all lines before emitting.
+// TestCutMemoryBounded asserts that cut -b uses O(1) memory regardless of
+// input size. cut is a streaming command that writes selected byte ranges
+// directly to Stdout with no per-line string allocation.
 func TestCutMemoryBounded(t *testing.T) {
 	dir := t.TempDir()
 	createLargeFileCut(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
@@ -84,35 +78,32 @@ func TestCutMemoryBounded(t *testing.T) {
 	result := testing.Benchmark(func(b *testing.B) {
 		b.ReportAllocs()
 		for b.Loop() {
-			cmdRunBCut(b, "cut -b 1-10 input.txt", dir)
+			testutil.RunScriptDiscard(b, "cut -b 1-10 input.txt", dir, interp.AllowedPaths([]string{dir}))
 		}
 	})
 
-	const maxBytesPerOp = 48 << 20 // 48MB ceiling (~3x observed ~16.8MB)
+	const maxBytesPerOp = 4 << 20
 	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
 		t.Errorf("cut -b 1-10 allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp)
 	}
 }
 
-// TestCutFieldsMemoryBounded asserts that cut -f allocation is bounded.
-// Field mode calls strings.Split on each line, allocating a []string per line.
-// This is O(input size) in total allocations. Using 1MB input keeps the
-// expected allocation manageable (~5.5MB observed) while still validating
-// that no additional unbounded growth occurs.
+// TestCutFieldsMemoryBounded asserts that cut -f uses O(1) memory regardless
+// of input size. Field mode scans raw bytes for the delimiter without
+// converting to string or allocating a []string per line.
 func TestCutFieldsMemoryBounded(t *testing.T) {
 	dir := t.TempDir()
-	// 1MB (not 10MB) because strings.Split allocates O(fields) per line.
-	createLargeFileCut(t, dir, "input.txt", "alpha\tbeta\tgamma\tdelta\n", 1<<20)
+	createLargeFileCut(t, dir, "input.txt", "alpha\tbeta\tgamma\tdelta\n", 10<<20)
 
 	result := testing.Benchmark(func(b *testing.B) {
 		b.ReportAllocs()
 		for b.Loop() {
-			cmdRunBCut(b, "cut -f 1 input.txt", dir)
+			testutil.RunScriptDiscard(b, "cut -f 1 input.txt", dir, interp.AllowedPaths([]string{dir}))
 		}
 	})
 
-	const maxBytesPerOp = 16 << 20 // 16MB ceiling (~3x observed ~5.5MB on 1MB input)
+	const maxBytesPerOp = 4 << 20
 	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
-		t.Errorf("cut -f 1 allocated %d bytes/op on 1MB input; want < %d", bpo, maxBytesPerOp)
+		t.Errorf("cut -f 1 allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp)
 	}
 }
diff --git a/interp/builtins/grep/grep.go b/interp/builtins/grep/grep.go
index cb4622e1..525f366a 100644
--- a/interp/builtins/grep/grep.go
+++ b/interp/builtins/grep/grep.go
@@ -562,9 +562,9 @@ func grepFile(ctx context.Context, callCtx *builtins.CallContext, file string, o
 			return matchCount > 0, ctx.Err()
 		}
 		lineNum++
-		line := sc.Text()
+		lineBytes := sc.Bytes()
 
-		matched := opts.re.MatchString(line)
+		matched := opts.re.Match(lineBytes)
 		if opts.invertMatch {
 			matched = !matched
 		}
@@ -606,15 +606,15 @@ func grepFile(ctx context.Context, callCtx *builtins.CallContext, file string, o
 				// -o -v: line was selected by inversion (doesn't contain
 				// pattern), so there are no matching parts to print.
 			} else if opts.onlyMatching {
-				matches := opts.re.FindAllString(line, -1)
-				for _, m := range matches {
-					if m == "" {
+				indices := opts.re.FindAllIndex(lineBytes, -1)
+				for _, idx := range indices {
+					if idx[0] == idx[1] {
 						continue // suppress empty matches (GNU grep behavior)
 					}
-					printMatchLine(callCtx, displayName, lineNum, m, opts)
+					printMatchLine(callCtx, displayName, lineNum, lineBytes[idx[0]:idx[1]], opts)
 				}
 			} else {
-				printMatchLine(callCtx, displayName, lineNum, line, opts)
+				printMatchLine(callCtx, displayName, lineNum, lineBytes, opts)
 			}
 			lastPrintedLine = lineNum
 			printedSeparator = true
@@ -625,7 +625,7 @@ func grepFile(ctx context.Context, callCtx *builtins.CallContext, file string, o
 		} else {
 			// Non-matching line: might be after-context or before-context.
 			if afterRemaining > 0 && !opts.quiet && !opts.count && !opts.filesWithMatches && !opts.filesWithoutMatch {
-				printContextLine(callCtx, displayName, lineNum, line, opts, '-')
+				printContextLine(callCtx, displayName, lineNum, lineBytes, opts, '-')
 				lastPrintedLine = lineNum
 				afterRemaining--
 			}
@@ -635,7 +635,9 @@ func grepFile(ctx context.Context, callCtx *builtins.CallContext, file string, o
 				if len(beforeBuf) >= opts.beforeContext {
 					beforeBuf = beforeBuf[1:]
 				}
-				beforeBuf = append(beforeBuf, contextLine{num: lineNum, text: line})
+				cp := make([]byte, len(lineBytes))
+				copy(cp, lineBytes)
+				beforeBuf = append(beforeBuf, contextLine{num: lineNum, text: cp})
 			}
 		}
 	}
@@ -664,31 +666,31 @@ func grepFile(ctx context.Context, callCtx *builtins.CallContext, file string, o
 
 type contextLine struct {
 	num  int
-	text string
+	text []byte
 }
 
-func printMatchLine(callCtx *builtins.CallContext, filename string, lineNum int, line string, opts *grepOpts) {
-	var prefix strings.Builder
+func printMatchLine(callCtx *builtins.CallContext, filename string, lineNum int, line []byte, opts *grepOpts) {
 	if opts.showFilename {
-		prefix.WriteString(filename)
-		prefix.WriteByte(':')
+		callCtx.Stdout.Write([]byte(filename)) //nolint:errcheck
+		callCtx.Stdout.Write([]byte{':'})      //nolint:errcheck
 	}
 	if opts.lineNumber {
-		prefix.WriteString(strconv.Itoa(lineNum))
-		prefix.WriteByte(':')
+		callCtx.Stdout.Write([]byte(strconv.Itoa(lineNum))) //nolint:errcheck
+		callCtx.Stdout.Write([]byte{':'})                   //nolint:errcheck
 	}
-	callCtx.Outf("%s%s\n", prefix.String(), line)
+	callCtx.Stdout.Write(line)      //nolint:errcheck
+	callCtx.Stdout.Write([]byte{'\n'}) //nolint:errcheck
 }
 
-func printContextLine(callCtx *builtins.CallContext, filename string, lineNum int, line string, opts *grepOpts, sep byte) {
-	var prefix strings.Builder
+func printContextLine(callCtx *builtins.CallContext, filename string, lineNum int, line []byte, opts *grepOpts, sep byte) {
 	if opts.showFilename {
-		prefix.WriteString(filename)
-		prefix.WriteByte(sep)
+		callCtx.Stdout.Write([]byte(filename)) //nolint:errcheck
+		callCtx.Stdout.Write([]byte{sep})      //nolint:errcheck
 	}
 	if opts.lineNumber {
-		prefix.WriteString(strconv.Itoa(lineNum))
-		prefix.WriteByte(sep)
+		callCtx.Stdout.Write([]byte(strconv.Itoa(lineNum))) //nolint:errcheck
+		callCtx.Stdout.Write([]byte{sep})                   //nolint:errcheck
 	}
-	callCtx.Outf("%s%s\n", prefix.String(), line)
+	callCtx.Stdout.Write(line)         //nolint:errcheck
+	callCtx.Stdout.Write([]byte{'\n'}) //nolint:errcheck
 }
diff --git a/interp/builtins/grep/grep_bench_test.go b/interp/builtins/grep/grep_bench_test.go
index d79ba944..916bc03a 100644
--- a/interp/builtins/grep/grep_bench_test.go
+++ b/interp/builtins/grep/grep_bench_test.go
@@ -78,17 +78,10 @@ func BenchmarkGrepCount(b *testing.B) {
 	}
 }
 
-// TestGrepMemoryBounded asserts that grep allocation is bounded relative to
-// input size. grep is a streaming command: it reads one line at a time (up to
-// MaxLineBytes = 1 MiB per line). Total allocations are O(input size) because
-// bufio.Scanner.Text() allocates a new string per line, but live heap stays
-// O(1). When every line matches, the output buffer also scales with input size;
-// using -c avoids that second O(n) factor and isolates the scanner overhead.
-//
-// With 10MB of 44-byte lines (~227k lines) the scanner allocates roughly 1
-// string per line ≈ 10MB of string data. A 32MB ceiling allows 3x headroom for
-// runtime, output buffering, and test-harness overhead while still catching
-// regressions such as accidentally storing all lines in a slice.
+// TestGrepMemoryBounded asserts that grep uses O(1) memory when processing
+// large files. grep is a streaming command that reads one line at a time via
+// sc.Bytes() (no per-line string allocation). Total allocations are dominated
+// by the shell/runner overhead, not input size.
 func TestGrepMemoryBounded(t *testing.T) {
 	dir := t.TempDir()
 	createLargeFileGrep(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
@@ -96,13 +89,12 @@ func TestGrepMemoryBounded(t *testing.T) {
 	result := testing.Benchmark(func(b *testing.B) {
 		b.ReportAllocs()
 		for b.Loop() {
-			// Use -c to avoid output scaling O(n) with match count.
-			cmdRunBGrep(b, "grep -c fox input.txt", dir)
+			testutil.RunScriptDiscard(b, "grep fox input.txt", dir, interp.AllowedPaths([]string{dir}))
 		}
 	})
 
-	const maxBytesPerOp = 32 << 20 // 32MB ceiling (~3x observed ~11.5MB)
+	const maxBytesPerOp = 4 << 20
 	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
-		t.Errorf("grep -c allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp)
+		t.Errorf("grep allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp)
 	}
 }
diff --git a/interp/builtins/strings_cmd/strings_bench_test.go b/interp/builtins/strings_cmd/strings_bench_test.go
index 032eef60..81d4c10a 100644
--- a/interp/builtins/strings_cmd/strings_bench_test.go
+++ b/interp/builtins/strings_cmd/strings_bench_test.go
@@ -61,24 +61,23 @@ func BenchmarkStringsPrintableOnly(b *testing.B) {
 	}
 }
 
-// TestStringsMemoryBounded asserts that strings uses bounded memory regardless
+// TestStringsMemoryBounded asserts that strings uses O(1) memory regardless
 // of input size. strings reads in 32 KiB chunks and caps individual string
 // accumulation at maxStringLen (1 MiB). With short printable sequences
 // separated by non-printable bytes the current string buffer stays small.
-// A 6MB ceiling allows for output buffering and test-harness overhead.
 func TestStringsMemoryBounded(t *testing.T) {
 	dir := t.TempDir()
-	createLargeFileStrings(t, dir, "input.bin", "the quick brown fox jumps over lazy\x00", 1<<20)
+	createLargeFileStrings(t, dir, "input.bin", "the quick brown fox jumps over lazy\x00", 10<<20)
 
 	result := testing.Benchmark(func(b *testing.B) {
 		b.ReportAllocs()
 		for b.Loop() {
-			cmdRunBStrings(b, "strings input.bin", dir)
+			testutil.RunScriptDiscard(b, "strings input.bin", dir, interp.AllowedPaths([]string{dir}))
 		}
 	})
 
-	const maxBytesPerOp = 6 << 20 // 6MB ceiling for a 1MB input with output buffering
+	const maxBytesPerOp = 4 << 20
 	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
-		t.Errorf("strings allocated %d bytes/op on 1MB input; want < %d", bpo, maxBytesPerOp)
+		t.Errorf("strings allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp)
 	}
 }
diff --git a/interp/builtins/tail/tail.go b/interp/builtins/tail/tail.go
index 35d231e3..878ef1db 100644
--- a/interp/builtins/tail/tail.go
+++ b/interp/builtins/tail/tail.go
@@ -320,8 +320,6 @@ func readLastLines(ctx context.Context, callCtx *builtins.CallContext, r io.Read
 		if !isRegularFile && totalRead > MaxTotalReadBytes {
 			return errors.New("input too large: read limit exceeded")
 		}
-		cp := make([]byte, len(raw))
-		copy(cp, raw)
 		// When the ring is full, evict the oldest entry before writing.
 		if ringCount == ringSize {
 			// If count exceeds the ring capacity, we cannot deliver the full
@@ -331,8 +329,8 @@ func readLastLines(ctx context.Context, callCtx *builtins.CallContext, r io.Read
 			}
 			ringBytes -= int64(len(ring[ringHead]))
 		}
-		ring[ringHead] = cp
-		ringBytes += int64(len(cp))
+		ring[ringHead] = append(ring[ringHead][:0], raw...)
+		ringBytes += int64(len(ring[ringHead]))
 		if ringBytes > MaxRingBytes {
 			return errors.New("input too large: ring buffer memory limit exceeded")
 		}
diff --git a/interp/builtins/tail/tail_bench_test.go b/interp/builtins/tail/tail_bench_test.go
index 8ba58d25..fefc61ee 100644
--- a/interp/builtins/tail/tail_bench_test.go
+++ b/interp/builtins/tail/tail_bench_test.go
@@ -58,21 +58,12 @@ func BenchmarkTailBytes(b *testing.B) {
 	}
 }
 
-// TestTailMemoryBounded asserts that tail -n 10 allocation is bounded.
-//
-// tail must scan the entire input to find the last N lines, so total
-// allocations are O(input size): one []byte copy per scanned line goes into
-// the ring buffer, and old entries are evicted as new ones arrive. Live heap
-// is O(K) (the ring size), but the GC has not necessarily freed evicted
-// entries by the time AllocedBytesPerOp is sampled.
-//
-// With a 1MB input of 44-byte lines (~23 300 lines) the expected total
-// allocation is roughly 1MB (one copy per line x line length). A 4MB ceiling
-// allows 4x headroom for Go runtime and test-harness overhead while still
-// catching regressions that accumulate all lines in memory.
+// TestTailMemoryBounded asserts that tail -n 10 uses O(1) memory regardless of
+// input size. The ring buffer slots are reused via append(slot[:0], raw...),
+// so no per-line allocation occurs after the first pass fills the ring.
 func TestTailMemoryBounded(t *testing.T) {
 	const line = "the quick brown fox jumps over the lazy dog\n" // 44 bytes
-	const inputSize = 1 << 20                                    // 1 MB -> ~23 300 lines
+	const inputSize = 10 << 20                                   // 10 MB
 
 	dir := t.TempDir()
 	createLargeFileTail(t, dir, "input.txt", line, inputSize)
@@ -80,11 +71,10 @@ func TestTailMemoryBounded(t *testing.T) {
 	result := testing.Benchmark(func(b *testing.B) {
 		b.ReportAllocs()
 		for b.Loop() {
-			cmdRunBTail(b, "tail -n 10 input.txt", dir)
+			testutil.RunScriptDiscard(b, "tail -n 10 input.txt", dir, interp.AllowedPaths([]string{dir}))
 		}
 	})
 
-	// 4MB ceiling for a 1MB input (4x multiplier for runtime/harness overhead).
 	const maxBytesPerOp = 4 << 20
 	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
 		t.Errorf("tail -n 10 allocated %d bytes/op on %d-byte input; want < %d", bpo, inputSize, maxBytesPerOp)
diff --git a/interp/builtins/testutil/testutil.go b/interp/builtins/testutil/testutil.go
index 5f0a01d5..bed6dead 100644
--- a/interp/builtins/testutil/testutil.go
+++ b/interp/builtins/testutil/testutil.go
@@ -85,3 +85,41 @@ func RunScript(t testing.TB, script, dir string, opts ...interp.RunnerOption) (s
 	t.Helper()
 	return RunScriptCtx(context.Background(), t, script, dir, opts...)
 }
+
+// RunScriptDiscard runs a shell script and returns stderr and the exit code.
+// Stdout is discarded (io.Discard). Use this in memory-allocation tests to
+// prevent output buffering from dominating the AllocedBytesPerOp measurement.
+func RunScriptDiscard(t testing.TB, script, dir string, opts ...interp.RunnerOption) (string, int) {
+	t.Helper()
+	return RunScriptDiscardCtx(context.Background(), t, script, dir, opts...)
+}
+
+// RunScriptDiscardCtx is RunScriptDiscard with an explicit context.
+func RunScriptDiscardCtx(ctx context.Context, t testing.TB, script, dir string, opts ...interp.RunnerOption) (string, int) {
+	t.Helper()
+	parser := syntax.NewParser()
+	prog, err := parser.Parse(strings.NewReader(script), "")
+	require.NoError(t, err)
+
+	var errBuf bytes.Buffer
+	allOpts := append([]interp.RunnerOption{interp.StdIO(nil, io.Discard, &errBuf)}, opts...)
+	runner, err := interp.New(allOpts...)
+	require.NoError(t, err)
+	defer runner.Close()
+
+	if dir != "" {
+		runner.Dir = dir
+	}
+
+	err = runner.Run(ctx, prog)
+	exitCode := 0
+	if err != nil {
+		var es interp.ExitStatus
+		if errors.As(err, &es) {
+			exitCode = int(es)
+		} else if ctx.Err() == nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+	}
+	return errBuf.String(), exitCode
+}
diff --git a/interp/builtins/tr/tr_bench_test.go b/interp/builtins/tr/tr_bench_test.go
index 9f11aa0f..cc88c25d 100644
--- a/interp/builtins/tr/tr_bench_test.go
+++ b/interp/builtins/tr/tr_bench_test.go
@@ -72,27 +72,20 @@ func BenchmarkTrSqueeze(b *testing.B) {
 // TestTrMemoryBounded asserts that tr uses O(1) memory regardless of input
 // size. tr operates on a 256-entry lookup table built once at startup. Input
 // is read in fixed 32 KiB chunks and translated in-place; no allocation is
-// proportional to input length. Output is buffered through the test harness,
-// so total allocations are O(input size) due to output buffering — using 1MB
-// input keeps the ceiling manageable.
-//
-// With 1MB of input, output is also ~1MB; a 6MB ceiling provides ~3x headroom
-// over the expected ~2–3MB (output buffer doublings + runtime overhead) while
-// still catching regressions such as accumulating the entire translated output
-// in a pre-allocated slice.
+// proportional to input length.
 func TestTrMemoryBounded(t *testing.T) {
 	dir := t.TempDir()
-	createLargeFileTr(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 1<<20)
+	createLargeFileTr(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
 
 	result := testing.Benchmark(func(b *testing.B) {
 		b.ReportAllocs()
 		for b.Loop() {
-			cmdRunBTr(b, "cat input.txt | tr 'a-z' 'A-Z'", dir)
+			testutil.RunScriptDiscard(b, "cat input.txt | tr 'a-z' 'A-Z'", dir, interp.AllowedPaths([]string{dir}))
 		}
 	})
 
-	const maxBytesPerOp = 6 << 20 // 6MB ceiling for a 1MB input with output buffering
+	const maxBytesPerOp = 4 << 20
 	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
-		t.Errorf("tr allocated %d bytes/op on 1MB input; want < %d", bpo, maxBytesPerOp)
+		t.Errorf("tr allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp)
 	}
 }
diff --git a/interp/builtins/uniq/uniq.go b/interp/builtins/uniq/uniq.go
index 4b44598a..6cabbaf9 100644
--- a/interp/builtins/uniq/uniq.go
+++ b/interp/builtins/uniq/uniq.go
@@ -73,6 +73,7 @@ package uniq
 
 import (
 	"bufio"
+	"bytes"
 	"context"
 	"io"
 	"math"
@@ -282,7 +283,6 @@ func processInput(ctx context.Context, callCtx *builtins.CallContext, r io.Reade
 	sc.Split(makeSplitFunc(cfg.delim))
 
 	w := callCtx.Stdout
-	delimStr := string([]byte{cfg.delim})
 
 	reportWrite := func(err error) error {
 		if err != nil {
@@ -291,8 +291,16 @@ func processInput(ctx context.Context, callCtx *builtins.CallContext, r io.Reade
 		return err
 	}
 
-	var prevLine string
-	var prevKey string
+	writeLine := func(line []byte) error {
+		if _, err := w.Write(line); err != nil {
+			return err
+		}
+		_, err := w.Write([]byte{cfg.delim})
+		return err
+	}
+
+	var prevLine []byte
+	var prevKey []byte
 	var lineCount int64
 	first := true
 	groupNum := 0
@@ -301,77 +309,77 @@ func processInput(ctx context.Context, callCtx *builtins.CallContext, r io.Reade
 		if ctx.Err() != nil {
 			return ctx.Err()
 		}
-		curLine := sc.Text()
-		curKey := compareKey(curLine, cfg)
+		curBytes := sc.Bytes()
+		curKey := compareKeyBytes(curBytes, cfg)
 
 		if first {
-			prevLine = curLine
-			prevKey = curKey
+			prevLine = append(prevLine[:0], curBytes...)
+			prevKey = append(prevKey[:0], curKey...)
 			lineCount = 1
 			first = false
 
 			if cfg.useGroup {
 				if cfg.grpMethod == groupPrepend || cfg.grpMethod == groupBoth {
-					if err := reportWrite(writeStr(w, delimStr)); err != nil {
+					if err := reportWrite(writeLine(nil)); err != nil {
 						return err
 					}
 				}
-				if err := reportWrite(writeStr(w, curLine+delimStr)); err != nil {
+				if err := reportWrite(writeLine(prevLine)); err != nil {
 					return err
 				}
 			}
 			continue
 		}
 
-		same := prevKey == curKey
+		same := bytes.Equal(prevKey, curKey)
 
 		if same {
 			if lineCount < math.MaxInt64 {
 				lineCount++
 			}
 			if cfg.useGroup {
-				if err := reportWrite(writeStr(w, curLine+delimStr)); err != nil {
+				if err := reportWrite(writeLine(curBytes)); err != nil {
 					return err
 				}
 			} else if cfg.useAllRepeated {
 				if lineCount == 2 {
 					if groupNum > 0 && cfg.arMethod != allRepeatedNone {
-						if err := reportWrite(writeStr(w, delimStr)); err != nil {
+						if err := reportWrite(writeLine(nil)); err != nil {
 							return err
 						}
 					}
 					if groupNum == 0 && cfg.arMethod == allRepeatedPrepend {
-						if err := reportWrite(writeStr(w, delimStr)); err != nil {
+						if err := reportWrite(writeLine(nil)); err != nil {
 							return err
 						}
 					}
-					if err := reportWrite(writeStr(w, prevLine+delimStr)); err != nil {
+					if err := reportWrite(writeLine(prevLine)); err != nil {
 						return err
 					}
 					groupNum++
 				}
-				if err := reportWrite(writeStr(w, curLine+delimStr)); err != nil {
+				if err := reportWrite(writeLine(curBytes)); err != nil {
 					return err
 				}
 			}
 		} else {
 			if cfg.useGroup {
-				if err := reportWrite(writeStr(w, delimStr)); err != nil {
+				if err := reportWrite(writeLine(nil)); err != nil {
 					return err
 				}
-				if err := reportWrite(writeStr(w, curLine+delimStr)); err != nil {
+				if err := reportWrite(writeLine(curBytes)); err != nil {
 					return err
 				}
 				groupNum++
 			} else if cfg.useAllRepeated {
 				// Nothing to do — non-repeated last group is simply dropped.
 			} else {
-				if err := reportWrite(emitStandard(w, cfg, prevLine, lineCount, delimStr)); err != nil {
+				if err := reportWrite(emitStandard(w, cfg, prevLine, lineCount)); err != nil {
 					return err
 				}
 			}
-			prevLine = curLine
-			prevKey = curKey
+			prevLine = append(prevLine[:0], curBytes...)
+			prevKey = append(prevKey[:0], curKey...)
 			lineCount = 1
 		}
 	}
@@ -388,17 +396,17 @@ func processInput(ctx context.Context, callCtx *builtins.CallContext, r io.Reade
 	// Flush last group.
 	if cfg.useGroup {
 		if cfg.grpMethod == groupAppend || cfg.grpMethod == groupBoth {
-			return reportWrite(writeStr(w, delimStr))
+			return reportWrite(writeLine(nil))
 		}
 		return nil
 	}
 	if cfg.useAllRepeated {
 		return nil
 	}
-	return reportWrite(emitStandard(w, cfg, prevLine, lineCount, delimStr))
+	return reportWrite(emitStandard(w, cfg, prevLine, lineCount))
 }
 
-func emitStandard(w io.Writer, cfg *uniqConfig, line string, count int64, delimStr string) error {
+func emitStandard(w io.Writer, cfg *uniqConfig, line []byte, count int64) error {
 	if cfg.repeated && cfg.unique {
 		return nil
 	}
@@ -413,22 +421,30 @@ func emitStandard(w io.Writer, cfg *uniqConfig, line string, count int64, delimS
 		for len(s) < countFieldWidth {
 			s = " " + s
 		}
-		return writeStr(w, s+" "+line+delimStr)
+		if _, err := io.WriteString(w, s+" "); err != nil {
+			return err
+		}
+		if _, err := w.Write(line); err != nil {
+			return err
+		}
+		_, err := w.Write([]byte{cfg.delim})
+		return err
 	}
-	return writeStr(w, line+delimStr)
-}
-
-func writeStr(w io.Writer, s string) error {
-	_, err := io.WriteString(w, s)
+	if _, err := w.Write(line); err != nil {
+		return err
+	}
+	_, err := w.Write([]byte{cfg.delim})
 	return err
 }
 
-// compareKey extracts the portion of line used for comparison, applying
+// compareKeyBytes extracts the portion of line used for comparison, applying
 // field skipping, char skipping, check-chars, and case folding.
-func compareKey(line string, cfg *uniqConfig) string {
+// For the ignore-case path it returns a newly allocated lowercased copy;
+// otherwise it returns a subslice of line (no allocation).
+func compareKeyBytes(line []byte, cfg *uniqConfig) []byte {
 	s := line
 	if cfg.skipFields > 0 {
-		s = skipFieldsN(s, cfg.skipFields)
+		s = skipFieldsBytesN(s, cfg.skipFields)
 	}
 	if cfg.skipChars > 0 && len(s) > 0 {
 		skip := cfg.skipChars
@@ -441,37 +457,28 @@ func compareKey(line string, cfg *uniqConfig) string {
 		s = s[:cfg.checkChars]
 	}
 	if cfg.ignoreCase {
-		s = asciiToLower(s)
+		s = asciiToLowerBytes(s)
 	}
 	return s
 }
 
-// asciiToLower folds only ASCII A-Z to a-z, matching GNU uniq behavior
-// in the default C/POSIX locale. Unlike strings.ToLower, this does not
-// apply Unicode case folding, so non-ASCII characters are left unchanged.
-func asciiToLower(s string) string {
-	for i := 0; i < len(s); i++ {
-		if s[i] >= 'A' && s[i] <= 'Z' {
-			b := make([]byte, len(s))
-			copy(b, s[:i])
-			b[i] = s[i] + ('a' - 'A')
-			for j := i + 1; j < len(s); j++ {
-				c := s[j]
-				if c >= 'A' && c <= 'Z' {
-					c += 'a' - 'A'
-				}
-				b[j] = c
-			}
-			return string(b)
+// asciiToLowerBytes folds only ASCII A-Z to a-z in a byte slice, matching GNU
+// uniq behavior in the default C/POSIX locale. It always returns a new copy.
+func asciiToLowerBytes(s []byte) []byte {
+	b := make([]byte, len(s))
+	for i, c := range s {
+		if c >= 'A' && c <= 'Z' {
+			c += 'a' - 'A'
 		}
+		b[i] = c
 	}
-	return s
+	return b
 }
 
-// skipFieldsN skips the first n blank-delimited fields and returns the
-// remainder of the string, starting immediately after the last character
+// skipFieldsBytesN skips the first n blank-delimited fields in a byte slice
+// and returns the remainder, starting immediately after the last character
 // of the n-th field (before any subsequent blanks).
-func skipFieldsN(s string, n int64) string {
+func skipFieldsBytesN(s []byte, n int64) []byte {
 	i := 0
 	for field := int64(0); field < n && i < len(s); field++ {
 		for i < len(s) && (s[i] == ' ' || s[i] == '\t') {
diff --git a/interp/builtins/uniq/uniq_bench_test.go b/interp/builtins/uniq/uniq_bench_test.go
index a0babc21..8091e6f8 100644
--- a/interp/builtins/uniq/uniq_bench_test.go
+++ b/interp/builtins/uniq/uniq_bench_test.go
@@ -56,16 +56,10 @@ func BenchmarkUniqCount(b *testing.B) {
 	}
 }
 
-// TestUniqMemoryBounded asserts that uniq allocation is bounded relative to
-// input size. uniq is a streaming command: only the current and previous lines
-// are kept in memory at any time (live heap is O(1)), but total allocations are
-// O(input size) because bufio.Scanner.Text() allocates a new string per line.
-//
-// With 10MB of 44-byte identical lines (~227k lines) the scanner allocates
-// roughly one 44-byte string per line ≈ 10MB of string data total. Output is
-// just a single deduplicated line (~44 bytes), so output buffering is trivial.
-// A 32MB ceiling provides 3x headroom for runtime overhead while still catching
-// regressions such as accumulating all lines in a slice before deduplicating.
+// TestUniqMemoryBounded asserts that uniq uses O(1) memory when processing
+// large files. uniq is a streaming command: only the current and previous lines
+// are kept in memory at any time (live heap is O(1)) and sc.Bytes() avoids
+// per-line string allocations.
 func TestUniqMemoryBounded(t *testing.T) {
 	dir := t.TempDir()
 	createLargeFileUniq(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
@@ -73,11 +67,11 @@ func TestUniqMemoryBounded(t *testing.T) {
 	result := testing.Benchmark(func(b *testing.B) {
 		b.ReportAllocs()
 		for b.Loop() {
-			cmdRunBUniq(b, "uniq input.txt", dir)
+			testutil.RunScriptDiscard(b, "uniq input.txt", dir, interp.AllowedPaths([]string{dir}))
 		}
 	})
 
-	const maxBytesPerOp = 32 << 20 // 32MB ceiling (~3x observed ~11.5MB)
+	const maxBytesPerOp = 4 << 20
 	if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp {
 		t.Errorf("uniq allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp)
 	}
diff --git a/tests/allowed_symbols_test.go b/tests/allowed_symbols_test.go
index 7a8adffb..38ce08c7 100644
--- a/tests/allowed_symbols_test.go
+++ b/tests/allowed_symbols_test.go
@@ -38,6 +38,8 @@ var builtinAllowedSymbols = []string{
 	"bufio.NewScanner",
 	// bufio.Scanner — scanner type for buffered input reading; no write or exec capability.
 	"bufio.Scanner",
+	// bytes.Equal — compares two byte slices for equality; pure function, no I/O.
+	"bytes.Equal",
 	// bufio.SplitFunc — type for custom scanner split functions; pure type, no I/O.
 	"bufio.SplitFunc",
 	// context.Context — deadline/cancellation plumbing; pure interface, no side effects.

From c5a4c2670b3563d7502ff01d86d71a1136361974 Mon Sep 17 00:00:00 2001
From: Travis Thieman <travis.thieman@gmail.com>
Date: Fri, 13 Mar 2026 13:44:11 -0400
Subject: [PATCH 5/8] Fix CI: skip memory bench tests under race, fix gofmt

Add //go:build !race to all *_bench_test.go files so the
TestXxxMemoryBounded tests are excluded when running with -race.
The race detector inflates AllocedBytesPerOp by orders of magnitude
(benchmarks also give meaningless throughput numbers under -race).

Fix trailing-space alignment in cut.go and grep.go to pass gofmt check.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 interp/builtins/cat/cat_bench_test.go             | 2 ++
 interp/builtins/cut/cut.go                        | 2 +-
 interp/builtins/cut/cut_bench_test.go             | 2 ++
 interp/builtins/grep/grep.go                      | 2 +-
 interp/builtins/grep/grep_bench_test.go           | 2 ++
 interp/builtins/head/head_bench_test.go           | 2 ++
 interp/builtins/ls/ls_bench_test.go               | 2 ++
 interp/builtins/strings_cmd/strings_bench_test.go | 2 ++
 interp/builtins/tail/tail_bench_test.go           | 2 ++
 interp/builtins/tr/tr_bench_test.go               | 2 ++
 interp/builtins/uniq/uniq_bench_test.go           | 2 ++
 interp/builtins/wc/wc_bench_test.go               | 2 ++
 12 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/interp/builtins/cat/cat_bench_test.go b/interp/builtins/cat/cat_bench_test.go
index f1bedc6e..3f2b9030 100644
--- a/interp/builtins/cat/cat_bench_test.go
+++ b/interp/builtins/cat/cat_bench_test.go
@@ -1,3 +1,5 @@
+//go:build !race
+
 // Unless explicitly stated otherwise all files in this repository are licensed
 // under the Apache License Version 2.0.
 // This product includes software developed at Datadog (https://www.datadoghq.com/).
diff --git a/interp/builtins/cut/cut.go b/interp/builtins/cut/cut.go
index 1d05e2b7..1595d393 100644
--- a/interp/builtins/cut/cut.go
+++ b/interp/builtins/cut/cut.go
@@ -482,7 +482,7 @@ func processFields(callCtx *builtins.CallContext, raw []byte, cfg *cutConfig) {
 		if cfg.onlyDelimited {
 			return
 		}
-		callCtx.Stdout.Write(raw)         //nolint:errcheck
+		callCtx.Stdout.Write(raw)          //nolint:errcheck
 		callCtx.Stdout.Write([]byte{'\n'}) //nolint:errcheck
 		return
 	}
diff --git a/interp/builtins/cut/cut_bench_test.go b/interp/builtins/cut/cut_bench_test.go
index 41346c75..033e3fca 100644
--- a/interp/builtins/cut/cut_bench_test.go
+++ b/interp/builtins/cut/cut_bench_test.go
@@ -1,3 +1,5 @@
+//go:build !race
+
 // Unless explicitly stated otherwise all files in this repository are licensed
 // under the Apache License Version 2.0.
 // This product includes software developed at Datadog (https://www.datadoghq.com/).
diff --git a/interp/builtins/grep/grep.go b/interp/builtins/grep/grep.go
index 525f366a..e1feebbe 100644
--- a/interp/builtins/grep/grep.go
+++ b/interp/builtins/grep/grep.go
@@ -678,7 +678,7 @@ func printMatchLine(callCtx *builtins.CallContext, filename string, lineNum int,
 		callCtx.Stdout.Write([]byte(strconv.Itoa(lineNum))) //nolint:errcheck
 		callCtx.Stdout.Write([]byte{':'})                   //nolint:errcheck
 	}
-	callCtx.Stdout.Write(line)      //nolint:errcheck
+	callCtx.Stdout.Write(line)         //nolint:errcheck
 	callCtx.Stdout.Write([]byte{'\n'}) //nolint:errcheck
 }
 
diff --git a/interp/builtins/grep/grep_bench_test.go b/interp/builtins/grep/grep_bench_test.go
index 916bc03a..8f687b94 100644
--- a/interp/builtins/grep/grep_bench_test.go
+++ b/interp/builtins/grep/grep_bench_test.go
@@ -1,3 +1,5 @@
+//go:build !race
+
 // Unless explicitly stated otherwise all files in this repository are licensed
 // under the Apache License Version 2.0.
 // This product includes software developed at Datadog (https://www.datadoghq.com/).
diff --git a/interp/builtins/head/head_bench_test.go b/interp/builtins/head/head_bench_test.go
index 188cdd63..bc99fe8b 100644
--- a/interp/builtins/head/head_bench_test.go
+++ b/interp/builtins/head/head_bench_test.go
@@ -1,3 +1,5 @@
+//go:build !race
+
 // Unless explicitly stated otherwise all files in this repository are licensed
 // under the Apache License Version 2.0.
 // This product includes software developed at Datadog (https://www.datadoghq.com/).
diff --git a/interp/builtins/ls/ls_bench_test.go b/interp/builtins/ls/ls_bench_test.go
index 365f8afb..e91f4c7c 100644
--- a/interp/builtins/ls/ls_bench_test.go
+++ b/interp/builtins/ls/ls_bench_test.go
@@ -1,3 +1,5 @@
+//go:build !race
+
 // Unless explicitly stated otherwise all files in this repository are licensed
 // under the Apache License Version 2.0.
 // This product includes software developed at Datadog (https://www.datadoghq.com/).
diff --git a/interp/builtins/strings_cmd/strings_bench_test.go b/interp/builtins/strings_cmd/strings_bench_test.go
index 81d4c10a..0ac887f2 100644
--- a/interp/builtins/strings_cmd/strings_bench_test.go
+++ b/interp/builtins/strings_cmd/strings_bench_test.go
@@ -1,3 +1,5 @@
+//go:build !race
+
 // Unless explicitly stated otherwise all files in this repository are licensed
 // under the Apache License Version 2.0.
 // This product includes software developed at Datadog (https://www.datadoghq.com/).
diff --git a/interp/builtins/tail/tail_bench_test.go b/interp/builtins/tail/tail_bench_test.go
index fefc61ee..49b76f8b 100644
--- a/interp/builtins/tail/tail_bench_test.go
+++ b/interp/builtins/tail/tail_bench_test.go
@@ -1,3 +1,5 @@
+//go:build !race
+
 // Unless explicitly stated otherwise all files in this repository are licensed
 // under the Apache License Version 2.0.
 // This product includes software developed at Datadog (https://www.datadoghq.com/).
diff --git a/interp/builtins/tr/tr_bench_test.go b/interp/builtins/tr/tr_bench_test.go
index cc88c25d..b3cccf5d 100644
--- a/interp/builtins/tr/tr_bench_test.go
+++ b/interp/builtins/tr/tr_bench_test.go
@@ -1,3 +1,5 @@
+//go:build !race
+
 // Unless explicitly stated otherwise all files in this repository are licensed
 // under the Apache License Version 2.0.
 // This product includes software developed at Datadog (https://www.datadoghq.com/).
diff --git a/interp/builtins/uniq/uniq_bench_test.go b/interp/builtins/uniq/uniq_bench_test.go
index 8091e6f8..b351cc92 100644
--- a/interp/builtins/uniq/uniq_bench_test.go
+++ b/interp/builtins/uniq/uniq_bench_test.go
@@ -1,3 +1,5 @@
+//go:build !race
+
 // Unless explicitly stated otherwise all files in this repository are licensed
 // under the Apache License Version 2.0.
 // This product includes software developed at Datadog (https://www.datadoghq.com/).
diff --git a/interp/builtins/wc/wc_bench_test.go b/interp/builtins/wc/wc_bench_test.go
index 99bfc09e..9ef224a5 100644
--- a/interp/builtins/wc/wc_bench_test.go
+++ b/interp/builtins/wc/wc_bench_test.go
@@ -1,3 +1,5 @@
+//go:build !race
+
 // Unless explicitly stated otherwise all files in this repository are licensed
 // under the Apache License Version 2.0.
 // This product includes software developed at Datadog (https://www.datadoghq.com/).

From 8584afad6ffaa07c4471524fccf90e24fb2b5578 Mon Sep 17 00:00:00 2001
From: Travis Thieman <travis.thieman@gmail.com>
Date: Fri, 13 Mar 2026 13:50:10 -0400
Subject: [PATCH 6/8] Fix compliance: move //go:build !race after license
 header

The compliance test requires the license header on line 1. Move the
build tag to after the license block, matching the convention used in
other build-constrained files (e.g. testcmd_unix_test.go).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 interp/builtins/cat/cat_bench_test.go             | 4 ++--
 interp/builtins/cut/cut_bench_test.go             | 4 ++--
 interp/builtins/grep/grep_bench_test.go           | 4 ++--
 interp/builtins/head/head_bench_test.go           | 4 ++--
 interp/builtins/ls/ls_bench_test.go               | 4 ++--
 interp/builtins/strings_cmd/strings_bench_test.go | 4 ++--
 interp/builtins/tail/tail_bench_test.go           | 4 ++--
 interp/builtins/tr/tr_bench_test.go               | 4 ++--
 interp/builtins/uniq/uniq_bench_test.go           | 4 ++--
 interp/builtins/wc/wc_bench_test.go               | 4 ++--
 10 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/interp/builtins/cat/cat_bench_test.go b/interp/builtins/cat/cat_bench_test.go
index 3f2b9030..54e2ae01 100644
--- a/interp/builtins/cat/cat_bench_test.go
+++ b/interp/builtins/cat/cat_bench_test.go
@@ -1,10 +1,10 @@
-//go:build !race
-
 // Unless explicitly stated otherwise all files in this repository are licensed
 // under the Apache License Version 2.0.
 // This product includes software developed at Datadog (https://www.datadoghq.com/).
 // Copyright 2026-present Datadog, Inc.
 
+//go:build !race
+
 package cat_test
 
 import (
diff --git a/interp/builtins/cut/cut_bench_test.go b/interp/builtins/cut/cut_bench_test.go
index 033e3fca..2e9be63b 100644
--- a/interp/builtins/cut/cut_bench_test.go
+++ b/interp/builtins/cut/cut_bench_test.go
@@ -1,10 +1,10 @@
-//go:build !race
-
 // Unless explicitly stated otherwise all files in this repository are licensed
 // under the Apache License Version 2.0.
 // This product includes software developed at Datadog (https://www.datadoghq.com/).
 // Copyright 2026-present Datadog, Inc.
 
+//go:build !race
+
 package cut_test
 
 import (
diff --git a/interp/builtins/grep/grep_bench_test.go b/interp/builtins/grep/grep_bench_test.go
index 8f687b94..cfd2341f 100644
--- a/interp/builtins/grep/grep_bench_test.go
+++ b/interp/builtins/grep/grep_bench_test.go
@@ -1,10 +1,10 @@
-//go:build !race
-
 // Unless explicitly stated otherwise all files in this repository are licensed
 // under the Apache License Version 2.0.
 // This product includes software developed at Datadog (https://www.datadoghq.com/).
 // Copyright 2026-present Datadog, Inc.
 
+//go:build !race
+
 package grep_test
 
 import (
diff --git a/interp/builtins/head/head_bench_test.go b/interp/builtins/head/head_bench_test.go
index bc99fe8b..c6bf134d 100644
--- a/interp/builtins/head/head_bench_test.go
+++ b/interp/builtins/head/head_bench_test.go
@@ -1,10 +1,10 @@
-//go:build !race
-
 // Unless explicitly stated otherwise all files in this repository are licensed
 // under the Apache License Version 2.0.
 // This product includes software developed at Datadog (https://www.datadoghq.com/).
 // Copyright 2026-present Datadog, Inc.
 
+//go:build !race
+
 package head_test
 
 import (
diff --git a/interp/builtins/ls/ls_bench_test.go b/interp/builtins/ls/ls_bench_test.go
index e91f4c7c..988ce1b6 100644
--- a/interp/builtins/ls/ls_bench_test.go
+++ b/interp/builtins/ls/ls_bench_test.go
@@ -1,10 +1,10 @@
-//go:build !race
-
 // Unless explicitly stated otherwise all files in this repository are licensed
 // under the Apache License Version 2.0.
 // This product includes software developed at Datadog (https://www.datadoghq.com/).
 // Copyright 2026-present Datadog, Inc.
 
+//go:build !race
+
 package ls_test
 
 import (
diff --git a/interp/builtins/strings_cmd/strings_bench_test.go b/interp/builtins/strings_cmd/strings_bench_test.go
index 0ac887f2..14e4bfb8 100644
--- a/interp/builtins/strings_cmd/strings_bench_test.go
+++ b/interp/builtins/strings_cmd/strings_bench_test.go
@@ -1,10 +1,10 @@
-//go:build !race
-
 // Unless explicitly stated otherwise all files in this repository are licensed
 // under the Apache License Version 2.0.
 // This product includes software developed at Datadog (https://www.datadoghq.com/).
 // Copyright 2026-present Datadog, Inc.
 
+//go:build !race
+
 package strings_cmd_test
 
 import (
diff --git a/interp/builtins/tail/tail_bench_test.go b/interp/builtins/tail/tail_bench_test.go
index 49b76f8b..ec6c59f9 100644
--- a/interp/builtins/tail/tail_bench_test.go
+++ b/interp/builtins/tail/tail_bench_test.go
@@ -1,10 +1,10 @@
-//go:build !race
-
 // Unless explicitly stated otherwise all files in this repository are licensed
 // under the Apache License Version 2.0.
 // This product includes software developed at Datadog (https://www.datadoghq.com/).
 // Copyright 2026-present Datadog, Inc.
 
+//go:build !race
+
 package tail_test
 
 import (
diff --git a/interp/builtins/tr/tr_bench_test.go b/interp/builtins/tr/tr_bench_test.go
index b3cccf5d..be2f6f89 100644
--- a/interp/builtins/tr/tr_bench_test.go
+++ b/interp/builtins/tr/tr_bench_test.go
@@ -1,10 +1,10 @@
-//go:build !race
-
 // Unless explicitly stated otherwise all files in this repository are licensed
 // under the Apache License Version 2.0.
 // This product includes software developed at Datadog (https://www.datadoghq.com/).
 // Copyright 2026-present Datadog, Inc.
 
+//go:build !race
+
 package tr_test
 
 import (
diff --git a/interp/builtins/uniq/uniq_bench_test.go b/interp/builtins/uniq/uniq_bench_test.go
index b351cc92..beb3ce5b 100644
--- a/interp/builtins/uniq/uniq_bench_test.go
+++ b/interp/builtins/uniq/uniq_bench_test.go
@@ -1,10 +1,10 @@
-//go:build !race
-
 // Unless explicitly stated otherwise all files in this repository are licensed
 // under the Apache License Version 2.0.
 // This product includes software developed at Datadog (https://www.datadoghq.com/).
 // Copyright 2026-present Datadog, Inc.
 
+//go:build !race
+
 package uniq_test
 
 import (
diff --git a/interp/builtins/wc/wc_bench_test.go b/interp/builtins/wc/wc_bench_test.go
index 9ef224a5..2411354a 100644
--- a/interp/builtins/wc/wc_bench_test.go
+++ b/interp/builtins/wc/wc_bench_test.go
@@ -1,10 +1,10 @@
-//go:build !race
-
 // Unless explicitly stated otherwise all files in this repository are licensed
 // under the Apache License Version 2.0.
 // This product includes software developed at Datadog (https://www.datadoghq.com/).
 // Copyright 2026-present Datadog, Inc.
 
+//go:build !race
+
 package wc_test
 
 import (

From ffc95f5b6eab1ddfca7263179bb46c9c5b249724 Mon Sep 17 00:00:00 2001
From: Travis Thieman <travis.thieman@gmail.com>
Date: Fri, 13 Mar 2026 14:03:25 -0400
Subject: [PATCH 7/8] Add BenchmarkXxxDiscard variants to cat, grep, tr

These benchmarks route stdout to io.Discard, isolating the command's
own allocations from output-buffering overhead. The existing benchmarks
(RunScript) measure throughput with real output capture; the Discard
variants show the algorithmic memory profile.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 interp/builtins/cat/cat_bench_test.go   | 10 ++++++++++
 interp/builtins/grep/grep_bench_test.go | 10 ++++++++++
 interp/builtins/tr/tr_bench_test.go     | 13 +++++++++++++
 3 files changed, 33 insertions(+)

diff --git a/interp/builtins/cat/cat_bench_test.go b/interp/builtins/cat/cat_bench_test.go
index 54e2ae01..b9cd0835 100644
--- a/interp/builtins/cat/cat_bench_test.go
+++ b/interp/builtins/cat/cat_bench_test.go
@@ -78,3 +78,13 @@ func TestCatMemoryBounded(t *testing.T) {
 		t.Errorf("cat allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp)
 	}
 }
+
+func BenchmarkCatDiscard(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileCat(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		testutil.RunScriptDiscard(b, "cat input.txt", dir, interp.AllowedPaths([]string{dir}))
+	}
+}
diff --git a/interp/builtins/grep/grep_bench_test.go b/interp/builtins/grep/grep_bench_test.go
index cfd2341f..a50fd9d3 100644
--- a/interp/builtins/grep/grep_bench_test.go
+++ b/interp/builtins/grep/grep_bench_test.go
@@ -100,3 +100,13 @@ func TestGrepMemoryBounded(t *testing.T) {
 		t.Errorf("grep allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp)
 	}
 }
+
+func BenchmarkGrepMatchDiscard(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileGrep(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		testutil.RunScriptDiscard(b, "grep fox input.txt", dir, interp.AllowedPaths([]string{dir}))
+	}
+}
diff --git a/interp/builtins/tr/tr_bench_test.go b/interp/builtins/tr/tr_bench_test.go
index be2f6f89..327b48ab 100644
--- a/interp/builtins/tr/tr_bench_test.go
+++ b/interp/builtins/tr/tr_bench_test.go
@@ -36,6 +36,19 @@ func cmdRunBTr(b *testing.B, script, dir string) (string, string, int) {
 	return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir}))
 }
 
+// BenchmarkTrTranslateDiscard measures tr with stdout discarded to isolate
+// tr's own allocations from output buffering. Used to calibrate the ceiling
+// in TestTrMemoryBounded.
+func BenchmarkTrTranslateDiscard(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileTr(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		testutil.RunScriptDiscard(b, "cat input.txt | tr 'a-z' 'A-Z'", dir, interp.AllowedPaths([]string{dir}))
+	}
+}
+
 // BenchmarkTrTranslate measures tr 'a-z' 'A-Z' on a 1MB file piped through tr.
 // tr reads input from stdin in fixed 32 KiB chunks and translates byte-by-byte
 // using a pre-built 256-entry lookup table.

From 71912974cb46c1ea0a2265758b2d4cd3c0c40a10 Mon Sep 17 00:00:00 2001
From: Travis Thieman <travis.thieman@gmail.com>
Date: Fri, 13 Mar 2026 14:10:46 -0400
Subject: [PATCH 8/8] cut: eliminate per-line newline allocation; add Discard
 benchmarks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace callCtx.Stdout.Write([]byte{'\n'}) with a package-level
var newline = []byte{'\n'}. The literal []byte{'\n'} escapes to the
heap on every call (one per input line), causing 238K-456K allocs on
a 10MB input. The package-level var reuses the same backing array.

Result (RunScriptDiscard, 10MB input):
  cut -b 1-10:  255 KB / 238K allocs  →  17 KB / 113 allocs
  cut -f 1:     472 KB / 456K allocs  →  17 KB / 111 allocs

Also add BenchmarkCutBytesDiscard, BenchmarkCutFieldsDiscard, and
BenchmarkStringsDiscard (stdout → io.Discard) to isolate algorithmic
allocation from output-buffering overhead.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 interp/builtins/cut/cut.go                    | 12 +++++++----
 interp/builtins/cut/cut_bench_test.go         | 20 +++++++++++++++++++
 .../strings_cmd/strings_bench_test.go         | 10 ++++++++++
 3 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/interp/builtins/cut/cut.go b/interp/builtins/cut/cut.go
index 1595d393..dba26259 100644
--- a/interp/builtins/cut/cut.go
+++ b/interp/builtins/cut/cut.go
@@ -213,6 +213,10 @@ func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc {
 	}
 }
 
+// newline is a package-level buffer reused for every line-terminator Write,
+// avoiding a heap allocation per line.
+var newline = []byte{'\n'}
+
 // cutConfig holds the parsed configuration for a cut invocation.
 type cutConfig struct {
 	mode          mode
@@ -431,7 +435,7 @@ func processBytes(callCtx *builtins.CallContext, raw []byte, cfg *cutConfig) {
 			}
 		}
 	}
-	callCtx.Stdout.Write([]byte{'\n'}) //nolint:errcheck
+	callCtx.Stdout.Write(newline) //nolint:errcheck
 }
 
 // processBytesWithOutDelim outputs selected byte ranges with the output
@@ -482,8 +486,8 @@ func processFields(callCtx *builtins.CallContext, raw []byte, cfg *cutConfig) {
 		if cfg.onlyDelimited {
 			return
 		}
-		callCtx.Stdout.Write(raw)          //nolint:errcheck
-		callCtx.Stdout.Write([]byte{'\n'}) //nolint:errcheck
+		callCtx.Stdout.Write(raw)     //nolint:errcheck
+		callCtx.Stdout.Write(newline) //nolint:errcheck
 		return
 	}
 
@@ -522,7 +526,7 @@ func processFields(callCtx *builtins.CallContext, raw []byte, cfg *cutConfig) {
 
 		fieldStart = i + 1
 	}
-	callCtx.Stdout.Write([]byte{'\n'}) //nolint:errcheck
+	callCtx.Stdout.Write(newline) //nolint:errcheck
 }
 
 // complementRanges returns the complement of the given sorted, merged ranges
diff --git a/interp/builtins/cut/cut_bench_test.go b/interp/builtins/cut/cut_bench_test.go
index 2e9be63b..9852e442 100644
--- a/interp/builtins/cut/cut_bench_test.go
+++ b/interp/builtins/cut/cut_bench_test.go
@@ -109,3 +109,23 @@ func TestCutFieldsMemoryBounded(t *testing.T) {
 		t.Errorf("cut -f 1 allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp)
 	}
 }
+
+func BenchmarkCutBytesDiscard(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileCut(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		testutil.RunScriptDiscard(b, "cut -b 1-10 input.txt", dir, interp.AllowedPaths([]string{dir}))
+	}
+}
+
+func BenchmarkCutFieldsDiscard(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileCut(b, dir, "input.txt", "alpha\tbeta\tgamma\tdelta\n", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		testutil.RunScriptDiscard(b, "cut -f 1 input.txt", dir, interp.AllowedPaths([]string{dir}))
+	}
+}
diff --git a/interp/builtins/strings_cmd/strings_bench_test.go b/interp/builtins/strings_cmd/strings_bench_test.go
index 14e4bfb8..7864b400 100644
--- a/interp/builtins/strings_cmd/strings_bench_test.go
+++ b/interp/builtins/strings_cmd/strings_bench_test.go
@@ -83,3 +83,13 @@ func TestStringsMemoryBounded(t *testing.T) {
 		t.Errorf("strings allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp)
 	}
 }
+
+func BenchmarkStringsDiscard(b *testing.B) {
+	dir := b.TempDir()
+	createLargeFileStrings(b, dir, "input.bin", "the quick brown fox jumps over lazy\x00", 10<<20)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for b.Loop() {
+		testutil.RunScriptDiscard(b, "strings input.bin", dir, interp.AllowedPaths([]string{dir}))
+	}
+}