diff --git a/interp/builtins/cat/cat_bench_test.go b/interp/builtins/cat/cat_bench_test.go new file mode 100644 index 00000000..b9cd0835 --- /dev/null +++ b/interp/builtins/cat/cat_bench_test.go @@ -0,0 +1,90 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +//go:build !race + +package cat_test + +import ( + "io" + "os" + "path/filepath" + "testing" + + "github.com/DataDog/rshell/interp" + "github.com/DataDog/rshell/interp/builtins/testutil" +) + +// createLargeFileCat writes totalBytes of repeating content to dir/filename. +func createLargeFileCat(tb testing.TB, dir, filename, line string, totalBytes int) string { + tb.Helper() + path := filepath.Join(dir, filename) + f, err := os.Create(path) + if err != nil { + tb.Fatal(err) + } + defer f.Close() + if _, err := io.Copy(f, io.LimitReader(testutil.NewRepeatReader(line), int64(totalBytes))); err != nil { + tb.Fatal(err) + } + return path +} + +// cmdRunBCat runs a cat command with AllowedPaths set to dir (bench variant). +func cmdRunBCat(b *testing.B, script, dir string) (string, string, int) { + b.Helper() + return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir})) +} + +// BenchmarkCat measures cat on a 1MB file. +func BenchmarkCat(b *testing.B) { + dir := b.TempDir() + createLargeFileCat(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 1<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBCat(b, "cat input.txt", dir) + } +} + +// BenchmarkCatNumbered measures cat -n on a 1MB file. +func BenchmarkCatNumbered(b *testing.B) { + dir := b.TempDir() + createLargeFileCat(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 1<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBCat(b, "cat -n input.txt", dir) + } +} + +// TestCatMemoryBounded asserts that cat uses O(1) memory regardless of input +// size. cat streams input to output in fixed chunks with no per-line allocation. +func TestCatMemoryBounded(t *testing.T) { + dir := t.TempDir() + createLargeFileCat(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + + result := testing.Benchmark(func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + testutil.RunScriptDiscard(b, "cat input.txt", dir, interp.AllowedPaths([]string{dir})) + } + }) + + const maxBytesPerOp = 4 << 20 + if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp { + t.Errorf("cat allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp) + } +} + +func BenchmarkCatDiscard(b *testing.B) { + dir := b.TempDir() + createLargeFileCat(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + testutil.RunScriptDiscard(b, "cat input.txt", dir, interp.AllowedPaths([]string{dir})) + } +} diff --git a/interp/builtins/cut/cut.go b/interp/builtins/cut/cut.go index a27ad0cc..dba26259 100644 --- a/interp/builtins/cut/cut.go +++ b/interp/builtins/cut/cut.go @@ -213,6 +213,10 @@ func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { } } +// newline is a package-level buffer reused for every line-terminator Write, +// avoiding a heap allocation per line. +var newline = []byte{'\n'} + // cutConfig holds the parsed configuration for a cut invocation. type cutConfig struct { mode mode @@ -392,30 +396,46 @@ func processBytes(callCtx *builtins.CallContext, raw []byte, cfg *cutConfig) { if cfg.outDelimSet { processBytesComplementWithOutDelim(callCtx, raw, cfg) } else { - var sb strings.Builder + start := -1 for i := range n { - pos := i + 1 - if !inRanges(pos, cfg.ranges) { - sb.WriteByte(raw[i]) + if !inRanges(i+1, cfg.ranges) { + if start < 0 { + start = i + } + } else { + if start >= 0 { + callCtx.Stdout.Write(raw[start:i]) //nolint:errcheck + start = -1 + } } } - callCtx.Out(sb.String()) + if start >= 0 { + callCtx.Stdout.Write(raw[start:]) //nolint:errcheck + } } } else { if cfg.outDelimSet { processBytesWithOutDelim(callCtx, raw, cfg) } else { - var sb strings.Builder + start := -1 for i := range n { - pos := i + 1 - if inRanges(pos, cfg.ranges) { - sb.WriteByte(raw[i]) + if inRanges(i+1, cfg.ranges) { + if start < 0 { + start = i + } + } else { + if start >= 0 { + callCtx.Stdout.Write(raw[start:i]) //nolint:errcheck + start = -1 + } } } - callCtx.Out(sb.String()) + if start >= 0 { + callCtx.Stdout.Write(raw[start:]) //nolint:errcheck + } } } - callCtx.Out("\n") + callCtx.Stdout.Write(newline) //nolint:errcheck } // processBytesWithOutDelim outputs selected byte ranges with the output @@ -455,56 +475,58 @@ func processBytesComplementWithOutDelim(callCtx *builtins.CallContext, raw []byt // processFields selects fields from a line. func processFields(callCtx *builtins.CallContext, raw []byte, cfg *cutConfig) { - line := string(raw) - delimStr := string(cfg.delimByte) - - // Check if line contains the delimiter. - if strings.IndexByte(line, cfg.delimByte) < 0 { + hasDelim := false + for _, b := range raw { + if b == cfg.delimByte { + hasDelim = true + break + } + } + if !hasDelim { if cfg.onlyDelimited { - return // suppress line + return } - // No delimiter: print the whole line + newline. - callCtx.Out(line) - callCtx.Out("\n") + callCtx.Stdout.Write(raw) //nolint:errcheck + callCtx.Stdout.Write(newline) //nolint:errcheck return } - fields := strings.Split(line, delimStr) - nFields := len(fields) + nFields := 1 + for _, b := range raw { + if b == cfg.delimByte { + nFields++ + } + } - // Determine which fields to select. - var selected []int - if cfg.complement { - compRanges := complementRanges(cfg.ranges, nFields) - for _, r := range compRanges { - for i := r[0]; i <= r[1] && i <= nFields; i++ { - selected = append(selected, i) - } + fieldIdx := 0 + fieldStart := 0 + firstOutput := true + + for i := 0; i <= len(raw); i++ { + if i < len(raw) && raw[i] != cfg.delimByte { + continue } - } else { - for _, r := range cfg.ranges { - start := r[0] - end := r[1] - if start > nFields { - break - } - if end > nFields { - end = nFields - } - for i := start; i <= end; i++ { - selected = append(selected, i) - } + fieldIdx++ + fieldNum := fieldIdx + + selected := false + if cfg.complement { + selected = !inRanges(fieldNum, cfg.ranges) + } else { + selected = inRanges(fieldNum, cfg.ranges) } - } - // Output selected fields joined by the output delimiter. - for i, idx := range selected { - if i > 0 { - callCtx.Out(cfg.outDelim) + if selected { + if !firstOutput { + callCtx.Out(cfg.outDelim) + } + callCtx.Stdout.Write(raw[fieldStart:i]) //nolint:errcheck + firstOutput = false } - callCtx.Out(fields[idx-1]) + + fieldStart = i + 1 } - callCtx.Out("\n") + callCtx.Stdout.Write(newline) //nolint:errcheck } // complementRanges returns the complement of the given sorted, merged ranges diff --git a/interp/builtins/cut/cut_bench_test.go b/interp/builtins/cut/cut_bench_test.go new file mode 100644 index 00000000..9852e442 --- /dev/null +++ b/interp/builtins/cut/cut_bench_test.go @@ -0,0 +1,131 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +//go:build !race + +package cut_test + +import ( + "io" + "os" + "path/filepath" + "testing" + + "github.com/DataDog/rshell/interp" + "github.com/DataDog/rshell/interp/builtins/testutil" +) + +func createLargeFileCut(tb testing.TB, dir, filename, line string, totalBytes int) string { + tb.Helper() + path := filepath.Join(dir, filename) + f, err := os.Create(path) + if err != nil { + tb.Fatal(err) + } + defer f.Close() + if _, err := io.Copy(f, io.LimitReader(testutil.NewRepeatReader(line), int64(totalBytes))); err != nil { + tb.Fatal(err) + } + return path +} + +func cmdRunBCut(b *testing.B, script, dir string) (string, string, int) { + b.Helper() + return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir})) +} + +// BenchmarkCutBytes measures cut -b 1-10 on a 10MB file of short lines. +func BenchmarkCutBytes(b *testing.B) { + dir := b.TempDir() + createLargeFileCut(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBCut(b, "cut -b 1-10 input.txt", dir) + } +} + +// BenchmarkCutFields measures cut -f 1 -d ' ' on a 10MB file of short lines. +func BenchmarkCutFields(b *testing.B) { + dir := b.TempDir() + // Tab-delimited: "field1\tfield2\tfield3" + createLargeFileCut(b, dir, "input.txt", "alpha\tbeta\tgamma\tdelta\n", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBCut(b, "cut -f 1 input.txt", dir) + } +} + +// BenchmarkCutFieldsMultiple measures cut selecting multiple fields on a 10MB file. +func BenchmarkCutFieldsMultiple(b *testing.B) { + dir := b.TempDir() + createLargeFileCut(b, dir, "input.txt", "alpha\tbeta\tgamma\tdelta\n", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBCut(b, "cut -f 1,3 input.txt", dir) + } +} + +// TestCutMemoryBounded asserts that cut -b uses O(1) memory regardless of +// input size. cut is a streaming command that writes selected byte ranges +// directly to Stdout with no per-line string allocation. +func TestCutMemoryBounded(t *testing.T) { + dir := t.TempDir() + createLargeFileCut(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + + result := testing.Benchmark(func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + testutil.RunScriptDiscard(b, "cut -b 1-10 input.txt", dir, interp.AllowedPaths([]string{dir})) + } + }) + + const maxBytesPerOp = 4 << 20 + if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp { + t.Errorf("cut -b 1-10 allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp) + } +} + +// TestCutFieldsMemoryBounded asserts that cut -f uses O(1) memory regardless +// of input size. Field mode scans raw bytes for the delimiter without +// converting to string or allocating a []string per line. +func TestCutFieldsMemoryBounded(t *testing.T) { + dir := t.TempDir() + createLargeFileCut(t, dir, "input.txt", "alpha\tbeta\tgamma\tdelta\n", 10<<20) + + result := testing.Benchmark(func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + testutil.RunScriptDiscard(b, "cut -f 1 input.txt", dir, interp.AllowedPaths([]string{dir})) + } + }) + + const maxBytesPerOp = 4 << 20 + if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp { + t.Errorf("cut -f 1 allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp) + } +} + +func BenchmarkCutBytesDiscard(b *testing.B) { + dir := b.TempDir() + createLargeFileCut(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + testutil.RunScriptDiscard(b, "cut -b 1-10 input.txt", dir, interp.AllowedPaths([]string{dir})) + } +} + +func BenchmarkCutFieldsDiscard(b *testing.B) { + dir := b.TempDir() + createLargeFileCut(b, dir, "input.txt", "alpha\tbeta\tgamma\tdelta\n", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + testutil.RunScriptDiscard(b, "cut -f 1 input.txt", dir, interp.AllowedPaths([]string{dir})) + } +} diff --git a/interp/builtins/grep/grep.go b/interp/builtins/grep/grep.go index cb4622e1..e1feebbe 100644 --- a/interp/builtins/grep/grep.go +++ b/interp/builtins/grep/grep.go @@ -562,9 +562,9 @@ func grepFile(ctx context.Context, callCtx *builtins.CallContext, file string, o return matchCount > 0, ctx.Err() } lineNum++ - line := sc.Text() + lineBytes := sc.Bytes() - matched := opts.re.MatchString(line) + matched := opts.re.Match(lineBytes) if opts.invertMatch { matched = !matched } @@ -606,15 +606,15 @@ func grepFile(ctx context.Context, callCtx *builtins.CallContext, file string, o // -o -v: line was selected by inversion (doesn't contain // pattern), so there are no matching parts to print. } else if opts.onlyMatching { - matches := opts.re.FindAllString(line, -1) - for _, m := range matches { - if m == "" { + indices := opts.re.FindAllIndex(lineBytes, -1) + for _, idx := range indices { + if idx[0] == idx[1] { continue // suppress empty matches (GNU grep behavior) } - printMatchLine(callCtx, displayName, lineNum, m, opts) + printMatchLine(callCtx, displayName, lineNum, lineBytes[idx[0]:idx[1]], opts) } } else { - printMatchLine(callCtx, displayName, lineNum, line, opts) + printMatchLine(callCtx, displayName, lineNum, lineBytes, opts) } lastPrintedLine = lineNum printedSeparator = true @@ -625,7 +625,7 @@ func grepFile(ctx context.Context, callCtx *builtins.CallContext, file string, o } else { // Non-matching line: might be after-context or before-context. if afterRemaining > 0 && !opts.quiet && !opts.count && !opts.filesWithMatches && !opts.filesWithoutMatch { - printContextLine(callCtx, displayName, lineNum, line, opts, '-') + printContextLine(callCtx, displayName, lineNum, lineBytes, opts, '-') lastPrintedLine = lineNum afterRemaining-- } @@ -635,7 +635,9 @@ func grepFile(ctx context.Context, callCtx *builtins.CallContext, file string, o if len(beforeBuf) >= opts.beforeContext { beforeBuf = beforeBuf[1:] } - beforeBuf = append(beforeBuf, contextLine{num: lineNum, text: line}) + cp := make([]byte, len(lineBytes)) + copy(cp, lineBytes) + beforeBuf = append(beforeBuf, contextLine{num: lineNum, text: cp}) } } } @@ -664,31 +666,31 @@ func grepFile(ctx context.Context, callCtx *builtins.CallContext, file string, o type contextLine struct { num int - text string + text []byte } -func printMatchLine(callCtx *builtins.CallContext, filename string, lineNum int, line string, opts *grepOpts) { - var prefix strings.Builder +func printMatchLine(callCtx *builtins.CallContext, filename string, lineNum int, line []byte, opts *grepOpts) { if opts.showFilename { - prefix.WriteString(filename) - prefix.WriteByte(':') + callCtx.Stdout.Write([]byte(filename)) //nolint:errcheck + callCtx.Stdout.Write([]byte{':'}) //nolint:errcheck } if opts.lineNumber { - prefix.WriteString(strconv.Itoa(lineNum)) - prefix.WriteByte(':') + callCtx.Stdout.Write([]byte(strconv.Itoa(lineNum))) //nolint:errcheck + callCtx.Stdout.Write([]byte{':'}) //nolint:errcheck } - callCtx.Outf("%s%s\n", prefix.String(), line) + callCtx.Stdout.Write(line) //nolint:errcheck + callCtx.Stdout.Write([]byte{'\n'}) //nolint:errcheck } -func printContextLine(callCtx *builtins.CallContext, filename string, lineNum int, line string, opts *grepOpts, sep byte) { - var prefix strings.Builder +func printContextLine(callCtx *builtins.CallContext, filename string, lineNum int, line []byte, opts *grepOpts, sep byte) { if opts.showFilename { - prefix.WriteString(filename) - prefix.WriteByte(sep) + callCtx.Stdout.Write([]byte(filename)) //nolint:errcheck + callCtx.Stdout.Write([]byte{sep}) //nolint:errcheck } if opts.lineNumber { - prefix.WriteString(strconv.Itoa(lineNum)) - prefix.WriteByte(sep) + callCtx.Stdout.Write([]byte(strconv.Itoa(lineNum))) //nolint:errcheck + callCtx.Stdout.Write([]byte{sep}) //nolint:errcheck } - callCtx.Outf("%s%s\n", prefix.String(), line) + callCtx.Stdout.Write(line) //nolint:errcheck + callCtx.Stdout.Write([]byte{'\n'}) //nolint:errcheck } diff --git a/interp/builtins/grep/grep_bench_test.go b/interp/builtins/grep/grep_bench_test.go new file mode 100644 index 00000000..a50fd9d3 --- /dev/null +++ b/interp/builtins/grep/grep_bench_test.go @@ -0,0 +1,112 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +//go:build !race + +package grep_test + +import ( + "io" + "os" + "path/filepath" + "testing" + + "github.com/DataDog/rshell/interp" + "github.com/DataDog/rshell/interp/builtins/testutil" +) + +func createLargeFileGrep(tb testing.TB, dir, filename, line string, totalBytes int) string { + tb.Helper() + path := filepath.Join(dir, filename) + f, err := os.Create(path) + if err != nil { + tb.Fatal(err) + } + defer f.Close() + if _, err := io.Copy(f, io.LimitReader(testutil.NewRepeatReader(line), int64(totalBytes))); err != nil { + tb.Fatal(err) + } + return path +} + +func cmdRunBGrep(b *testing.B, script, dir string) (string, string, int) { + b.Helper() + return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir})) +} + +// BenchmarkGrepMatch measures grep on a 10MB file where every line matches. +func BenchmarkGrepMatch(b *testing.B) { + dir := b.TempDir() + createLargeFileGrep(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBGrep(b, "grep fox input.txt", dir) + } +} + +// BenchmarkGrepNoMatch measures grep on a 10MB file where no lines match. +func BenchmarkGrepNoMatch(b *testing.B) { + dir := b.TempDir() + createLargeFileGrep(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBGrep(b, "grep NOMATCH input.txt", dir) + } +} + +// BenchmarkGrepFixedStrings measures grep -F on a 10MB file. +func BenchmarkGrepFixedStrings(b *testing.B) { + dir := b.TempDir() + createLargeFileGrep(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBGrep(b, "grep -F fox input.txt", dir) + } +} + +// BenchmarkGrepCount measures grep -c on a 10MB file. +func BenchmarkGrepCount(b *testing.B) { + dir := b.TempDir() + createLargeFileGrep(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBGrep(b, "grep -c fox input.txt", dir) + } +} + +// TestGrepMemoryBounded asserts that grep uses O(1) memory when processing +// large files. grep is a streaming command that reads one line at a time via +// sc.Bytes() (no per-line string allocation). Total allocations are dominated +// by the shell/runner overhead, not input size. +func TestGrepMemoryBounded(t *testing.T) { + dir := t.TempDir() + createLargeFileGrep(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + + result := testing.Benchmark(func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + testutil.RunScriptDiscard(b, "grep fox input.txt", dir, interp.AllowedPaths([]string{dir})) + } + }) + + const maxBytesPerOp = 4 << 20 + if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp { + t.Errorf("grep allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp) + } +} + +func BenchmarkGrepMatchDiscard(b *testing.B) { + dir := b.TempDir() + createLargeFileGrep(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + testutil.RunScriptDiscard(b, "grep fox input.txt", dir, interp.AllowedPaths([]string{dir})) + } +} diff --git a/interp/builtins/head/head_bench_test.go b/interp/builtins/head/head_bench_test.go new file mode 100644 index 00000000..c6bf134d --- /dev/null +++ b/interp/builtins/head/head_bench_test.go @@ -0,0 +1,126 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +//go:build !race + +package head_test + +import ( + "io" + "os" + "path/filepath" + "testing" + + "github.com/DataDog/rshell/interp" + "github.com/DataDog/rshell/interp/builtins/testutil" +) + +// createLargeFile writes totalBytes of repeating line content to dir/filename. +func createLargeFile(tb testing.TB, dir, filename, line string, totalBytes int) string { + tb.Helper() + path := filepath.Join(dir, filename) + f, err := os.Create(path) + if err != nil { + tb.Fatal(err) + } + defer f.Close() + if _, err := io.Copy(f, io.LimitReader(testutil.NewRepeatReader(line), int64(totalBytes))); err != nil { + tb.Fatal(err) + } + return path +} + +// cmdRunB runs a head command with AllowedPaths set to dir (bench variant). +// Uses testutil.RunScript which accepts testing.TB. +func cmdRunB(b *testing.B, script, dir string) (string, string, int) { + b.Helper() + return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir})) +} + +// BenchmarkHeadTenLines measures head -n 10 on a 10MB file of short lines. +func BenchmarkHeadTenLines(b *testing.B) { + dir := b.TempDir() + createLargeFile(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunB(b, "head -n 10 input.txt", dir) + } +} + +// BenchmarkHeadBytes measures head -c 1024 on a 10MB file. +func BenchmarkHeadBytes(b *testing.B) { + dir := b.TempDir() + createLargeFile(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunB(b, "head -c 1024 input.txt", dir) + } +} + +// BenchmarkHeadSingleLineNearCap measures head -n 1 on a file with one line +// just below MaxLineBytes (1MiB). Lines exceeding MaxLineBytes trigger an +// error path; this benchmark exercises the successful large-line path. +func BenchmarkHeadSingleLineNearCap(b *testing.B) { + dir := b.TempDir() + // 900KB line -- safely below MaxLineBytes (1MiB) so head succeeds. + createLargeFile(b, dir, "input.txt", "x", 900<<10) + // Append a newline to complete the line. + f, err := os.OpenFile(filepath.Join(dir, "input.txt"), os.O_APPEND|os.O_WRONLY, 0) + if err != nil { + b.Fatal(err) + } + defer func() { + if err := f.Close(); err != nil { + b.Errorf("close input.txt: %v", err) + } + }() + if _, err := f.WriteString("\n"); err != nil { + b.Fatal(err) + } + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunB(b, "head -n 1 input.txt", dir) + } +} + +// TestHeadMemoryBoundedLines asserts that head -n 10 uses O(1) memory +// regardless of input file size. +func TestHeadMemoryBoundedLines(t *testing.T) { + dir := t.TempDir() + createLargeFile(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + + result := testing.Benchmark(func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + cmdRunB(b, "head -n 10 input.txt", dir) + } + }) + + const maxBytesPerOp = 1 << 20 // 1MB ceiling + if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp { + t.Errorf("head -n 10 allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp) + } +} + +// TestHeadMemoryBoundedBytes asserts that head -c 1024 uses O(1) memory. +func TestHeadMemoryBoundedBytes(t *testing.T) { + dir := t.TempDir() + createLargeFile(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + + result := testing.Benchmark(func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + cmdRunB(b, "head -c 1024 input.txt", dir) + } + }) + + const maxBytesPerOp = 1 << 20 // 1MB ceiling + if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp { + t.Errorf("head -c 1024 allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp) + } +} diff --git a/interp/builtins/ls/ls_bench_test.go b/interp/builtins/ls/ls_bench_test.go new file mode 100644 index 00000000..988ce1b6 --- /dev/null +++ b/interp/builtins/ls/ls_bench_test.go @@ -0,0 +1,94 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +//go:build !race + +package ls_test + +import ( + "fmt" + "os" + "path/filepath" + "testing" + + "github.com/DataDog/rshell/interp" + "github.com/DataDog/rshell/interp/builtins/testutil" +) + +// createFileDir creates a directory containing n empty files named +// file0000.txt … fileNNNN.txt and returns the directory path. +func createFileDir(tb testing.TB, n int) string { + tb.Helper() + dir := tb.TempDir() + for i := range n { + name := filepath.Join(dir, fmt.Sprintf("file%04d.txt", i)) + f, err := os.Create(name) + if err != nil { + tb.Fatal(err) + } + f.Close() + } + return dir +} + +func cmdRunBLs(b *testing.B, script, dir string) (string, string, int) { + b.Helper() + return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir})) +} + +// BenchmarkLs measures ls on a directory with 1000 entries. +func BenchmarkLs(b *testing.B) { + dir := createFileDir(b, 1000) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBLs(b, "ls .", dir) + } +} + +// BenchmarkLsLong measures ls -l on a directory with 1000 entries. +func BenchmarkLsLong(b *testing.B) { + dir := createFileDir(b, 1000) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBLs(b, "ls -l .", dir) + } +} + +// BenchmarkLsSmallDir measures ls on a small directory (10 entries). +func BenchmarkLsSmallDir(b *testing.B) { + dir := createFileDir(b, 10) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBLs(b, "ls .", dir) + } +} + +// TestLsMemoryBounded asserts that ls allocation scales linearly with the +// number of directory entries rather than diverging to pathological levels. +// ls must load all directory entries into memory to sort them (O(n) live heap), +// but should not buffer additional data beyond what os.ReadDir returns. +// +// With 1000 entries of ~12-byte names the expected allocation is roughly +// 1000 × (name string + FileInfo struct) ≈ a few hundred KB. A 10MB ceiling +// catches regressions that accidentally buffer full file contents or loop +// without bound. +func TestLsMemoryBounded(t *testing.T) { + dir := createFileDir(t, 1000) + + result := testing.Benchmark(func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + cmdRunBLs(b, "ls .", dir) + } + }) + + const maxBytesPerOp = 10 << 20 // 10MB ceiling for 1000-entry directory + if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp { + t.Errorf("ls allocated %d bytes/op on 1000-entry dir; want < %d", bpo, maxBytesPerOp) + } +} diff --git a/interp/builtins/strings_cmd/strings_bench_test.go b/interp/builtins/strings_cmd/strings_bench_test.go new file mode 100644 index 00000000..7864b400 --- /dev/null +++ b/interp/builtins/strings_cmd/strings_bench_test.go @@ -0,0 +1,95 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +//go:build !race + +package strings_cmd_test + +import ( + "io" + "os" + "path/filepath" + "testing" + + "github.com/DataDog/rshell/interp" + "github.com/DataDog/rshell/interp/builtins/testutil" +) + +func createLargeFileStrings(tb testing.TB, dir, filename, line string, totalBytes int) string { + tb.Helper() + path := filepath.Join(dir, filename) + f, err := os.Create(path) + if err != nil { + tb.Fatal(err) + } + defer f.Close() + if _, err := io.Copy(f, io.LimitReader(testutil.NewRepeatReader(line), int64(totalBytes))); err != nil { + tb.Fatal(err) + } + return path +} + +func cmdRunBStrings(b *testing.B, script, dir string) (string, string, int) { + b.Helper() + return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir})) +} + +// BenchmarkStrings measures strings on a 1MB file containing many short +// printable sequences separated by null bytes. Each line is a 43-byte printable +// string followed by a null byte, producing ~24k strings. +func BenchmarkStrings(b *testing.B) { + dir := b.TempDir() + // Mix of printable chars + null byte so strings emits many short tokens. + createLargeFileStrings(b, dir, "input.bin", "the quick brown fox jumps over lazy\x00", 1<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBStrings(b, "strings input.bin", dir) + } +} + +// BenchmarkStringsPrintableOnly measures strings on a 1MB fully-printable file. +// The entire file is one continuous printable run that exceeds maxStringLen +// (1 MiB cap), so only the first 1 MiB is emitted. +func BenchmarkStringsPrintableOnly(b *testing.B) { + dir := b.TempDir() + createLargeFileStrings(b, dir, "input.txt", "abcdefghij", 1<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBStrings(b, "strings input.txt", dir) + } +} + +// TestStringsMemoryBounded asserts that strings uses O(1) memory regardless +// of input size. strings reads in 32 KiB chunks and caps individual string +// accumulation at maxStringLen (1 MiB). With short printable sequences +// separated by non-printable bytes the current string buffer stays small. +func TestStringsMemoryBounded(t *testing.T) { + dir := t.TempDir() + createLargeFileStrings(t, dir, "input.bin", "the quick brown fox jumps over lazy\x00", 10<<20) + + result := testing.Benchmark(func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + testutil.RunScriptDiscard(b, "strings input.bin", dir, interp.AllowedPaths([]string{dir})) + } + }) + + const maxBytesPerOp = 4 << 20 + if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp { + t.Errorf("strings allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp) + } +} + +func BenchmarkStringsDiscard(b *testing.B) { + dir := b.TempDir() + createLargeFileStrings(b, dir, "input.bin", "the quick brown fox jumps over lazy\x00", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + testutil.RunScriptDiscard(b, "strings input.bin", dir, interp.AllowedPaths([]string{dir})) + } +} diff --git a/interp/builtins/tail/tail.go b/interp/builtins/tail/tail.go index 35d231e3..878ef1db 100644 --- a/interp/builtins/tail/tail.go +++ b/interp/builtins/tail/tail.go @@ -320,8 +320,6 @@ func readLastLines(ctx context.Context, callCtx *builtins.CallContext, r io.Read if !isRegularFile && totalRead > MaxTotalReadBytes { return errors.New("input too large: read limit exceeded") } - cp := make([]byte, len(raw)) - copy(cp, raw) // When the ring is full, evict the oldest entry before writing. if ringCount == ringSize { // If count exceeds the ring capacity, we cannot deliver the full @@ -331,8 +329,8 @@ func readLastLines(ctx context.Context, callCtx *builtins.CallContext, r io.Read } ringBytes -= int64(len(ring[ringHead])) } - ring[ringHead] = cp - ringBytes += int64(len(cp)) + ring[ringHead] = append(ring[ringHead][:0], raw...) + ringBytes += int64(len(ring[ringHead])) if ringBytes > MaxRingBytes { return errors.New("input too large: ring buffer memory limit exceeded") } diff --git a/interp/builtins/tail/tail_bench_test.go b/interp/builtins/tail/tail_bench_test.go new file mode 100644 index 00000000..ec6c59f9 --- /dev/null +++ b/interp/builtins/tail/tail_bench_test.go @@ -0,0 +1,84 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +//go:build !race + +package tail_test + +import ( + "io" + "os" + "path/filepath" + "testing" + + "github.com/DataDog/rshell/interp" + "github.com/DataDog/rshell/interp/builtins/testutil" +) + +// createLargeFileTail writes totalBytes of repeating content to dir/filename. +func createLargeFileTail(tb testing.TB, dir, filename, line string, totalBytes int) string { + tb.Helper() + path := filepath.Join(dir, filename) + f, err := os.Create(path) + if err != nil { + tb.Fatal(err) + } + defer f.Close() + if _, err := io.Copy(f, io.LimitReader(testutil.NewRepeatReader(line), int64(totalBytes))); err != nil { + tb.Fatal(err) + } + return path +} + +// cmdRunBTail runs a tail command with AllowedPaths set to dir (bench variant). +func cmdRunBTail(b *testing.B, script, dir string) (string, string, int) { + b.Helper() + return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir})) +} + +// BenchmarkTailTenLines measures tail -n 10 on a 10MB file. +func BenchmarkTailTenLines(b *testing.B) { + dir := b.TempDir() + createLargeFileTail(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBTail(b, "tail -n 10 input.txt", dir) + } +} + +// BenchmarkTailBytes measures tail -c 1024 on a 10MB file. +func BenchmarkTailBytes(b *testing.B) { + dir := b.TempDir() + createLargeFileTail(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBTail(b, "tail -c 1024 input.txt", dir) + } +} + +// TestTailMemoryBounded asserts that tail -n 10 uses O(1) memory regardless of +// input size. The ring buffer slots are reused via append(slot[:0], raw...), +// so no per-line allocation occurs after the first pass fills the ring. +func TestTailMemoryBounded(t *testing.T) { + const line = "the quick brown fox jumps over the lazy dog\n" // 44 bytes + const inputSize = 10 << 20 // 10 MB + + dir := t.TempDir() + createLargeFileTail(t, dir, "input.txt", line, inputSize) + + result := testing.Benchmark(func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + testutil.RunScriptDiscard(b, "tail -n 10 input.txt", dir, interp.AllowedPaths([]string{dir})) + } + }) + + const maxBytesPerOp = 4 << 20 + if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp { + t.Errorf("tail -n 10 allocated %d bytes/op on %d-byte input; want < %d", bpo, inputSize, maxBytesPerOp) + } +} diff --git a/interp/builtins/testutil/testutil.go b/interp/builtins/testutil/testutil.go index f262617c..bed6dead 100644 --- a/interp/builtins/testutil/testutil.go +++ b/interp/builtins/testutil/testutil.go @@ -10,6 +10,7 @@ import ( "bytes" "context" "errors" + "io" "strings" "testing" @@ -19,9 +20,37 @@ import ( "github.com/DataDog/rshell/interp" ) +// repeatReader is an io.Reader that repeats a fixed line pattern indefinitely. +type repeatReader struct { + line []byte + pos int +} + +func (r *repeatReader) Read(p []byte) (int, error) { + n := 0 + for n < len(p) { + if r.pos >= len(r.line) { + r.pos = 0 + } + copied := copy(p[n:], r.line[r.pos:]) + r.pos += copied + n += copied + } + return n, nil +} + +// NewRepeatReader returns an io.Reader that yields the given line pattern +// indefinitely. Use io.LimitReader to cap the total bytes produced. +// It is intended for benchmark setup — generating large synthetic files +// without keeping the full content in memory. +func NewRepeatReader(line string) io.Reader { + return &repeatReader{line: []byte(line)} +} + // RunScriptCtx runs a shell script with a context and returns stdout, stderr, -// and the exit code. -func RunScriptCtx(ctx context.Context, t *testing.T, script, dir string, opts ...interp.RunnerOption) (string, string, int) { +// and the exit code. It accepts testing.TB so it can be used in both tests +// and benchmarks. +func RunScriptCtx(ctx context.Context, t testing.TB, script, dir string, opts ...interp.RunnerOption) (string, string, int) { t.Helper() parser := syntax.NewParser() prog, err := parser.Parse(strings.NewReader(script), "") @@ -51,7 +80,46 @@ func RunScriptCtx(ctx context.Context, t *testing.T, script, dir string, opts .. } // RunScript runs a shell script and returns stdout, stderr, and the exit code. -func RunScript(t *testing.T, script, dir string, opts ...interp.RunnerOption) (string, string, int) { +// It accepts testing.TB so it can be used in both tests and benchmarks. +func RunScript(t testing.TB, script, dir string, opts ...interp.RunnerOption) (string, string, int) { t.Helper() return RunScriptCtx(context.Background(), t, script, dir, opts...) } + +// RunScriptDiscard runs a shell script and returns stderr and the exit code. +// Stdout is discarded (io.Discard). Use this in memory-allocation tests to +// prevent output buffering from dominating the AllocedBytesPerOp measurement. +func RunScriptDiscard(t testing.TB, script, dir string, opts ...interp.RunnerOption) (string, int) { + t.Helper() + return RunScriptDiscardCtx(context.Background(), t, script, dir, opts...) +} + +// RunScriptDiscardCtx is RunScriptDiscard with an explicit context. +func RunScriptDiscardCtx(ctx context.Context, t testing.TB, script, dir string, opts ...interp.RunnerOption) (string, int) { + t.Helper() + parser := syntax.NewParser() + prog, err := parser.Parse(strings.NewReader(script), "") + require.NoError(t, err) + + var errBuf bytes.Buffer + allOpts := append([]interp.RunnerOption{interp.StdIO(nil, io.Discard, &errBuf)}, opts...) + runner, err := interp.New(allOpts...) + require.NoError(t, err) + defer runner.Close() + + if dir != "" { + runner.Dir = dir + } + + err = runner.Run(ctx, prog) + exitCode := 0 + if err != nil { + var es interp.ExitStatus + if errors.As(err, &es) { + exitCode = int(es) + } else if ctx.Err() == nil { + t.Fatalf("unexpected error: %v", err) + } + } + return errBuf.String(), exitCode +} diff --git a/interp/builtins/tr/tr_bench_test.go b/interp/builtins/tr/tr_bench_test.go new file mode 100644 index 00000000..327b48ab --- /dev/null +++ b/interp/builtins/tr/tr_bench_test.go @@ -0,0 +1,106 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +//go:build !race + +package tr_test + +import ( + "io" + "os" + "path/filepath" + "testing" + + "github.com/DataDog/rshell/interp" + "github.com/DataDog/rshell/interp/builtins/testutil" +) + +func createLargeFileTr(tb testing.TB, dir, filename, line string, totalBytes int) string { + tb.Helper() + path := filepath.Join(dir, filename) + f, err := os.Create(path) + if err != nil { + tb.Fatal(err) + } + defer f.Close() + if _, err := io.Copy(f, io.LimitReader(testutil.NewRepeatReader(line), int64(totalBytes))); err != nil { + tb.Fatal(err) + } + return path +} + +func cmdRunBTr(b *testing.B, script, dir string) (string, string, int) { + b.Helper() + return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir})) +} + +// BenchmarkTrTranslateDiscard measures tr with stdout discarded to isolate +// tr's own allocations from output buffering. Used to calibrate the ceiling +// in TestTrMemoryBounded. +func BenchmarkTrTranslateDiscard(b *testing.B) { + dir := b.TempDir() + createLargeFileTr(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + testutil.RunScriptDiscard(b, "cat input.txt | tr 'a-z' 'A-Z'", dir, interp.AllowedPaths([]string{dir})) + } +} + +// BenchmarkTrTranslate measures tr 'a-z' 'A-Z' on a 1MB file piped through tr. +// tr reads input from stdin in fixed 32 KiB chunks and translates byte-by-byte +// using a pre-built 256-entry lookup table. +func BenchmarkTrTranslate(b *testing.B) { + dir := b.TempDir() + createLargeFileTr(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 1<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBTr(b, "cat input.txt | tr 'a-z' 'A-Z'", dir) + } +} + +// BenchmarkTrDelete measures tr -d on a 1MB file. +func BenchmarkTrDelete(b *testing.B) { + dir := b.TempDir() + createLargeFileTr(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 1<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBTr(b, "cat input.txt | tr -d ' '", dir) + } +} + +// BenchmarkTrSqueeze measures tr -s on a 1MB file. +func BenchmarkTrSqueeze(b *testing.B) { + dir := b.TempDir() + createLargeFileTr(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 1<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBTr(b, "cat input.txt | tr -s ' '", dir) + } +} + +// TestTrMemoryBounded asserts that tr uses O(1) memory regardless of input +// size. tr operates on a 256-entry lookup table built once at startup. Input +// is read in fixed 32 KiB chunks and translated in-place; no allocation is +// proportional to input length. +func TestTrMemoryBounded(t *testing.T) { + dir := t.TempDir() + createLargeFileTr(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + + result := testing.Benchmark(func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + testutil.RunScriptDiscard(b, "cat input.txt | tr 'a-z' 'A-Z'", dir, interp.AllowedPaths([]string{dir})) + } + }) + + const maxBytesPerOp = 4 << 20 + if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp { + t.Errorf("tr allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp) + } +} diff --git a/interp/builtins/uniq/uniq.go b/interp/builtins/uniq/uniq.go index 4b44598a..6cabbaf9 100644 --- a/interp/builtins/uniq/uniq.go +++ b/interp/builtins/uniq/uniq.go @@ -73,6 +73,7 @@ package uniq import ( "bufio" + "bytes" "context" "io" "math" @@ -282,7 +283,6 @@ func processInput(ctx context.Context, callCtx *builtins.CallContext, r io.Reade sc.Split(makeSplitFunc(cfg.delim)) w := callCtx.Stdout - delimStr := string([]byte{cfg.delim}) reportWrite := func(err error) error { if err != nil { @@ -291,8 +291,16 @@ func processInput(ctx context.Context, callCtx *builtins.CallContext, r io.Reade return err } - var prevLine string - var prevKey string + writeLine := func(line []byte) error { + if _, err := w.Write(line); err != nil { + return err + } + _, err := w.Write([]byte{cfg.delim}) + return err + } + + var prevLine []byte + var prevKey []byte var lineCount int64 first := true groupNum := 0 @@ -301,77 +309,77 @@ func processInput(ctx context.Context, callCtx *builtins.CallContext, r io.Reade if ctx.Err() != nil { return ctx.Err() } - curLine := sc.Text() - curKey := compareKey(curLine, cfg) + curBytes := sc.Bytes() + curKey := compareKeyBytes(curBytes, cfg) if first { - prevLine = curLine - prevKey = curKey + prevLine = append(prevLine[:0], curBytes...) + prevKey = append(prevKey[:0], curKey...) lineCount = 1 first = false if cfg.useGroup { if cfg.grpMethod == groupPrepend || cfg.grpMethod == groupBoth { - if err := reportWrite(writeStr(w, delimStr)); err != nil { + if err := reportWrite(writeLine(nil)); err != nil { return err } } - if err := reportWrite(writeStr(w, curLine+delimStr)); err != nil { + if err := reportWrite(writeLine(prevLine)); err != nil { return err } } continue } - same := prevKey == curKey + same := bytes.Equal(prevKey, curKey) if same { if lineCount < math.MaxInt64 { lineCount++ } if cfg.useGroup { - if err := reportWrite(writeStr(w, curLine+delimStr)); err != nil { + if err := reportWrite(writeLine(curBytes)); err != nil { return err } } else if cfg.useAllRepeated { if lineCount == 2 { if groupNum > 0 && cfg.arMethod != allRepeatedNone { - if err := reportWrite(writeStr(w, delimStr)); err != nil { + if err := reportWrite(writeLine(nil)); err != nil { return err } } if groupNum == 0 && cfg.arMethod == allRepeatedPrepend { - if err := reportWrite(writeStr(w, delimStr)); err != nil { + if err := reportWrite(writeLine(nil)); err != nil { return err } } - if err := reportWrite(writeStr(w, prevLine+delimStr)); err != nil { + if err := reportWrite(writeLine(prevLine)); err != nil { return err } groupNum++ } - if err := reportWrite(writeStr(w, curLine+delimStr)); err != nil { + if err := reportWrite(writeLine(curBytes)); err != nil { return err } } } else { if cfg.useGroup { - if err := reportWrite(writeStr(w, delimStr)); err != nil { + if err := reportWrite(writeLine(nil)); err != nil { return err } - if err := reportWrite(writeStr(w, curLine+delimStr)); err != nil { + if err := reportWrite(writeLine(curBytes)); err != nil { return err } groupNum++ } else if cfg.useAllRepeated { // Nothing to do — non-repeated last group is simply dropped. } else { - if err := reportWrite(emitStandard(w, cfg, prevLine, lineCount, delimStr)); err != nil { + if err := reportWrite(emitStandard(w, cfg, prevLine, lineCount)); err != nil { return err } } - prevLine = curLine - prevKey = curKey + prevLine = append(prevLine[:0], curBytes...) + prevKey = append(prevKey[:0], curKey...) lineCount = 1 } } @@ -388,17 +396,17 @@ func processInput(ctx context.Context, callCtx *builtins.CallContext, r io.Reade // Flush last group. if cfg.useGroup { if cfg.grpMethod == groupAppend || cfg.grpMethod == groupBoth { - return reportWrite(writeStr(w, delimStr)) + return reportWrite(writeLine(nil)) } return nil } if cfg.useAllRepeated { return nil } - return reportWrite(emitStandard(w, cfg, prevLine, lineCount, delimStr)) + return reportWrite(emitStandard(w, cfg, prevLine, lineCount)) } -func emitStandard(w io.Writer, cfg *uniqConfig, line string, count int64, delimStr string) error { +func emitStandard(w io.Writer, cfg *uniqConfig, line []byte, count int64) error { if cfg.repeated && cfg.unique { return nil } @@ -413,22 +421,30 @@ func emitStandard(w io.Writer, cfg *uniqConfig, line string, count int64, delimS for len(s) < countFieldWidth { s = " " + s } - return writeStr(w, s+" "+line+delimStr) + if _, err := io.WriteString(w, s+" "); err != nil { + return err + } + if _, err := w.Write(line); err != nil { + return err + } + _, err := w.Write([]byte{cfg.delim}) + return err } - return writeStr(w, line+delimStr) -} - -func writeStr(w io.Writer, s string) error { - _, err := io.WriteString(w, s) + if _, err := w.Write(line); err != nil { + return err + } + _, err := w.Write([]byte{cfg.delim}) return err } -// compareKey extracts the portion of line used for comparison, applying +// compareKeyBytes extracts the portion of line used for comparison, applying // field skipping, char skipping, check-chars, and case folding. -func compareKey(line string, cfg *uniqConfig) string { +// For the ignore-case path it returns a newly allocated lowercased copy; +// otherwise it returns a subslice of line (no allocation). +func compareKeyBytes(line []byte, cfg *uniqConfig) []byte { s := line if cfg.skipFields > 0 { - s = skipFieldsN(s, cfg.skipFields) + s = skipFieldsBytesN(s, cfg.skipFields) } if cfg.skipChars > 0 && len(s) > 0 { skip := cfg.skipChars @@ -441,37 +457,28 @@ func compareKey(line string, cfg *uniqConfig) string { s = s[:cfg.checkChars] } if cfg.ignoreCase { - s = asciiToLower(s) + s = asciiToLowerBytes(s) } return s } -// asciiToLower folds only ASCII A-Z to a-z, matching GNU uniq behavior -// in the default C/POSIX locale. Unlike strings.ToLower, this does not -// apply Unicode case folding, so non-ASCII characters are left unchanged. -func asciiToLower(s string) string { - for i := 0; i < len(s); i++ { - if s[i] >= 'A' && s[i] <= 'Z' { - b := make([]byte, len(s)) - copy(b, s[:i]) - b[i] = s[i] + ('a' - 'A') - for j := i + 1; j < len(s); j++ { - c := s[j] - if c >= 'A' && c <= 'Z' { - c += 'a' - 'A' - } - b[j] = c - } - return string(b) +// asciiToLowerBytes folds only ASCII A-Z to a-z in a byte slice, matching GNU +// uniq behavior in the default C/POSIX locale. It always returns a new copy. +func asciiToLowerBytes(s []byte) []byte { + b := make([]byte, len(s)) + for i, c := range s { + if c >= 'A' && c <= 'Z' { + c += 'a' - 'A' } + b[i] = c } - return s + return b } -// skipFieldsN skips the first n blank-delimited fields and returns the -// remainder of the string, starting immediately after the last character +// skipFieldsBytesN skips the first n blank-delimited fields in a byte slice +// and returns the remainder, starting immediately after the last character // of the n-th field (before any subsequent blanks). -func skipFieldsN(s string, n int64) string { +func skipFieldsBytesN(s []byte, n int64) []byte { i := 0 for field := int64(0); field < n && i < len(s); field++ { for i < len(s) && (s[i] == ' ' || s[i] == '\t') { diff --git a/interp/builtins/uniq/uniq_bench_test.go b/interp/builtins/uniq/uniq_bench_test.go new file mode 100644 index 00000000..beb3ce5b --- /dev/null +++ b/interp/builtins/uniq/uniq_bench_test.go @@ -0,0 +1,80 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +//go:build !race + +package uniq_test + +import ( + "io" + "os" + "path/filepath" + "testing" + + "github.com/DataDog/rshell/interp" + "github.com/DataDog/rshell/interp/builtins/testutil" +) + +func createLargeFileUniq(tb testing.TB, dir, filename, line string, totalBytes int) string { + tb.Helper() + path := filepath.Join(dir, filename) + f, err := os.Create(path) + if err != nil { + tb.Fatal(err) + } + defer f.Close() + if _, err := io.Copy(f, io.LimitReader(testutil.NewRepeatReader(line), int64(totalBytes))); err != nil { + tb.Fatal(err) + } + return path +} + +func cmdRunBUniq(b *testing.B, script, dir string) (string, string, int) { + b.Helper() + return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir})) +} + +// BenchmarkUniq measures uniq on a 10MB file of identical lines (all deduplicated to one). +func BenchmarkUniq(b *testing.B) { + dir := b.TempDir() + createLargeFileUniq(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBUniq(b, "uniq input.txt", dir) + } +} + +// BenchmarkUniqCount measures uniq -c on a 10MB file. +func BenchmarkUniqCount(b *testing.B) { + dir := b.TempDir() + createLargeFileUniq(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBUniq(b, "uniq -c input.txt", dir) + } +} + +// TestUniqMemoryBounded asserts that uniq uses O(1) memory when processing +// large files. uniq is a streaming command: only the current and previous lines +// are kept in memory at any time (live heap is O(1)) and sc.Bytes() avoids +// per-line string allocations. +func TestUniqMemoryBounded(t *testing.T) { + dir := t.TempDir() + createLargeFileUniq(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + + result := testing.Benchmark(func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + testutil.RunScriptDiscard(b, "uniq input.txt", dir, interp.AllowedPaths([]string{dir})) + } + }) + + const maxBytesPerOp = 4 << 20 + if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp { + t.Errorf("uniq allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp) + } +} diff --git a/interp/builtins/wc/wc_bench_test.go b/interp/builtins/wc/wc_bench_test.go new file mode 100644 index 00000000..2411354a --- /dev/null +++ b/interp/builtins/wc/wc_bench_test.go @@ -0,0 +1,79 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +//go:build !race + +package wc_test + +import ( + "io" + "os" + "path/filepath" + "testing" + + "github.com/DataDog/rshell/interp" + "github.com/DataDog/rshell/interp/builtins/testutil" +) + +// createLargeFileWc writes totalBytes of repeating content to dir/filename. +func createLargeFileWc(tb testing.TB, dir, filename, line string, totalBytes int) string { + tb.Helper() + path := filepath.Join(dir, filename) + f, err := os.Create(path) + if err != nil { + tb.Fatal(err) + } + defer f.Close() + if _, err := io.Copy(f, io.LimitReader(testutil.NewRepeatReader(line), int64(totalBytes))); err != nil { + tb.Fatal(err) + } + return path +} + +// cmdRunBWc runs a wc command with AllowedPaths set to dir (bench variant). +func cmdRunBWc(b *testing.B, script, dir string) (string, string, int) { + b.Helper() + return testutil.RunScript(b, script, dir, interp.AllowedPaths([]string{dir})) +} + +// BenchmarkWcLines measures wc -l on a 10MB file. +func BenchmarkWcLines(b *testing.B) { + dir := b.TempDir() + createLargeFileWc(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBWc(b, "wc -l input.txt", dir) + } +} + +// BenchmarkWcAll measures wc (all counts) on a 10MB file. +func BenchmarkWcAll(b *testing.B) { + dir := b.TempDir() + createLargeFileWc(b, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + b.ResetTimer() + b.ReportAllocs() + for b.Loop() { + cmdRunBWc(b, "wc input.txt", dir) + } +} + +// TestWcMemoryBounded asserts that wc uses O(1) memory regardless of file size. +func TestWcMemoryBounded(t *testing.T) { + dir := t.TempDir() + createLargeFileWc(t, dir, "input.txt", "the quick brown fox jumps over the lazy dog\n", 10<<20) + + result := testing.Benchmark(func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + cmdRunBWc(b, "wc -l input.txt", dir) + } + }) + + const maxBytesPerOp = 1 << 20 // 1MB ceiling for a streaming counter + if bpo := result.AllocedBytesPerOp(); bpo > maxBytesPerOp { + t.Errorf("wc -l allocated %d bytes/op on 10MB input; want < %d", bpo, maxBytesPerOp) + } +} diff --git a/tests/allowed_symbols_test.go b/tests/allowed_symbols_test.go index 7a8adffb..38ce08c7 100644 --- a/tests/allowed_symbols_test.go +++ b/tests/allowed_symbols_test.go @@ -38,6 +38,8 @@ var builtinAllowedSymbols = []string{ "bufio.NewScanner", // bufio.Scanner — scanner type for buffered input reading; no write or exec capability. "bufio.Scanner", + // bytes.Equal — compares two byte slices for equality; pure function, no I/O. + "bytes.Equal", // bufio.SplitFunc — type for custom scanner split functions; pure type, no I/O. "bufio.SplitFunc", // context.Context — deadline/cancellation plumbing; pure interface, no side effects.